Netdev List
 help / color / mirror / Atom feed
* [PATCH 08/12] first fruits - kill l2cap ->memcpy_fromiovec()
From: Al Viro @ 2014-12-05  5:58 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-kernel
In-Reply-To: <20141205055623.GQ29748@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

Just use copy_from_iter().  That's what this method is trying to do
in all cases, in a very convoluted fashion.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/net/bluetooth/l2cap.h | 29 -----------------------------
 net/bluetooth/6lowpan.c       |  3 +--
 net/bluetooth/a2mp.c          |  3 +--
 net/bluetooth/l2cap_core.c    |  7 +++----
 net/bluetooth/l2cap_sock.c    |  8 --------
 net/bluetooth/smp.c           |  4 +---
 6 files changed, 6 insertions(+), 48 deletions(-)

diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index bca6fc0..692f786 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -606,10 +606,6 @@ struct l2cap_ops {
 	struct sk_buff		*(*alloc_skb) (struct l2cap_chan *chan,
 					       unsigned long hdr_len,
 					       unsigned long len, int nb);
-	int			(*memcpy_fromiovec) (struct l2cap_chan *chan,
-						     unsigned char *kdata,
-						     struct msghdr *msg,
-						     int len);
 };
 
 struct l2cap_conn {
@@ -903,31 +899,6 @@ static inline long l2cap_chan_no_get_sndtimeo(struct l2cap_chan *chan)
 	return 0;
 }
 
-static inline int l2cap_chan_no_memcpy_fromiovec(struct l2cap_chan *chan,
-						 unsigned char *kdata,
-						 struct msghdr *msg,
-						 int len)
-{
-	/* Following is safe since for compiler definitions of kvec and
-	 * iovec are identical, yielding the same in-core layout and alignment
-	 */
-	struct kvec *vec = (struct kvec *)msg->msg_iter.iov;
-
-	while (len > 0) {
-		if (vec->iov_len) {
-			int copy = min_t(unsigned int, len, vec->iov_len);
-			memcpy(kdata, vec->iov_base, copy);
-			len -= copy;
-			kdata += copy;
-			vec->iov_base += copy;
-			vec->iov_len -= copy;
-		}
-		vec++;
-	}
-
-	return 0;
-}
-
 extern bool disable_ertm;
 
 int l2cap_init_sockets(void);
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index d8c67a5..76617be 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -541,7 +541,7 @@ static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb,
 	iv.iov_len = skb->len;
 
 	memset(&msg, 0, sizeof(msg));
-	iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *) &iv, 1, skb->len);
+	iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iv, 1, skb->len);
 
 	err = l2cap_chan_send(chan, &msg, skb->len);
 	if (err > 0) {
@@ -1050,7 +1050,6 @@ static const struct l2cap_ops bt_6lowpan_chan_ops = {
 	.suspend		= chan_suspend_cb,
 	.get_sndtimeo		= chan_get_sndtimeo_cb,
 	.alloc_skb		= chan_alloc_skb_cb,
-	.memcpy_fromiovec	= l2cap_chan_no_memcpy_fromiovec,
 
 	.teardown		= l2cap_chan_no_teardown,
 	.defer			= l2cap_chan_no_defer,
diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c
index 716d2a3..cedfbda 100644
--- a/net/bluetooth/a2mp.c
+++ b/net/bluetooth/a2mp.c
@@ -60,7 +60,7 @@ void a2mp_send(struct amp_mgr *mgr, u8 code, u8 ident, u16 len, void *data)
 
 	memset(&msg, 0, sizeof(msg));
 
-	iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *)&iv, 1, total_len);
+	iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iv, 1, total_len);
 
 	l2cap_chan_send(chan, &msg, total_len);
 
@@ -719,7 +719,6 @@ static const struct l2cap_ops a2mp_chan_ops = {
 	.resume = l2cap_chan_no_resume,
 	.set_shutdown = l2cap_chan_no_set_shutdown,
 	.get_sndtimeo = l2cap_chan_no_get_sndtimeo,
-	.memcpy_fromiovec = l2cap_chan_no_memcpy_fromiovec,
 };
 
 static struct l2cap_chan *a2mp_chan_open(struct l2cap_conn *conn, bool locked)
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 5201d61..1754040 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -2096,8 +2096,7 @@ static inline int l2cap_skbuff_fromiovec(struct l2cap_chan *chan,
 	struct sk_buff **frag;
 	int sent = 0;
 
-	if (chan->ops->memcpy_fromiovec(chan, skb_put(skb, count),
-					msg, count))
+	if (copy_from_iter(skb_put(skb, count), count, &msg->msg_iter) != count)
 		return -EFAULT;
 
 	sent += count;
@@ -2117,8 +2116,8 @@ static inline int l2cap_skbuff_fromiovec(struct l2cap_chan *chan,
 
 		*frag = tmp;
 
-		if (chan->ops->memcpy_fromiovec(chan, skb_put(*frag, count),
-						msg, count))
+		if (copy_from_iter(skb_put(*frag, count), count,
+				   &msg->msg_iter) != count)
 			return -EFAULT;
 
 		sent += count;
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 205b298..f65caf4 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1336,13 +1336,6 @@ static struct sk_buff *l2cap_sock_alloc_skb_cb(struct l2cap_chan *chan,
 	return skb;
 }
 
-static int l2cap_sock_memcpy_fromiovec_cb(struct l2cap_chan *chan,
-					  unsigned char *kdata,
-					  struct msghdr *msg, int len)
-{
-	return memcpy_from_msg(kdata, msg, len);
-}
-
 static void l2cap_sock_ready_cb(struct l2cap_chan *chan)
 {
 	struct sock *sk = chan->data;
@@ -1427,7 +1420,6 @@ static const struct l2cap_ops l2cap_chan_ops = {
 	.set_shutdown		= l2cap_sock_set_shutdown_cb,
 	.get_sndtimeo		= l2cap_sock_get_sndtimeo_cb,
 	.alloc_skb		= l2cap_sock_alloc_skb_cb,
-	.memcpy_fromiovec	= l2cap_sock_memcpy_fromiovec_cb,
 };
 
 static void l2cap_sock_destruct(struct sock *sk)
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 21f555b..de7dc75 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -268,7 +268,7 @@ static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data)
 
 	memset(&msg, 0, sizeof(msg));
 
-	iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *)iv, 2, 1 + len);
+	iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iv, 2, 1 + len);
 
 	l2cap_chan_send(chan, &msg, 1 + len);
 
@@ -1629,7 +1629,6 @@ static const struct l2cap_ops smp_chan_ops = {
 	.suspend		= l2cap_chan_no_suspend,
 	.set_shutdown		= l2cap_chan_no_set_shutdown,
 	.get_sndtimeo		= l2cap_chan_no_get_sndtimeo,
-	.memcpy_fromiovec	= l2cap_chan_no_memcpy_fromiovec,
 };
 
 static inline struct l2cap_chan *smp_new_conn_cb(struct l2cap_chan *pchan)
@@ -1678,7 +1677,6 @@ static const struct l2cap_ops smp_root_chan_ops = {
 	.resume			= l2cap_chan_no_resume,
 	.set_shutdown		= l2cap_chan_no_set_shutdown,
 	.get_sndtimeo		= l2cap_chan_no_get_sndtimeo,
-	.memcpy_fromiovec	= l2cap_chan_no_memcpy_fromiovec,
 };
 
 int smp_register(struct hci_dev *hdev)
-- 
2.1.3

^ permalink raw reply related

* [PATCH 07/12] put iov_iter into msghdr
From: Al Viro @ 2014-12-05  5:58 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-kernel
In-Reply-To: <20141205055623.GQ29748@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

Note that the code _using_ ->msg_iter at that point will be very
unhappy with anything other than unshifted iovec-backed iov_iter.
We still need to convert users to proper primitives.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 crypto/algif_hash.c            |  4 ++--
 crypto/algif_skcipher.c        |  4 ++--
 drivers/net/macvtap.c          |  8 ++------
 drivers/net/tun.c              |  8 ++------
 drivers/vhost/net.c            |  8 +++-----
 fs/afs/rxrpc.c                 | 14 ++++++--------
 include/linux/skbuff.h         | 16 ++++++++++------
 include/linux/socket.h         |  3 +--
 include/net/bluetooth/l2cap.h  |  2 +-
 include/net/udplite.h          |  3 ++-
 net/atm/common.c               |  5 +----
 net/bluetooth/6lowpan.c        |  6 +++---
 net/bluetooth/a2mp.c           |  3 +--
 net/bluetooth/smp.c            |  3 +--
 net/caif/caif_socket.c         |  2 +-
 net/compat.c                   | 10 ++++++----
 net/ipv4/ip_output.c           |  6 ++++--
 net/ipv4/ping.c                |  3 ++-
 net/ipv4/raw.c                 |  3 ++-
 net/ipv4/tcp.c                 |  6 +++---
 net/ipv4/tcp_output.c          |  2 +-
 net/ipv6/ping.c                |  3 ++-
 net/ipv6/raw.c                 |  3 ++-
 net/netlink/af_netlink.c       |  2 +-
 net/packet/af_packet.c         |  7 ++-----
 net/rds/recv.c                 |  7 ++++---
 net/rds/send.c                 |  4 +---
 net/rxrpc/ar-output.c          |  8 +++-----
 net/sctp/socket.c              |  5 +----
 net/socket.c                   | 27 ++++++++++++---------------
 net/tipc/msg.c                 |  4 ++--
 net/tipc/socket.c              |  4 ++--
 net/unix/af_unix.c             | 10 ++--------
 net/vmw_vsock/vmci_transport.c |  3 ++-
 34 files changed, 92 insertions(+), 114 deletions(-)

diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c
index 35c93ff..83cd2cc 100644
--- a/crypto/algif_hash.c
+++ b/crypto/algif_hash.c
@@ -42,7 +42,7 @@ static int hash_sendmsg(struct kiocb *unused, struct socket *sock,
 	struct alg_sock *ask = alg_sk(sk);
 	struct hash_ctx *ctx = ask->private;
 	unsigned long iovlen;
-	struct iovec *iov;
+	const struct iovec *iov;
 	long copied = 0;
 	int err;
 
@@ -58,7 +58,7 @@ static int hash_sendmsg(struct kiocb *unused, struct socket *sock,
 
 	ctx->more = 0;
 
-	for (iov = msg->msg_iov, iovlen = msg->msg_iovlen; iovlen > 0;
+	for (iov = msg->msg_iter.iov, iovlen = msg->msg_iter.nr_segs; iovlen > 0;
 	     iovlen--, iov++) {
 		unsigned long seglen = iov->iov_len;
 		char __user *from = iov->iov_base;
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index c3b482b..4f45dab 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -429,13 +429,13 @@ static int skcipher_recvmsg(struct kiocb *unused, struct socket *sock,
 	struct skcipher_sg_list *sgl;
 	struct scatterlist *sg;
 	unsigned long iovlen;
-	struct iovec *iov;
+	const struct iovec *iov;
 	int err = -EAGAIN;
 	int used;
 	long copied = 0;
 
 	lock_sock(sk);
-	for (iov = msg->msg_iov, iovlen = msg->msg_iovlen; iovlen > 0;
+	for (iov = msg->msg_iter.iov, iovlen = msg->msg_iter.nr_segs; iovlen > 0;
 	     iovlen--, iov++) {
 		unsigned long seglen = iov->iov_len;
 		char __user *from = iov->iov_base;
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 22b4cf2..4fb1222 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -1092,9 +1092,7 @@ static int macvtap_sendmsg(struct kiocb *iocb, struct socket *sock,
 			   struct msghdr *m, size_t total_len)
 {
 	struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
-	struct iov_iter from;
-	iov_iter_init(&from, WRITE, m->msg_iov, m->msg_iovlen, total_len);
-	return macvtap_get_user(q, m, &from, m->msg_flags & MSG_DONTWAIT);
+	return macvtap_get_user(q, m, &m->msg_iter, m->msg_flags & MSG_DONTWAIT);
 }
 
 static int macvtap_recvmsg(struct kiocb *iocb, struct socket *sock,
@@ -1102,12 +1100,10 @@ static int macvtap_recvmsg(struct kiocb *iocb, struct socket *sock,
 			   int flags)
 {
 	struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
-	struct iov_iter to;
 	int ret;
 	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
 		return -EINVAL;
-	iov_iter_init(&to, READ, m->msg_iov, m->msg_iovlen, total_len);
-	ret = macvtap_do_read(q, &to, flags & MSG_DONTWAIT);
+	ret = macvtap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT);
 	if (ret > total_len) {
 		m->msg_flags |= MSG_TRUNC;
 		ret = flags & MSG_TRUNC ? ret : total_len;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 6d44da1..8d8bede 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1446,13 +1446,11 @@ static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
 	int ret;
 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
 	struct tun_struct *tun = __tun_get(tfile);
-	struct iov_iter from;
 
 	if (!tun)
 		return -EBADFD;
 
-	iov_iter_init(&from, WRITE, m->msg_iov, m->msg_iovlen, total_len);
-	ret = tun_get_user(tun, tfile, m->msg_control, &from,
+	ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
 			   m->msg_flags & MSG_DONTWAIT);
 	tun_put(tun);
 	return ret;
@@ -1464,7 +1462,6 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
 {
 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
 	struct tun_struct *tun = __tun_get(tfile);
-	struct iov_iter to;
 	int ret;
 
 	if (!tun)
@@ -1479,8 +1476,7 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
 					 SOL_PACKET, TUN_TX_TIMESTAMP);
 		goto out;
 	}
-	iov_iter_init(&to, READ, m->msg_iov, m->msg_iovlen, total_len);
-	ret = tun_do_read(tun, tfile, &to, flags & MSG_DONTWAIT);
+	ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT);
 	if (ret > total_len) {
 		m->msg_flags |= MSG_TRUNC;
 		ret = flags & MSG_TRUNC ? ret : total_len;
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 8dae2f7..9f06e70 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -342,7 +342,6 @@ static void handle_tx(struct vhost_net *net)
 		.msg_namelen = 0,
 		.msg_control = NULL,
 		.msg_controllen = 0,
-		.msg_iov = vq->iov,
 		.msg_flags = MSG_DONTWAIT,
 	};
 	size_t len, total_len = 0;
@@ -396,8 +395,8 @@ static void handle_tx(struct vhost_net *net)
 		}
 		/* Skip header. TODO: support TSO. */
 		s = move_iovec_hdr(vq->iov, nvq->hdr, hdr_size, out);
-		msg.msg_iovlen = out;
 		len = iov_length(vq->iov, out);
+		iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
 		/* Sanity check */
 		if (!len) {
 			vq_err(vq, "Unexpected header len for TX: "
@@ -562,7 +561,6 @@ static void handle_rx(struct vhost_net *net)
 		.msg_namelen = 0,
 		.msg_control = NULL, /* FIXME: get and handle RX aux data. */
 		.msg_controllen = 0,
-		.msg_iov = vq->iov,
 		.msg_flags = MSG_DONTWAIT,
 	};
 	struct virtio_net_hdr_mrg_rxbuf hdr = {
@@ -600,7 +598,7 @@ static void handle_rx(struct vhost_net *net)
 			break;
 		/* On overrun, truncate and discard */
 		if (unlikely(headcount > UIO_MAXIOV)) {
-			msg.msg_iovlen = 1;
+			iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
 			err = sock->ops->recvmsg(NULL, sock, &msg,
 						 1, MSG_DONTWAIT | MSG_TRUNC);
 			pr_debug("Discarded rx packet: len %zd\n", sock_len);
@@ -626,7 +624,7 @@ static void handle_rx(struct vhost_net *net)
 			/* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF:
 			 * needed because recvmsg can modify msg_iov. */
 			copy_iovec_hdr(vq->iov, nvq->hdr, sock_hlen, in);
-		msg.msg_iovlen = in;
+		iov_iter_init(&msg.msg_iter, READ, vq->iov, in, sock_len);
 		err = sock->ops->recvmsg(NULL, sock, &msg,
 					 sock_len, MSG_DONTWAIT | MSG_TRUNC);
 		/* Userspace might have consumed the packet meanwhile:
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 03a3beb..06e14bf 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -306,8 +306,8 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg,
 
 			_debug("- range %u-%u%s",
 			       offset, to, msg->msg_flags ? " [more]" : "");
-			msg->msg_iov = (struct iovec *) iov;
-			msg->msg_iovlen = 1;
+			iov_iter_init(&msg->msg_iter, WRITE,
+				      (struct iovec *) iov, 1, to - offset);
 
 			/* have to change the state *before* sending the last
 			 * packet as RxRPC might give us the reply before it
@@ -384,8 +384,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 
 	msg.msg_name		= NULL;
 	msg.msg_namelen		= 0;
-	msg.msg_iov		= (struct iovec *) iov;
-	msg.msg_iovlen		= 1;
+	iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *)iov, 1,
+		      call->request_size);
 	msg.msg_control		= NULL;
 	msg.msg_controllen	= 0;
 	msg.msg_flags		= (call->send_pages ? MSG_MORE : 0);
@@ -778,8 +778,7 @@ void afs_send_empty_reply(struct afs_call *call)
 	iov[0].iov_len		= 0;
 	msg.msg_name		= NULL;
 	msg.msg_namelen		= 0;
-	msg.msg_iov		= iov;
-	msg.msg_iovlen		= 0;
+	iov_iter_init(&msg.msg_iter, WRITE, iov, 0, 0);	/* WTF? */
 	msg.msg_control		= NULL;
 	msg.msg_controllen	= 0;
 	msg.msg_flags		= 0;
@@ -815,8 +814,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 	iov[0].iov_len		= len;
 	msg.msg_name		= NULL;
 	msg.msg_namelen		= 0;
-	msg.msg_iov		= iov;
-	msg.msg_iovlen		= 1;
+	iov_iter_init(&msg.msg_iter, WRITE, iov, 1, len);
 	msg.msg_control		= NULL;
 	msg.msg_controllen	= 0;
 	msg.msg_flags		= 0;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7691ad5..4199dfa 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2644,22 +2644,24 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
 			   struct poll_table_struct *wait);
 int skb_copy_datagram_iovec(const struct sk_buff *from, int offset,
 			    struct iovec *to, int size);
+int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
+			   struct iov_iter *to, int size);
 static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
 					struct msghdr *msg, int size)
 {
-	return skb_copy_datagram_iovec(from, offset, msg->msg_iov, size);
+	/* XXX: stripping const */
+	return skb_copy_datagram_iovec(from, offset, (struct iovec *)msg->msg_iter.iov, size);
 }
 int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, int hlen,
 				     struct iovec *iov);
 static inline int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen,
 			    struct msghdr *msg)
 {
-	return skb_copy_and_csum_datagram_iovec(skb, hlen, msg->msg_iov);
+	/* XXX: stripping const */
+	return skb_copy_and_csum_datagram_iovec(skb, hlen, (struct iovec *)msg->msg_iter.iov);
 }
 int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
 				 struct iov_iter *from, int len);
-int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
-			   struct iov_iter *to, int size);
 int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm);
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
 void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb);
@@ -2687,12 +2689,14 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
 
 static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len)
 {
-	return memcpy_fromiovec(data, msg->msg_iov, len);
+	/* XXX: stripping const */
+	return memcpy_fromiovec(data, (struct iovec *)msg->msg_iter.iov, len);
 }
 
 static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len)
 {
-	return memcpy_toiovec(msg->msg_iov, data, len);
+	/* XXX: stripping const */
+	return memcpy_toiovec((struct iovec *)msg->msg_iter.iov, data, len);
 }
 
 struct skb_checksum_ops {
diff --git a/include/linux/socket.h b/include/linux/socket.h
index de52228..048d6d6 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -47,8 +47,7 @@ struct linger {
 struct msghdr {
 	void		*msg_name;	/* ptr to socket address structure */
 	int		msg_namelen;	/* size of socket address structure */
-	struct iovec	*msg_iov;	/* scatter/gather array */
-	__kernel_size_t	msg_iovlen;	/* # elements in msg_iov */
+	struct iov_iter	msg_iter;	/* data */
 	void		*msg_control;	/* ancillary data */
 	__kernel_size_t	msg_controllen;	/* ancillary data buffer length */
 	unsigned int	msg_flags;	/* flags on received message */
diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index 4e23674..bca6fc0 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -911,7 +911,7 @@ static inline int l2cap_chan_no_memcpy_fromiovec(struct l2cap_chan *chan,
 	/* Following is safe since for compiler definitions of kvec and
 	 * iovec are identical, yielding the same in-core layout and alignment
 	 */
-	struct kvec *vec = (struct kvec *)msg->msg_iov;
+	struct kvec *vec = (struct kvec *)msg->msg_iter.iov;
 
 	while (len > 0) {
 		if (vec->iov_len) {
diff --git a/include/net/udplite.h b/include/net/udplite.h
index d5baaba..ae7c8d1 100644
--- a/include/net/udplite.h
+++ b/include/net/udplite.h
@@ -20,7 +20,8 @@ static __inline__ int udplite_getfrag(void *from, char *to, int  offset,
 				      int len, int odd, struct sk_buff *skb)
 {
 	struct msghdr *msg = from;
-	return memcpy_fromiovecend(to, msg->msg_iov, offset, len);
+	/* XXX: stripping const */
+	return memcpy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len);
 }
 
 /* Designate sk as UDP-Lite socket */
diff --git a/net/atm/common.c b/net/atm/common.c
index f591129..b84057e 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -577,9 +577,6 @@ int vcc_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
 	struct atm_vcc *vcc;
 	struct sk_buff *skb;
 	int eff, error;
-	struct iov_iter from;
-
-	iov_iter_init(&from, WRITE, m->msg_iov, m->msg_iovlen, size);
 
 	lock_sock(sk);
 	if (sock->state != SS_CONNECTED) {
@@ -634,7 +631,7 @@ int vcc_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
 		goto out;
 	skb->dev = NULL; /* for paths shared with net_device interfaces */
 	ATM_SKB(skb)->atm_options = vcc->atm_options;
-	if (copy_from_iter(skb_put(skb, size), size, &from) != size) {
+	if (copy_from_iter(skb_put(skb, size), size, &m->msg_iter) != size) {
 		kfree_skb(skb);
 		error = -EFAULT;
 		goto out;
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index bdcaefd..d8c67a5 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -537,12 +537,12 @@ static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb,
 	 */
 	chan->data = skb;
 
-	memset(&msg, 0, sizeof(msg));
-	msg.msg_iov = (struct iovec *) &iv;
-	msg.msg_iovlen = 1;
 	iv.iov_base = skb->data;
 	iv.iov_len = skb->len;
 
+	memset(&msg, 0, sizeof(msg));
+	iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *) &iv, 1, skb->len);
+
 	err = l2cap_chan_send(chan, &msg, skb->len);
 	if (err > 0) {
 		netdev->stats.tx_bytes += err;
diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c
index 5dcade5..716d2a3 100644
--- a/net/bluetooth/a2mp.c
+++ b/net/bluetooth/a2mp.c
@@ -60,8 +60,7 @@ void a2mp_send(struct amp_mgr *mgr, u8 code, u8 ident, u16 len, void *data)
 
 	memset(&msg, 0, sizeof(msg));
 
-	msg.msg_iov = (struct iovec *) &iv;
-	msg.msg_iovlen = 1;
+	iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *)&iv, 1, total_len);
 
 	l2cap_chan_send(chan, &msg, total_len);
 
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 069b76e..21f555b 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -268,8 +268,7 @@ static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data)
 
 	memset(&msg, 0, sizeof(msg));
 
-	msg.msg_iov = (struct iovec *) &iv;
-	msg.msg_iovlen = 2;
+	iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *)iv, 2, 1 + len);
 
 	l2cap_chan_send(chan, &msg, 1 + len);
 
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index ac618b0..769b185 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -535,7 +535,7 @@ static int caif_seqpkt_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		goto err;
 
 	ret = -EINVAL;
-	if (unlikely(msg->msg_iov->iov_base == NULL))
+	if (unlikely(msg->msg_iter.iov->iov_base == NULL))
 		goto err;
 	noblock = msg->msg_flags & MSG_DONTWAIT;
 
diff --git a/net/compat.c b/net/compat.c
index 062f157..3236b41 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -37,13 +37,14 @@ ssize_t get_compat_msghdr(struct msghdr *kmsg,
 			  struct iovec **iov)
 {
 	compat_uptr_t uaddr, uiov, tmp3;
+	compat_size_t nr_segs;
 	ssize_t err;
 
 	if (!access_ok(VERIFY_READ, umsg, sizeof(*umsg)) ||
 	    __get_user(uaddr, &umsg->msg_name) ||
 	    __get_user(kmsg->msg_namelen, &umsg->msg_namelen) ||
 	    __get_user(uiov, &umsg->msg_iov) ||
-	    __get_user(kmsg->msg_iovlen, &umsg->msg_iovlen) ||
+	    __get_user(nr_segs, &umsg->msg_iovlen) ||
 	    __get_user(tmp3, &umsg->msg_control) ||
 	    __get_user(kmsg->msg_controllen, &umsg->msg_controllen) ||
 	    __get_user(kmsg->msg_flags, &umsg->msg_flags))
@@ -68,14 +69,15 @@ ssize_t get_compat_msghdr(struct msghdr *kmsg,
 		kmsg->msg_namelen = 0;
 	}
 
-	if (kmsg->msg_iovlen > UIO_MAXIOV)
+	if (nr_segs > UIO_MAXIOV)
 		return -EMSGSIZE;
 
 	err = compat_rw_copy_check_uvector(save_addr ? READ : WRITE,
-					   compat_ptr(uiov), kmsg->msg_iovlen,
+					   compat_ptr(uiov), nr_segs,
 					   UIO_FASTIOV, *iov, iov);
 	if (err >= 0)
-		kmsg->msg_iov = *iov;
+		iov_iter_init(&kmsg->msg_iter, save_addr ? READ : WRITE,
+			      *iov, nr_segs, err);
 	return err;
 }
 
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index cdedcf1..b50861b2 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -755,11 +755,13 @@ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk
 	struct msghdr *msg = from;
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
-		if (memcpy_fromiovecend(to, msg->msg_iov, offset, len) < 0)
+		/* XXX: stripping const */
+		if (memcpy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len) < 0)
 			return -EFAULT;
 	} else {
 		__wsum csum = 0;
-		if (csum_partial_copy_fromiovecend(to, msg->msg_iov, offset, len, &csum) < 0)
+		/* XXX: stripping const */
+		if (csum_partial_copy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len, &csum) < 0)
 			return -EFAULT;
 		skb->csum = csum_block_add(skb->csum, csum, odd);
 	}
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 8dd4ae0..c0d82f7 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -811,7 +811,8 @@ back_from_confirm:
 	pfh.icmph.checksum = 0;
 	pfh.icmph.un.echo.id = inet->inet_sport;
 	pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence;
-	pfh.iov = msg->msg_iov;
+	/* XXX: stripping const */
+	pfh.iov = (struct iovec *)msg->msg_iter.iov;
 	pfh.wcheck = 0;
 	pfh.family = AF_INET;
 
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 5d83bd2..0bb68df 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -625,7 +625,8 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 back_from_confirm:
 
 	if (inet->hdrincl)
-		err = raw_send_hdrinc(sk, &fl4, msg->msg_iov, len,
+		/* XXX: stripping const */
+		err = raw_send_hdrinc(sk, &fl4, (struct iovec *)msg->msg_iter.iov, len,
 				      &rt, msg->msg_flags);
 
 	 else {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4a96f37..54ba620 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1085,7 +1085,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		size_t size)
 {
-	struct iovec *iov;
+	const struct iovec *iov;
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	int iovlen, flags, err, copied = 0;
@@ -1136,8 +1136,8 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	mss_now = tcp_send_mss(sk, &size_goal, flags);
 
 	/* Ok commence sending. */
-	iovlen = msg->msg_iovlen;
-	iov = msg->msg_iov;
+	iovlen = msg->msg_iter.nr_segs;
+	iov = msg->msg_iter.iov;
 	copied = 0;
 
 	err = -EPIPE;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f5bd4bd..3e225b0 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3050,7 +3050,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 	syn_data->ip_summed = CHECKSUM_PARTIAL;
 	memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
 	if (unlikely(memcpy_fromiovecend(skb_put(syn_data, space),
-					 fo->data->msg_iov, 0, space))) {
+					 fo->data->msg_iter.iov, 0, space))) {
 		kfree_skb(syn_data);
 		goto fallback;
 	}
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 5b7a1ed..2d31483 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -163,7 +163,8 @@ int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	pfh.icmph.checksum = 0;
 	pfh.icmph.un.echo.id = inet->inet_sport;
 	pfh.icmph.un.echo.sequence = user_icmph.icmp6_sequence;
-	pfh.iov = msg->msg_iov;
+	/* XXX: stripping const */
+	pfh.iov = (struct iovec *)msg->msg_iter.iov;
 	pfh.wcheck = 0;
 	pfh.family = AF_INET6;
 
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 11a9283..ee25631 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -886,7 +886,8 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 
 back_from_confirm:
 	if (inet->hdrincl)
-		err = rawv6_send_hdrinc(sk, msg->msg_iov, len, &fl6, &dst, msg->msg_flags);
+		/* XXX: stripping const */
+		err = rawv6_send_hdrinc(sk, (struct iovec *)msg->msg_iter.iov, len, &fl6, &dst, msg->msg_flags);
 	else {
 		lock_sock(sk);
 		err = ip6_append_data(sk, raw6_getfrag, &rfv,
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 63aa5c8..cc9bcf0 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2305,7 +2305,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	}
 
 	if (netlink_tx_is_mmaped(sk) &&
-	    msg->msg_iov->iov_base == NULL) {
+	    msg->msg_iter.iov->iov_base == NULL) {
 		err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group,
 					   siocb);
 		goto out;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index efa8445..ed2e620 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2408,11 +2408,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	unsigned short gso_type = 0;
 	int hlen, tlen;
 	int extra_len = 0;
-	struct iov_iter from;
 	ssize_t n;
 
-	iov_iter_init(&from, WRITE, msg->msg_iov, msg->msg_iovlen, len);
-
 	/*
 	 *	Get and verify the address.
 	 */
@@ -2451,7 +2448,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 		len -= vnet_hdr_len;
 
 		err = -EFAULT;
-		n = copy_from_iter(&vnet_hdr, vnet_hdr_len, &from);
+		n = copy_from_iter(&vnet_hdr, vnet_hdr_len, &msg->msg_iter);
 		if (n != vnet_hdr_len)
 			goto out_unlock;
 
@@ -2522,7 +2519,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	}
 
 	/* Returns -EFAULT on error */
-	err = skb_copy_datagram_from_iter(skb, offset, &from, len);
+	err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
 	if (err)
 		goto out_free;
 
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 47d7b10..f9ec1ac 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -404,7 +404,6 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 	int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
 	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
 	struct rds_incoming *inc = NULL;
-	struct iov_iter to;
 
 	/* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
 	timeo = sock_rcvtimeo(sk, nonblock);
@@ -415,6 +414,7 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 		goto out;
 
 	while (1) {
+		struct iov_iter save;
 		/* If there are pending notifications, do those - and nothing else */
 		if (!list_empty(&rs->rs_notify_queue)) {
 			ret = rds_notify_queue_get(rs, msg);
@@ -450,8 +450,8 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 		rdsdebug("copying inc %p from %pI4:%u to user\n", inc,
 			 &inc->i_conn->c_faddr,
 			 ntohs(inc->i_hdr.h_sport));
-		iov_iter_init(&to, READ, msg->msg_iov, msg->msg_iovlen, size);
-		ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &to);
+		save = msg->msg_iter;
+		ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter);
 		if (ret < 0)
 			break;
 
@@ -464,6 +464,7 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 			rds_inc_put(inc);
 			inc = NULL;
 			rds_stats_inc(s_recv_deliver_raced);
+			msg->msg_iter = save;
 			continue;
 		}
 
diff --git a/net/rds/send.c b/net/rds/send.c
index 4de62ea..40a5629a 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -934,9 +934,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 	int queued = 0, allocated_mr = 0;
 	int nonblock = msg->msg_flags & MSG_DONTWAIT;
 	long timeo = sock_sndtimeo(sk, nonblock);
-	struct iov_iter from;
 
-	iov_iter_init(&from, WRITE, msg->msg_iov, msg->msg_iovlen, payload_len);
 	/* Mirror Linux UDP mirror of BSD error message compatibility */
 	/* XXX: Perhaps MSG_MORE someday */
 	if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) {
@@ -984,7 +982,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 			ret = -ENOMEM;
 			goto out;
 		}
-		ret = rds_message_copy_from_user(rm, &from);
+		ret = rds_message_copy_from_user(rm, &msg->msg_iter);
 		if (ret)
 			goto out;
 	}
diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c
index 0b4b9a7..86e0f10 100644
--- a/net/rxrpc/ar-output.c
+++ b/net/rxrpc/ar-output.c
@@ -531,14 +531,12 @@ static int rxrpc_send_data(struct kiocb *iocb,
 	struct rxrpc_skb_priv *sp;
 	unsigned char __user *from;
 	struct sk_buff *skb;
-	struct iovec *iov;
+	const struct iovec *iov;
 	struct sock *sk = &rx->sk;
 	long timeo;
 	bool more;
 	int ret, ioc, segment, copied;
 
-	_enter(",,,{%zu},%zu", msg->msg_iovlen, len);
-
 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
 
 	/* this should be in poll */
@@ -547,8 +545,8 @@ static int rxrpc_send_data(struct kiocb *iocb,
 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 		return -EPIPE;
 
-	iov = msg->msg_iov;
-	ioc = msg->msg_iovlen - 1;
+	iov = msg->msg_iter.iov;
+	ioc = msg->msg_iter.nr_segs - 1;
 	from = iov->iov_base;
 	segment = iov->iov_len;
 	iov++;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 0397ac9..c92f96cd 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1609,9 +1609,6 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
 	__u16 sinfo_flags = 0;
 	long timeo;
 	int err;
-	struct iov_iter from;
-
-	iov_iter_init(&from, WRITE, msg->msg_iov, msg->msg_iovlen, msg_len);
 
 	err = 0;
 	sp = sctp_sk(sk);
@@ -1950,7 +1947,7 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
 	}
 
 	/* Break the message into multiple chunks of maximum size. */
-	datamsg = sctp_datamsg_from_user(asoc, sinfo, &from);
+	datamsg = sctp_datamsg_from_user(asoc, sinfo, &msg->msg_iter);
 	if (IS_ERR(datamsg)) {
 		err = PTR_ERR(datamsg);
 		goto out_free;
diff --git a/net/socket.c b/net/socket.c
index ee3ee39..46571ee 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -691,8 +691,7 @@ int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
 	 * the following is safe, since for compiler definitions of kvec and
 	 * iovec are identical, yielding the same in-core layout and alignment
 	 */
-	msg->msg_iov = (struct iovec *)vec;
-	msg->msg_iovlen = num;
+	iov_iter_init(&msg->msg_iter, WRITE, (struct iovec *)vec, num, size);
 	result = sock_sendmsg(sock, msg, size);
 	set_fs(oldfs);
 	return result;
@@ -855,7 +854,7 @@ int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
 	 * the following is safe, since for compiler definitions of kvec and
 	 * iovec are identical, yielding the same in-core layout and alignment
 	 */
-	msg->msg_iov = (struct iovec *)vec, msg->msg_iovlen = num;
+	iov_iter_init(&msg->msg_iter, READ, (struct iovec *)vec, num, size);
 	result = sock_recvmsg(sock, msg, size, flags);
 	set_fs(oldfs);
 	return result;
@@ -915,8 +914,7 @@ static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb,
 	msg->msg_namelen = 0;
 	msg->msg_control = NULL;
 	msg->msg_controllen = 0;
-	msg->msg_iov = (struct iovec *)iov;
-	msg->msg_iovlen = nr_segs;
+	iov_iter_init(&msg->msg_iter, READ, iov, nr_segs, size);
 	msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
 
 	return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags);
@@ -955,8 +953,7 @@ static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb,
 	msg->msg_namelen = 0;
 	msg->msg_control = NULL;
 	msg->msg_controllen = 0;
-	msg->msg_iov = (struct iovec *)iov;
-	msg->msg_iovlen = nr_segs;
+	iov_iter_init(&msg->msg_iter, WRITE, iov, nr_segs, size);
 	msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
 	if (sock->type == SOCK_SEQPACKET)
 		msg->msg_flags |= MSG_EOR;
@@ -1800,8 +1797,7 @@ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
 	iov.iov_base = buff;
 	iov.iov_len = len;
 	msg.msg_name = NULL;
-	msg.msg_iov = &iov;
-	msg.msg_iovlen = 1;
+	iov_iter_init(&msg.msg_iter, WRITE, &iov, 1, len);
 	msg.msg_control = NULL;
 	msg.msg_controllen = 0;
 	msg.msg_namelen = 0;
@@ -1858,10 +1854,9 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
 
 	msg.msg_control = NULL;
 	msg.msg_controllen = 0;
-	msg.msg_iovlen = 1;
-	msg.msg_iov = &iov;
 	iov.iov_len = size;
 	iov.iov_base = ubuf;
+	iov_iter_init(&msg.msg_iter, READ, &iov, 1, size);
 	/* Save some cycles and don't copy the address if not needed */
 	msg.msg_name = addr ? (struct sockaddr *)&address : NULL;
 	/* We assume all kernel code knows the size of sockaddr_storage */
@@ -1995,13 +1990,14 @@ static ssize_t copy_msghdr_from_user(struct msghdr *kmsg,
 {
 	struct sockaddr __user *uaddr;
 	struct iovec __user *uiov;
+	size_t nr_segs;
 	ssize_t err;
 
 	if (!access_ok(VERIFY_READ, umsg, sizeof(*umsg)) ||
 	    __get_user(uaddr, &umsg->msg_name) ||
 	    __get_user(kmsg->msg_namelen, &umsg->msg_namelen) ||
 	    __get_user(uiov, &umsg->msg_iov) ||
-	    __get_user(kmsg->msg_iovlen, &umsg->msg_iovlen) ||
+	    __get_user(nr_segs, &umsg->msg_iovlen) ||
 	    __get_user(kmsg->msg_control, &umsg->msg_control) ||
 	    __get_user(kmsg->msg_controllen, &umsg->msg_controllen) ||
 	    __get_user(kmsg->msg_flags, &umsg->msg_flags))
@@ -2031,14 +2027,15 @@ static ssize_t copy_msghdr_from_user(struct msghdr *kmsg,
 		kmsg->msg_namelen = 0;
 	}
 
-	if (kmsg->msg_iovlen > UIO_MAXIOV)
+	if (nr_segs > UIO_MAXIOV)
 		return -EMSGSIZE;
 
 	err = rw_copy_check_uvector(save_addr ? READ : WRITE,
-				    uiov, kmsg->msg_iovlen,
+				    uiov, nr_segs,
 				    UIO_FASTIOV, *iov, iov);
 	if (err >= 0)
-		kmsg->msg_iov = *iov;
+		iov_iter_init(&kmsg->msg_iter, save_addr ? READ : WRITE,
+			      *iov, nr_segs, err);
 	return err;
 }
 
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 5b06597..a687b30 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -194,7 +194,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset,
 		__skb_queue_tail(list, skb);
 		skb_copy_to_linear_data(skb, mhdr, mhsz);
 		pktpos = skb->data + mhsz;
-		if (!dsz || !memcpy_fromiovecend(pktpos, m->msg_iov, offset,
+		if (!dsz || !memcpy_fromiovecend(pktpos, m->msg_iter.iov, offset,
 						 dsz))
 			return dsz;
 		rc = -EFAULT;
@@ -224,7 +224,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset,
 		if (drem < pktrem)
 			pktrem = drem;
 
-		if (memcpy_fromiovecend(pktpos, m->msg_iov, offset, pktrem)) {
+		if (memcpy_fromiovecend(pktpos, m->msg_iter.iov, offset, pktrem)) {
 			rc = -EFAULT;
 			goto error;
 		}
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 9658d9b..4e78a7d 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -850,9 +850,9 @@ static int dest_name_check(struct sockaddr_tipc *dest, struct msghdr *m)
 	if (likely(dest->addr.name.name.type != TIPC_CFG_SRV))
 		return -EACCES;
 
-	if (!m->msg_iovlen || (m->msg_iov[0].iov_len < sizeof(hdr)))
+	if (!m->msg_iter.nr_segs || (m->msg_iter.iov[0].iov_len < sizeof(hdr)))
 		return -EMSGSIZE;
-	if (copy_from_user(&hdr, m->msg_iov[0].iov_base, sizeof(hdr)))
+	if (copy_from_user(&hdr, m->msg_iter.iov[0].iov_base, sizeof(hdr)))
 		return -EFAULT;
 	if ((ntohs(hdr.tcm_type) & 0xC000) && (!capable(CAP_NET_ADMIN)))
 		return -EACCES;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 4450d62..8e1b102 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1459,9 +1459,6 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	struct scm_cookie tmp_scm;
 	int max_level;
 	int data_len = 0;
-	struct iov_iter from;
-
-	iov_iter_init(&from, WRITE, msg->msg_iov, msg->msg_iovlen, len);
 
 	if (NULL == siocb->scm)
 		siocb->scm = &tmp_scm;
@@ -1519,7 +1516,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	skb_put(skb, len - data_len);
 	skb->data_len = data_len;
 	skb->len = len;
-	err = skb_copy_datagram_from_iter(skb, 0, &from, len);
+	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
 	if (err)
 		goto out_free;
 
@@ -1641,9 +1638,6 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	bool fds_sent = false;
 	int max_level;
 	int data_len;
-	struct iov_iter from;
-
-	iov_iter_init(&from, WRITE, msg->msg_iov, msg->msg_iovlen, len);
 
 	if (NULL == siocb->scm)
 		siocb->scm = &tmp_scm;
@@ -1700,7 +1694,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		skb_put(skb, size - data_len);
 		skb->data_len = data_len;
 		skb->len = size;
-		err = skb_copy_datagram_from_iter(skb, 0, &from, size);
+		err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
 		if (err) {
 			kfree_skb(skb);
 			goto out_err;
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 20a0ba3..02d2e52 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1850,7 +1850,8 @@ static ssize_t vmci_transport_stream_enqueue(
 	struct msghdr *msg,
 	size_t len)
 {
-	return vmci_qpair_enquev(vmci_trans(vsk)->qpair, msg->msg_iov, len, 0);
+	/* XXX: stripping const */
+	return vmci_qpair_enquev(vmci_trans(vsk)->qpair, (struct iovec *)msg->msg_iter.iov, len, 0);
 }
 
 static s64 vmci_transport_stream_has_data(struct vsock_sock *vsk)
-- 
2.1.3

^ permalink raw reply related

* [PATCH 06/12] vmci: propagate msghdr all way down to __qp_memcpy_from_queue()
From: Al Viro @ 2014-12-05  5:58 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-kernel
In-Reply-To: <20141205055623.GQ29748@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

... and switch it to memcpy_to_msg()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/misc/vmw_vmci/vmci_queue_pair.c | 17 +++++++++--------
 include/linux/vmw_vmci_api.h            |  5 +++--
 net/vmw_vsock/vmci_transport.c          |  4 ++--
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c b/drivers/misc/vmw_vmci/vmci_queue_pair.c
index 1b7b303..7aaaf51 100644
--- a/drivers/misc/vmw_vmci/vmci_queue_pair.c
+++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c
@@ -27,6 +27,7 @@
 #include <linux/uio.h>
 #include <linux/wait.h>
 #include <linux/vmalloc.h>
+#include <linux/skbuff.h>
 
 #include "vmci_handle_array.h"
 #include "vmci_queue_pair.h"
@@ -429,11 +430,11 @@ static int __qp_memcpy_from_queue(void *dest,
 			to_copy = size - bytes_copied;
 
 		if (is_iovec) {
-			struct iovec *iov = (struct iovec *)dest;
+			struct msghdr *msg = dest;
 			int err;
 
 			/* The iovec will track bytes_copied internally. */
-			err = memcpy_toiovec(iov, (u8 *)va + page_offset,
+			err = memcpy_to_msg(msg, (u8 *)va + page_offset,
 					     to_copy);
 			if (err != 0) {
 				if (kernel_if->host)
@@ -3264,13 +3265,13 @@ EXPORT_SYMBOL_GPL(vmci_qpair_enquev);
  * of bytes dequeued or < 0 on error.
  */
 ssize_t vmci_qpair_dequev(struct vmci_qp *qpair,
-			  void *iov,
+			  struct msghdr *msg,
 			  size_t iov_size,
 			  int buf_type)
 {
 	ssize_t result;
 
-	if (!qpair || !iov)
+	if (!qpair)
 		return VMCI_ERROR_INVALID_ARGS;
 
 	qp_lock(qpair);
@@ -3279,7 +3280,7 @@ ssize_t vmci_qpair_dequev(struct vmci_qp *qpair,
 		result = qp_dequeue_locked(qpair->produce_q,
 					   qpair->consume_q,
 					   qpair->consume_q_size,
-					   iov, iov_size,
+					   msg, iov_size,
 					   qp_memcpy_from_queue_iov,
 					   true);
 
@@ -3308,13 +3309,13 @@ EXPORT_SYMBOL_GPL(vmci_qpair_dequev);
  * of bytes peeked or < 0 on error.
  */
 ssize_t vmci_qpair_peekv(struct vmci_qp *qpair,
-			 void *iov,
+			 struct msghdr *msg,
 			 size_t iov_size,
 			 int buf_type)
 {
 	ssize_t result;
 
-	if (!qpair || !iov)
+	if (!qpair)
 		return VMCI_ERROR_INVALID_ARGS;
 
 	qp_lock(qpair);
@@ -3323,7 +3324,7 @@ ssize_t vmci_qpair_peekv(struct vmci_qp *qpair,
 		result = qp_dequeue_locked(qpair->produce_q,
 					   qpair->consume_q,
 					   qpair->consume_q_size,
-					   iov, iov_size,
+					   msg, iov_size,
 					   qp_memcpy_from_queue_iov,
 					   false);
 
diff --git a/include/linux/vmw_vmci_api.h b/include/linux/vmw_vmci_api.h
index 023430e..5691f75 100644
--- a/include/linux/vmw_vmci_api.h
+++ b/include/linux/vmw_vmci_api.h
@@ -24,6 +24,7 @@
 #define VMCI_KERNEL_API_VERSION_2 2
 #define VMCI_KERNEL_API_VERSION   VMCI_KERNEL_API_VERSION_2
 
+struct msghdr;
 typedef void (vmci_device_shutdown_fn) (void *device_registration,
 					void *user_data);
 
@@ -75,8 +76,8 @@ ssize_t vmci_qpair_peek(struct vmci_qp *qpair, void *buf, size_t buf_size,
 ssize_t vmci_qpair_enquev(struct vmci_qp *qpair,
 			  void *iov, size_t iov_size, int mode);
 ssize_t vmci_qpair_dequev(struct vmci_qp *qpair,
-			  void *iov, size_t iov_size, int mode);
-ssize_t vmci_qpair_peekv(struct vmci_qp *qpair, void *iov, size_t iov_size,
+			  struct msghdr *msg, size_t iov_size, int mode);
+ssize_t vmci_qpair_peekv(struct vmci_qp *qpair, struct msghdr *msg, size_t iov_size,
 			 int mode);
 
 #endif /* !__VMW_VMCI_API_H__ */
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index c1c0389..20a0ba3 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1840,9 +1840,9 @@ static ssize_t vmci_transport_stream_dequeue(
 	int flags)
 {
 	if (flags & MSG_PEEK)
-		return vmci_qpair_peekv(vmci_trans(vsk)->qpair, msg->msg_iov, len, 0);
+		return vmci_qpair_peekv(vmci_trans(vsk)->qpair, msg, len, 0);
 	else
-		return vmci_qpair_dequev(vmci_trans(vsk)->qpair, msg->msg_iov, len, 0);
+		return vmci_qpair_dequev(vmci_trans(vsk)->qpair, msg, len, 0);
 }
 
 static ssize_t vmci_transport_stream_enqueue(
-- 
2.1.3

^ permalink raw reply related

* [PATCH 05/12] switch l2cap ->memcpy_fromiovec() to msghdr
From: Al Viro @ 2014-12-05  5:58 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-kernel
In-Reply-To: <20141205055623.GQ29748@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

it'll die soon enough - now that kvec-backed iov_iter works regardless
of set_fs(), both instances will become copy_from_iter() as soon as
we introduce ->msg_iter...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/net/bluetooth/l2cap.h | 6 +++---
 net/bluetooth/l2cap_core.c    | 4 ++--
 net/bluetooth/l2cap_sock.c    | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index 061e648..4e23674 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -608,7 +608,7 @@ struct l2cap_ops {
 					       unsigned long len, int nb);
 	int			(*memcpy_fromiovec) (struct l2cap_chan *chan,
 						     unsigned char *kdata,
-						     struct iovec *iov,
+						     struct msghdr *msg,
 						     int len);
 };
 
@@ -905,13 +905,13 @@ static inline long l2cap_chan_no_get_sndtimeo(struct l2cap_chan *chan)
 
 static inline int l2cap_chan_no_memcpy_fromiovec(struct l2cap_chan *chan,
 						 unsigned char *kdata,
-						 struct iovec *iov,
+						 struct msghdr *msg,
 						 int len)
 {
 	/* Following is safe since for compiler definitions of kvec and
 	 * iovec are identical, yielding the same in-core layout and alignment
 	 */
-	struct kvec *vec = (struct kvec *)iov;
+	struct kvec *vec = (struct kvec *)msg->msg_iov;
 
 	while (len > 0) {
 		if (vec->iov_len) {
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 8e12731..5201d61 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -2097,7 +2097,7 @@ static inline int l2cap_skbuff_fromiovec(struct l2cap_chan *chan,
 	int sent = 0;
 
 	if (chan->ops->memcpy_fromiovec(chan, skb_put(skb, count),
-					msg->msg_iov, count))
+					msg, count))
 		return -EFAULT;
 
 	sent += count;
@@ -2118,7 +2118,7 @@ static inline int l2cap_skbuff_fromiovec(struct l2cap_chan *chan,
 		*frag = tmp;
 
 		if (chan->ops->memcpy_fromiovec(chan, skb_put(*frag, count),
-						msg->msg_iov, count))
+						msg, count))
 			return -EFAULT;
 
 		sent += count;
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index b0efb72..205b298 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1338,9 +1338,9 @@ static struct sk_buff *l2cap_sock_alloc_skb_cb(struct l2cap_chan *chan,
 
 static int l2cap_sock_memcpy_fromiovec_cb(struct l2cap_chan *chan,
 					  unsigned char *kdata,
-					  struct iovec *iov, int len)
+					  struct msghdr *msg, int len)
 {
-	return memcpy_fromiovec(kdata, iov, len);
+	return memcpy_from_msg(kdata, msg, len);
 }
 
 static void l2cap_sock_ready_cb(struct l2cap_chan *chan)
-- 
2.1.3

^ permalink raw reply related

* [PATCH 04/12] switch tcp_sock->ucopy from iovec (ucopy.iov) to msghdr (ucopy.msg)
From: Al Viro @ 2014-12-05  5:58 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-kernel
In-Reply-To: <20141205055623.GQ29748@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/tcp.h  | 2 +-
 net/ipv4/tcp.c       | 2 +-
 net/ipv4/tcp_input.c | 7 +++----
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index f566b85..5d9cc9c 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -162,7 +162,7 @@ struct tcp_sock {
 	struct {
 		struct sk_buff_head	prequeue;
 		struct task_struct	*task;
-		struct iovec		*iov;
+		struct msghdr		*msg;
 		int			memory;
 		int			len;
 	} ucopy;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index dc13a36..4a96f37 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1729,7 +1729,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
 				user_recv = current;
 				tp->ucopy.task = user_recv;
-				tp->ucopy.iov = msg->msg_iov;
+				tp->ucopy.msg = msg;
 			}
 
 			tp->ucopy.len = len;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 69de1a1..075ab4d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4421,7 +4421,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 			__set_current_state(TASK_RUNNING);
 
 			local_bh_enable();
-			if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
+			if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) {
 				tp->ucopy.len -= chunk;
 				tp->copied_seq += chunk;
 				eaten = (chunk == skb->len);
@@ -4941,10 +4941,9 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
 
 	local_bh_enable();
 	if (skb_csum_unnecessary(skb))
-		err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk);
+		err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk);
 	else
-		err = skb_copy_and_csum_datagram_iovec(skb, hlen,
-						       tp->ucopy.iov);
+		err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg);
 
 	if (!err) {
 		tp->ucopy.len -= chunk;
-- 
2.1.3

^ permalink raw reply related

* [PATCH 02/12] ipv6 equivalent of "ipv4: Avoid reading user iov twice after raw_probe_proto_opt"
From: Al Viro @ 2014-12-05  5:58 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-kernel
In-Reply-To: <20141205055623.GQ29748@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/ipv6/raw.c | 112 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 56 insertions(+), 56 deletions(-)

diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 8baa53e..942f67b 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -672,65 +672,62 @@ error:
 	return err;
 }
 
-static int rawv6_probe_proto_opt(struct flowi6 *fl6, struct msghdr *msg)
+struct raw6_frag_vec {
+	struct msghdr *msg;
+	int hlen;
+	char c[4];
+};
+
+static int rawv6_probe_proto_opt(struct raw6_frag_vec *rfv, struct flowi6 *fl6)
 {
-	struct iovec *iov;
-	u8 __user *type = NULL;
-	u8 __user *code = NULL;
-	u8 len = 0;
-	int probed = 0;
-	int i;
-
-	if (!msg->msg_iov)
-		return 0;
+	int err = 0;
+	switch (fl6->flowi6_proto) {
+	case IPPROTO_ICMPV6:
+		rfv->hlen = 2;
+		err = memcpy_from_msg(rfv->c, rfv->msg, rfv->hlen);
+		if (!err) {
+			fl6->fl6_icmp_type = rfv->c[0];
+			fl6->fl6_icmp_code = rfv->c[1];
+		}
+		break;
+	case IPPROTO_MH:
+		rfv->hlen = 4;
+		err = memcpy_from_msg(rfv->c, rfv->msg, rfv->hlen);
+		if (!err)
+			fl6->fl6_mh_type = rfv->c[2];
+	}
+	return err;
+}
 
-	for (i = 0; i < msg->msg_iovlen; i++) {
-		iov = &msg->msg_iov[i];
-		if (!iov)
-			continue;
+static int raw6_getfrag(void *from, char *to, int offset, int len, int odd,
+		       struct sk_buff *skb)
+{
+	struct raw6_frag_vec *rfv = from;
 
-		switch (fl6->flowi6_proto) {
-		case IPPROTO_ICMPV6:
-			/* check if one-byte field is readable or not. */
-			if (iov->iov_base && iov->iov_len < 1)
-				break;
-
-			if (!type) {
-				type = iov->iov_base;
-				/* check if code field is readable or not. */
-				if (iov->iov_len > 1)
-					code = type + 1;
-			} else if (!code)
-				code = iov->iov_base;
-
-			if (type && code) {
-				if (get_user(fl6->fl6_icmp_type, type) ||
-				    get_user(fl6->fl6_icmp_code, code))
-					return -EFAULT;
-				probed = 1;
-			}
-			break;
-		case IPPROTO_MH:
-			if (iov->iov_base && iov->iov_len < 1)
-				break;
-			/* check if type field is readable or not. */
-			if (iov->iov_len > 2 - len) {
-				u8 __user *p = iov->iov_base;
-				if (get_user(fl6->fl6_mh_type, &p[2 - len]))
-					return -EFAULT;
-				probed = 1;
-			} else
-				len += iov->iov_len;
+	if (offset < rfv->hlen) {
+		int copy = min(rfv->hlen - offset, len);
 
-			break;
-		default:
-			probed = 1;
-			break;
-		}
-		if (probed)
-			break;
+		if (skb->ip_summed == CHECKSUM_PARTIAL)
+			memcpy(to, rfv->c + offset, copy);
+		else
+			skb->csum = csum_block_add(
+				skb->csum,
+				csum_partial_copy_nocheck(rfv->c + offset,
+							  to, copy, 0),
+				odd);
+
+		odd = 0;
+		offset += copy;
+		to += copy;
+		len -= copy;
+
+		if (!len)
+			return 0;
 	}
-	return 0;
+
+	offset -= rfv->hlen;
+
+	return ip_generic_getfrag(rfv->msg->msg_iov, to, offset, len, odd, skb);
 }
 
 static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
@@ -745,6 +742,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 	struct ipv6_txoptions *opt = NULL;
 	struct ip6_flowlabel *flowlabel = NULL;
 	struct dst_entry *dst = NULL;
+	struct raw6_frag_vec rfv;
 	struct flowi6 fl6;
 	int addr_len = msg->msg_namelen;
 	int hlimit = -1;
@@ -848,7 +846,9 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 	opt = ipv6_fixup_options(&opt_space, opt);
 
 	fl6.flowi6_proto = proto;
-	err = rawv6_probe_proto_opt(&fl6, msg);
+	rfv.msg = msg;
+	rfv.hlen = 0;
+	err = rawv6_probe_proto_opt(&rfv, &fl6);
 	if (err)
 		goto out;
 
@@ -889,7 +889,7 @@ back_from_confirm:
 		err = rawv6_send_hdrinc(sk, msg->msg_iov, len, &fl6, &dst, msg->msg_flags);
 	else {
 		lock_sock(sk);
-		err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov,
+		err = ip6_append_data(sk, raw6_getfrag, &rfv,
 			len, 0, hlimit, tclass, opt, &fl6, (struct rt6_info *)dst,
 			msg->msg_flags, dontfrag);
 
-- 
2.1.3

^ permalink raw reply related

* [PATCH 01/12] raw.c: stick msghdr into raw_frag_vec
From: Al Viro @ 2014-12-05  5:58 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-kernel
In-Reply-To: <20141205055623.GQ29748@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

we'll want access to ->msg_iter

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/ipv4/raw.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 43385a9..5c901eb 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -82,7 +82,7 @@
 #include <linux/uio.h>
 
 struct raw_frag_vec {
-	struct iovec *iov;
+	struct msghdr *msg;
 	union {
 		struct icmphdr icmph;
 		char c[1];
@@ -440,7 +440,7 @@ static int raw_probe_proto_opt(struct raw_frag_vec *rfv, struct flowi4 *fl4)
 	/* We only need the first two bytes. */
 	rfv->hlen = 2;
 
-	err = memcpy_fromiovec(rfv->hdr.c, rfv->iov, rfv->hlen);
+	err = memcpy_from_msg(rfv->hdr.c, rfv->msg, rfv->hlen);
 	if (err)
 		return err;
 
@@ -478,7 +478,7 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd,
 
 	offset -= rfv->hlen;
 
-	return ip_generic_getfrag(rfv->iov, to, offset, len, odd, skb);
+	return ip_generic_getfrag(rfv->msg->msg_iov, to, offset, len, odd, skb);
 }
 
 static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
@@ -600,7 +600,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			   daddr, saddr, 0, 0);
 
 	if (!inet->hdrincl) {
-		rfv.iov = msg->msg_iov;
+		rfv.msg = msg;
 		rfv.hlen = 0;
 
 		err = raw_probe_proto_opt(&rfv, &fl4);
-- 
2.1.3

^ permalink raw reply related

* the next chunk of iov_iter-net stuff for review
From: Al Viro @ 2014-12-05  5:56 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-kernel
In-Reply-To: <20141126.122727.1791815907895724977.davem@davemloft.net>

	OK, here's the tentative next batch (covers most of the ->recvmsg()
side of conversion).  That's on top of merge of net-next#master with
vfs#iov_iter (the latter had been posted earlier today, Cc'd to netdev among
other places).  This series corresponds to vfs#for-davem.  Review and comments
would be very welcome...

Shortlog:
Al Viro (12):
      raw.c: stick msghdr into raw_frag_vec
      ipv6 equivalent of "ipv4: Avoid reading user iov twice after raw_probe_proto_opt"
      ip_generic_getfrag, udplite_getfrag: switch to passing msghdr
      switch tcp_sock->ucopy from iovec (ucopy.iov) to msghdr (ucopy.msg)
      switch l2cap ->memcpy_fromiovec() to msghdr
      vmci: propagate msghdr all way down to __qp_memcpy_from_queue()
      put iov_iter into msghdr
      first fruits - kill l2cap ->memcpy_fromiovec()
      switch memcpy_to_msg() and skb_copy{,_and_csum}_datagram_msg() to primitives
      ppp_read(): switch to skb_copy_datagram_iter()
      skb_copy_datagram_iovec() can die
      bury memcpy_toiovec()
Diffstat:
 crypto/algif_hash.c                      |   4 +-
 crypto/algif_skcipher.c                  |   4 +-
 drivers/misc/vmw_vmci/vmci_queue_pair.c  |  17 ++--
 drivers/net/macvtap.c                    |   8 +-
 drivers/net/ppp/ppp_generic.c            |   4 +-
 drivers/net/tun.c                        |   8 +-
 drivers/target/iscsi/iscsi_target_util.c |  12 ++-
 drivers/vhost/net.c                      |   8 +-
 fs/afs/rxrpc.c                           |  14 ++--
 include/linux/skbuff.h                   |  22 ++---
 include/linux/socket.h                   |   3 +-
 include/linux/tcp.h                      |   2 +-
 include/linux/uio.h                      |   1 -
 include/linux/vmw_vmci_api.h             |   5 +-
 include/net/bluetooth/l2cap.h            |  29 -------
 include/net/udplite.h                    |   4 +-
 lib/iovec.c                              |  25 ------
 net/atm/common.c                         |   5 +-
 net/bluetooth/6lowpan.c                  |   7 +-
 net/bluetooth/a2mp.c                     |   4 +-
 net/bluetooth/l2cap_core.c               |   7 +-
 net/bluetooth/l2cap_sock.c               |   8 --
 net/bluetooth/smp.c                      |   5 +-
 net/caif/caif_socket.c                   |   2 +-
 net/compat.c                             |  10 ++-
 net/core/datagram.c                      | 138 +++++--------------------------
 net/ipv4/ip_output.c                     |   8 +-
 net/ipv4/ping.c                          |   3 +-
 net/ipv4/raw.c                           |  11 +--
 net/ipv4/tcp.c                           |   8 +-
 net/ipv4/tcp_input.c                     |   7 +-
 net/ipv4/tcp_output.c                    |   2 +-
 net/ipv4/udp.c                           |   4 +-
 net/ipv6/ping.c                          |   3 +-
 net/ipv6/raw.c                           | 115 +++++++++++++-------------
 net/ipv6/udp.c                           |   2 +-
 net/l2tp/l2tp_ip6.c                      |   2 +-
 net/netlink/af_netlink.c                 |   2 +-
 net/packet/af_packet.c                   |   7 +-
 net/rds/recv.c                           |   7 +-
 net/rds/send.c                           |   4 +-
 net/rxrpc/ar-output.c                    |   8 +-
 net/sctp/socket.c                        |   5 +-
 net/socket.c                             |  27 +++---
 net/tipc/msg.c                           |   4 +-
 net/tipc/socket.c                        |   4 +-
 net/unix/af_unix.c                       |  10 +--
 net/vmw_vsock/vmci_transport.c           |   7 +-
 48 files changed, 204 insertions(+), 402 deletions(-)

^ permalink raw reply

* Re: [PATCH 3/3] rocker: set feature NETIF_F_HW_SWITCH_OFFLOAD
From: Jianhua Xie @ 2014-12-05  4:44 UTC (permalink / raw)
  To: Roopa Prabhu
  Cc: jiri, sfeldma, jhs, bcrl, tgraf, john.fastabend, stephen,
	linville, nhorman, nicolas.dichtel, vyasevic, f.fainelli, buytenh,
	aviadr, netdev, davem, shm, gospo, jianhua Xie
In-Reply-To: <5481312D.6080708@cumulusnetworks.com>


在 2014年12月05日 12:14, Roopa Prabhu 写道:
> On 12/4/14, 7:25 PM, Jianhua Xie wrote:
>>
>> 在 2014年12月05日 10:26, roopa@cumulusnetworks.com 写道:
>>> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>>>
>>> This patch just sets the feature flag on rocker ports
>>> ---
>>>   drivers/net/ethernet/rocker/rocker.c |    3 ++-
>>>   1 file changed, 2 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/net/ethernet/rocker/rocker.c 
>>> b/drivers/net/ethernet/rocker/rocker.c
>>> index fded127..3fe19b0 100644
>>> --- a/drivers/net/ethernet/rocker/rocker.c
>>> +++ b/drivers/net/ethernet/rocker/rocker.c
>>> @@ -4003,7 +4003,8 @@ static int rocker_probe_port(struct rocker 
>>> *rocker, unsigned int port_number)
>>>                  NAPI_POLL_WEIGHT);
>>>       rocker_carrier_init(rocker_port);
>>>   -    dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
>>> +    dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER |
>>> +                     NETIF_F_HW_SWITCH_OFFLOAD;
>> Do you have a plan on enabling/disabling  this flag dynamically by 
>> ethtool?
>
> I have not thought about it yet. But if this gets accepted, then yes, 
> I will have an ethtool way for drivers who want it.
> if we have a IFF_ flag for this (which is also one of my alternate 
> proposals), there will be a netlink way to set it on and off.
>
Got it with thanks.
Best Regards.
Jianhua

^ permalink raw reply

* Re: [PATCH 1/3] netdev: introduce new NETIF_F_HW_SWITCH_OFFLOAD feature flag for switch device offloads
From: Jianhua Xie @ 2014-12-05  4:43 UTC (permalink / raw)
  To: Roopa Prabhu
  Cc: jiri, sfeldma, jhs, bcrl, tgraf, john.fastabend, stephen,
	linville, nhorman, nicolas.dichtel, vyasevic, f.fainelli, buytenh,
	aviadr, netdev, davem, shm, gospo
In-Reply-To: <548131EB.3060502@cumulusnetworks.com>


在 2014年12月05日 12:17, Roopa Prabhu 写道:
> On 12/4/14, 7:21 PM, Jianhua Xie wrote:
>>
>> 在 2014年12月05日 10:26, roopa@cumulusnetworks.com 写道:
>>> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>>>
>>> This is a generic high level feature flag for all switch asic 
>>> features today.
>>>
>>> switch drivers set this flag on switch ports. Logical devices like
>>> bridge, bonds, vxlans can inherit this flag from their slaves/ports.
>>>
>>> I had to use SWITCH in the name to avoid ambiguity with other feature
>>> flags. But, since i have been harping about not calling it 'switch',
>>> I am welcome to any suggestions :)
>>>
>>> An alternative to using a feature flag is to use a IFF_HW_OFFLOAD
>>> in net_device_flags.
>>> ---
>>>   include/linux/netdev_features.h |    2 ++
>>>   1 file changed, 2 insertions(+)
>>>
>>> diff --git a/include/linux/netdev_features.h 
>>> b/include/linux/netdev_features.h
>>> index 8e30685..68db1de 100644
>>> --- a/include/linux/netdev_features.h
>>> +++ b/include/linux/netdev_features.h
>>> @@ -66,6 +66,7 @@ enum {
>>>       NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN 
>>> STAGs */
>>>       NETIF_F_HW_L2FW_DOFFLOAD_BIT,    /* Allow L2 Forwarding in 
>>> Hardware */
>>>       NETIF_F_BUSY_POLL_BIT,        /* Busy poll */
>>> +    NETIF_F_HW_SWITCH_OFFLOAD_BIT,  /* HW switch offload */
>> I am interested in this flag very much, but I am not very clear
>> how many offload capabilities does this flag imply.  If this flag
>> belongs to a general flag and can be accepted by all vendors,
>> I will reuse this flag to introduce another out going data traffics
>> distribution offload method to bonding driver.
> Right now its a one global offload flag for switch asics. Some may not 
> want to tune it more. But others may and so i do expect more flags in 
> this category.
>
Got it with thanks.
Best Regards.
> Thanks,
> Roopa
>

^ permalink raw reply

* Re: [PATCH] net/mlx4: Fix EEH recovery failure
From: Gavin Shan @ 2014-12-05  4:28 UTC (permalink / raw)
  To: Gavin Shan; +Cc: netdev, amirv, davem, yishaih
In-Reply-To: <1416653807-4859-1-git-send-email-gwshan@linux.vnet.ibm.com>

On Sat, Nov 22, 2014 at 09:56:47PM +1100, Gavin Shan wrote:

Yishai already had patches fixing the issue. So please ignore
this patch and drop it.

Thanks,
Gavin

>The patch fixes couple of EEH recovery failures on PPC PowerNV
>platform:
>
>   * Release reserved memory regions in mlx4_pci_err_detected().
>     Otherwise, __mlx4_init_one() fails because of reserving
>     same memory regions recursively.
>   * Disable PCI device in mlx4_pci_err_detected(). Otherwise,
>     pci_enable_device() in __mlx4_init_one() doesn't enable
>     the PCI device because it's already in enabled state indicated
>     by struct pci_dev::enable_cnt.
>   * Don't clear struct mlx4_priv instance in mlx4_pci_err_detected().
>     Otherwise, __mlx4_init_one() runs into kernel crash because
>     of dereferencing to NULL pointer.
>
>With the patch applied, EEH recovery for mlx4 adapter succeeds on PPC
>PowerNV platform.
>
>   # lspci
>   0003:0f:00.0 Network controller: Mellanox Technologies \
>   MT27500 Family [ConnectX-3]
>
>Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
>---
> drivers/net/ethernet/mellanox/mlx4/main.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
>diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
>index 90de6e1..e118ac9 100644
>--- a/drivers/net/ethernet/mellanox/mlx4/main.c
>+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
>@@ -2809,7 +2809,6 @@ static void mlx4_unload_one(struct pci_dev *pdev)
> 	kfree(dev->caps.qp1_proxy);
> 	kfree(dev->dev_vfs);
>
>-	memset(priv, 0, sizeof(*priv));
> 	priv->pci_dev_data = pci_dev_data;
> 	priv->removed = 1;
> }
>@@ -2900,6 +2899,8 @@ static pci_ers_result_t mlx4_pci_err_detected(struct pci_dev *pdev,
> 					      pci_channel_state_t state)
> {
> 	mlx4_unload_one(pdev);
>+	pci_release_regions(pdev);
>+	pci_disable_device(pdev);
>
> 	return state == pci_channel_io_perm_failure ?
> 		PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
>-- 
>1.8.3.2
>

^ permalink raw reply

* Re: [PATCH] net: ethernet: rocker: Add select to CONFIG_BRIDGE in Kconfig
From: David Miller @ 2014-12-05  4:24 UTC (permalink / raw)
  To: mail; +Cc: jim.epost, sfr, linux-next, linux-kernel, jiri, sfeldma, netdev
In-Reply-To: <54809BA5.7030901@rupran.de>

From: Andreas Ruprecht <mail@rupran.de>
Date: Thu, 04 Dec 2014 18:36:37 +0100

> On 04.12.2014 17:34, Jim Davis wrote:
>> Building with the attached random configuration file,
>> 
>> drivers/built-in.o: In function `rocker_port_fdb_learn_work':
>> /home/jim/linux/drivers/net/ethernet/rocker/rocker.c:3014: undefined
>> reference to `br_fdb_external_learn_del'
>> /home/jim/linux/drivers/net/ethernet/rocker/rocker.c:3016: undefined
>> reference to `br_fdb_external_learn_add'
>> 
> 
> Hi,
> 
> the problem here is that CONFIG_BRIDGE is set to 'm' (leading to
> inclusion of the two functions above in the kernel module) while
> CONFIG_ROCKER is set to 'y', requiring the functions at link time.
> 
> Is the attached patch sufficient to fix this?

Do not use select, please.

You can only use select on leaf node Kconfig symbols, ie. those
which do not have any dependencies whatsoever.

Select does not recursively walk down the dependency chain turning
things on for you when you say "select X".

^ permalink raw reply

* Re: [PATCH 1/3] netdev: introduce new NETIF_F_HW_SWITCH_OFFLOAD feature flag for switch device offloads
From: Roopa Prabhu @ 2014-12-05  4:17 UTC (permalink / raw)
  To: Jianhua Xie
  Cc: jiri, sfeldma, jhs, bcrl, tgraf, john.fastabend, stephen,
	linville, nhorman, nicolas.dichtel, vyasevic, f.fainelli, buytenh,
	aviadr, netdev, davem, shm, gospo
In-Reply-To: <5481249E.7020902@freescale.com>

On 12/4/14, 7:21 PM, Jianhua Xie wrote:
>
> 在 2014年12月05日 10:26, roopa@cumulusnetworks.com 写道:
>> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>>
>> This is a generic high level feature flag for all switch asic 
>> features today.
>>
>> switch drivers set this flag on switch ports. Logical devices like
>> bridge, bonds, vxlans can inherit this flag from their slaves/ports.
>>
>> I had to use SWITCH in the name to avoid ambiguity with other feature
>> flags. But, since i have been harping about not calling it 'switch',
>> I am welcome to any suggestions :)
>>
>> An alternative to using a feature flag is to use a IFF_HW_OFFLOAD
>> in net_device_flags.
>> ---
>>   include/linux/netdev_features.h |    2 ++
>>   1 file changed, 2 insertions(+)
>>
>> diff --git a/include/linux/netdev_features.h 
>> b/include/linux/netdev_features.h
>> index 8e30685..68db1de 100644
>> --- a/include/linux/netdev_features.h
>> +++ b/include/linux/netdev_features.h
>> @@ -66,6 +66,7 @@ enum {
>>       NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN 
>> STAGs */
>>       NETIF_F_HW_L2FW_DOFFLOAD_BIT,    /* Allow L2 Forwarding in 
>> Hardware */
>>       NETIF_F_BUSY_POLL_BIT,        /* Busy poll */
>> +    NETIF_F_HW_SWITCH_OFFLOAD_BIT,  /* HW switch offload */
> I am interested in this flag very much, but I am not very clear
> how many offload capabilities does this flag imply.  If this flag
> belongs to a general flag and can be accepted by all vendors,
> I will reuse this flag to introduce another out going data traffics
> distribution offload method to bonding driver.
Right now its a one global offload flag for switch asics. Some may not 
want to tune it more. But others may and so i do expect more flags in 
this category.

Thanks,
Roopa

^ permalink raw reply

* Re: [PATCH 3/3] rocker: set feature NETIF_F_HW_SWITCH_OFFLOAD
From: Roopa Prabhu @ 2014-12-05  4:14 UTC (permalink / raw)
  To: Jianhua Xie
  Cc: jiri, sfeldma, jhs, bcrl, tgraf, john.fastabend, stephen,
	linville, nhorman, nicolas.dichtel, vyasevic, f.fainelli, buytenh,
	aviadr, netdev, davem, shm, gospo
In-Reply-To: <5481259C.3040707@freescale.com>

On 12/4/14, 7:25 PM, Jianhua Xie wrote:
>
> 在 2014年12月05日 10:26, roopa@cumulusnetworks.com 写道:
>> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>>
>> This patch just sets the feature flag on rocker ports
>> ---
>>   drivers/net/ethernet/rocker/rocker.c |    3 ++-
>>   1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/ethernet/rocker/rocker.c 
>> b/drivers/net/ethernet/rocker/rocker.c
>> index fded127..3fe19b0 100644
>> --- a/drivers/net/ethernet/rocker/rocker.c
>> +++ b/drivers/net/ethernet/rocker/rocker.c
>> @@ -4003,7 +4003,8 @@ static int rocker_probe_port(struct rocker 
>> *rocker, unsigned int port_number)
>>                  NAPI_POLL_WEIGHT);
>>       rocker_carrier_init(rocker_port);
>>   -    dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
>> +    dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER |
>> +                     NETIF_F_HW_SWITCH_OFFLOAD;
> Do you have a plan on enabling/disabling  this flag dynamically by 
> ethtool?

I have not thought about it yet. But if this gets accepted, then yes, I 
will have an ethtool way for drivers who want it.
if we have a IFF_ flag for this (which is also one of my alternate 
proposals), there will be a netlink way to set it on and off.

^ permalink raw reply

* [RFC PATCH net-next] tun: support retrieving multiple packets in a single read with IFF_MULTI_READ
From: Alex Gartrell @ 2014-12-05  4:00 UTC (permalink / raw)
  To: jasonwang
  Cc: davem, netdev, linux-kernel, mst, herbert, kernel-team,
	Alex Gartrell

This patch adds the IFF_MULTI_READ flag.  This has the following behavior.

1) If a read is too short for a packet, a single stripped packet will be read

2) If a read is long enough for multiple packets, as many *full* packets
will be read as possible.  We will not return a stripped packet, so even if
there are many, many packets, we may get a short read.

In casual performance testing with a simple test program that simply reads
and counts packets, IFF_MULTI_READ conservatively yielded a 30% CPU win, as
measured by top.  Load was being driven by a bunch of hpings running on a
server on the same L2 network (single hop through a top-of-rack switch).

Signed-off-by: Alex Gartrell <agartrell@fb.com>
---
 drivers/net/tun.c           | 66 ++++++++++++++++++++++++++++++++++++++-------
 include/uapi/linux/if_tun.h |  3 +++
 2 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 6d44da1..f57d618 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1228,6 +1228,26 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	return result;
 }
 
+static inline size_t tun_calc_max_put_len(const struct tun_struct *tun)
+{
+	size_t len = 0;
+
+	/* It's a pain to peek the skb, so let's assume the worst:
+	 * 1) That skb->len = mtu
+	 * 2) That there is a vlan_tx_tag present
+	 */
+
+	len += tun->dev->mtu + VLAN_HLEN;
+
+	if (tun->flags & TUN_VNET_HDR)
+		len += tun->vnet_hdr_sz;
+
+	if (!(tun->flags & TUN_NO_PI))
+		len += sizeof(struct tun_pi);
+
+	return len;
+}
+
 /* Put packet to the user space buffer */
 static ssize_t tun_put_user(struct tun_struct *tun,
 			    struct tun_file *tfile,
@@ -1343,8 +1363,10 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 			   struct iov_iter *to,
 			   int noblock)
 {
+	const size_t max_put_len = tun_calc_max_put_len(tun);
 	struct sk_buff *skb;
-	ssize_t ret;
+	ssize_t ret = 0;
+	ssize_t put_ret = 0;
 	int peeked, err, off = 0;
 
 	tun_debug(KERN_INFO, tun, "tun_do_read\n");
@@ -1355,14 +1377,31 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 	if (tun->dev->reg_state != NETREG_REGISTERED)
 		return -EIO;
 
-	/* Read frames from queue */
-	skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0,
-				  &peeked, &off, &err);
-	if (!skb)
-		return 0;
+	while (!ret || ((tun->flags & TUN_MULTI_READ) &&
+			iov_iter_count(to) >= max_put_len)) {
+		/* Read frames from queue */
+		skb = __skb_recv_datagram(tfile->socket.sk,
+					  noblock ? MSG_DONTWAIT : 0,
+					  &peeked, &off, &err);
+		if (skb) {
+			put_ret = tun_put_user(tun, tfile, skb, to);
+			kfree_skb(skb);
+			if (put_ret < 0) {
+				ret = put_ret;
+				break;
+			}
+			ret += put_ret;
+		} else {
+			if (!ret)
+				ret = err;
+			break;
+		}
 
-	ret = tun_put_user(tun, tfile, skb, to);
-	kfree_skb(skb);
+		/* Now that we've received a datagram, noblock for the
+		 * rest
+		 */
+		noblock = 1;
+	}
 
 	return ret;
 }
@@ -1537,6 +1576,9 @@ static int tun_flags(struct tun_struct *tun)
 	if (tun->flags & TUN_PERSIST)
 		flags |= IFF_PERSIST;
 
+	if (tun->flags & TUN_MULTI_READ)
+		flags |= IFF_MULTI_READ;
+
 	return flags;
 }
 
@@ -1720,6 +1762,11 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 	else
 		tun->flags &= ~TUN_TAP_MQ;
 
+	if (ifr->ifr_flags & IFF_MULTI_READ)
+		tun->flags |= TUN_MULTI_READ;
+	else
+		tun->flags &= ~TUN_MULTI_READ;
+
 	/* Make sure persistent devices do not get stuck in
 	 * xoff state.
 	 */
@@ -1883,7 +1930,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		 * This is needed because we never checked for invalid flags on
 		 * TUNSETIFF. */
 		return put_user(IFF_TUN | IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE |
-				IFF_VNET_HDR | IFF_MULTI_QUEUE,
+				IFF_VNET_HDR | IFF_MULTI_QUEUE |
+				IFF_MULTI_READ,
 				(unsigned int __user*)argp);
 	} else if (cmd == TUNSETQUEUE)
 		return tun_set_queue(file, &ifr);
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index e9502dd..aaf9ddc 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -36,6 +36,7 @@
 #define TUN_PERSIST 	0x0100	
 #define TUN_VNET_HDR 	0x0200
 #define TUN_TAP_MQ      0x0400
+#define TUN_MULTI_READ	0x0800
 
 /* Ioctl defines */
 #define TUNSETNOCSUM  _IOW('T', 200, int) 
@@ -74,6 +75,8 @@
 #define IFF_PERSIST	0x0800
 #define IFF_NOFILTER	0x1000
 
+#define IFF_MULTI_READ	0x2000
+
 /* Socket options */
 #define TUN_TX_TIMESTAMP 1
 
-- 
Alex Gartrell <agartrell@fb.com>

^ permalink raw reply related

* Re: [PATCH 3/3] rocker: set feature NETIF_F_HW_SWITCH_OFFLOAD
From: Jianhua Xie @ 2014-12-05  3:25 UTC (permalink / raw)
  To: roopa, jiri, sfeldma, jhs, bcrl, tgraf, john.fastabend, stephen,
	linville, nhorman, nicolas.dichtel, vyasevic, f.fainelli, buytenh,
	aviadr
  Cc: netdev, davem, shm, gospo, jianhua Xie
In-Reply-To: <1417746401-8140-4-git-send-email-roopa@cumulusnetworks.com>


在 2014年12月05日 10:26, roopa@cumulusnetworks.com 写道:
> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>
> This patch just sets the feature flag on rocker ports
> ---
>   drivers/net/ethernet/rocker/rocker.c |    3 ++-
>   1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
> index fded127..3fe19b0 100644
> --- a/drivers/net/ethernet/rocker/rocker.c
> +++ b/drivers/net/ethernet/rocker/rocker.c
> @@ -4003,7 +4003,8 @@ static int rocker_probe_port(struct rocker *rocker, unsigned int port_number)
>   		       NAPI_POLL_WEIGHT);
>   	rocker_carrier_init(rocker_port);
>   
> -	dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
> +	dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER |
> +					 NETIF_F_HW_SWITCH_OFFLOAD;
Do you have a plan on enabling/disabling  this flag dynamically by ethtool?

Thanks & B.R
Jianhua
>   
>   	err = register_netdev(dev);
>   	if (err) {

^ permalink raw reply

* Re: [PATCH 1/3] netdev: introduce new NETIF_F_HW_SWITCH_OFFLOAD feature flag for switch device offloads
From: Jianhua Xie @ 2014-12-05  3:21 UTC (permalink / raw)
  To: roopa, jiri, sfeldma, jhs, bcrl, tgraf, john.fastabend, stephen,
	linville, nhorman, nicolas.dichtel, vyasevic, f.fainelli, buytenh,
	aviadr
  Cc: netdev, davem, shm, gospo, Jianhua Xie
In-Reply-To: <1417746401-8140-2-git-send-email-roopa@cumulusnetworks.com>


在 2014年12月05日 10:26, roopa@cumulusnetworks.com 写道:
> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>
> This is a generic high level feature flag for all switch asic features today.
>
> switch drivers set this flag on switch ports. Logical devices like
> bridge, bonds, vxlans can inherit this flag from their slaves/ports.
>
> I had to use SWITCH in the name to avoid ambiguity with other feature
> flags. But, since i have been harping about not calling it 'switch',
> I am welcome to any suggestions :)
>
> An alternative to using a feature flag is to use a IFF_HW_OFFLOAD
> in net_device_flags.
> ---
>   include/linux/netdev_features.h |    2 ++
>   1 file changed, 2 insertions(+)
>
> diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
> index 8e30685..68db1de 100644
> --- a/include/linux/netdev_features.h
> +++ b/include/linux/netdev_features.h
> @@ -66,6 +66,7 @@ enum {
>   	NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN STAGs */
>   	NETIF_F_HW_L2FW_DOFFLOAD_BIT,	/* Allow L2 Forwarding in Hardware */
>   	NETIF_F_BUSY_POLL_BIT,		/* Busy poll */
> +	NETIF_F_HW_SWITCH_OFFLOAD_BIT,  /* HW switch offload */
I am interested in this flag very much, but I am not very clear
how many offload capabilities does this flag imply.  If this flag
belongs to a general flag and can be accepted by all vendors,
I will reuse this flag to introduce another out going data traffics
distribution offload method to bonding driver.

Thanks & B.R.
Jianhua
>   
>   	/*
>   	 * Add your fresh new feature above and remember to update
> @@ -124,6 +125,7 @@ enum {
>   #define NETIF_F_HW_VLAN_STAG_TX	__NETIF_F(HW_VLAN_STAG_TX)
>   #define NETIF_F_HW_L2FW_DOFFLOAD	__NETIF_F(HW_L2FW_DOFFLOAD)
>   #define NETIF_F_BUSY_POLL	__NETIF_F(BUSY_POLL)
> +#define NETIF_F_HW_SWITCH_OFFLOAD	__NETIF_F(HW_SWITCH_OFFLOAD)
>   
>   /* Features valid for ethtool to change */
>   /* = all defined minus driver/device-class-related */

^ permalink raw reply

* Re: [patch iproute2 0/6] iproute2: add changes for switchdev
From: Roopa Prabhu @ 2014-12-05  2:28 UTC (permalink / raw)
  To: Scott Feldman
  Cc: Jiri Pirko, Netdev, David S. Miller, nhorman@tuxdriver.com,
	Andy Gospodarek, Thomas Graf, dborkman@redhat.com,
	ogerlitz@mellanox.com, jesse@nicira.com, pshelar@nicira.com,
	azhou@nicira.com, ben@decadent.org.uk, stephen@networkplumber.org,
	Kirsher, Jeffrey T, vyasevic@redhat.com, Cong Wang,
	Fastabend, John R, Eric Dumazet, Jamal Hadi Salim,
	Florian Fainelli, John Linville
In-Reply-To: <CAE4R7bCyf7VBFWqEme37rf0YbTc3-s5Qka+1HMOsdMgVC3qZ-A@mail.gmail.com>

On 12/4/14, 12:49 PM, Scott Feldman wrote:
> On Thu, Dec 4, 2014 at 8:55 AM, Roopa Prabhu <roopa@cumulusnetworks.com> wrote:
>> On 12/4/14, 8:04 AM, Jiri Pirko wrote:
>>> Thu, Dec 04, 2014 at 03:45:44PM CET, roopa@cumulusnetworks.com wrote:
>>>> On 12/4/14, 6:34 AM, Jiri Pirko wrote:
>>>>> Thu, Dec 04, 2014 at 03:26:50PM CET, roopa@cumulusnetworks.com wrote:
>>>>>> On 12/4/14, 12:57 AM, Jiri Pirko wrote:
>>>>>>> Jiri Pirko (1):
>>>>>>>     iproute2: ipa: show switch id
>>>>>>>
>>>>>>> Scott Feldman (5):
>>>>>>>     bridge/fdb: fix statistics output spacing
>>>>>>>     bridge/fdb: add flag/indication for FDB entry synced from offload
>>>>>>>       device
>>>>>>>     bridge/link: add new offload hwmode swdev
>>>>>> Ack to most patches but nack on this one. The todo list still has a
>>>>>> note to
>>>>>> revist the flag to indicate switchdev offloads.
>>>>>> Exposing this to userspace does not help that.
>>>>> Hmm, note that this is already exposed to userspace, this patchset is
>>>>> for iproute2 (userspace tool).
>>>> hmmm, all feedback on the switchdev patches seemed to indicate we can
>>>> change
>>>> this later.
>>>> I don't see swdev mode being used in the kernel anywhere today.
>>> Well, it is, in rocker:
>>> $ git grep BRIDGE_MODE_SWDEV
>>> drivers/net/ethernet/rocker/rocker.c:                   if (mode !=
>>> BRIDGE_MODE_SWDEV)
>>> drivers/net/ethernet/rocker/rocker.c:   u16 mode = BRIDGE_MODE_SWDEV;
>>> include/uapi/linux/if_bridge.h:#define BRIDGE_MODE_SWDEV        2       /*
>>> Full switch device offload */
>>
>> The problem is rocker is not the only one who is going to be using this. And
>> so, we need something that fits everybody.
>> And i am not going to make my user set a mode for him to enable offload to
>> hw.
>>
>>>> I will send a patch to remove it. Its still in net-next and so can be
>>>> changed
>>>> ?.
>>>> I was going to resend my patch to introduce a common offload flag for all
>>>> link objects.
>>>> It would be nice if all of them had a consistent flag to indicate hw
>>>> offload
>>>> and iproute2 could display the same flag for all.
>>>> Including bonds and vxlan's.
>>> I do not understand the connection with BRIDGE_MODE_SWDEV. We discussed
>>> this already. BRIDGE_MODE_SWDEV is a bridge mode, similar to for example
>>> BRIDGE_MODE_VEPA and makes perfect sense to have it.
>> I dont think everybody acked it. But it went in with a note saying that it
>> can be changed.
> I thought that was the plan: this new mode goes in now for net-next
> and iproute2, and you would supply follow up patch for each to move to
> your switch port flag.  That will give us time to review your work
> without have net-next and iproute2 out-of-sync.
>
>>>
>>> How vxlan and bonds come into the mixture, that is a puzzler for me.
>>> Maybe I have to see patches.
>>
>> I had posted a version of the patch previously:
>> http://www.spinics.net/lists/netdev/msg305472.html
>>
>> I have a v2 patch in my stack which does not touch the netlink header.
>> But in the past hour, i have been thinking about it some more. Do we really
>> need this set by the user ?. In my use case i don't need it.
> Look at how iproute2 figures out if SELF should be set or not.  It's
> only set if hwmode is set, otherwise it defaults to MASTER.  So with
> SWDEV a new hwmode, we can push settings (learning, leraning_sync) to
> port with SELF set.  It's probably not an ideal arrangement having to
> set hwmode each time, but this was the low-touch change to iproute2 to
> push port settings.
Did not know about this. Thanks for the info.
>
> I'm hoping your new patch will kind of straighten this all out.  But
> you've got extra work to make sure backward compat with older iproute2
> still works, including this weirdness around hwmode.
>
>> We do need a feature flag (or net_device_flags), but it does not need to be
>> set by the user explicitly.
>> This flag can be set by the switch port driver on the switch ports. And the
>> logical device: bridge/bond/vxlan
>> can inherit it from the port. There was a need of a flag in some usecases,
>> to control offloading of specific bridge port flags
>> to hw/sw (example learning in hw or sw). example patch:
>> https://patchwork.ozlabs.org/patch/413211/
>>
>> I will post something today.
> Can you include matching iproute2 changes?  (Assuming you'll building
> on top of what's already in net-next and this iproute2 set Jiri sent
> out).  It's helpful to see the iproute2 changes to see what the new
> cmd structure is and how legacy is handled.
Just sent what ever i had. Take a look and let me know.

Thanks,
Roopa


>

^ permalink raw reply

* [PATCH iproute2] bridge link: add option 'self'
From: roopa @ 2014-12-05  2:27 UTC (permalink / raw)
  To: jiri, sfeldma, jhs, bcrl, tgraf, john.fastabend, stephen,
	linville, nhorman, nicolas.dichtel, vyasevic, f.fainelli, buytenh,
	aviadr
  Cc: netdev, davem, shm, gospo, Roopa Prabhu

From: Roopa Prabhu <roopa@cumulusnetworks.com>

Currently self is set internally only if hwmode is set.
This makes it necessary for the hw to have a mode.
There is no hwmode really required to go to hardware. So, introduce
self for anybody who wants to target hardware.
---
 bridge/link.c |    3 +++
 1 file changed, 3 insertions(+)

diff --git a/bridge/link.c b/bridge/link.c
index 90d9e7f..2b86141 100644
--- a/bridge/link.c
+++ b/bridge/link.c
@@ -321,6 +321,9 @@ static int brlink_modify(int argc, char **argv)
 					"\"veb\".\n");
 				exit(-1);
 			}
+		} else if (strcmp(*argv, "self") == 0) {
+			NEXT_ARG();
+			flags = BRIDGE_FLAGS_SELF;
 		} else {
 			usage();
 		}
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 3/3] rocker: set feature NETIF_F_HW_SWITCH_OFFLOAD
From: roopa @ 2014-12-05  2:26 UTC (permalink / raw)
  To: jiri, sfeldma, jhs, bcrl, tgraf, john.fastabend, stephen,
	linville, nhorman, nicolas.dichtel, vyasevic, f.fainelli, buytenh,
	aviadr
  Cc: netdev, davem, shm, gospo, Roopa Prabhu

From: Roopa Prabhu <roopa@cumulusnetworks.com>

This patch just sets the feature flag on rocker ports
---
 drivers/net/ethernet/rocker/rocker.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index fded127..3fe19b0 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -4003,7 +4003,8 @@ static int rocker_probe_port(struct rocker *rocker, unsigned int port_number)
 		       NAPI_POLL_WEIGHT);
 	rocker_carrier_init(rocker_port);
 
-	dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
+	dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER |
+					 NETIF_F_HW_SWITCH_OFFLOAD;
 
 	err = register_netdev(dev);
 	if (err) {
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 2/3] bridge: offload bridge port attributes to switch asic if feature flag set
From: roopa @ 2014-12-05  2:26 UTC (permalink / raw)
  To: jiri, sfeldma, jhs, bcrl, tgraf, john.fastabend, stephen,
	linville, nhorman, nicolas.dichtel, vyasevic, f.fainelli, buytenh,
	aviadr
  Cc: netdev, davem, shm, gospo, Roopa Prabhu

From: Roopa Prabhu <roopa@cumulusnetworks.com>

This allows offloading to switch asic without having the user to set
any flag. And this is done in the bridge driver to rollback kernel settings
on hw offload failure if required in the future.

With this, it also makes sure a notification goes out only after the
attributes are set both in the kernel and hw.
---
 net/bridge/br_netlink.c |   27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 9f5eb55..ce173f0 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -407,9 +407,21 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh)
 				afspec, RTM_SETLINK);
 	}
 
+	if ((dev->features & NETIF_F_HW_SWITCH_OFFLOAD) &&
+			dev->netdev_ops->ndo_bridge_setlink) {
+		int ret = dev->netdev_ops->ndo_bridge_setlink(dev, nlh);
+		if (ret && ret != -EOPNOTSUPP) {
+			/* XXX Fix this in the future to rollback
+			 * kernel settings and return error
+			 */
+			br_warn(p->br, "error offloading bridge attributes " 
+					"on port %u(%s)\n", (unsigned int) p->port_no,
+					p->dev->name);
+		}
+	}
+
 	if (err == 0)
 		br_ifinfo_notify(RTM_NEWLINK, p);
-
 out:
 	return err;
 }
@@ -433,6 +445,19 @@ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh)
 	err = br_afspec((struct net_bridge *)netdev_priv(dev), p,
 			afspec, RTM_DELLINK);
 
+	if (dev->features & NETIF_F_HW_SWITCH_OFFLOAD
+			&& dev->netdev_ops->ndo_bridge_setlink) {
+		int ret = dev->netdev_ops->ndo_bridge_dellink(dev, nlh);
+		if (ret && ret != -EOPNOTSUPP) {
+			/* XXX Fix this in the future to rollback
+			 * kernel settings and return error
+			 */
+			br_warn(p->br, "error offloading bridge attributes " 
+					"on port %u(%s)\n", (unsigned int) p->port_no,
+					p->dev->name);
+		}
+	}
+
 	return err;
 }
 static int br_validate(struct nlattr *tb[], struct nlattr *data[])
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 1/3] netdev: introduce new NETIF_F_HW_SWITCH_OFFLOAD feature flag for switch device offloads
From: roopa @ 2014-12-05  2:26 UTC (permalink / raw)
  To: jiri, sfeldma, jhs, bcrl, tgraf, john.fastabend, stephen,
	linville, nhorman, nicolas.dichtel, vyasevic, f.fainelli, buytenh,
	aviadr
  Cc: netdev, davem, shm, gospo, Roopa Prabhu

From: Roopa Prabhu <roopa@cumulusnetworks.com>

This is a generic high level feature flag for all switch asic features today.

switch drivers set this flag on switch ports. Logical devices like
bridge, bonds, vxlans can inherit this flag from their slaves/ports.

I had to use SWITCH in the name to avoid ambiguity with other feature
flags. But, since i have been harping about not calling it 'switch',
I am welcome to any suggestions :)

An alternative to using a feature flag is to use a IFF_HW_OFFLOAD
in net_device_flags.
---
 include/linux/netdev_features.h |    2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 8e30685..68db1de 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -66,6 +66,7 @@ enum {
 	NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN STAGs */
 	NETIF_F_HW_L2FW_DOFFLOAD_BIT,	/* Allow L2 Forwarding in Hardware */
 	NETIF_F_BUSY_POLL_BIT,		/* Busy poll */
+	NETIF_F_HW_SWITCH_OFFLOAD_BIT,  /* HW switch offload */
 
 	/*
 	 * Add your fresh new feature above and remember to update
@@ -124,6 +125,7 @@ enum {
 #define NETIF_F_HW_VLAN_STAG_TX	__NETIF_F(HW_VLAN_STAG_TX)
 #define NETIF_F_HW_L2FW_DOFFLOAD	__NETIF_F(HW_L2FW_DOFFLOAD)
 #define NETIF_F_BUSY_POLL	__NETIF_F(BUSY_POLL)
+#define NETIF_F_HW_SWITCH_OFFLOAD	__NETIF_F(HW_SWITCH_OFFLOAD)
 
 /* Features valid for ethtool to change */
 /* = all defined minus driver/device-class-related */
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH net-next 0/3] switchdev offload flags
From: roopa @ 2014-12-05  2:26 UTC (permalink / raw)
  To: jiri, sfeldma, jhs, bcrl, tgraf, john.fastabend, stephen,
	linville, nhorman, nicolas.dichtel, vyasevic, f.fainelli, buytenh,
	aviadr
  Cc: netdev, davem, shm, gospo, Roopa Prabhu

From: Roopa Prabhu <roopa@cumulusnetworks.com>

This patch series only addresses bridge link attribute offloads to hardware.

It is a continuation of my previous series on switchdev policy attributes:
http://www.spinics.net/lists/netdev/msg305469.html

Looking at the current state of bridge l2 offload in the kernel,
    - flag 'self' is the way to directly manage the bridge device in hw via
      the ndo_bridge_setlink/ndo_bridge_getlink calls
 
    - flag 'master' is always used to manage the in kernel bridge devices
      via the same ndo_bridge_setlink/ndo_bridge_getlink calls

Today these are used separately. The nic offloads use hwmode "vepa/veb" to go
directly to hw with the "self" flag.

At this point i am trying not to introduce any new user facing flags/attributes.

In the model where we want the kernel bridge device to be offloaded to
hardware (In other words, sync kernel bridge state to hw),
we very much want the bridge driver to be involved.

With this patch series,
When the user sends a bridge setlink message, it will come in with 'master',
        - go to the bridge device,
        - set settings in the kernel
        - if offload mode is set on the port, also call the port driver
          offload ndo_bridge_setlink
        If you want to act on the hw alone, you can still use the self flag to
        go to the hw or port driver directly.
        (There is no need to specify a hardware mode to go to the port driver)

(To selectively offload bridge port attributes,
example learning in hw only etc, we can introduce offload bits for 
per bridge port flag attribute as in my previous patch
https://patchwork.ozlabs.org/patch/413211/. I have not included that in this
series)


Roopa Prabhu (3):
  netdev: introduce new NETIF_F_HW_SWITCH_OFFLOAD feature flag for
    switch device offloads
  bridge: offload bridge port attributes to switch asic if feature flag
    set
  rocker: set feature NETIF_F_HW_SWITCH_OFFLOAD

 drivers/net/ethernet/rocker/rocker.c |    3 ++-
 include/linux/netdev_features.h      |    2 ++
 net/bridge/br_netlink.c              |   27 ++++++++++++++++++++++++++-
 3 files changed, 30 insertions(+), 2 deletions(-)

-- 
1.7.10.4

^ permalink raw reply

* Re: 3.12.33 - BUG xfrm_selector_match+0x25/0x2f6
From: Smart Weblications GmbH - Florian Wiessner @ 2014-12-05  2:23 UTC (permalink / raw)
  To: Julian Anastasov, Steffen Klassert; +Cc: netdev, LKML, stable
In-Reply-To: <alpine.LFD.2.11.1412042338370.4841@ja.home.ssi.bg>

Hi,

Am 05.12.2014 00:15, schrieb Julian Anastasov:
> 
> 	Hello,
> 
> On Thu, 4 Dec 2014, Steffen Klassert wrote:
> 
>>> [16623.096721] Call Trace:
>>> [16623.096744]  <IRQ>
>>> [16623.096749]  [<ffffffff81547a7c>] ? xfrm_sk_policy_lookup+0x44/0x9b
>>> [16623.096802]  [<ffffffff81547ef7>] ? xfrm_lookup+0x91/0x446
>>> [16623.096832]  [<ffffffff81541316>] ? ip_route_me_harder+0x150/0x1b0
>>> [16623.096865]  [<ffffffffa01b6457>] ? ip_vs_route_me_harder+0x86/0x91 [ip_vs]
>>> [16623.096899]  [<ffffffffa01b797a>] ? ip_vs_out+0x2d3/0x5bc [ip_vs]
>>> [16623.096930]  [<ffffffff81501420>] ? ip_rcv_finish+0x2b8/0x2b8
>>
>> I really wonder why the xfrm_sk_policy_lookup codepath is taken here.
>> It looks like this is the processing of an inbound ipv4 packet that
>> is going to be rerouted to the output path by ipvs, so this packet
>> should not have socket context at all.
> 
> 	In above trace looks like IPVS-NAT is used between
> local client and some real server. IPVS handles this skb
> at LOCAL_IN and calls ip_vs_route_me_harder(). If we have
> skb->sk at LOCAL_IN, my first thought is about early demux.
> 
> 	If I remember correctly, looking at commit f5a41847acc535e2
> ("ipvs: move ip_route_me_harder for ICMP") that introduced
> this rerouting (2.6.37), it was needed because at that time TCP
> used rt_src from received skb to select daddr in ip_send_reply().
> As packets to server are DNAT-ed and packets to client are
> SNAT-ed we used rerouting to fill rt_src with correct IP
> after SNAT.
> 
> 	Now when routing cache is removed in 3.6 and
> tcp_v4_send_reset() is changed to provide ip_hdr(skb)->saddr
> instead of rt_src it should be safe to remove this rerouting,
> it is enough that ip_hdr(skb)->saddr was updated on IPVS-SNAT at
> LOCAL_IN. In fact, rt_src was removed early in 3.0 with
> commit 0a5ebb8000c5362 ("ipv4: Pass explicit daddr arg to 
> ip_send_reply().").
> 
> 	This is only to explain above stack. Not sure
> if problem is related somehow to early demux but such
> commits look interesting:
> 
> - commit 6b8dbcf2c44fd7a ("bridge: netfilter: orphan skb before invoking 
> ip netfilter hooks")
> 
> 	Also, it would be good to know which 3.x kernel between
> 3.13 and 3.17 fixes the problem, it will narrow the search.
> 


i tried with 3.12.33 without any XFRM and now got this one (which is reproducable):

[  233.956012] BUG: unable to handle kernel NULL pointer dereference at 00000000
                                   00000014
[  233.956218] IP: [<ffffffffa013a470>] nf_ct_seqadj_set+0x60/0x90 [nf_conntrack
                                   ]
[  233.956371] PGD 0
[  233.956493] Oops: 0000 [#1] SMP
[  233.956680] Modules linked in: netconsole xt_nat xt_multiport veth iptable_ma
                                   ngle xt_mark nf_conntrack_netlink nfnetlink
ip_vs_rr ipt_MASQUERADE iptable_nat
nf_nat_ipv4 nf_conntrack_ipv4 nf_defrag_ipv4 ipt_REJECT xt_tcpudp iptable_filter
                                    ip_tables cpufreq_ondemand cpufreq_powersave
cpufreq_conservative cpufreq_users                                    pace
ocfs2_stack_o2cb ocfs2_dlm bridge stp llc bonding fuse nf_conntrack_ftp 802
                               1q openvswitch gre vxlan xt_conntrack x_tables
ocfs2_dlmfs dlm sctp ocfs2 ocfs2_                                    nodemanager
ocfs2_stackglue configfs rbd kvm_intel kvm coretemp ip_vs_ftp ip_vs
                        nf_nat nf_conntrack psmouse i2c_i801 serio_raw lpc_ich
mfd_core evdev btrfs lzo_                                    decompress lzo_compress
[  233.960221] CPU: 2 PID: 29996 Comm: vsftpd Not tainted 3.12.33 #4
[  233.960298] Hardware name: Supermicro X9SCI/X9SCA/X9SCI/X9SCA, BIOS 1.1a 09/2
                                   8/2011
[  233.960395] task: ffff88075e87a2c0 ti: ffff8806a7444000 task.ti: ffff8806a744
                                   4000
[  233.960486] RIP: 0010:[<ffffffffa013a470>]  [<ffffffffa013a470>] nf_ct_seqadj
                                   _set+0x60/0x90 [nf_conntrack]
[  233.960632] RSP: 0018:ffff88083fc83998  EFLAGS: 00010206
[  233.960709] RAX: 000000000000000c RBX: ffff8806cab452cc RCX: 0000000000000003
[  233.960791] RDX: 0000000000000029 RSI: 0000000000000003 RDI: ffff8806cab452cc
[  233.960875] RBP: 00000000ee38035a R08: ffff8807e2b1edc0 R09: ffff88083fc839a8
[  233.960957] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000003
[  233.961041] R13: 0000000000000000 R14: 0000000000000003 R15: ffff8806a75a50bc
[  233.961124] FS:  00007ff22daec700(0000) GS:ffff88083fc80000(0000) knlGS:00000
                                   00000000000
[  233.961226] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  233.961303] CR2: 0000000000000014 CR3: 00000006b3259000 CR4: 00000000000407e0
[  233.961384] Stack:
[  233.961460]  ffff880815612b60 0000000000000012 0000000000000014 ffff8806cab45
                                   2c8
[  233.961776]  ffff8806a75a5001 ffffffffa014f681 0000000000000000 ffffffff00000
                                   045
[  233.962095]  ffff880800000048 0000001b00000003 ffff88083fc83a70 ffff880815612
                                   b60
[  233.962411] Call Trace:
[  233.962482]  <IRQ>
[  233.962538]  [<ffffffffa014f681>] ? __nf_nat_mangle_tcp_packet+0x109/0x120 [n
                                   f_nat]
[  233.962762]  [<ffffffffa017749e>] ? ip_vs_ftp_out.part.8+0x2b2/0x338 [ip_vs_f
                                   tp]
[  233.962866]  [<ffffffff814cb8c0>] ? __domain_mapping+0x25d/0x2a3
[  233.962949]  [<ffffffff8154140c>] ? fib_table_lookup+0xe4/0x255
[  233.963032]  [<ffffffffa015f858>] ? ip_vs_app_pkt_out+0x105/0x18b [ip_vs]
[  233.963110]  [<ffffffffa0162ffc>] ? tcp_snat_handler+0x6b/0x320 [ip_vs]
[  233.963189]  [<ffffffffa0155d3d>] ? ip_vs_conn_out_get_proto+0x1c/0x25 [ip_vs
                                   ]
[  233.963284]  [<ffffffffa0158937>] ? ip_vs_out+0x290/0x5bc [ip_vs]
[  233.963362]  [<ffffffff8150f544>] ? ip_frag_mem+0x2a/0x2a
[  233.963442]  [<ffffffff81508e1f>] ? nf_iterate+0x42/0x80
[  233.963519]  [<ffffffff81508ec6>] ? nf_hook_slow+0x69/0xff
[  233.963595]  [<ffffffff8150f544>] ? ip_frag_mem+0x2a/0x2a
[  233.963667]  [<ffffffff8150f8ae>] ? ip_forward+0x22d/0x2cf
[  233.963744]  [<ffffffff814e57ce>] ? __netif_receive_skb_core+0x5f0/0x66c
[  233.963826]  [<ffffffff814e59df>] ? process_backlog+0x13e/0x13e
[  233.963911]  [<ffffffffa0455e09>] ? br_handle_frame_finish+0x382/0x382 [bridg
                                   e]
[  233.964008]  [<ffffffff814e5a2b>] ? netif_receive_skb+0x4c/0x7d
[  233.964090]  [<ffffffffa0455d95>] ? br_handle_frame_finish+0x30e/0x382 [bridg
                                   e]
[  233.964186]  [<ffffffffa0455fda>] ? br_handle_frame+0x1d1/0x217 [bridge]
[  233.964267]  [<ffffffff814e567d>] ? __netif_receive_skb_core+0x49f/0x66c
[  233.964350]  [<ffffffff814e592b>] ? process_backlog+0x8a/0x13e
[  233.964429]  [<ffffffff814e5c31>] ? net_rx_action+0xa2/0x1c0
[  233.964508]  [<ffffffff81047e2e>] ? __do_softirq+0xf6/0x24f
[  233.964588]  [<ffffffff8106cbfd>] ? account_system_time+0x10f/0x169
[  233.964669]  [<ffffffff815ad7dc>] ? call_softirq+0x1c/0x30
[  233.964743]  <EOI>
[  233.964801]  [<ffffffff8100464d>] ? do_softirq+0x2c/0x5f
[  233.965013]  [<ffffffff81047ca1>] ? local_bh_enable+0x67/0x85
[  233.965088]  [<ffffffff81511689>] ? ip_finish_output+0x2c9/0x322
[  233.965165]  [<ffffffff8151240a>] ? ip_queue_xmit+0x2b7/0x2f0
[  233.965239]  [<ffffffff81524772>] ? tcp_transmit_skb+0x6ef/0x755
[  233.965316]  [<ffffffff815250e8>] ? tcp_write_xmit+0x886/0x9cb
[  233.965391]  [<ffffffff8152527a>] ? __tcp_push_pending_frames+0x24/0x7e
[  233.965473]  [<ffffffff8151a33c>] ? tcp_sendmsg+0xa4c/0xbfc
[  233.965550]  [<ffffffff814d3477>] ? sock_aio_write+0xe3/0xfd
[  233.965631]  [<ffffffff81122f4d>] ? do_sync_write+0x59/0x79
[  233.965709]  [<ffffffff811239e3>] ? vfs_write+0xc4/0x182
[  233.965786]  [<ffffffff81123daf>] ? SyS_write+0x45/0x7c
[  233.965864]  [<ffffffff815ac35b>] ? tracesys+0xdd/0xe2
[  233.965940] Code: 68 14 4d 01 c5 45 85 e4 74 46 f0 80 4f 78 40 48 8d 5f 04 48
                                    89 df e8 00 12 47 e1 31 c0 41 83 fe 02 0f 97
c0 48 6b c0 0c 4c 01 e8 <8b> 70 08                                     39 70 04
74 08 89 ea 0f ca 39 10 79 0d 89 70 04 44 01
[  233.969602] RIP  [<ffffffffa013a470>] nf_ct_seqadj_set+0x60/0x90 [nf_conntrac
                                   k]
[  233.969746]  RSP <ffff88083fc83998>
[  233.969816] CR2: 0000000000000014
[  233.969919] ---[ end trace c6faf7aa989b11c2 ]---
[  233.969999] Kernel panic - not syncing: Fatal exception in interrupt
[  233.970081] Rebooting in 10 seconds..
[  244.029931] ACPI MEMORY or I/O RESET_REG.


node01:/ocfs2/usr/src/linux-3.12.33/scripts# ./decodecode < /tmp/oops-ipvsftp.txt
[ 233.965940] Code: 68 14 4d 01 c5 45 85 e4 74 46 f0 80 4f 78 40 48 8d 5f 04 48
89 df e8 00 12 47 e1 31 c0 41 83 fe 02 0f 97 c0 48 6b c0 0c 4c 01 e8 <8b> 70 08
39 70 04 74 08 89 ea 0f ca 39 10 79 0d 89 70 04 44 01
All code
========
   0:   68 14 4d 01 c5          pushq  $0xffffffffc5014d14
   5:   45 85 e4                test   %r12d,%r12d
   8:   74 46                   je     0x50
   a:   f0 80 4f 78 40          lock orb $0x40,0x78(%rdi)
   f:   48 8d 5f 04             lea    0x4(%rdi),%rbx
  13:   48 89 df                mov    %rbx,%rdi
  16:   e8 00 12 47 e1          callq  0xffffffffe147121b
  1b:   31 c0                   xor    %eax,%eax
  1d:   41 83 fe 02             cmp    $0x2,%r14d
  21:   0f 97 c0                seta   %al
  24:   48 6b c0 0c             imul   $0xc,%rax,%rax
  28:   4c 01 e8                add    %r13,%rax
  2b:*  8b 70 08                mov    0x8(%rax),%esi           <-- trapping
instruction
  2e:   39 70 04                cmp    %esi,0x4(%rax)
  31:   74 08                   je     0x3b
  33:   89 ea                   mov    %ebp,%edx
  35:   0f ca                   bswap  %edx
  37:   39 10                   cmp    %edx,(%rax)
  39:   79 0d                   jns    0x48
  3b:   89 70 04                mov    %esi,0x4(%rax)
  3e:   44                      rex.R
  3f:   01                      .byte 0x1

Code starting with the faulting instruction
===========================================
   0:   8b 70 08                mov    0x8(%rax),%esi
   3:   39 70 04                cmp    %esi,0x4(%rax)
   6:   74 08                   je     0x10
   8:   89 ea                   mov    %ebp,%edx
   a:   0f ca                   bswap  %edx
   c:   39 10                   cmp    %edx,(%rax)
   e:   79 0d                   jns    0x1d
  10:   89 70 04                mov    %esi,0x4(%rax)
  13:   44                      rex.R
  14:   01                      .byte 0x1


setup is like this:


#virtual=<myVIP>:21
#       real=10.10.1.20:21 masq
#       real=10.10.1.21:21 masq
#       real=10.10.1.22:21 masq
#       real=10.10.1.23:21 masq
#       persistent=600
#       service=ftp
#       scheduler=rr
#       protocol=tcp
#       checktype=connect

( i remarked it to prevent fruther crashes...)

when ip_vs_ftp is loaded and someone trying to make a ftp connection, the system
panics instantly.

10.10.1.20 - 10.10.1.23 are lxc-containers using veth connected to the bridge
running on 4 different nodes. The node running ldirector/ipvsadm has also one of
those containers running (don't know if that matters)

brctl show
bridge name     bridge id               STP enabled     interfaces
br0             8000.00259052bbf4       no              bond0
                                                        vethMKELUc
                                                        vethXdWGqf
                                                        vethgJMmEb
                                                        vethmKNqFc


I disabled the ftp server lxc container on the node doing ip_vs, so that the
endpoint of the connection is not on the same node and tried again but with the
same result.

Unfortunatelly i cannot test with newer kernels than 3.12, because ocfs2 is
somehow broken in >= 3.14


-- 

Mit freundlichen Grüßen,

Florian Wiessner

Smart Weblications GmbH
Martinsberger Str. 1
D-95119 Naila

fon.: +49 9282 9638 200
fax.: +49 9282 9638 205
24/7: +49 900 144 000 00 - 0,99 EUR/Min*
http://www.smart-weblications.de

--
Sitz der Gesellschaft: Naila
Geschäftsführer: Florian Wiessner
HRB-Nr.: HRB 3840 Amtsgericht Hof
*aus dem dt. Festnetz, ggf. abweichende Preise aus dem Mobilfunknetz

^ permalink raw reply

* Re: Is this 32-bit NCM?y
From: Kevin Zhu @ 2014-12-05  2:20 UTC (permalink / raw)
  To: Enrico Mioso, Bjørn Mork
  Cc: Midge Shaojun  Tan, Eli Britstein, Alex Strizhevsky,
	youtux-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org,
	linux-usb-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <alpine.LNX.2.03.1412041326160.9926-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 4821 bytes --]

Regarding the location of NDP, it should be easy to fix. It can be added
to the end of the NTB only after it's ready to send. Regarding the
concern to other devices, as there's a particular driver for Huawei
devices in kernel, which is huawei_cdc_ncm, maybe we can just fix the TX
function there to avoid breaking other devices.

Regards,
Kevin

On 12/04/2014 08:28 PM, Enrico Mioso wrote:
> ... DHCP will work with some DHCPNACKS in the meanwhile, but ping
> stops working at all.
> Otherwise, it works with the standard value:
>
> --- 8.8.8.8 ping statistics ---
> 48 packets transmitted, 48 received, 0% packet loss, time 47004ms
> rtt min/avg/max/mdev = 362.084/392.878/523.132/33.636 ms
>
> And I was expecting effectively to see some lost packets, but
> instead... no.
>
>
> On Thu, 4 Dec 2014, Bjørn Mork wrote:
>
>> Date: Thu, 4 Dec 2014 12:44:56
>> From: Bjørn Mork <bjorn@mork.no>
>> To: Midge Shaojun Tan <ShaojunMidge.Tan@audiocodes.com>
>> Cc: Enrico Mioso <mrkiko.rs@gmail.com>,
>>     Kevin Zhu <Mingying.Zhu@audiocodes.com>,
>>     Eli Britstein <Eli.Britstein@audiocodes.com>,
>>     Alex Strizhevsky <alexxst@gmail.com>,
>>     "youtux@gmail.com" <youtux@gmail.com>,
>>     "linux-usb@vger.kernel.org" <linux-usb@vger.kernel.org>,
>>     "netdev@vger.kernel.org" <netdev@vger.kernel.org>
>> Subject: Re: Is this 32-bit NCM?y
>>
>> "Midge Shaojun  Tan" <ShaojunMidge.Tan@audiocodes.com> writes:
>>
>>> Hi all,
>>>
>>> I test OK with kervel 3.16.4
>>> Need disable other Ethernet network, just like eth1. (Then the DNS
>>> and route is OK)
>>> And also need disable arp, (ifconfig wwan0 -arp up), because China
>>> UNICOM don't respond the ARP message.
>>
>> The ARP functionality is independent of operator.  It is handled
>> internally by the modem firmware.  There are no MAC addresses or
>> ethernet headers transmitted over the radio link.  That's all faked by
>> the modem.  All MAC addresses and ethernet headers are local to the
>> modem<->host USB link.
>>
>>> With new mode switch string: /etc/usb_modeswitch.d/12d1:14fe
>>> Please see the patch and check whether it is correct?
>>
>> I see that you have two changes there:
>>
>> 1) the ETH_HLEN adjustment of ctx->tx_remainder is dropped
>> 2) the NDP is placed after the first frame.
>>
>> I haven't verified the effect of the tx_remainder change, but I assume
>> it fixes an alignment problem for this device.  I'd like to look more at
>> the effect of this for different values of wNdpOutPayloadRemainder and
>> wNdpOutDivisor.
>>
>> We can choose to put the NDP at the end of the NTB if we find that this
>> fixes some problem, but doing so by default for every NCM and MBIM
>> device is a bit risky. If we accept that some devices are so buggy that
>> the NDP cannot be placed anywhere (as required by the spec), then we
>> have to assume that this goes both ways.  Which means that moving the
>> NDP to the end of the NTB might break some other device.  We just don't
>> know that since we haven't ever tried it.
>>
>> And your fix doesn't really move it to the end either.  It just places
>> the NDP after the first ethernet packet.  Which happens to be the end if
>> there is only one packet in the NTB. But if we aggregate more packets
>> into this NTB then the result will look like this:
>>
>> NTH
>> eth packet 1
>> NDP
>> eth packet 2
>> ..
>> eth packet N
>>
>> I'm not convinced this modem will handle that if it cannot handle the
>> NDP being before the first packet...  This needs to be tested. Try
>> increasing /sys/class/net/wwan0/cdc_ncm/tx_timer_usecs to force the
>> driver to aggregate packets and see if everything still works.
>> Preferably while looking at the resulting NTB to verify that it does
>> contain more than one ethernet packet.
>>
>> I realize I sound a bit negative now.  This is absolutely not my
>> intention. This is great work, providing some real progress wrt figuring
>> out what goes on here.  Thanks a lot!  I am sure we can sort out the
>> remaining issues, which are really minor compared to what you have found
>> so far.
>>
>>
>>
>> Bjørn
>>
>>
This email and any files transmitted with it are confidential material. They are intended solely for the use of the designated individual or entity to whom they are addressed. If the reader of this message is not the intended recipient, you are hereby notified that any dissemination, use, distribution or copying of this communication is strictly prohibited and may be unlawful.

If you have received this email in error please immediately notify the sender and delete or destroy any copy of this message
N‹§²æìr¸›yúèšØb²X¬¶Ç§vØ^–)Þº{.nÇ+‰·¥Š{±ºÆâžØ^n‡r¡ö¦zË\x1aëh™¨è­Ú&¢îý»\x05ËÛÔØï¦v¬Îf\x1dp)¹¹br	šê+€Ê+zf£¢·hšˆ§~†­†Ûiÿûàz¹\x1e®w¥¢¸?™¨è­Ú&¢)ߢ^[f

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox