[PATCH net-next-2.6] macvtap: Add GSO/csum offload support

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH net-next-2.6] macvtap: Add GSO/csum offload support
@ 2010-02-12 22:27 Sridhar Samudrala
  2010-02-13  6:58 ` Sridhar Samudrala
  2010-02-18 16:10 ` Arnd Bergmann
  0 siblings, 2 replies; 11+ messages in thread
From: Sridhar Samudrala @ 2010-02-12 22:27 UTC (permalink / raw)
  To: David Miller, Arnd Bergmann, Herbert Xu; +Cc: netdev

This patch adds GSO/checksum offload support to macvtap driver and applies
on top of Arnd's refcnt bugfix.
	http://patchwork.ozlabs.org/patch/45136/

Added flags field to macvtap_queue to enable/disable processing of
virtio_net_hdr via IFF_VNET_HDR. This flag is checked to prepend virtio_net_hdr
in the receive path and process/skip virtio_net_hdr in the send path.

Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index fe7656b..5f70f13 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -17,6 +17,7 @@
 #include <net/net_namespace.h>
 #include <net/rtnetlink.h>
 #include <net/sock.h>
+#include <linux/virtio_net.h>
 
 /*
  * A macvtap queue is the central object of this driver, it connects
@@ -37,6 +38,7 @@ struct macvtap_queue {
 	struct socket sock;
 	struct macvlan_dev *vlan;
 	struct file *file;
+	unsigned int flags;
 };
 
 static struct proto macvtap_proto = {
@@ -286,6 +288,7 @@ static int macvtap_open(struct inode *inode, struct file *file)
 	sock_init_data(&q->sock, &q->sk);
 	q->sk.sk_allocation = GFP_ATOMIC; /* for now */
 	q->sk.sk_write_space = macvtap_sock_write_space;
+	q->flags = IFF_VNET_HDR;
 
 	err = macvtap_set_queue(dev, file, q);
 	if (err)
@@ -328,6 +331,29 @@ out:
 	return mask;
 }
 
+static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad,
+						size_t len, size_t linear,
+						int noblock, int *err)
+{
+	struct sk_buff *skb;
+
+	/* Under a page?  Don't bother with paged skb. */
+	if (prepad + len < PAGE_SIZE || !linear)
+		linear = len;
+
+	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
+				   err);
+	if (!skb)
+		return NULL;
+
+	skb_reserve(skb, prepad);
+	skb_put(skb, linear);
+	skb->data_len = len - linear;
+	skb->len += len - linear;
+
+	return skb;
+}
+
 /* Get packet from user space buffer */
 static ssize_t macvtap_get_user(struct macvtap_queue *q,
 				const struct iovec *iv, size_t count,
@@ -336,31 +362,99 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q,
 	struct sk_buff *skb;
 	size_t len = count;
 	int err;
+	struct virtio_net_hdr vnet_hdr = { 0 };
+	int vnet_hdr_len = 0;
+	unsigned short gso_type = 0;
+
+	if (q->flags & IFF_VNET_HDR) {
+		vnet_hdr_len = sizeof(vnet_hdr); 
+
+		err = -EINVAL;
+		if ((len -= vnet_hdr_len) < 0)
+			goto out;
+
+		err = (memcpy_fromiovecend((void *)&vnet_hdr, iv, 0,
+					   vnet_hdr_len));
+		if (err < 0)
+			goto out;			
+
+		if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
+		     vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 > 
+							vnet_hdr.hdr_len)
+			vnet_hdr.hdr_len = vnet_hdr.csum_start +
+						vnet_hdr.csum_offset + 2;
+
+		err = -EINVAL;
+		if (vnet_hdr.hdr_len > len)
+			goto out;
+
+		if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+			switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+			case VIRTIO_NET_HDR_GSO_TCPV4:
+				gso_type = SKB_GSO_TCPV4;
+				break;
+			case VIRTIO_NET_HDR_GSO_TCPV6:
+				gso_type = SKB_GSO_TCPV6;
+				break;
+			case VIRTIO_NET_HDR_GSO_UDP:
+				gso_type = SKB_GSO_UDP;
+				break;
+			default:
+				goto out;
+			}
+
+			if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
+				gso_type |= SKB_GSO_TCP_ECN;
+               
+			if (vnet_hdr.gso_size == 0)
+				goto out;
+		}
+	}
 
 	if (unlikely(len < ETH_HLEN))
-		return -EINVAL;
+		goto out;
 
-	skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err);
+	skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, len, vnet_hdr.hdr_len,
+				noblock, &err);
+	if (!skb)
+		goto out;
 
-	if (!skb) {
-		macvlan_count_rx(q->vlan, 0, false, false);
-		return err;
-	}
+	err = -EFAULT;
+	if (skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, len))
+		goto out_free;
 
-	skb_reserve(skb, NET_IP_ALIGN);
-	skb_put(skb, count);
+	skb_set_network_header(skb, ETH_HLEN);
+	skb_reset_mac_header(skb);
+	skb->protocol = eth_hdr(skb)->h_proto;
+
+	if (vnet_hdr_len) {
+		if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+			err = -EINVAL;
+			if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
+						  vnet_hdr.csum_offset))
+				goto out_free;
+		}
 
-	if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) {
-		macvlan_count_rx(q->vlan, 0, false, false);
-		kfree_skb(skb);
-		return -EFAULT;
-	}
+		if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+			skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
+			skb_shinfo(skb)->gso_type = gso_type;
 
-	skb_set_network_header(skb, ETH_HLEN);
+			/* Header must be checked, and gso_segs computed. */
+			skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+			skb_shinfo(skb)->gso_segs = 0;
+		}
+	}
 
 	macvlan_start_xmit(skb, q->vlan->dev);
 
+	macvlan_count_rx(q->vlan, skb->len, 1, 0);
 	return count;
+
+out_free:
+	kfree_skb(skb);
+out:
+	macvlan_count_rx(q->vlan, 0, false, false);
+	return -EINVAL;
 }
 
 static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
@@ -387,14 +481,54 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
 {
 	struct macvlan_dev *vlan = q->vlan;
 	int ret;
+	int vnet_hdr_len = 0;
+
+	if (q->flags & IFF_VNET_HDR) {
+		struct virtio_net_hdr vnet_hdr = { 0 };
+
+		vnet_hdr_len = sizeof(vnet_hdr);
+		if ((len -= vnet_hdr_len) < 0)
+			return -EINVAL;
+
+		if (skb_is_gso(skb)) {
+			struct skb_shared_info *sinfo = skb_shinfo(skb);
+
+			/* This is a hint as to how much should be linear. */
+			vnet_hdr.hdr_len = skb_headlen(skb);
+			vnet_hdr.gso_size = sinfo->gso_size;
+			if (sinfo->gso_type & SKB_GSO_TCPV4)
+				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+			else if (sinfo->gso_type & SKB_GSO_TCPV6)
+				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+			else if (sinfo->gso_type & SKB_GSO_UDP)
+				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
+			else
+				BUG();
+			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
+				vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
+		} else
+			vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
+
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+			vnet_hdr.csum_start = skb->csum_start -
+						skb_headroom(skb);
+			vnet_hdr.csum_offset = skb->csum_offset;
+		} /* else everything is zero */
+
+		if (unlikely(memcpy_toiovecend(iv, (void *)&vnet_hdr, 0,
+								vnet_hdr_len)))
+			return -EFAULT;
+	}
+
 
 	len = min_t(int, skb->len, len);
 
-	ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len);
+	ret = skb_copy_datagram_const_iovec(skb, 0, iv, vnet_hdr_len, len);
 
 	macvlan_count_rx(vlan, len, ret == 0, 0);
 
-	return ret ? ret : len;
+	return ret ? ret : (len + vnet_hdr_len);
 }
 
 static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
@@ -460,14 +594,23 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
 	unsigned int __user *up = argp;
 	unsigned int u;
 	char devname[IFNAMSIZ];
+	int ret;
 
 	switch (cmd) {
 	case TUNSETIFF:
 		/* ignore the name, just look at flags */
 		if (get_user(u, &ifr->ifr_flags))
 			return -EFAULT;
-		if (u != (IFF_TAP | IFF_NO_PI))
-			return -EINVAL;
+		q = macvtap_file_get_queue(file);
+		if (!q)
+			return -ENOLINK;
+
+		if (u & IFF_VNET_HDR)
+			q->flags |= IFF_VNET_HDR;
+		else
+			q->flags &= ~IFF_VNET_HDR;
+
+		macvtap_file_put_queue(q);
 		return 0;
 
 	case TUNGETIFF:
@@ -475,17 +618,23 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
 		if (!q)
 			return -ENOLINK;
 		memcpy(devname, q->vlan->dev->name, sizeof(devname));
-		macvtap_file_put_queue(q);
 
+		ret = 0;
 		if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) ||
-		    put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags))
-			return -EFAULT;
-		return 0;
+		    put_user(q->flags, &ifr->ifr_flags))
+			ret =  -EFAULT;
+		macvtap_file_put_queue(q);
+		return ret;
 
 	case TUNGETFEATURES:
-		if (put_user((IFF_TAP | IFF_NO_PI), up))
-			return -EFAULT;
-		return 0;
+		q = macvtap_file_get_queue(file);
+		if (!q)
+			return -ENOLINK;
+		ret = 0;
+		if (put_user(q->flags, up))
+			ret = -EFAULT;
+		macvtap_file_put_queue(q);
+		return ret;
 
 	case TUNSETSNDBUF:
 		if (get_user(u, up))
@@ -499,18 +648,14 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
 		return 0;
 
 	case TUNSETOFFLOAD:
-		/* let the user check for future flags */
-		if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
-			  TUN_F_TSO_ECN | TUN_F_UFO))
-			return -EINVAL;
-
-		/* TODO: add support for these, so far we don't
-			 support any offload */
-		if (arg & (TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
-			 TUN_F_TSO_ECN | TUN_F_UFO))
-			return -EINVAL;
-
-		return 0;
+		q = macvtap_file_get_queue(file);
+		if (!q)
+			return -ENOLINK;
+		ret = 0;
+		if (!(q->flags & IFF_VNET_HDR))
+			ret =  -EINVAL;
+		macvtap_file_put_queue(q);
+		return ret;
 
 	default:
 		return -EINVAL;





^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next-2.6] macvtap: Add GSO/csum offload support
  2010-02-12 22:27 [PATCH net-next-2.6] macvtap: Add GSO/csum offload support Sridhar Samudrala
@ 2010-02-13  6:58 ` Sridhar Samudrala
  2010-02-13 17:34   ` Arnd Bergmann
  2010-02-18 16:10 ` Arnd Bergmann
  1 sibling, 1 reply; 11+ messages in thread
From: Sridhar Samudrala @ 2010-02-13  6:58 UTC (permalink / raw)
  To: David Miller; +Cc: Arnd Bergmann, Herbert Xu, netdev

Please ignore my original version of the patch. It has some trailing 
whitespace errors. Here is an updated version that should apply cleanly.

This patch adds GSO/checksum offload support to macvtap driver and applies
on top of Arnd's refcnt bugfix.
	http://patchwork.ozlabs.org/patch/45136/

Added flags field to macvtap_queue to enable/disable processing of
virtio_net_hdr via IFF_VNET_HDR. This flag is checked to prepend virtio_net_hdr
in the receive path and process/skip virtio_net_hdr in the send path.

Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index fe7656b..5f70f13 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -17,6 +17,7 @@
 #include <net/net_namespace.h>
 #include <net/rtnetlink.h>
 #include <net/sock.h>
+#include <linux/virtio_net.h>
 
 /*
  * A macvtap queue is the central object of this driver, it connects
@@ -37,6 +38,7 @@ struct macvtap_queue {
 	struct socket sock;
 	struct macvlan_dev *vlan;
 	struct file *file;
+	unsigned int flags;
 };
 
 static struct proto macvtap_proto = {
@@ -286,6 +288,7 @@ static int macvtap_open(struct inode *inode, struct file *file)
 	sock_init_data(&q->sock, &q->sk);
 	q->sk.sk_allocation = GFP_ATOMIC; /* for now */
 	q->sk.sk_write_space = macvtap_sock_write_space;
+	q->flags = IFF_VNET_HDR;
 
 	err = macvtap_set_queue(dev, file, q);
 	if (err)
@@ -328,6 +331,29 @@ out:
 	return mask;
 }
 
+static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad,
+						size_t len, size_t linear,
+						int noblock, int *err)
+{
+	struct sk_buff *skb;
+
+	/* Under a page?  Don't bother with paged skb. */
+	if (prepad + len < PAGE_SIZE || !linear)
+		linear = len;
+
+	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
+				   err);
+	if (!skb)
+		return NULL;
+
+	skb_reserve(skb, prepad);
+	skb_put(skb, linear);
+	skb->data_len = len - linear;
+	skb->len += len - linear;
+
+	return skb;
+}
+
 /* Get packet from user space buffer */
 static ssize_t macvtap_get_user(struct macvtap_queue *q,
 				const struct iovec *iv, size_t count,
@@ -336,31 +362,99 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q,
 	struct sk_buff *skb;
 	size_t len = count;
 	int err;
+	struct virtio_net_hdr vnet_hdr = { 0 };
+	int vnet_hdr_len = 0;
+	unsigned short gso_type = 0;
+
+	if (q->flags & IFF_VNET_HDR) {
+		vnet_hdr_len = sizeof(vnet_hdr);
+
+		err = -EINVAL;
+		if ((len -= vnet_hdr_len) < 0)
+			goto out;
+
+		err = (memcpy_fromiovecend((void *)&vnet_hdr, iv, 0,
+					   vnet_hdr_len));
+		if (err < 0)
+			goto out;
+
+		if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
+		     vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
+							vnet_hdr.hdr_len)
+			vnet_hdr.hdr_len = vnet_hdr.csum_start +
+						vnet_hdr.csum_offset + 2;
+
+		err = -EINVAL;
+		if (vnet_hdr.hdr_len > len)
+			goto out;
+
+		if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+			switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+			case VIRTIO_NET_HDR_GSO_TCPV4:
+				gso_type = SKB_GSO_TCPV4;
+				break;
+			case VIRTIO_NET_HDR_GSO_TCPV6:
+				gso_type = SKB_GSO_TCPV6;
+				break;
+			case VIRTIO_NET_HDR_GSO_UDP:
+				gso_type = SKB_GSO_UDP;
+				break;
+			default:
+				goto out;
+			}
+
+			if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
+				gso_type |= SKB_GSO_TCP_ECN;
+
+			if (vnet_hdr.gso_size == 0)
+				goto out;
+		}
+	}
 
 	if (unlikely(len < ETH_HLEN))
-		return -EINVAL;
+		goto out;
 
-	skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err);
+	skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, len, vnet_hdr.hdr_len,
+				noblock, &err);
+	if (!skb)
+		goto out;
 
-	if (!skb) {
-		macvlan_count_rx(q->vlan, 0, false, false);
-		return err;
-	}
+	err = -EFAULT;
+	if (skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, len))
+		goto out_free;
 
-	skb_reserve(skb, NET_IP_ALIGN);
-	skb_put(skb, count);
+	skb_set_network_header(skb, ETH_HLEN);
+	skb_reset_mac_header(skb);
+	skb->protocol = eth_hdr(skb)->h_proto;
+
+	if (vnet_hdr_len) {
+		if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+			err = -EINVAL;
+			if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
+						  vnet_hdr.csum_offset))
+				goto out_free;
+		}
 
-	if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) {
-		macvlan_count_rx(q->vlan, 0, false, false);
-		kfree_skb(skb);
-		return -EFAULT;
-	}
+		if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+			skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
+			skb_shinfo(skb)->gso_type = gso_type;
 
-	skb_set_network_header(skb, ETH_HLEN);
+			/* Header must be checked, and gso_segs computed. */
+			skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+			skb_shinfo(skb)->gso_segs = 0;
+		}
+	}
 
 	macvlan_start_xmit(skb, q->vlan->dev);
 
+	macvlan_count_rx(q->vlan, skb->len, 1, 0);
 	return count;
+
+out_free:
+	kfree_skb(skb);
+out:
+	macvlan_count_rx(q->vlan, 0, false, false);
+	return -EINVAL;
 }
 
 static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
@@ -387,14 +481,54 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
 {
 	struct macvlan_dev *vlan = q->vlan;
 	int ret;
+	int vnet_hdr_len = 0;
+
+	if (q->flags & IFF_VNET_HDR) {
+		struct virtio_net_hdr vnet_hdr = { 0 };
+
+		vnet_hdr_len = sizeof(vnet_hdr);
+		if ((len -= vnet_hdr_len) < 0)
+			return -EINVAL;
+
+		if (skb_is_gso(skb)) {
+			struct skb_shared_info *sinfo = skb_shinfo(skb);
+
+			/* This is a hint as to how much should be linear. */
+			vnet_hdr.hdr_len = skb_headlen(skb);
+			vnet_hdr.gso_size = sinfo->gso_size;
+			if (sinfo->gso_type & SKB_GSO_TCPV4)
+				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+			else if (sinfo->gso_type & SKB_GSO_TCPV6)
+				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+			else if (sinfo->gso_type & SKB_GSO_UDP)
+				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
+			else
+				BUG();
+			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
+				vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
+		} else
+			vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
+
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+			vnet_hdr.csum_start = skb->csum_start -
+						skb_headroom(skb);
+			vnet_hdr.csum_offset = skb->csum_offset;
+		} /* else everything is zero */
+
+		if (unlikely(memcpy_toiovecend(iv, (void *)&vnet_hdr, 0,
+								vnet_hdr_len)))
+			return -EFAULT;
+	}
+
 
 	len = min_t(int, skb->len, len);
 
-	ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len);
+	ret = skb_copy_datagram_const_iovec(skb, 0, iv, vnet_hdr_len, len);
 
 	macvlan_count_rx(vlan, len, ret == 0, 0);
 
-	return ret ? ret : len;
+	return ret ? ret : (len + vnet_hdr_len);
 }
 
 static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
@@ -460,14 +594,23 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
 	unsigned int __user *up = argp;
 	unsigned int u;
 	char devname[IFNAMSIZ];
+	int ret;
 
 	switch (cmd) {
 	case TUNSETIFF:
 		/* ignore the name, just look at flags */
 		if (get_user(u, &ifr->ifr_flags))
 			return -EFAULT;
-		if (u != (IFF_TAP | IFF_NO_PI))
-			return -EINVAL;
+		q = macvtap_file_get_queue(file);
+		if (!q)
+			return -ENOLINK;
+
+		if (u & IFF_VNET_HDR)
+			q->flags |= IFF_VNET_HDR;
+		else
+			q->flags &= ~IFF_VNET_HDR;
+
+		macvtap_file_put_queue(q);
 		return 0;
 
 	case TUNGETIFF:
@@ -475,17 +618,23 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
 		if (!q)
 			return -ENOLINK;
 		memcpy(devname, q->vlan->dev->name, sizeof(devname));
-		macvtap_file_put_queue(q);
 
+		ret = 0;
 		if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) ||
-		    put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags))
-			return -EFAULT;
-		return 0;
+		    put_user(q->flags, &ifr->ifr_flags))
+			ret =  -EFAULT;
+		macvtap_file_put_queue(q);
+		return ret;
 
 	case TUNGETFEATURES:
-		if (put_user((IFF_TAP | IFF_NO_PI), up))
-			return -EFAULT;
-		return 0;
+		q = macvtap_file_get_queue(file);
+		if (!q)
+			return -ENOLINK;
+		ret = 0;
+		if (put_user(q->flags, up))
+			ret = -EFAULT;
+		macvtap_file_put_queue(q);
+		return ret;
 
 	case TUNSETSNDBUF:
 		if (get_user(u, up))
@@ -499,18 +648,14 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
 		return 0;
 
 	case TUNSETOFFLOAD:
-		/* let the user check for future flags */
-		if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
-			  TUN_F_TSO_ECN | TUN_F_UFO))
-			return -EINVAL;
-
-		/* TODO: add support for these, so far we don't
-			 support any offload */
-		if (arg & (TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
-			 TUN_F_TSO_ECN | TUN_F_UFO))
-			return -EINVAL;
-
-		return 0;
+		q = macvtap_file_get_queue(file);
+		if (!q)
+			return -ENOLINK;
+		ret = 0;
+		if (!(q->flags & IFF_VNET_HDR))
+			ret =  -EINVAL;
+		macvtap_file_put_queue(q);
+		return ret;
 
 	default:
 		return -EINVAL;



^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next-2.6] macvtap: Add GSO/csum offload support
  2010-02-13  6:58 ` Sridhar Samudrala
@ 2010-02-13 17:34   ` Arnd Bergmann
  2010-02-13 20:55     ` Sridhar Samudrala
  0 siblings, 1 reply; 11+ messages in thread
From: Arnd Bergmann @ 2010-02-13 17:34 UTC (permalink / raw)
  To: Sridhar Samudrala; +Cc: David Miller, Herbert Xu, netdev

Hi Sridhar,

On Saturday 13 February 2010, Sridhar Samudrala wrote:

> This patch adds GSO/checksum offload support to macvtap driver and applies
> on top of Arnd's refcnt bugfix.
> 	http://patchwork.ozlabs.org/patch/45136/

Sorry for messing this up by replacing that patch with a different one.
It shouldn't be hard to rebase this one though, which I'll probably do on Monday.
Please tell me if you want to do it yourself instead.

> @@ -286,6 +288,7 @@ static int macvtap_open(struct inode *inode, struct file *file)
>  	sock_init_data(&q->sock, &q->sk);
>  	q->sk.sk_allocation = GFP_ATOMIC; /* for now */
>  	q->sk.sk_write_space = macvtap_sock_write_space;
> +	q->flags = IFF_VNET_HDR;
>  
>  	err = macvtap_set_queue(dev, file, q);
>  	if (err)

Making IFF_VNET_HDR the default probably prevents the driver from working
with applications that don't known about VNET_HDR, e.g. anything other
than qemu. I believe qemu always tries setting it though, which would make
a default value of !IFF_NET_HDR fine.

Also, what about IFF_TAP and IFF_NO_PI, should those be always set?

> @@ -499,18 +648,14 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
>  		return 0;
>  
>  	case TUNSETOFFLOAD:
> -		/* let the user check for future flags */
> -		if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
> -			  TUN_F_TSO_ECN | TUN_F_UFO))
> -			return -EINVAL;
> -
> -		/* TODO: add support for these, so far we don't
> -			 support any offload */
> -		if (arg & (TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
> -			 TUN_F_TSO_ECN | TUN_F_UFO))
> -			return -EINVAL;
> -
> -		return 0;
> +		q = macvtap_file_get_queue(file);
> +		if (!q)
> +			return -ENOLINK;
> +		ret = 0;
> +		if (!(q->flags & IFF_VNET_HDR))
> +			ret =  -EINVAL;
> +		macvtap_file_put_queue(q);
> +		return ret;
>  
>  	default:
>  		return -EINVAL;

At least the first check needs to be in there, in case we are running with
new user space that knows additional flags. Moreover, shouldn't we check
the flags against the capabilities of vlan->lowerdev? I though it would be
best to report the capabilities of the real hardware to the guest kernel
so it can do the right thing.

	Arnd

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next-2.6] macvtap: Add GSO/csum offload support
  2010-02-13 17:34   ` Arnd Bergmann
@ 2010-02-13 20:55     ` Sridhar Samudrala
  2010-02-14 19:13       ` Arnd Bergmann
  0 siblings, 1 reply; 11+ messages in thread
From: Sridhar Samudrala @ 2010-02-13 20:55 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: David Miller, Herbert Xu, netdev

On 2/13/2010 9:34 AM, Arnd Bergmann wrote:
> Hi Sridhar,
>
> On Saturday 13 February 2010, Sridhar Samudrala wrote:
>
>    
>> This patch adds GSO/checksum offload support to macvtap driver and applies
>> on top of Arnd's refcnt bugfix.
>> 	http://patchwork.ozlabs.org/patch/45136/
>>      
> Sorry for messing this up by replacing that patch with a different one.
> It shouldn't be hard to rebase this one though, which I'll probably do on Monday.
> Please tell me if you want to do it yourself instead.
>    
No problem. If you get to it before, it is fine with me.
>> @@ -286,6 +288,7 @@ static int macvtap_open(struct inode *inode, struct file *file)
>>   	sock_init_data(&q->sock,&q->sk);
>>   	q->sk.sk_allocation = GFP_ATOMIC; /* for now */
>>   	q->sk.sk_write_space = macvtap_sock_write_space;
>> +	q->flags = IFF_VNET_HDR;
>>
>>   	err = macvtap_set_queue(dev, file, q);
>>   	if (err)
>>      
> Making IFF_VNET_HDR the default probably prevents the driver from working
> with applications that don't known about VNET_HDR, e.g. anything other
> than qemu. I believe qemu always tries setting it though, which would make
> a default value of !IFF_NET_HDR fine.
>    
Yes. It is better to make the default as !IFF_VNET_HDR
> Also, what about IFF_TAP and IFF_NO_PI, should those be always set?
>    
Atleast it is not required for qemu to have these flags set. If we are 
not doing anything different based on
these flags, i felt we don't need to have them.
>    
>> @@ -499,18 +648,14 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
>>   		return 0;
>>
>>   	case TUNSETOFFLOAD:
>> -		/* let the user check for future flags */
>> -		if (arg&  ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
>> -			  TUN_F_TSO_ECN | TUN_F_UFO))
>> -			return -EINVAL;
>> -
>> -		/* TODO: add support for these, so far we don't
>> -			 support any offload */
>> -		if (arg&  (TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
>> -			 TUN_F_TSO_ECN | TUN_F_UFO))
>> -			return -EINVAL;
>> -
>> -		return 0;
>> +		q = macvtap_file_get_queue(file);
>> +		if (!q)
>> +			return -ENOLINK;
>> +		ret = 0;
>> +		if (!(q->flags&  IFF_VNET_HDR))
>> +			ret =  -EINVAL;
>> +		macvtap_file_put_queue(q);
>> +		return ret;
>>
>>   	default:
>>   		return -EINVAL;
>>      
> At least the first check needs to be in there, in case we are running with
> new user space that knows additional flags. Moreover, shouldn't we check
> the flags against the capabilities of vlan->lowerdev? I though it would be
> best to report the capabilities of the real hardware to the guest kernel
> so it can do the right thing.
>    
Originally, i also thought we should check these based on the real 
device capabilities. But later i realized,
that it is not really required as we fall back to software offload via 
dev_gso_segment() call in dev_hard_start_xmit()
if the real device doesn't support any of the offloads. So we can 
advertise that all the offloads are supported to
the guest and let host deal with any offloads that are not supported by 
the real device.

Thanks
Sridhar


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next-2.6] macvtap: Add GSO/csum offload support
  2010-02-13 20:55     ` Sridhar Samudrala
@ 2010-02-14 19:13       ` Arnd Bergmann
  2010-02-15  0:21         ` Herbert Xu
  2010-02-15 17:05         ` Sridhar Samudrala
  0 siblings, 2 replies; 11+ messages in thread
From: Arnd Bergmann @ 2010-02-14 19:13 UTC (permalink / raw)
  To: Sridhar Samudrala; +Cc: David Miller, Herbert Xu, netdev

On Saturday 13 February 2010 21:55:37 Sridhar Samudrala wrote:
> > Also, what about IFF_TAP and IFF_NO_PI, should those be always set?
> >    
> Atleast it is not required for qemu to have these flags set. If we are 
> not doing anything different based on
> these flags, i felt we don't need to have them.

The point is that other applications might depend on them. We only support
IFF_TAP operation (not IFF_TUN), and we do not understand the !IFF_NO_PI
frame format, so any program that tries to use the PI header or use cooked
IP packets will get incorrect data.

> >> -            /* TODO: add support for these, so far we don't
> >> -                     support any offload */
> >> -            if (arg&  (TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
> >> -                     TUN_F_TSO_ECN | TUN_F_UFO))
> >> -                    return -EINVAL;
> >> -
> >> -            return 0;
> >> +            q = macvtap_file_get_queue(file);
> >> +            if (!q)
> >> +                    return -ENOLINK;
> >> +            ret = 0;
> >> +            if (!(q->flags&  IFF_VNET_HDR))
> >> +                    ret =  -EINVAL;
> >> +            macvtap_file_put_queue(q);
> >> +            return ret;
> >>
> >>      default:
> >>              return -EINVAL;
> >>      
> > At least the first check needs to be in there, in case we are running with
> > new user space that knows additional flags. Moreover, shouldn't we check
> > the flags against the capabilities of vlan->lowerdev? I though it would be
> > best to report the capabilities of the real hardware to the guest kernel
> > so it can do the right thing.
> >    
> Originally, i also thought we should check these based on the real 
> device capabilities. But later i realized,
> that it is not really required as we fall back to software offload via 
> dev_gso_segment() call in dev_hard_start_xmit()
> if the real device doesn't support any of the offloads. So we can 
> advertise that all the offloads are supported to
> the guest and let host deal with any offloads that are not supported by 
> the real device.

Ah, good point. Will that also work for the checksumming? Also, should the
host really be doing segmentation and checksumming for the guest if the
hardware can't do it? So even if everything works correctly, we might
want to let the guest do the work in order to get accounting of the CPU
cycles right.
OTOH, for data transfers between guests, we can probably use all the offloads
independent of what the HW can do, so when optimizing for inter-guest
communication, we might want to ignore the accounting problems.

I'd like to hear other opinions on this.

	Arnd

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next-2.6] macvtap: Add GSO/csum offload support
  2010-02-14 19:13       ` Arnd Bergmann
@ 2010-02-15  0:21         ` Herbert Xu
  2010-02-15  9:19           ` Arnd Bergmann
  2010-02-15 17:05         ` Sridhar Samudrala
  1 sibling, 1 reply; 11+ messages in thread
From: Herbert Xu @ 2010-02-15  0:21 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: Sridhar Samudrala, David Miller, netdev

On Sun, Feb 14, 2010 at 08:13:08PM +0100, Arnd Bergmann wrote:
>
> Ah, good point. Will that also work for the checksumming? Also, should the
> host really be doing segmentation and checksumming for the guest if the
> hardware can't do it? So even if everything works correctly, we might
> want to let the guest do the work in order to get accounting of the CPU
> cycles right.

Yes the amount you save by ensuring we postpone the processing until
as last as possible is tremendous.

As for accounting, it depends on how we structure the vhost backend.
If it could stay in the process context of the qemu process then all
should be fine.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next-2.6] macvtap: Add GSO/csum offload support
  2010-02-15  0:21         ` Herbert Xu
@ 2010-02-15  9:19           ` Arnd Bergmann
  0 siblings, 0 replies; 11+ messages in thread
From: Arnd Bergmann @ 2010-02-15  9:19 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Sridhar Samudrala, David Miller, netdev

On Monday 15 February 2010, Herbert Xu wrote:
> On Sun, Feb 14, 2010 at 08:13:08PM +0100, Arnd Bergmann wrote:
> >
> > Ah, good point. Will that also work for the checksumming? Also, should the
> > host really be doing segmentation and checksumming for the guest if the
> > hardware can't do it? So even if everything works correctly, we might
> > want to let the guest do the work in order to get accounting of the CPU
> > cycles right.
> 
> Yes the amount you save by ensuring we postpone the processing until
> as last as possible is tremendous.

Ok. If the host actually saves more time than it spends on segmenting
the data, the accounting problem doesn't exist.

> As for accounting, it depends on how we structure the vhost backend.
> If it could stay in the process context of the qemu process then all
> should be fine.

I believe the bulk of the work is currently done in a separate
kernel thread that is shared by all users of vhost. We can probably
try some measurements and see if any insane amounts of time are
spent in that thread.

	Arnd

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next-2.6] macvtap: Add GSO/csum offload support
  2010-02-14 19:13       ` Arnd Bergmann
  2010-02-15  0:21         ` Herbert Xu
@ 2010-02-15 17:05         ` Sridhar Samudrala
  1 sibling, 0 replies; 11+ messages in thread
From: Sridhar Samudrala @ 2010-02-15 17:05 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: David Miller, Herbert Xu, netdev

On 2/14/2010 11:13 AM, Arnd Bergmann wrote:
> On Saturday 13 February 2010 21:55:37 Sridhar Samudrala wrote:
>    
>>> Also, what about IFF_TAP and IFF_NO_PI, should those be always set?
>>>
>>>        
>> Atleast it is not required for qemu to have these flags set. If we are
>> not doing anything different based on
>> these flags, i felt we don't need to have them.
>>      
> The point is that other applications might depend on them. We only support
> IFF_TAP operation (not IFF_TUN), and we do not understand the !IFF_NO_PI
> frame format, so any program that tries to use the PI header or use cooked
> IP packets will get incorrect data.
>
>    
OK. that is a good point. We should handle cases where a user is trying 
to set IFF_TUN and !IFF_NO_PI.

Thanks
Sridhar


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next-2.6] macvtap: Add GSO/csum offload support
  2010-02-12 22:27 [PATCH net-next-2.6] macvtap: Add GSO/csum offload support Sridhar Samudrala
  2010-02-13  6:58 ` Sridhar Samudrala
@ 2010-02-18 16:10 ` Arnd Bergmann
  2010-02-18 20:03   ` Sridhar Samudrala
  1 sibling, 1 reply; 11+ messages in thread
From: Arnd Bergmann @ 2010-02-18 16:10 UTC (permalink / raw)
  To: Sridhar Samudrala; +Cc: David Miller, Herbert Xu, netdev

On Friday 12 February 2010, Sridhar Samudrala wrote:
> This patch adds GSO/checksum offload support to macvtap driver and applies
> on top of Arnd's refcnt bugfix.
> 	http://patchwork.ozlabs.org/patch/45136/
> 
> Added flags field to macvtap_queue to enable/disable processing of
> virtio_net_hdr via IFF_VNET_HDR. This flag is checked to prepend virtio_net_hdr
> in the receive path and process/skip virtio_net_hdr in the send path.
> 
> Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>

I've just sent out the new version of this, with further changes from
myself and rebased on my vhost-net series. Here are some more comments
about stuff that I changed in the process and why:

>  /* Get packet from user space buffer */
>  static ssize_t macvtap_get_user(struct macvtap_queue *q,
>  				const struct iovec *iv, size_t count,
> @@ -336,31 +362,99 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q,
>  	struct sk_buff *skb;
>  	size_t len = count;
>  	int err;
> +	struct virtio_net_hdr vnet_hdr = { 0 };
> +	int vnet_hdr_len = 0;
> +	unsigned short gso_type = 0;
> +
> +	if (q->flags & IFF_VNET_HDR) {
> +		vnet_hdr_len = sizeof(vnet_hdr); 
> +
> +		err = -EINVAL;
> +		if ((len -= vnet_hdr_len) < 0)
> +			goto out;
> +
> +		err = (memcpy_fromiovecend((void *)&vnet_hdr, iv, 0,
> +					   vnet_hdr_len));
> +		if (err < 0)
> +			goto out;			
> +
> +		if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
> +		     vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 > 
> +							vnet_hdr.hdr_len)
> +			vnet_hdr.hdr_len = vnet_hdr.csum_start +
> +						vnet_hdr.csum_offset + 2;
> +
> +		err = -EINVAL;
> +		if (vnet_hdr.hdr_len > len)
> +			goto out;
> +
> +		if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
> +			switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
> +			case VIRTIO_NET_HDR_GSO_TCPV4:
> +				gso_type = SKB_GSO_TCPV4;
> +				break;
> +			case VIRTIO_NET_HDR_GSO_TCPV6:
> +				gso_type = SKB_GSO_TCPV6;
> +				break;
> +			case VIRTIO_NET_HDR_GSO_UDP:
> +				gso_type = SKB_GSO_UDP;
> +				break;
> +			default:
> +				goto out;
> +			}
> +
> +			if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
> +				gso_type |= SKB_GSO_TCP_ECN;
> +               
> +			if (vnet_hdr.gso_size == 0)
> +				goto out;
> +		}
> +	}

I've moved most of this to a separate function. The function was getting
far too complex to read, and splitting it out makes it possible to move
it into a common location later so it can also be used by the tun/tap
driver that does basically the same here.
>  
>  	macvlan_start_xmit(skb, q->vlan->dev);
>  
> +	macvlan_count_rx(q->vlan, skb->len, 1, 0);
>  	return count;
> +
> +out_free:
> +	kfree_skb(skb);
> +out:
> +	macvlan_count_rx(q->vlan, 0, false, false);
> +	return -EINVAL;
>  }

macvlan_count_rx() is really the wrong thing to do here, since we're
transmitting the data, not receiving it. On success, the accounting will
be done by macvlan_start_xmit, and I've added a corresponding tx_dropped
count in the failure path.
  
>  static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
> @@ -387,14 +481,54 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
>  {
>  	struct macvlan_dev *vlan = q->vlan;
>  	int ret;
> +	int vnet_hdr_len = 0;
> +
> +	if (q->flags & IFF_VNET_HDR) {
> +		struct virtio_net_hdr vnet_hdr = { 0 };
> +
> +		vnet_hdr_len = sizeof(vnet_hdr);
> +		if ((len -= vnet_hdr_len) < 0)
> +			return -EINVAL;
> +
> +		if (skb_is_gso(skb)) {
> +			struct skb_shared_info *sinfo = skb_shinfo(skb);
> +
> +			/* This is a hint as to how much should be linear. */
> +			vnet_hdr.hdr_len = skb_headlen(skb);
> +			vnet_hdr.gso_size = sinfo->gso_size;
> +			if (sinfo->gso_type & SKB_GSO_TCPV4)
> +				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
> +			else if (sinfo->gso_type & SKB_GSO_TCPV6)
> +				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
> +			else if (sinfo->gso_type & SKB_GSO_UDP)
> +				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
> +			else
> +				BUG();
> +			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
> +				vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
> +		} else
> +			vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
> +
> +		if (skb->ip_summed == CHECKSUM_PARTIAL) {
> +			vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
> +			vnet_hdr.csum_start = skb->csum_start -
> +						skb_headroom(skb);
> +			vnet_hdr.csum_offset = skb->csum_offset;
> +		} /* else everything is zero */
> +
> +		if (unlikely(memcpy_toiovecend(iv, (void *)&vnet_hdr, 0,
> +								vnet_hdr_len)))
> +			return -EFAULT;
> +	}
> +

This code is now also a separate function, same reason as above.

>  	case TUNSETOFFLOAD:
> -		/* let the user check for future flags */
> -		if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
> -			  TUN_F_TSO_ECN | TUN_F_UFO))
> -			return -EINVAL;
> -
> -		/* TODO: add support for these, so far we don't
> -			 support any offload */
> -		if (arg & (TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
> -			 TUN_F_TSO_ECN | TUN_F_UFO))
> -			return -EINVAL;
> -
> -		return 0;
> +		q = macvtap_file_get_queue(file);
> +		if (!q)
> +			return -ENOLINK;
> +		ret = 0;
> +		if (!(q->flags & IFF_VNET_HDR))
> +			ret =  -EINVAL;
> +		macvtap_file_put_queue(q);
> +		return ret;

I still feel uncomfortable about this, but partly because I don't fully
understand the GSO functionality. I've added a comment for now so we can
have another look.

macvtap is different from tun/tap here, because the data direction is the
opposite: reading from a macvtap chardev corresponds to receive, while
writing to it is a transmit in the network stack. In the tap driver, we
just set the GSO flags of the netdev and dev_queue_xmit will do the right
thing when forwarding data to the tap, but for macvtap, incoming frames
never go through dev_queue_xmit (they go through netif_rx), so if the
external device passes us GSO frames, we just pass them on unmodified
to the guest, even if that guest does not understand GSO.

In particular, when we have two guests using macvtap in bridge mode,
we don't even go through the network stack and just pass down the
SKB we got from the other side if the destination MAC address matches.
That means that a sender using virtio-net with GSO will send garbage
to another guest using a hardware emulated NIC that cannot receive
GSO (GRO?) frames.

I hope you have an idea how to do this right or can convince me that
everything is ok, otherwise we'd have to defer this patch.

	Arnd

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next-2.6] macvtap: Add GSO/csum offload support
  2010-02-18 16:10 ` Arnd Bergmann
@ 2010-02-18 20:03   ` Sridhar Samudrala
  2010-02-18 20:47     ` Arnd Bergmann
  0 siblings, 1 reply; 11+ messages in thread
From: Sridhar Samudrala @ 2010-02-18 20:03 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: David Miller, Herbert Xu, netdev

On Thu, 2010-02-18 at 17:10 +0100, Arnd Bergmann wrote:
> On Friday 12 February 2010, Sridhar Samudrala wrote:
> > This patch adds GSO/checksum offload support to macvtap driver and applies
> > on top of Arnd's refcnt bugfix.
> > 	http://patchwork.ozlabs.org/patch/45136/
> > 
> > Added flags field to macvtap_queue to enable/disable processing of
> > virtio_net_hdr via IFF_VNET_HDR. This flag is checked to prepend virtio_net_hdr
> > in the receive path and process/skip virtio_net_hdr in the send path.
> > 
> > Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
> 
> I've just sent out the new version of this, with further changes from
> myself and rebased on my vhost-net series. Here are some more comments
> about stuff that I changed in the process and why:
> 
> >  /* Get packet from user space buffer */
> >  static ssize_t macvtap_get_user(struct macvtap_queue *q,
> >  				const struct iovec *iv, size_t count,
> > @@ -336,31 +362,99 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q,
> >  	struct sk_buff *skb;
> >  	size_t len = count;
> >  	int err;
> > +	struct virtio_net_hdr vnet_hdr = { 0 };
> > +	int vnet_hdr_len = 0;
> > +	unsigned short gso_type = 0;
> > +
> > +	if (q->flags & IFF_VNET_HDR) {
> > +		vnet_hdr_len = sizeof(vnet_hdr); 
> > +
> > +		err = -EINVAL;
> > +		if ((len -= vnet_hdr_len) < 0)
> > +			goto out;
> > +
> > +		err = (memcpy_fromiovecend((void *)&vnet_hdr, iv, 0,
> > +					   vnet_hdr_len));
> > +		if (err < 0)
> > +			goto out;			
> > +
> > +		if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
> > +		     vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 > 
> > +							vnet_hdr.hdr_len)
> > +			vnet_hdr.hdr_len = vnet_hdr.csum_start +
> > +						vnet_hdr.csum_offset + 2;
> > +
> > +		err = -EINVAL;
> > +		if (vnet_hdr.hdr_len > len)
> > +			goto out;
> > +
> > +		if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
> > +			switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
> > +			case VIRTIO_NET_HDR_GSO_TCPV4:
> > +				gso_type = SKB_GSO_TCPV4;
> > +				break;
> > +			case VIRTIO_NET_HDR_GSO_TCPV6:
> > +				gso_type = SKB_GSO_TCPV6;
> > +				break;
> > +			case VIRTIO_NET_HDR_GSO_UDP:
> > +				gso_type = SKB_GSO_UDP;
> > +				break;
> > +			default:
> > +				goto out;
> > +			}
> > +
> > +			if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
> > +				gso_type |= SKB_GSO_TCP_ECN;
> > +               
> > +			if (vnet_hdr.gso_size == 0)
> > +				goto out;
> > +		}
> > +	}
> 
> I've moved most of this to a separate function. The function was getting
> far too complex to read, and splitting it out makes it possible to move
> it into a common location later so it can also be used by the tun/tap
> driver that does basically the same here.

Yes. I agree. We could share this code with tap and af_packet socket
offload too.

> >  
> >  	macvlan_start_xmit(skb, q->vlan->dev);
> >  
> > +	macvlan_count_rx(q->vlan, skb->len, 1, 0);
> >  	return count;
> > +
> > +out_free:
> > +	kfree_skb(skb);
> > +out:
> > +	macvlan_count_rx(q->vlan, 0, false, false);
> > +	return -EINVAL;
> >  }
> 
> macvlan_count_rx() is really the wrong thing to do here, since we're
> transmitting the data, not receiving it. On success, the accounting will
> be done by macvlan_start_xmit, and I've added a corresponding tx_dropped
> count in the failure path.

OK.
>   
> >  static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
> > @@ -387,14 +481,54 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
> >  {
> >  	struct macvlan_dev *vlan = q->vlan;
> >  	int ret;
> > +	int vnet_hdr_len = 0;
> > +
> > +	if (q->flags & IFF_VNET_HDR) {
> > +		struct virtio_net_hdr vnet_hdr = { 0 };
> > +
> > +		vnet_hdr_len = sizeof(vnet_hdr);
> > +		if ((len -= vnet_hdr_len) < 0)
> > +			return -EINVAL;
> > +
> > +		if (skb_is_gso(skb)) {
> > +			struct skb_shared_info *sinfo = skb_shinfo(skb);
> > +
> > +			/* This is a hint as to how much should be linear. */
> > +			vnet_hdr.hdr_len = skb_headlen(skb);
> > +			vnet_hdr.gso_size = sinfo->gso_size;
> > +			if (sinfo->gso_type & SKB_GSO_TCPV4)
> > +				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
> > +			else if (sinfo->gso_type & SKB_GSO_TCPV6)
> > +				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
> > +			else if (sinfo->gso_type & SKB_GSO_UDP)
> > +				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
> > +			else
> > +				BUG();
> > +			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
> > +				vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
> > +		} else
> > +			vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
> > +
> > +		if (skb->ip_summed == CHECKSUM_PARTIAL) {
> > +			vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
> > +			vnet_hdr.csum_start = skb->csum_start -
> > +						skb_headroom(skb);
> > +			vnet_hdr.csum_offset = skb->csum_offset;
> > +		} /* else everything is zero */
> > +
> > +		if (unlikely(memcpy_toiovecend(iv, (void *)&vnet_hdr, 0,
> > +								vnet_hdr_len)))
> > +			return -EFAULT;
> > +	}
> > +
> 
> This code is now also a separate function, same reason as above.
> 
> >  	case TUNSETOFFLOAD:
> > -		/* let the user check for future flags */
> > -		if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
> > -			  TUN_F_TSO_ECN | TUN_F_UFO))
> > -			return -EINVAL;
> > -
> > -		/* TODO: add support for these, so far we don't
> > -			 support any offload */
> > -		if (arg & (TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
> > -			 TUN_F_TSO_ECN | TUN_F_UFO))
> > -			return -EINVAL;
> > -
> > -		return 0;
> > +		q = macvtap_file_get_queue(file);
> > +		if (!q)
> > +			return -ENOLINK;
> > +		ret = 0;
> > +		if (!(q->flags & IFF_VNET_HDR))
> > +			ret =  -EINVAL;
> > +		macvtap_file_put_queue(q);
> > +		return ret;
> 
> I still feel uncomfortable about this, but partly because I don't fully
> understand the GSO functionality. I've added a comment for now so we can
> have another look.
> 
> macvtap is different from tun/tap here, because the data direction is the
> opposite: reading from a macvtap chardev corresponds to receive, while
> writing to it is a transmit in the network stack. In the tap driver, we
> just set the GSO flags of the netdev and dev_queue_xmit will do the right
> thing when forwarding data to the tap, but for macvtap, incoming frames
> never go through dev_queue_xmit (they go through netif_rx), so if the
> external device passes us GSO frames, we just pass them on unmodified
> to the guest, even if that guest does not understand GSO.

If a guest is connected to a macvtap device attached to an underlying physical
device with GRO enabled, it is possible to receive large SKBs and we don't handle
them correctly. The current workaround is to disable GRO on the physical device.

> 
> In particular, when we have two guests using macvtap in bridge mode,
> we don't even go through the network stack and just pass down the
> SKB we got from the other side if the destination MAC address matches.
> That means that a sender using virtio-net with GSO will send garbage
> to another guest using a hardware emulated NIC that cannot receive
> GSO (GRO?) frames.

Yes. I think we need to do something similar to dev_gso_segment() in
macvtap_forward() if skb_is_gso() and IFF_VNET_HDR is not set in
q->flags.

> 
> I hope you have an idea how to do this right or can convince me that
> everything is ok, otherwise we'd have to defer this patch.

I would prefer getting this patch in as it helps peformance when both
the guest and the physical device support offloads and also we have 
workaround for other situations. In the meantime, I will start looking
into addressing this specific case in macvtap_forward().

Thanks
Sridhar


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next-2.6] macvtap: Add GSO/csum offload support
  2010-02-18 20:03   ` Sridhar Samudrala
@ 2010-02-18 20:47     ` Arnd Bergmann
  0 siblings, 0 replies; 11+ messages in thread
From: Arnd Bergmann @ 2010-02-18 20:47 UTC (permalink / raw)
  To: Sridhar Samudrala; +Cc: David Miller, Herbert Xu, netdev

On Thursday 18 February 2010, Sridhar Samudrala wrote:
> > 
> > macvtap is different from tun/tap here, because the data direction is the
> > opposite: reading from a macvtap chardev corresponds to receive, while
> > writing to it is a transmit in the network stack. In the tap driver, we
> > just set the GSO flags of the netdev and dev_queue_xmit will do the right
> > thing when forwarding data to the tap, but for macvtap, incoming frames
> > never go through dev_queue_xmit (they go through netif_rx), so if the
> > external device passes us GSO frames, we just pass them on unmodified
> > to the guest, even if that guest does not understand GSO.
> 
> If a guest is connected to a macvtap device attached to an underlying physical
> device with GRO enabled, it is possible to receive large SKBs and we don't handle
> them correctly. The current workaround is to disable GRO on the physical device.

Ok. 

> > In particular, when we have two guests using macvtap in bridge mode,
> > we don't even go through the network stack and just pass down the
> > SKB we got from the other side if the destination MAC address matches.
> > That means that a sender using virtio-net with GSO will send garbage
> > to another guest using a hardware emulated NIC that cannot receive
> > GSO (GRO?) frames.
> 
> Yes. I think we need to do something similar to dev_gso_segment() in
> macvtap_forward() if skb_is_gso() and IFF_VNET_HDR is not set in
> q->flags.
 
I think it needs to be more fine-grained than that, and take into account
the specific offload capabilities of the receiving guest that were
negotiated in TUNSETOFFLOAD, but other than that, this looks fine.

Regarding TUNSETOFFLOAD, the part I don't get is what the negotiation
really means for TX and RX respectively. Is it safe to assume that what
gets set is the common subset of features between guest and host for
RX *and* TX, or is it just one of them?

> > I hope you have an idea how to do this right or can convince me that
> > everything is ok, otherwise we'd have to defer this patch.
> 
> I would prefer getting this patch in as it helps peformance when both
> the guest and the physical device support offloads and also we have 
> workaround for other situations. In the meantime, I will start looking
> into addressing this specific case in macvtap_forward().

Ok, fair enough.

For the forwarding between ports, simply refusing TUNSETOFFLOAD for
any bridge mode ports should be fine and still let us use offloading
for vepa mode.

	Arnd

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2010-02-18 20:48 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-02-12 22:27 [PATCH net-next-2.6] macvtap: Add GSO/csum offload support Sridhar Samudrala
2010-02-13  6:58 ` Sridhar Samudrala
2010-02-13 17:34   ` Arnd Bergmann
2010-02-13 20:55     ` Sridhar Samudrala
2010-02-14 19:13       ` Arnd Bergmann
2010-02-15  0:21         ` Herbert Xu
2010-02-15  9:19           ` Arnd Bergmann
2010-02-15 17:05         ` Sridhar Samudrala
2010-02-18 16:10 ` Arnd Bergmann
2010-02-18 20:03   ` Sridhar Samudrala
2010-02-18 20:47     ` Arnd Bergmann

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).