netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/3] Cleanup and simplify virtnet header
@ 2008-01-23 14:07 Rusty Russell
  2008-01-23 14:10 ` [PATCH 2/3] partial checksum and GSO support for tun/tap Rusty Russell
  0 siblings, 1 reply; 9+ messages in thread
From: Rusty Russell @ 2008-01-23 14:07 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, virtualization

1) Turn GSO on virtio net into an all-or-nothing (keep checksumming
   separate).  Having multiple bits is a pain: if you can't support something
   you should handle it in software, which is still a performance win.

2) Make VIRTIO_NET_HDR_GSO_ECN a flag in the header, so it can apply to
   IPv6 or v4.

3) Rename VIRTIO_NET_F_NO_CSUM to VIRTIO_NET_F_CSUM (ie. means we do
   checksumming).

4) Add csum and gso params to virtio_net to allow more testing.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/net/virtio_net.c   |   32 ++++++++++++++++----------------
 include/linux/virtio_net.h |   12 ++++--------
 2 files changed, 20 insertions(+), 24 deletions(-)

diff -r 4fb788b18cf8 drivers/net/virtio_net.c
--- a/drivers/net/virtio_net.c	Wed Jan 23 13:07:59 2008 +1100
+++ b/drivers/net/virtio_net.c	Wed Jan 23 18:46:05 2008 +1100
@@ -26,6 +26,10 @@
 
 static int napi_weight = 128;
 module_param(napi_weight, int, 0444);
+
+static int csum = 1, gso = 1;
+module_param(csum, int, 0444);
+module_param(gso, int, 0444);
 
 MODULE_LICENSE("GPL");
 
@@ -95,12 +99,9 @@ static void receive_skb(struct net_devic
 
 	if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
 		pr_debug("GSO!\n");
-		switch (hdr->gso_type) {
+		switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
 		case VIRTIO_NET_HDR_GSO_TCPV4:
 			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
-			break;
-		case VIRTIO_NET_HDR_GSO_TCPV4_ECN:
-			skb_shinfo(skb)->gso_type = SKB_GSO_TCP_ECN;
 			break;
 		case VIRTIO_NET_HDR_GSO_UDP:
 			skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
@@ -114,6 +115,9 @@ static void receive_skb(struct net_devic
 				       dev->name, hdr->gso_type);
 			goto frame_err;
 		}
+
+		if (hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN)
+			skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
 
 		skb_shinfo(skb)->gso_size = hdr->gso_size;
 		if (skb_shinfo(skb)->gso_size == 0) {
@@ -249,9 +253,7 @@ static int start_xmit(struct sk_buff *sk
 	if (skb_is_gso(skb)) {
 		hdr->hdr_len = skb_transport_header(skb) - skb->data;
 		hdr->gso_size = skb_shinfo(skb)->gso_size;
-		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN)
-			hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4_ECN;
-		else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
+		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
 			hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
 		else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
 			hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
@@ -259,6 +261,8 @@ static int start_xmit(struct sk_buff *sk
 			hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
 		else
 			BUG();
+		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN)
+			hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
 	} else {
 		hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
 		hdr->gso_size = hdr->hdr_len = 0;
@@ -354,17 +358,13 @@ static int virtnet_probe(struct virtio_d
 	SET_NETDEV_DEV(dev, &vdev->dev);
 
 	/* Do we support "hardware" checksums? */
-	if (vdev->config->feature(vdev, VIRTIO_NET_F_NO_CSUM)) {
+	if (csum && vdev->config->feature(vdev, VIRTIO_NET_F_CSUM)) {
 		/* This opens up the world of extra features. */
 		dev->features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST;
-		if (vdev->config->feature(vdev, VIRTIO_NET_F_TSO4))
-			dev->features |= NETIF_F_TSO;
-		if (vdev->config->feature(vdev, VIRTIO_NET_F_UFO))
-			dev->features |= NETIF_F_UFO;
-		if (vdev->config->feature(vdev, VIRTIO_NET_F_TSO4_ECN))
-			dev->features |= NETIF_F_TSO_ECN;
-		if (vdev->config->feature(vdev, VIRTIO_NET_F_TSO6))
-			dev->features |= NETIF_F_TSO6;
+		if (gso && vdev->config->feature(vdev, VIRTIO_NET_F_GSO)) {
+			dev->features |= NETIF_F_TSO | NETIF_F_UFO
+				| NETIF_F_TSO_ECN | NETIF_F_TSO6;
+		}
 	}
 
 	/* Configuration may specify what MAC to use.  Otherwise random. */
diff -r 4fb788b18cf8 include/linux/virtio_net.h
--- a/include/linux/virtio_net.h	Wed Jan 23 13:07:59 2008 +1100
+++ b/include/linux/virtio_net.h	Wed Jan 23 18:46:05 2008 +1100
@@ -6,12 +6,9 @@
 #define VIRTIO_ID_NET	1
 
 /* The feature bitmap for virtio net */
-#define VIRTIO_NET_F_NO_CSUM	0
-#define VIRTIO_NET_F_TSO4	1
-#define VIRTIO_NET_F_UFO	2
-#define VIRTIO_NET_F_TSO4_ECN	3
-#define VIRTIO_NET_F_TSO6	4
-#define VIRTIO_NET_F_MAC	5
+#define VIRTIO_NET_F_CSUM	0	/* Can handle pkts w/ partial csum */
+#define VIRTIO_NET_F_MAC	5	/* Host has given MAC address. */
+#define VIRTIO_NET_F_GSO	6	/* Can handle pkts w/ any GSO type */
 
 struct virtio_net_config
 {
@@ -27,10 +24,9 @@ struct virtio_net_hdr
 	__u8 flags;
 #define VIRTIO_NET_HDR_GSO_NONE		0	// Not a GSO frame
 #define VIRTIO_NET_HDR_GSO_TCPV4	1	// GSO frame, IPv4 TCP (TSO)
-/* FIXME: Do we need this?  If they said they can handle ECN, do they care? */
-#define VIRTIO_NET_HDR_GSO_TCPV4_ECN	2	// GSO frame, IPv4 TCP w/ ECN
 #define VIRTIO_NET_HDR_GSO_UDP		3	// GSO frame, IPv4 UDP (UFO)
 #define VIRTIO_NET_HDR_GSO_TCPV6	4	// GSO frame, IPv6 TCP
+#define VIRTIO_NET_HDR_GSO_ECN		0x80	// TCP has ECN set
 	__u8 gso_type;
 	__u16 hdr_len;		/* Ethernet + IP + tcp/udp hdrs */
 	__u16 gso_size;		/* Bytes to append to gso_hdr_len per frame */

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 2/3] partial checksum and GSO support for tun/tap.
  2008-01-23 14:07 [PATCH 1/3] Cleanup and simplify virtnet header Rusty Russell
@ 2008-01-23 14:10 ` Rusty Russell
  2008-01-23 14:14   ` [PATCH 3/3] Interface to query tun/tap features Rusty Russell
  2008-02-08  5:39   ` [PATCH 2/3] partial checksum and GSO support for tun/tap Max Krasnyansky
  0 siblings, 2 replies; 9+ messages in thread
From: Rusty Russell @ 2008-01-23 14:10 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, virtualization

(Changes since last time: we how have explicit IFF_RECV_CSUM and 
IFF_RECV_GSO bits, and some renaming of virtio_net hdr)

We use the virtio_net_hdr: it is an ABI already and designed to
encapsulate such metadata as GSO and partial checksums.

IFF_VIRTIO_HDR means you will write and read a 'struct virtio_net_hdr'
at the start of each packet.  You can always write packets with
partial checksum and gso to the tap device using this header.

IFF_RECV_CSUM means you can handle reading packets with partial
checksums.  If IFF_RECV_GSO is also set, it means you can handle
reading (all types of) GSO packets.

Note that there is no easy way to detect if these flags are supported:
see next patch.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/net/tun.c      |  259 +++++++++++++++++++++++++++++++++++++++++++------
 include/linux/if_tun.h |    6 +
 2 files changed, 238 insertions(+), 27 deletions(-)

diff -r cb85fb035378 drivers/net/tun.c
--- a/drivers/net/tun.c	Wed Jan 23 20:06:56 2008 +1100
+++ b/drivers/net/tun.c	Wed Jan 23 20:12:51 2008 +1100
@@ -62,6 +62,7 @@
 #include <linux/if_ether.h>
 #include <linux/if_tun.h>
 #include <linux/crc32.h>
+#include <linux/virtio_net.h>
 #include <net/net_namespace.h>
 
 #include <asm/system.h>
@@ -238,35 +239,188 @@ static unsigned int tun_chr_poll(struct 
 	return mask;
 }
 
+static struct sk_buff *copy_user_skb(size_t align, struct iovec *iv, size_t len)
+{
+	struct sk_buff *skb;
+
+	if (!(skb = alloc_skb(len + align, GFP_KERNEL)))
+		return ERR_PTR(-ENOMEM);
+
+	if (align)
+		skb_reserve(skb, align);
+
+	if (memcpy_fromiovec(skb_put(skb, len), iv, len)) {
+		kfree_skb(skb);
+		return ERR_PTR(-EFAULT);
+	}
+	return skb;
+}
+
+/* This will fail if they give us a crazy iovec, but that's their own fault. */
+static int get_user_skb_frags(const struct iovec *iv, size_t count,
+			      struct skb_frag_struct *f)
+{
+	unsigned int i, j, num_pg = 0;
+	int err;
+	struct page *pages[MAX_SKB_FRAGS];
+
+	down_read(&current->mm->mmap_sem);
+	for (i = 0; i < count; i++) {
+		int n, npages;
+		unsigned long base, len;
+		base = (unsigned long)iv[i].iov_base;
+		len = (unsigned long)iv[i].iov_len;
+
+		if (len == 0)
+			continue;
+
+		/* How many pages will this take? */
+		npages = 1 + (base + len - 1)/PAGE_SIZE - base/PAGE_SIZE;
+		if (unlikely(num_pg + npages > MAX_SKB_FRAGS)) {
+			err = -ENOSPC;
+			goto fail;
+		}
+		n = get_user_pages(current, current->mm, base, npages,
+				   0, 0, pages, NULL);
+		if (unlikely(n < 0)) {
+			err = n;
+			goto fail;
+		}
+
+		/* Transfer pages to the frag array */
+		for (j = 0; j < n; j++) {
+			f[num_pg].page = pages[j];
+			if (j == 0) {
+				f[num_pg].page_offset = offset_in_page(base);
+				f[num_pg].size = min(len, PAGE_SIZE -
+						     f[num_pg].page_offset);
+			} else {
+				f[num_pg].page_offset = 0;
+				f[num_pg].size = min(len, PAGE_SIZE);
+			}
+			len -= f[num_pg].size;
+			base += f[num_pg].size;
+			num_pg++;
+		}
+
+		if (unlikely(n != npages)) {
+			err = -EFAULT;
+			goto fail;
+		}
+	}
+	up_read(&current->mm->mmap_sem);
+	return num_pg;
+
+fail:
+	for (i = 0; i < num_pg; i++)
+		put_page(f[i].page);
+	up_read(&current->mm->mmap_sem);
+	return err;
+}
+
+
+static struct sk_buff *map_user_skb(const struct virtio_net_hdr *gso,
+				    size_t align, struct iovec *iv,
+				    size_t count, size_t len)
+{
+	struct sk_buff *skb;
+	struct skb_shared_info *sinfo;
+	int err;
+
+	if (!(skb = alloc_skb(gso->hdr_len + align, GFP_KERNEL)))
+		return ERR_PTR(-ENOMEM);
+
+	if (align)
+		skb_reserve(skb, align);
+
+	sinfo = skb_shinfo(skb);
+	sinfo->gso_size = gso->gso_size;
+	sinfo->gso_type = SKB_GSO_DODGY;
+	switch (gso->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+	case VIRTIO_NET_HDR_GSO_TCPV4:
+		sinfo->gso_type |= SKB_GSO_TCPV4;
+		break;
+	case VIRTIO_NET_HDR_GSO_TCPV6:
+		sinfo->gso_type |= SKB_GSO_TCPV6;
+		break;
+	case VIRTIO_NET_HDR_GSO_UDP:
+		sinfo->gso_type |= SKB_GSO_UDP;
+		break;
+	default:
+		err = -EINVAL;
+		goto fail;
+	}
+
+	if (gso->gso_type & VIRTIO_NET_HDR_GSO_ECN)
+		skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+
+	/* Copy in the header. */
+	if (memcpy_fromiovec(skb_put(skb, gso->hdr_len), iv, gso->hdr_len)) {
+		err = -EFAULT;
+		goto fail;
+	}
+
+	err = get_user_skb_frags(iv, count, sinfo->frags);
+	if (err < 0)
+		goto fail;
+
+	sinfo->nr_frags = err;
+	skb->len += len;
+	skb->data_len += len;
+	
+	return skb;
+
+fail:
+	kfree_skb(skb);
+	return ERR_PTR(err);
+}
+
+static inline size_t iov_total(const struct iovec *iv, unsigned long count)
+{
+	unsigned long i;
+	size_t len;
+
+	for (i = 0, len = 0; i < count; i++)
+		len += iv[i].iov_len;
+
+	return len;
+}
+
 /* Get packet from user space buffer */
-static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv, size_t count)
+static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv, size_t num)
 {
 	struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
+	struct virtio_net_hdr gso = { 0, VIRTIO_NET_HDR_GSO_NONE };
 	struct sk_buff *skb;
-	size_t len = count, align = 0;
+	size_t tot_len = iov_total(iv, num);
+	size_t len = tot_len, align = 0;
 
 	if (!(tun->flags & TUN_NO_PI)) {
-		if ((len -= sizeof(pi)) > count)
+		if ((len -= sizeof(pi)) > tot_len)
 			return -EINVAL;
 
 		if(memcpy_fromiovec((void *)&pi, iv, sizeof(pi)))
+			return -EFAULT;
+	}
+	if (tun->flags & TUN_VIRTIO_HDR) {
+		if ((len -= sizeof(gso)) > tot_len)
+			return -EINVAL;
+
+		if (memcpy_fromiovec((void *)&gso, iv, sizeof(gso)))
 			return -EFAULT;
 	}
 
 	if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV)
 		align = NET_IP_ALIGN;
 
-	if (!(skb = alloc_skb(len + align, GFP_KERNEL))) {
+	if (gso.gso_type != VIRTIO_NET_HDR_GSO_NONE)
+		skb = map_user_skb(&gso, align, iv, num, len);
+	else
+		skb = copy_user_skb(align, iv, len);
+
+	if (IS_ERR(skb)) {
 		tun->dev->stats.rx_dropped++;
-		return -ENOMEM;
-	}
-
-	if (align)
-		skb_reserve(skb, align);
-	if (memcpy_fromiovec(skb_put(skb, len), iv, len)) {
-		tun->dev->stats.rx_dropped++;
-		kfree_skb(skb);
-		return -EFAULT;
+		return PTR_ERR(skb);
 	}
 
 	switch (tun->flags & TUN_TYPE_MASK) {
@@ -280,7 +434,13 @@ static __inline__ ssize_t tun_get_user(s
 		break;
 	};
 
-	if (tun->flags & TUN_NOCHECKSUM)
+	if (gso.flags & (1 << VIRTIO_NET_F_CSUM)) {
+		if (!skb_partial_csum_set(skb,gso.csum_start,gso.csum_offset)) {
+			tun->dev->stats.rx_dropped++;
+			kfree_skb(skb);
+			return -EINVAL;
+		}
+	} else if (tun->flags & TUN_NOCHECKSUM)
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
 
 	netif_rx_ni(skb);
@@ -289,18 +449,7 @@ static __inline__ ssize_t tun_get_user(s
 	tun->dev->stats.rx_packets++;
 	tun->dev->stats.rx_bytes += len;
 
-	return count;
-}
-
-static inline size_t iov_total(const struct iovec *iv, unsigned long count)
-{
-	unsigned long i;
-	size_t len;
-
-	for (i = 0, len = 0; i < count; i++)
-		len += iv[i].iov_len;
-
-	return len;
+	return tot_len;
 }
 
 static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
@@ -313,7 +462,7 @@ static ssize_t tun_chr_aio_write(struct 
 
 	DBG(KERN_INFO "%s: tun_chr_write %ld\n", tun->dev->name, count);
 
-	return tun_get_user(tun, (struct iovec *) iv, iov_total(iv, count));
+	return tun_get_user(tun, (struct iovec *) iv, count);
 }
 
 /* Put packet to the user space buffer */
@@ -336,6 +485,42 @@ static __inline__ ssize_t tun_put_user(s
 		if (memcpy_toiovec(iv, (void *) &pi, sizeof(pi)))
 			return -EFAULT;
 		total += sizeof(pi);
+	}
+	if (tun->flags & TUN_VIRTIO_HDR) {
+		struct virtio_net_hdr gso;
+		struct skb_shared_info *sinfo = skb_shinfo(skb);
+
+		if (skb_is_gso(skb)) {
+			gso.hdr_len = skb_transport_header(skb) - skb->data;
+			gso.gso_size = sinfo->gso_size;
+			if (sinfo->gso_type & SKB_GSO_TCPV4)
+				gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+			else if (sinfo->gso_type & SKB_GSO_TCPV6)
+				gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+			else if (sinfo->gso_type & SKB_GSO_UDP)
+				gso.gso_type = VIRTIO_NET_HDR_GSO_UDP;
+			else
+				BUG();
+			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
+				gso.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
+		} else
+			gso.gso_type = VIRTIO_NET_HDR_GSO_NONE;
+		
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			gso.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+			gso.csum_start = skb->csum_start - skb_headroom(skb);
+			gso.csum_offset = skb->csum_offset;
+		} else {
+			gso.flags = 0;
+			gso.csum_offset = gso.csum_start = 0;
+		}
+
+		if ((len -= sizeof(gso)) < 0)
+			return -EINVAL;
+
+		if (memcpy_toiovec(iv, (void *)&gso, sizeof(gso)))
+			return -EFAULT;
+		total += sizeof(gso);
 	}
 
 	len = min_t(int, skb->len, len);
@@ -523,6 +708,17 @@ static int tun_set_iff(struct file *file
 
 		tun_net_init(dev);
 
+		/* Virtio header means we can handle csum & gso. */
+		if ((ifr->ifr_flags & (IFF_VIRTIO_HDR|IFF_RECV_CSUM)) ==
+		    (IFF_VIRTIO_HDR|IFF_RECV_CSUM)) {
+			dev->features = NETIF_F_SG | NETIF_F_HW_CSUM |
+					NETIF_F_HIGHDMA | NETIF_F_FRAGLIST;
+
+			if (ifr->ifr_flags & IFF_RECV_GSO)
+				dev->features |= NETIF_F_TSO | NETIF_F_UFO |
+						 NETIF_F_TSO_ECN | NETIF_F_TSO6;
+		}
+
 		if (strchr(dev->name, '%')) {
 			err = dev_alloc_name(dev, dev->name);
 			if (err < 0)
@@ -543,6 +739,15 @@ static int tun_set_iff(struct file *file
 
 	if (ifr->ifr_flags & IFF_ONE_QUEUE)
 		tun->flags |= TUN_ONE_QUEUE;
+
+	if (ifr->ifr_flags & IFF_VIRTIO_HDR)
+		tun->flags |= TUN_VIRTIO_HDR;
+
+	if (ifr->ifr_flags & IFF_RECV_CSUM)
+		tun->flags |= TUN_RECV_CSUM;
+
+	if (ifr->ifr_flags & IFF_RECV_GSO)
+		tun->flags |= TUN_RECV_GSO;
 
 	file->private_data = tun;
 	tun->attached = 1;
diff -r cb85fb035378 include/linux/if_tun.h
--- a/include/linux/if_tun.h	Wed Jan 23 20:06:56 2008 +1100
+++ b/include/linux/if_tun.h	Wed Jan 23 20:12:51 2008 +1100
@@ -70,6 +70,9 @@ struct tun_struct {
 #define TUN_NO_PI	0x0040
 #define TUN_ONE_QUEUE	0x0080
 #define TUN_PERSIST 	0x0100	
+#define TUN_VIRTIO_HDR	0x0200
+#define TUN_RECV_CSUM	0x0400
+#define TUN_RECV_GSO	0x0400
 
 /* Ioctl defines */
 #define TUNSETNOCSUM  _IOW('T', 200, int) 
@@ -85,6 +88,9 @@ struct tun_struct {
 #define IFF_TAP		0x0002
 #define IFF_NO_PI	0x1000
 #define IFF_ONE_QUEUE	0x2000
+#define IFF_VIRTIO_HDR	0x4000
+#define IFF_RECV_CSUM	0x8000
+#define IFF_RECV_GSO	0x0800
 
 struct tun_pi {
 	unsigned short flags;

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 3/3] Interface to query tun/tap features.
  2008-01-23 14:10 ` [PATCH 2/3] partial checksum and GSO support for tun/tap Rusty Russell
@ 2008-01-23 14:14   ` Rusty Russell
  2008-02-08  5:07     ` Max Krasnyansky
  2008-02-08  5:39   ` [PATCH 2/3] partial checksum and GSO support for tun/tap Max Krasnyansky
  1 sibling, 1 reply; 9+ messages in thread
From: Rusty Russell @ 2008-01-23 14:14 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, virtualization

(No real change, just updated with new bits)

The problem with introducing IFF_RECV_CSUM and IFF_RECV_GSO is that
they need to set dev->features to enable GSO and/or checksumming,
which is supposed to be done before register_netdevice(), ie. as part
of TUNSETIFF.

Unfortunately, TUNSETIFF has always just ignored flags it doesn't understand,
so there's no good way of detecting whether the kernel supports IFF_GSO_HDR.

This patch implements a TUNGETFEATURES ioctl which returns all the valid IFF
flags.  It could be extended later to include other features.

Here's an example program which uses it:

#include <linux/if_tun.h>
#include <sys/types.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <err.h>
#include <stdio.h>

static struct {
	unsigned int flag;
	const char *name;
} known_flags[] = {
	{ IFF_TUN, "TUN" },
	{ IFF_TAP, "TAP" },
	{ IFF_NO_PI, "NO_PI" },
	{ IFF_ONE_QUEUE, "ONE_QUEUE" },
	{ IFF_VIRTIO_HDR, "VIRTIO_HDR" },
	{ IFF_RECV_CSUM, "RECV_CSUM" },
	{ IFF_RECV_GSO, "RECV_GSO" },
};

int main()
{
	unsigned int features, i;

	int netfd = open("/dev/net/tun", O_RDWR);
	if (netfd < 0)
		err(1, "Opening /dev/net/tun");

	if (ioctl(netfd, TUNGETFEATURES, &features) != 0) {
		printf("Kernel does not support TUNGETFEATURES, guessing\n");
		features = (IFF_TUN|IFF_TAP|IFF_NO_PI|IFF_ONE_QUEUE);
	}
	printf("Available features are: ");
	for (i = 0; i < sizeof(known_flags)/sizeof(known_flags[0]); i++) {
		if (features & known_flags[i].flag) {
			features &= ~known_flags[i].flag;
			printf("%s ", known_flags[i].name);
		}
	}
	if (features)
		printf("(UNKNOWN %#x)", features);
	printf("\n");
	return 0;
}

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/net/tun.c      |    9 +++++++++
 include/linux/if_tun.h |    3 +++
 2 files changed, 12 insertions(+)

diff -r c0e7a8b99325 drivers/net/tun.c
--- a/drivers/net/tun.c	Wed Jan 23 20:12:51 2008 +1100
+++ b/drivers/net/tun.c	Wed Jan 23 20:17:28 2008 +1100
@@ -790,6 +790,15 @@ static int tun_chr_ioctl(struct inode *i
 		return 0;
 	}
 
+	if (cmd == TUNGETFEATURES) {
+		/* Currently this just means: "what IFF flags are valid?".
+		 * This is needed because we never checked for invalid flags on
+		 * TUNSETIFF.  This was introduced with IFF_GSO_HDR, so if a
+		 * kernel doesn't have this ioctl, it doesn't have GSO header
+		 * support. */
+		return put_user(IFF_ALL_FLAGS, (unsigned int __user*)argp);
+	}
+
 	if (!tun)
 		return -EBADFD;
 
diff -r c0e7a8b99325 include/linux/if_tun.h
--- a/include/linux/if_tun.h	Wed Jan 23 20:12:51 2008 +1100
+++ b/include/linux/if_tun.h	Wed Jan 23 20:17:28 2008 +1100
@@ -82,6 +82,7 @@ struct tun_struct {
 #define TUNSETOWNER   _IOW('T', 204, int)
 #define TUNSETLINK    _IOW('T', 205, int)
 #define TUNSETGROUP   _IOW('T', 206, int)
+#define TUNGETFEATURES _IOR('T', 207, unsigned int)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
@@ -91,6 +92,8 @@ struct tun_struct {
 #define IFF_VIRTIO_HDR	0x4000
 #define IFF_RECV_CSUM	0x8000
 #define IFF_RECV_GSO	0x0800
+#define IFF_ALL_FLAGS (IFF_TUN | IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE | \
+		       IFF_VIRTIO_HDR | IFF_RECV_CSUM | IFF_RECV_GSO)
 
 struct tun_pi {
 	unsigned short flags;

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/3] Interface to query tun/tap features.
  2008-01-23 14:14   ` [PATCH 3/3] Interface to query tun/tap features Rusty Russell
@ 2008-02-08  5:07     ` Max Krasnyansky
  0 siblings, 0 replies; 9+ messages in thread
From: Max Krasnyansky @ 2008-02-08  5:07 UTC (permalink / raw)
  To: Rusty Russell; +Cc: netdev, Herbert Xu, virtualization

Hi Rusty,

Sorry for delay in reply. I totally missed this one. Need to fix my mail filters. 
See below.

Rusty Russell wrote:
> (No real change, just updated with new bits)
> 
> The problem with introducing IFF_RECV_CSUM and IFF_RECV_GSO is that
> they need to set dev->features to enable GSO and/or checksumming,
> which is supposed to be done before register_netdevice(), ie. as part
> of TUNSETIFF.
> 
> Unfortunately, TUNSETIFF has always just ignored flags it doesn't understand,
> so there's no good way of detecting whether the kernel supports IFF_GSO_HDR.
> 
> This patch implements a TUNGETFEATURES ioctl which returns all the valid IFF
> flags.  It could be extended later to include other features.
> 
> Here's an example program which uses it:
> 
> #include <linux/if_tun.h>
> #include <sys/types.h>
> #include <sys/ioctl.h>
> #include <sys/stat.h>
> #include <fcntl.h>
> #include <err.h>
> #include <stdio.h>
> 
> static struct {
> 	unsigned int flag;
> 	const char *name;
> } known_flags[] = {
> 	{ IFF_TUN, "TUN" },
> 	{ IFF_TAP, "TAP" },
> 	{ IFF_NO_PI, "NO_PI" },
> 	{ IFF_ONE_QUEUE, "ONE_QUEUE" },
> 	{ IFF_VIRTIO_HDR, "VIRTIO_HDR" },
> 	{ IFF_RECV_CSUM, "RECV_CSUM" },
> 	{ IFF_RECV_GSO, "RECV_GSO" },
> };
> 
> int main()
> {
> 	unsigned int features, i;
> 
> 	int netfd = open("/dev/net/tun", O_RDWR);
> 	if (netfd < 0)
> 		err(1, "Opening /dev/net/tun");
> 
> 	if (ioctl(netfd, TUNGETFEATURES, &features) != 0) {
> 		printf("Kernel does not support TUNGETFEATURES, guessing\n");
> 		features = (IFF_TUN|IFF_TAP|IFF_NO_PI|IFF_ONE_QUEUE);
> 	}
> 	printf("Available features are: ");
> 	for (i = 0; i < sizeof(known_flags)/sizeof(known_flags[0]); i++) {
> 		if (features & known_flags[i].flag) {
> 			features &= ~known_flags[i].flag;
> 			printf("%s ", known_flags[i].name);
> 		}
> 	}
> 	if (features)
> 		printf("(UNKNOWN %#x)", features);
> 	printf("\n");
> 	return 0;
> }
> 
> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
> ---
>  drivers/net/tun.c      |    9 +++++++++
>  include/linux/if_tun.h |    3 +++
>  2 files changed, 12 insertions(+)
> 
> diff -r c0e7a8b99325 drivers/net/tun.c
> --- a/drivers/net/tun.c	Wed Jan 23 20:12:51 2008 +1100
> +++ b/drivers/net/tun.c	Wed Jan 23 20:17:28 2008 +1100
> @@ -790,6 +790,15 @@ static int tun_chr_ioctl(struct inode *i
>  		return 0;
>  	}
>  
> +	if (cmd == TUNGETFEATURES) {
> +		/* Currently this just means: "what IFF flags are valid?".
> +		 * This is needed because we never checked for invalid flags on
> +		 * TUNSETIFF.  This was introduced with IFF_GSO_HDR, so if a
> +		 * kernel doesn't have this ioctl, it doesn't have GSO header
> +		 * support. */
> +		return put_user(IFF_ALL_FLAGS, (unsigned int __user*)argp);
> +	}
> +
>  	if (!tun)
>  		return -EBADFD;
>  
> diff -r c0e7a8b99325 include/linux/if_tun.h
> --- a/include/linux/if_tun.h	Wed Jan 23 20:12:51 2008 +1100
> +++ b/include/linux/if_tun.h	Wed Jan 23 20:17:28 2008 +1100
> @@ -82,6 +82,7 @@ struct tun_struct {
>  #define TUNSETOWNER   _IOW('T', 204, int)
>  #define TUNSETLINK    _IOW('T', 205, int)
>  #define TUNSETGROUP   _IOW('T', 206, int)
> +#define TUNGETFEATURES _IOR('T', 207, unsigned int)
>  
>  /* TUNSETIFF ifr flags */
>  #define IFF_TUN		0x0001
> @@ -91,6 +92,8 @@ struct tun_struct {
>  #define IFF_VIRTIO_HDR	0x4000
>  #define IFF_RECV_CSUM	0x8000
>  #define IFF_RECV_GSO	0x0800
> +#define IFF_ALL_FLAGS (IFF_TUN | IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE | \
> +		       IFF_VIRTIO_HDR | IFF_RECV_CSUM | IFF_RECV_GSO)
>  
>  struct tun_pi {
>  	unsigned short flags;

Definitely Ack this one. Query interface makes perfect sense.
I'll reply to the GSO itself shortly.

I was going to create git tree for tun changes for awhile now. Looks like the time has come.
I'll do that asap.

Thanx
Max

 



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/3] partial checksum and GSO support for tun/tap.
  2008-01-23 14:10 ` [PATCH 2/3] partial checksum and GSO support for tun/tap Rusty Russell
  2008-01-23 14:14   ` [PATCH 3/3] Interface to query tun/tap features Rusty Russell
@ 2008-02-08  5:39   ` Max Krasnyansky
  2008-03-04  1:02     ` Rusty Russell
  1 sibling, 1 reply; 9+ messages in thread
From: Max Krasnyansky @ 2008-02-08  5:39 UTC (permalink / raw)
  To: Rusty Russell; +Cc: netdev, Herbert Xu, virtualization

Rusty Russell wrote:
> (Changes since last time: we how have explicit IFF_RECV_CSUM and 
> IFF_RECV_GSO bits, and some renaming of virtio_net hdr)
> 
> We use the virtio_net_hdr: it is an ABI already and designed to
> encapsulate such metadata as GSO and partial checksums.
> 
> IFF_VIRTIO_HDR means you will write and read a 'struct virtio_net_hdr'
> at the start of each packet.  You can always write packets with
> partial checksum and gso to the tap device using this header.
> 
> IFF_RECV_CSUM means you can handle reading packets with partial
> checksums.  If IFF_RECV_GSO is also set, it means you can handle
> reading (all types of) GSO packets.
>
> Note that there is no easy way to detect if these flags are supported:
> see next patch.

Again sorry for delay in replying. Here are my thoughts on this.

I like the approach in general. Certainly the part that creates skbs out of the user-space
pages looks good. And it's fits nicely into existing TUN driver model.
However I actually wanted to change the model :). In particular I'm talking about 
	"syscall per packet" 
After messing around with things like libe1000.sf.net I'd like to make TUN/TAP driver look 
more like modern nic's to the user-space. In other words I'm thinking about introducing RX and
TX rings that the user-space can then mmap() and write/read packets descriptors to/from.
That will saves the number of system calls that the user-space app needs to do. That by 
itself saves a lot of overhead, combined with the GSO it's be lightning fast.

I'm going to send you a version that I cooked up awhile ago in a private email. Do not want
to spam netdev :). It's not quite the RX/TX ring model but I'll give you an idea.
I did some profiling and PPS (packets per second) numbers that user-space can handle literally 
sky rocketed.

btw We had a long discussion with Eugeniy Polakov on mapping user-pages vs mmap()ing large
kernel buffer and doing normal memcpy() (ie instead of copy_to/fromuser()) in the kernel.
On small packets overhead of get_user_pages() eats up all the benefits. So we should think
of some scheme that nicely combines the two. Kind of like "copy break" that latest net 
drivers do these days.

Also btw why call it VIRTIO ? For example I'm actually interested in speeding up tunning
and general network apps. We have wireless basestation apps here that need to handle packets
in user-space. Those kind things have nothing to with virtualization.

Max

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/3] partial checksum and GSO support for tun/tap.
  2008-02-08  5:39   ` [PATCH 2/3] partial checksum and GSO support for tun/tap Max Krasnyansky
@ 2008-03-04  1:02     ` Rusty Russell
  2008-03-04  5:08       ` Max Krasnyansky
  0 siblings, 1 reply; 9+ messages in thread
From: Rusty Russell @ 2008-03-04  1:02 UTC (permalink / raw)
  To: Max Krasnyansky; +Cc: netdev, Herbert Xu, virtualization

On Friday 08 February 2008 16:39:03 Max Krasnyansky wrote:
> Rusty Russell wrote:
> > (Changes since last time: we how have explicit IFF_RECV_CSUM and
> > IFF_RECV_GSO bits, and some renaming of virtio_net hdr)
> >
> > We use the virtio_net_hdr: it is an ABI already and designed to
> > encapsulate such metadata as GSO and partial checksums.
> >
> > IFF_VIRTIO_HDR means you will write and read a 'struct virtio_net_hdr'
> > at the start of each packet.  You can always write packets with
> > partial checksum and gso to the tap device using this header.
> >
> > IFF_RECV_CSUM means you can handle reading packets with partial
> > checksums.  If IFF_RECV_GSO is also set, it means you can handle
> > reading (all types of) GSO packets.
> >
> > Note that there is no easy way to detect if these flags are supported:
> > see next patch.
>
> Again sorry for delay in replying. Here are my thoughts on this.
>
> I like the approach in general. Certainly the part that creates skbs out of
> the user-space pages looks good. And it's fits nicely into existing TUN
> driver model. However I actually wanted to change the model :). In
> particular I'm talking about "syscall per packet"
> After messing around with things like libe1000.sf.net I'd like to make
> TUN/TAP driver look more like modern nic's to the user-space. In other
> words I'm thinking about introducing RX and TX rings that the user-space
> can then mmap() and write/read packets descriptors to/from. That will saves
> the number of system calls that the user-space app needs to do. That by
> itself saves a lot of overhead, combined with the GSO it's be lightning
> fast.

The problem with this approach is that for what I'm doing, the packets aren't 
nicely arranged somewhere; they're in random process memory.

I thought about further abusing writev and readv to do multiple packets at 
once.  

> btw We had a long discussion with Eugeniy Polakov on mapping user-pages vs
> mmap()ing large kernel buffer and doing normal memcpy() (ie instead of
> copy_to/fromuser()) in the kernel. On small packets overhead of
> get_user_pages() eats up all the benefits. So we should think of some
> scheme that nicely combines the two. Kind of like "copy break" that latest
> net drivers do these days.

Yes, the threshold for copy should probably be set around 128 bytes.

> Also btw why call it VIRTIO ? For example I'm actually interested in
> speeding up tunning and general network apps. We have wireless basestation
> apps here that need to handle packets in user-space. Those kind things have
> nothing to with virtualization.

The structure is for virtio, I'm just borrowing it for tap because it's 
already there.  We could rename it and move it out to its own header, but if 
so we should do that before 2.6.25 is released.

Thanks!
Rusty.


>
> Max



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/3] partial checksum and GSO support for tun/tap.
  2008-03-04  1:02     ` Rusty Russell
@ 2008-03-04  5:08       ` Max Krasnyansky
  2008-03-04  7:47         ` Rusty Russell
  0 siblings, 1 reply; 9+ messages in thread
From: Max Krasnyansky @ 2008-03-04  5:08 UTC (permalink / raw)
  To: Rusty Russell; +Cc: netdev, Herbert Xu, virtualization

Rusty Russell wrote:
> On Friday 08 February 2008 16:39:03 Max Krasnyansky wrote:
>> Rusty Russell wrote:
>>> (Changes since last time: we how have explicit IFF_RECV_CSUM and
>>> IFF_RECV_GSO bits, and some renaming of virtio_net hdr)
>>>
>>> We use the virtio_net_hdr: it is an ABI already and designed to
>>> encapsulate such metadata as GSO and partial checksums.
>>>
>>> IFF_VIRTIO_HDR means you will write and read a 'struct virtio_net_hdr'
>>> at the start of each packet.  You can always write packets with
>>> partial checksum and gso to the tap device using this header.
>>>
>>> IFF_RECV_CSUM means you can handle reading packets with partial
>>> checksums.  If IFF_RECV_GSO is also set, it means you can handle
>>> reading (all types of) GSO packets.
>>>
>>> Note that there is no easy way to detect if these flags are supported:
>>> see next patch.
>> Again sorry for delay in replying. Here are my thoughts on this.
>>
>> I like the approach in general. Certainly the part that creates skbs out of
>> the user-space pages looks good. And it's fits nicely into existing TUN
>> driver model. However I actually wanted to change the model :). In
>> particular I'm talking about "syscall per packet"
>> After messing around with things like libe1000.sf.net I'd like to make
>> TUN/TAP driver look more like modern nic's to the user-space. In other
>> words I'm thinking about introducing RX and TX rings that the user-space
>> can then mmap() and write/read packets descriptors to/from. That will saves
>> the number of system calls that the user-space app needs to do. That by
>> itself saves a lot of overhead, combined with the GSO it's be lightning
>> fast.
> 
> The problem with this approach is that for what I'm doing, the packets aren't 
> nicely arranged somewhere; they're in random process memory.
That's fine. RX/TX descriptors would not contain the data itself. They'd
contain pointers to actual packets (ie just like the NIC takes physical memory
address and DMAs data in/out).
The allows for sending/receiving packets without syscalls and fits nicely with
the async schemes like GSO.

btw The code that I sent you does indeed expect packets to be in a mmap()ed
buffer but I agree that it only works for certain cases. In general it's not
flexible. I was thinking of introducing some flags in the descriptor that tell
the kernel how to handle the packet. ie Whether it needs to be just copied
into a fresh SKB or remapped with get_user_pages().

> I thought about further abusing writev and readv to do multiple packets at 
> once.  
I actually was going to abuse them from day one. At that time Alex Kuznetsov
told me that I'm crazy and I gave up on it :)

>> Also btw why call it VIRTIO ? For example I'm actually interested in
>> speeding up tunning and general network apps. We have wireless basestation
>> apps here that need to handle packets in user-space. Those kind things have
>> nothing to with virtualization.
> 
> The structure is for virtio, I'm just borrowing it for tap because it's 
> already there.  We could rename it and move it out to its own header, but if 
> so we should do that before 2.6.25 is released.
If we do the whole enchilada with the RX/TX rings then we probably do not even
need it. I'm thinking that RX/TX descriptor would include everything you need
for the GSO and stuff.
I meant do not need it for the TUN/TAP driver that is. Is it used anywhere else ?

Max

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/3] partial checksum and GSO support for tun/tap.
  2008-03-04  5:08       ` Max Krasnyansky
@ 2008-03-04  7:47         ` Rusty Russell
  2008-03-04 20:08           ` Max Krasnyanskiy
  0 siblings, 1 reply; 9+ messages in thread
From: Rusty Russell @ 2008-03-04  7:47 UTC (permalink / raw)
  To: Max Krasnyansky; +Cc: netdev, Herbert Xu, virtualization

On Tuesday 04 March 2008 16:08:00 Max Krasnyansky wrote:
> > The problem with this approach is that for what I'm doing, the packets
> > aren't nicely arranged somewhere; they're in random process memory.
>
> That's fine. RX/TX descriptors would not contain the data itself. They'd
> contain pointers to actual packets (ie just like the NIC takes physical
> memory address and DMAs data in/out).
> The allows for sending/receiving packets without syscalls and fits nicely
> with the async schemes like GSO.

Yes, yes it does.  That would be a very nice extension (it's orthogonal to 
this patch though, so should we get Dave to take these for 2.6.25?).

And as it happens, virtio already has such a structure: virtio_ring.  See 
linux/virtio_ring.h.

> > The structure is for virtio, I'm just borrowing it for tap because it's
> > already there.  We could rename it and move it out to its own header, but
> > if so we should do that before 2.6.25 is released.
>
> If we do the whole enchilada with the RX/TX rings then we probably do not
> even need it. I'm thinking that RX/TX descriptor would include everything
> you need for the GSO and stuff.
> I meant do not need it for the TUN/TAP driver that is. Is it used anywhere
> else ?

Just for the linux virtio drivers.  Reusing it for tun/tap was an 
afterthought.  It just meant I could pass the same structure straight thru, 
though, which is nice.

The userspace->kernel problem is very similar to the guest->host problem, so 
it doesn't surprise me if we end up with very similar (identical?) 
interfaces.

Take a look at virtio_ring.h, virtio_ring.c and Documentation/lguest/lguest.c 
to see how we use it...

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/3] partial checksum and GSO support for tun/tap.
  2008-03-04  7:47         ` Rusty Russell
@ 2008-03-04 20:08           ` Max Krasnyanskiy
  0 siblings, 0 replies; 9+ messages in thread
From: Max Krasnyanskiy @ 2008-03-04 20:08 UTC (permalink / raw)
  To: Rusty Russell; +Cc: netdev, Herbert Xu, virtualization

Rusty Russell wrote:
> On Tuesday 04 March 2008 16:08:00 Max Krasnyansky wrote:
>>> The problem with this approach is that for what I'm doing, the packets
>>> aren't nicely arranged somewhere; they're in random process memory.
>> That's fine. RX/TX descriptors would not contain the data itself. They'd
>> contain pointers to actual packets (ie just like the NIC takes physical
>> memory address and DMAs data in/out).
>> The allows for sending/receiving packets without syscalls and fits nicely
>> with the async schemes like GSO.
> 
> Yes, yes it does.  That would be a very nice extension (it's orthogonal to 
> this patch though, so should we get Dave to take these for 2.6.25?).
It's orthogonal in general I agree. The only concern is whether we should keep 
extending existing driver API (ie adding more flags like TUN_NO_PI, etc) or go 
straight to the new/better ring based API. I do not have a strong preference 
one way or the other. So I guess I'm saying I'd be ok with Dave taking them 
for 2.6.25, but that API may be obsolete when the ring based thing comes out.

> And as it happens, virtio already has such a structure: virtio_ring.  See 
> linux/virtio_ring.h.
I'll take a look.

>>> The structure is for virtio, I'm just borrowing it for tap because it's
>>> already there.  We could rename it and move it out to its own header, but
>>> if so we should do that before 2.6.25 is released.
>> If we do the whole enchilada with the RX/TX rings then we probably do not
>> even need it. I'm thinking that RX/TX descriptor would include everything
>> you need for the GSO and stuff.
>> I meant do not need it for the TUN/TAP driver that is. Is it used anywhere
>> else ?
> 
> Just for the linux virtio drivers.  Reusing it for tun/tap was an 
> afterthought.  It just meant I could pass the same structure straight thru, 
> though, which is nice.
> 
> The userspace->kernel problem is very similar to the guest->host problem, so 
> it doesn't surprise me if we end up with very similar (identical?) 
> interfaces.
> 
> Take a look at virtio_ring.h, virtio_ring.c and Documentation/lguest/lguest.c 
> to see how we use it...
Thanx for the pointers.

Max


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2008-03-04 20:09 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-01-23 14:07 [PATCH 1/3] Cleanup and simplify virtnet header Rusty Russell
2008-01-23 14:10 ` [PATCH 2/3] partial checksum and GSO support for tun/tap Rusty Russell
2008-01-23 14:14   ` [PATCH 3/3] Interface to query tun/tap features Rusty Russell
2008-02-08  5:07     ` Max Krasnyansky
2008-02-08  5:39   ` [PATCH 2/3] partial checksum and GSO support for tun/tap Max Krasnyansky
2008-03-04  1:02     ` Rusty Russell
2008-03-04  5:08       ` Max Krasnyansky
2008-03-04  7:47         ` Rusty Russell
2008-03-04 20:08           ` Max Krasnyanskiy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).