Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net-next PATCH 4/5] GSO: Support partial segmentation offload
From: Alexander Duyck @ 2016-04-08 20:33 UTC (permalink / raw)
  To: herbert, tom, jesse, alexander.duyck, edumazet, netdev, davem
In-Reply-To: <20160408203013.12838.63429.stgit@ahduyck-xeon-server>

This patch adds support for something I am referring to as GSO partial.
The basic idea is that we can support a broader range of devices for
segmentation if we use fixed outer headers and have the hardware only
really deal with segmenting the inner header.  The idea behind the naming
is due to the fact that everything before csum_start will be fixed headers,
and everything after will be the region that is handled by hardware.

With the current implementation it allows us to add support for the
following GSO types with an inner TSO_MANGLEID or TSO6 offload:
NETIF_F_GSO_GRE
NETIF_F_GSO_GRE_CSUM
NETIF_F_GSO_IPIP
NETIF_F_GSO_SIT
NETIF_F_UDP_TUNNEL
NETIF_F_UDP_TUNNEL_CSUM

In the case of hardware that already supports tunneling we may be able to
extend this further to support TSO_TCPV4 without TSO_MANGLEID if the
hardware can support updating inner IPv4 headers.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
---
 include/linux/netdev_features.h |    5 +++++
 include/linux/netdevice.h       |    2 ++
 include/linux/skbuff.h          |    9 +++++++--
 net/core/dev.c                  |   31 ++++++++++++++++++++++++++++++-
 net/core/ethtool.c              |    1 +
 net/core/skbuff.c               |   29 ++++++++++++++++++++++++++++-
 net/ipv4/af_inet.c              |   20 ++++++++++++++++----
 net/ipv4/gre_offload.c          |   26 +++++++++++++++++++++-----
 net/ipv4/tcp_offload.c          |   10 ++++++++--
 net/ipv4/udp_offload.c          |   27 +++++++++++++++++++++------
 net/ipv6/ip6_offload.c          |   10 +++++++++-
 11 files changed, 148 insertions(+), 22 deletions(-)

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 7cf272a4b5c8..9fc79df0e561 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -48,6 +48,10 @@ enum {
 	NETIF_F_GSO_SIT_BIT,		/* ... SIT tunnel with TSO */
 	NETIF_F_GSO_UDP_TUNNEL_BIT,	/* ... UDP TUNNEL with TSO */
 	NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT,/* ... UDP TUNNEL with TSO & CSUM */
+	NETIF_F_GSO_PARTIAL_BIT,	/* ... Only segment inner-most L4
+					 *     in hardware and all other
+					 *     headers in software.
+					 */
 	NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */
 	/**/NETIF_F_GSO_LAST =		/* last bit, see GSO_MASK */
 		NETIF_F_GSO_TUNNEL_REMCSUM_BIT,
@@ -122,6 +126,7 @@ enum {
 #define NETIF_F_GSO_UDP_TUNNEL	__NETIF_F(GSO_UDP_TUNNEL)
 #define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM)
 #define NETIF_F_TSO_MANGLEID	__NETIF_F(TSO_MANGLEID)
+#define NETIF_F_GSO_PARTIAL	 __NETIF_F(GSO_PARTIAL)
 #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX	__NETIF_F(HW_VLAN_STAG_RX)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a3ac84ac8cb0..554efb93f0ed 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1656,6 +1656,7 @@ struct net_device {
 	netdev_features_t	vlan_features;
 	netdev_features_t	hw_enc_features;
 	netdev_features_t	mpls_features;
+	netdev_features_t	gso_partial_features;
 
 	int			ifindex;
 	int			group;
@@ -4006,6 +4007,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 	BUILD_BUG_ON(SKB_GSO_SIT     != (NETIF_F_GSO_SIT >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
 
 	return (features & feature) == feature;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5fba16658f9d..da0ace389fec 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -483,7 +483,9 @@ enum {
 
 	SKB_GSO_UDP_TUNNEL_CSUM = 1 << 12,
 
-	SKB_GSO_TUNNEL_REMCSUM = 1 << 13,
+	SKB_GSO_PARTIAL = 1 << 13,
+
+	SKB_GSO_TUNNEL_REMCSUM = 1 << 14,
 };
 
 #if BITS_PER_LONG > 32
@@ -3591,7 +3593,10 @@ static inline struct sec_path *skb_sec_path(struct sk_buff *skb)
  * Keeps track of level of encapsulation of network headers.
  */
 struct skb_gso_cb {
-	int	mac_offset;
+	union {
+		int	mac_offset;
+		int	data_offset;
+	};
 	int	encap_level;
 	__wsum	csum;
 	__u16	csum_start;
diff --git a/net/core/dev.c b/net/core/dev.c
index 235e0f3e34f0..d80010b3828f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2711,6 +2711,19 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 			return ERR_PTR(err);
 	}
 
+	/* Only report GSO partial support if it will enable us to
+	 * support segmentation on this frame without needing additional
+	 * work.
+	 */
+	if (features & NETIF_F_GSO_PARTIAL) {
+		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
+		struct net_device *dev = skb->dev;
+
+		partial_features |= dev->features & dev->gso_partial_features;
+		if (!skb_gso_ok(skb, features | partial_features))
+			features &= ~NETIF_F_GSO_PARTIAL;
+	}
+
 	BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
 		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
 
@@ -2841,6 +2854,14 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
 	if (skb->encapsulation)
 		features &= dev->hw_enc_features;
 
+	/* Support for GSO partial features requires software intervention
+	 * before we can actually process the packets so we need to strip
+	 * support for any partial features now and we can pull them back
+	 * in after we have partially segmented the frame.
+	 */
+	if (skb_is_gso(skb) && !(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
+		features &= ~dev->gso_partial_features;
+
 	if (skb_vlan_tagged(skb))
 		features = netdev_intersect_features(features,
 						     dev->vlan_features |
@@ -6707,6 +6728,14 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
 		}
 	}
 
+	/* GSO partial features require GSO partial be set */
+	if ((features & dev->gso_partial_features) &&
+	    !(features & NETIF_F_GSO_PARTIAL)) {
+		netdev_dbg(dev,
+			   "Dropping partially supported GSO features since no GSO partial.\n");
+		features &= ~dev->gso_partial_features;
+	}
+
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	if (dev->netdev_ops->ndo_busy_poll)
 		features |= NETIF_F_BUSY_POLL;
@@ -6989,7 +7018,7 @@ int register_netdevice(struct net_device *dev)
 
 	/* Make NETIF_F_SG inheritable to tunnel devices.
 	 */
-	dev->hw_enc_features |= NETIF_F_SG;
+	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
 
 	/* Make NETIF_F_SG inheritable to MPLS.
 	 */
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 9494c41cc77c..e0cf20a3b3dd 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -88,6 +88,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_GSO_SIT_BIT] =		 "tx-sit-segmentation",
 	[NETIF_F_GSO_UDP_TUNNEL_BIT] =	 "tx-udp_tnl-segmentation",
 	[NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
+	[NETIF_F_GSO_PARTIAL_BIT] =	 "tx-gso-partial",
 
 	[NETIF_F_FCOE_CRC_BIT] =         "tx-checksum-fcoe-crc",
 	[NETIF_F_SCTP_CRC_BIT] =        "tx-checksum-sctp",
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d04c2d1c8c87..4cc594cdaada 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3076,8 +3076,9 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
 	struct sk_buff *frag_skb = head_skb;
 	unsigned int offset = doffset;
 	unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
+	unsigned int partial_segs = 0;
 	unsigned int headroom;
-	unsigned int len;
+	unsigned int len = head_skb->len;
 	__be16 proto;
 	bool csum;
 	int sg = !!(features & NETIF_F_SG);
@@ -3094,6 +3095,15 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
 
 	csum = !!can_checksum_protocol(features, proto);
 
+	/* GSO partial only requires that we trim off any excess that
+	 * doesn't fit into an MSS sized block, so take care of that
+	 * now.
+	 */
+	if (features & NETIF_F_GSO_PARTIAL) {
+		partial_segs = len / mss;
+		mss *= partial_segs;
+	}
+
 	headroom = skb_headroom(head_skb);
 	pos = skb_headlen(head_skb);
 
@@ -3281,6 +3291,23 @@ perform_csum_check:
 	 */
 	segs->prev = tail;
 
+	/* Update GSO info on first skb in partial sequence. */
+	if (partial_segs) {
+		int type = skb_shinfo(head_skb)->gso_type;
+
+		/* Update type to add partial and then remove dodgy if set */
+		type |= SKB_GSO_PARTIAL;
+		type &= ~SKB_GSO_DODGY;
+
+		/* Update GSO info and prepare to start updating headers on
+		 * our way back down the stack of protocols.
+		 */
+		skb_shinfo(segs)->gso_size = skb_shinfo(head_skb)->gso_size;
+		skb_shinfo(segs)->gso_segs = partial_segs;
+		skb_shinfo(segs)->gso_type = type;
+		SKB_GSO_CB(segs)->data_offset = skb_headroom(segs) + doffset;
+	}
+
 	/* Following permits correct backpressure, for protocols
 	 * using skb_set_owner_w().
 	 * Idea is to tranfert ownership from head_skb to last segment.
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8564cab96189..2e6e65fc4d20 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1200,7 +1200,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	const struct net_offload *ops;
 	unsigned int offset = 0;
 	struct iphdr *iph;
-	int proto;
+	int proto, tot_len;
 	int nhoff;
 	int ihl;
 	int id;
@@ -1219,6 +1219,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_UDP_TUNNEL_CSUM |
 		       SKB_GSO_TCP_FIXEDID |
 		       SKB_GSO_TUNNEL_REMCSUM |
+		       SKB_GSO_PARTIAL |
 		       0)))
 		goto out;
 
@@ -1273,10 +1274,21 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 			if (skb->next)
 				iph->frag_off |= htons(IP_MF);
 			offset += skb->len - nhoff - ihl;
-		} else if (!fixedid) {
-			iph->id = htons(id++);
+			tot_len = skb->len - nhoff;
+		} else if (skb_is_gso(skb)) {
+			if (!fixedid) {
+				iph->id = htons(id);
+				id += skb_shinfo(skb)->gso_segs;
+			}
+			tot_len = skb_shinfo(skb)->gso_size +
+				  SKB_GSO_CB(skb)->data_offset +
+				  skb->head - (unsigned char *)iph;
+		} else {
+			if (!fixedid)
+				iph->id = htons(id++);
+			tot_len = skb->len - nhoff;
 		}
-		iph->tot_len = htons(skb->len - nhoff);
+		iph->tot_len = htons(tot_len);
 		ip_send_check(iph);
 		if (encap)
 			skb_reset_inner_headers(skb);
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 6376b0cdf693..20557f211408 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -36,7 +36,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 				  SKB_GSO_GRE |
 				  SKB_GSO_GRE_CSUM |
 				  SKB_GSO_IPIP |
-				  SKB_GSO_SIT)))
+				  SKB_GSO_SIT |
+				  SKB_GSO_PARTIAL)))
 		goto out;
 
 	if (!skb->encapsulation)
@@ -87,7 +88,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 	skb = segs;
 	do {
 		struct gre_base_hdr *greh;
-		__be32 *pcsum;
+		__sum16 *pcsum;
 
 		/* Set up inner headers if we are offloading inner checksum */
 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
@@ -107,10 +108,25 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 			continue;
 
 		greh = (struct gre_base_hdr *)skb_transport_header(skb);
-		pcsum = (__be32 *)(greh + 1);
+		pcsum = (__sum16 *)(greh + 1);
+
+		if (skb_is_gso(skb)) {
+			unsigned int partial_adj;
+
+			/* Adjust checksum to account for the fact that
+			 * the partial checksum is based on actual size
+			 * whereas headers should be based on MSS size.
+			 */
+			partial_adj = skb->len + skb_headroom(skb) -
+				      SKB_GSO_CB(skb)->data_offset -
+				      skb_shinfo(skb)->gso_size;
+			*pcsum = ~csum_fold((__force __wsum)htonl(partial_adj));
+		} else {
+			*pcsum = 0;
+		}
 
-		*pcsum = 0;
-		*(__sum16 *)pcsum = gso_make_checksum(skb, 0);
+		*(pcsum + 1) = 0;
+		*pcsum = gso_make_checksum(skb, 0);
 	} while ((skb = skb->next));
 out:
 	return segs;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index d1ffd55289bd..02737b607aa7 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -109,6 +109,12 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 		goto out;
 	}
 
+	/* GSO partial only requires splitting the frame into an MSS
+	 * multiple and possibly a remainder.  So update the mss now.
+	 */
+	if (features & NETIF_F_GSO_PARTIAL)
+		mss = skb->len - (skb->len % mss);
+
 	copy_destructor = gso_skb->destructor == tcp_wfree;
 	ooo_okay = gso_skb->ooo_okay;
 	/* All segments but the first should have ooo_okay cleared */
@@ -133,7 +139,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 	newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +
 					       (__force u32)delta));
 
-	do {
+	while (skb->next) {
 		th->fin = th->psh = 0;
 		th->check = newcheck;
 
@@ -153,7 +159,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 
 		th->seq = htonl(seq);
 		th->cwr = 0;
-	} while (skb->next);
+	}
 
 	/* Following permits TCP Small Queues to work well with GSO :
 	 * The callback to TCP stack will be called at the time last frag
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 6230cf4b0d2d..097060def7f0 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -39,8 +39,11 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	 * 16 bit length field due to the header being added outside of an
 	 * IP or IPv6 frame that was already limited to 64K - 1.
 	 */
-	partial = csum_sub(csum_unfold(uh->check),
-			   (__force __wsum)htonl(skb->len));
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)
+		partial = (__force __wsum)uh->len;
+	else
+		partial = (__force __wsum)htonl(skb->len);
+	partial = csum_sub(csum_unfold(uh->check), partial);
 
 	/* setup inner skb. */
 	skb->encapsulation = 0;
@@ -89,7 +92,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	udp_offset = outer_hlen - tnl_hlen;
 	skb = segs;
 	do {
-		__be16 len;
+		unsigned int len;
 
 		if (remcsum)
 			skb->ip_summed = CHECKSUM_NONE;
@@ -107,14 +110,26 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 		skb_reset_mac_header(skb);
 		skb_set_network_header(skb, mac_len);
 		skb_set_transport_header(skb, udp_offset);
-		len = htons(skb->len - udp_offset);
+		len = skb->len - udp_offset;
 		uh = udp_hdr(skb);
-		uh->len = len;
+
+		/* If we are only performing partial GSO the inner header
+		 * will be using a length value equal to only one MSS sized
+		 * segment instead of the entire frame.
+		 */
+		if (skb_is_gso(skb)) {
+			uh->len = htons(skb_shinfo(skb)->gso_size +
+					SKB_GSO_CB(skb)->data_offset +
+					skb->head - (unsigned char *)uh);
+		} else {
+			uh->len = htons(len);
+		}
 
 		if (!need_csum)
 			continue;
 
-		uh->check = ~csum_fold(csum_add(partial, (__force __wsum)len));
+		uh->check = ~csum_fold(csum_add(partial,
+				       (__force __wsum)htonl(len)));
 
 		if (skb->encapsulation || !offload_csum) {
 			uh->check = gso_make_checksum(skb, ~uh->check);
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 061adcda65f3..f5eb184e1093 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -63,6 +63,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	int proto;
 	struct frag_hdr *fptr;
 	unsigned int unfrag_ip6hlen;
+	unsigned int payload_len;
 	u8 *prevhdr;
 	int offset = 0;
 	bool encap, udpfrag;
@@ -82,6 +83,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_UDP_TUNNEL |
 		       SKB_GSO_UDP_TUNNEL_CSUM |
 		       SKB_GSO_TUNNEL_REMCSUM |
+		       SKB_GSO_PARTIAL |
 		       0)))
 		goto out;
 
@@ -118,7 +120,13 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 
 	for (skb = segs; skb; skb = skb->next) {
 		ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff);
-		ipv6h->payload_len = htons(skb->len - nhoff - sizeof(*ipv6h));
+		if (skb_is_gso(skb))
+			payload_len = skb_shinfo(skb)->gso_size +
+				      SKB_GSO_CB(skb)->data_offset +
+				      skb->head - (unsigned char *)(ipv6h + 1);
+		else
+			payload_len = skb->len - nhoff - sizeof(*ipv6h);
+		ipv6h->payload_len = htons(payload_len);
 		skb->network_header = (u8 *)ipv6h - skb->head;
 
 		if (udpfrag) {

^ permalink raw reply related

* [net-next PATCH 5/5] Documentation: Add documentation for TSO and GSO features
From: Alexander Duyck @ 2016-04-08 20:33 UTC (permalink / raw)
  To: herbert, tom, jesse, alexander.duyck, edumazet, netdev, davem
In-Reply-To: <20160408203013.12838.63429.stgit@ahduyck-xeon-server>

This document is a starting point for defining the TSO and GSO features.
The whole thing is starting to get a bit messy so I wanted to make sure we
have notes somwhere to start describing what does and doesn't work.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
---
 Documentation/networking/segmentation-offloads.txt |  130 ++++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100644 Documentation/networking/segmentation-offloads.txt

diff --git a/Documentation/networking/segmentation-offloads.txt b/Documentation/networking/segmentation-offloads.txt
new file mode 100644
index 000000000000..f200467ade38
--- /dev/null
+++ b/Documentation/networking/segmentation-offloads.txt
@@ -0,0 +1,130 @@
+Segmentation Offloads in the Linux Networking Stack
+
+Introduction
+============
+
+This document describes a set of techniques in the Linux networking stack
+to take advantage of segmentation offload capabilities of various NICs.
+
+The following technologies are described:
+ * TCP Segmentation Offload - TSO
+ * UDP Fragmentation Offload - UFO
+ * IPIP, SIT, GRE, and UDP Tunnel Offloads
+ * Generic Segmentation Offload - GSO
+ * Generic Receive Offload - GRO
+ * Partial Generic Segmentation Offload - GSO_PARTIAL
+
+TCP Segmentation Offload
+========================
+
+TCP segmentation allows a device to segment a single frame into multiple
+frames with a data payload size specified in skb_shinfo()->gso_size.
+When TCP segmentation requested the bit for either SKB_GSO_TCP or
+SKB_GSO_TCP6 should be set in skb_shinfo()->gso_type and
+skb_shinfo()->gso_size should be set to a non-zero value.
+
+TCP segmentation is dependent on support for the use of partial checksum
+offload.  For this reason TSO is normally disabled if the Tx checksum
+offload for a given device is disabled.
+
+In order to support TCP segmentation offload it is necessary to populate
+the network and transport header offsets of the skbuff so that the device
+drivers will be able determine the offsets of the IP or IPv6 header and the
+TCP header.  In addition as CHECKSUM_PARTIAL is required csum_start should
+also point to the TCP header of the packet.
+
+For IPv4 segmentation we support one of two types in terms of the IP ID.
+The default behavior is to increment the IP ID with every segment.  If the
+GSO type SKB_GSO_TCP_FIXEDID is specified then we will not increment the IP
+ID and all segments will use the same IP ID.  If a device has
+NETIF_F_TSO_MANGLEID set then the IP ID can be ignored when performing TSO
+and we will either increment the IP ID for all frames, or leave it at a
+static value based on driver preference.
+
+UDP Fragmentation Offload
+=========================
+
+UDP fragmentation offload allows a device to fragment an oversized UDP
+datagram into multiple IPv4 fragments.  Many of the requirements for UDP
+fragmentation offload are the same as TSO.  However the IPv4 ID for
+fragments should not increment as a single IPv4 datagram is fragmented.
+
+IPIP, SIT, GRE, UDP Tunnel, and Remote Checksum Offloads
+========================================================
+
+In addition to the offloads described above it is possible for a frame to
+contain additional headers such as an outer tunnel.  In order to account
+for such instances an additional set of segmentation offload types were
+introduced including SKB_GSO_IPIP, SKB_GSO_SIT, SKB_GSO_GRE, and
+SKB_GSO_UDP_TUNNEL.  These extra segmentation types are used to identify
+cases where there are more than just 1 set of headers.  For example in the
+case of IPIP and SIT we should have the network and transport headers moved
+from the standard list of headers to "inner" header offsets.
+
+Currently only two levels of headers are supported.  The convention is to
+refer to the tunnel headers as the outer headers, while the encapsulated
+data is normally referred to as the inner headers.  Below is the list of
+calls to access the given headers:
+
+IPIP/SIT Tunnel:
+		Outer			Inner
+MAC		skb_mac_header
+Network		skb_network_header	skb_inner_network_header
+Transport	skb_transport_header
+
+UDP/GRE Tunnel:
+		Outer			Inner
+MAC		skb_mac_header		skb_inner_mac_header
+Network		skb_network_header	skb_inner_network_header
+Transport	skb_transport_header	skb_inner_transport_header
+
+In addition to the above tunnel types there are also SKB_GSO_GRE_CSUM and
+SKB_GSO_UDP_TUNNEL_CSUM.  These two additional tunnel types reflect the
+fact that the outer header also requests to have a non-zero checksum
+included in the outer header.
+
+Finally there is SKB_GSO_REMCSUM which indicates that a given tunnel header
+has requested a remote checksum offload.  In this case the inner headers
+will be left with a partial checksum and only the outer header checksum
+will be computed.
+
+Generic Segmentation Offload
+============================
+
+Generic segmentation offload is a pure software offload that is meant to
+deal with cases where device drivers cannot perform the offloads described
+above.  What occurs in GSO is that a given skbuff will have its data broken
+out over multiple skbuffs that have been resized to match the MSS provided
+via skb_shinfo()->gso_size.
+
+Before enabling any hardware segmentation offload a corresponding software
+offload is required in GSO.  Otherwise it becomes possible for a frame to
+be re-routed between devices and end up being unable to be transmitted.
+
+Generic Receive Offload
+=======================
+
+Generic receive offload is the complement to GSO.  Ideally any frame
+assembled by GRO should be segmented to create an identical sequence of
+frames using GSO, and any sequence of frames segmented by GSO should be
+able to be reassembled back to the original by GRO.  The only exception to
+this is IPv4 ID in the case that the DF bit is set for a given IP header.
+If the value of the IPv4 ID is not sequentially incrementing it will be
+altered so that it is when a frame assembled via GRO is segmented via GSO.
+
+Partial Generic Segmentation Offload
+====================================
+
+Partial generic segmentation offload is a hybrid between TSO and GSO.  What
+it effectively does is take advantage of certain traits of TCP and tunnels
+so that instead of having to rewrite the packet headers for each segment
+only the inner-most transport header and possibly the outer-most network
+header need to be updated.  This allows devices that do not support tunnel
+offloads or tunnel offloads with checksum to still make use of segmentation.
+
+With the partial offload what occurs is that all headers excluding the
+inner transport header are updated such that they will contain the correct
+values for if the header was simply duplicated.  The one exception to this
+is the outer IPv4 ID field.  It is up to the device drivers to guarantee
+that the IPv4 ID field is incremented in the case that a given header does
+not have the DF bit set.

^ permalink raw reply related

* Re: [PATCH] net: thunderx: Fix broken of_node_put() code.
From: David Daney @ 2016-04-08 20:34 UTC (permalink / raw)
  To: David Miller
  Cc: ddaney.cavm, netdev, linux-kernel, linux-arm-kernel, rric,
	sgoutham, david.daney
In-Reply-To: <20160408.161557.850174112269792582.davem@davemloft.net>

On 04/08/2016 01:15 PM, David Miller wrote:
> From: David Daney <ddaney@caviumnetworks.com>
> Date: Fri, 8 Apr 2016 09:41:35 -0700
>
>> Due to mail server malfunction, this patch was sent twice.  Please
>> ignore this duplicate.
>
> This submission had another problem too.
>
> Do not use the date of your commit as the date that gets put into
> your email headers.

I don't.  This is standard git-send-email 1.7.11.7.

>
> This makes all of your patch submissions look like they occurred in
> the past, and this mixes up the ordering of patches in patchwork.

They did occur in the past.  Just like all e-mail you read, they were 
sent before you read them.

I ran git-send-email for this on  Thu, 31 Mar 2016 18:01:57 -0700.  I 
observed that the patch didn't seem to make it to the public lists, so I 
figured I screwed something up and I sent it again, with the same results.

Then I went on vacation, and came back today to sort everything out.  My 
MTA had died, so I restarted it, and ... the backlog of messages was 
sent and you read it.

>
> So please resubmit this properly with a normal, current, date in your
> email headers.

OK, I will resend the identical patch for the third time...

^ permalink raw reply

* Re: [PATCH v3] route: do not cache fib route info on local routes with oif
From: Julian Anastasov @ 2016-04-08 20:35 UTC (permalink / raw)
  To: Chris Friesen; +Cc: netdev
In-Reply-To: <57080F7E.809@windriver.com>


	Hello,

On Fri, 8 Apr 2016, Chris Friesen wrote:

> For local routes that require a particular output interface we do not want to
> cache the result.  Caching the result causes incorrect behaviour when there are
> multiple source addresses on the interface.  The end result being that if the
> intended recipient is waiting on that interface for the packet he won't receive
> it because it will be delivered on the loopback interface and the IP_PKTINFO
> ipi_ifindex will be set to the loopback interface as well.
> 
> This can be tested by running a program such as "dhcp_release" which attempts
> to inject a packet on a particular interface so that it is received by another
> program on the same board.  The receiving process should see an IP_PKTINFO
> ipi_ifndex value of the source interface (e.g., eth1) instead of the loopback
> interface (e.g., lo).  The packet will still appear on the loopback interface
> in tcpdump but the important aspect is that the CMSG info is correct.
> 
> Sample dhcp_release command line:
> 
>    dhcp_release eth1 192.168.204.222 02:11:33:22:44:66
> 
> Signed-off-by: Allain Legacy <allain.legacy@windriver.com>
> Signed off-by: Chris Friesen <chris.friesen@windriver.com>

	Sorry, forgot to mention that patch has other errors:

scripts/checkpatch.pl --strict /tmp/file3.patch

	Its recommendations:

- prefer a maximum 75 chars per line
- code indent should use tabs where possible

> ---
>  net/ipv4/route.c | 12 ++++++++++++
>  1 file changed, 12 insertions(+)
> 
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 02c6229..437a377 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -2045,6 +2045,18 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
>  		 */
>  		if (fi && res->prefixlen < 4)
>  			fi = NULL;
> +	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
> +		   (orig_oif != dev_out->ifindex)) {
> +		/* For local routes that require a particular output interface
> +                 * we do not want to cache the result.  Caching the result
> +                 * causes incorrect behaviour when there are multiple source
> +                 * addresses on the interface, the end result being that if the
> +                 * intended recipient is waiting on that interface for the
> +                 * packet he won't receive it because it will be delivered on
> +                 * the loopback interface and the IP_PKTINFO ipi_ifindex will
> +                 * be set to the loopback interface as well.
> +		 */
> +		fi = NULL;
>  	}
>  
>  	fnhe = NULL;

Regards

^ permalink raw reply

* Re: [PATCH net-next] net: bcmgenet: add BQL support
From: David Miller @ 2016-04-08 20:36 UTC (permalink / raw)
  To: pgynther; +Cc: netdev, f.fainelli, opendmb, jaedon.shin
In-Reply-To: <1459903801-83727-1-git-send-email-pgynther@google.com>

From: Petri Gynther <pgynther@google.com>
Date: Tue,  5 Apr 2016 17:50:01 -0700

> Add Byte Queue Limits (BQL) support to bcmgenet driver.
> 
> Signed-off-by: Petri Gynther <pgynther@google.com>

As Eric Dumazet indicated, your ->ndo_init() code to reset the queues is
probably not necessary at all.

^ permalink raw reply

* [PATCH] net: thunderx: Fix broken of_node_put() code.
From: David Daney @ 2016-04-08 20:37 UTC (permalink / raw)
  To: David S. Miller, netdev
  Cc: linux-kernel, linux-arm-kernel, Robert Richter, Sunil Goutham,
	David Daney

From: David Daney <david.daney@cavium.com>

commit b7d3e3d3d21a ("net: thunderx: Don't leak phy device references
on -EPROBE_DEFER condition.") incorrectly moved the call to
of_node_put() outside of the loop.  Under normal loop exit, the node
has already had of_node_put() called, so the extra call results in:

[    8.228020] ERROR: Bad of_node_put() on /soc@0/pci@848000000000/mrml-bridge0@1,0/bgx0/xlaui00
[    8.239433] CPU: 16 PID: 608 Comm: systemd-udevd Not tainted 4.6.0-rc1-numa+ #157
[    8.247380] Hardware name: www.cavium.com EBB8800/EBB8800, BIOS 0.3 Mar  2 2016
[    8.273541] Call trace:
[    8.273550] [<fffffc0008097364>] dump_backtrace+0x0/0x210
[    8.273557] [<fffffc0008097598>] show_stack+0x24/0x2c
[    8.273560] [<fffffc0008399ed0>] dump_stack+0x8c/0xb4
[    8.273566] [<fffffc00085aa828>] of_node_release+0xa8/0xac
[    8.273570] [<fffffc000839cad8>] kobject_cleanup+0x8c/0x194
[    8.273573] [<fffffc000839c97c>] kobject_put+0x44/0x6c
[    8.273576] [<fffffc00085a9ab0>] of_node_put+0x24/0x30
[    8.273587] [<fffffc0000bd0f74>] bgx_probe+0x17c/0xcd8 [thunder_bgx]
[    8.273591] [<fffffc00083ed220>] pci_device_probe+0xa0/0x114
[    8.273596] [<fffffc0008473fbc>] driver_probe_device+0x178/0x418
[    8.273599] [<fffffc000847435c>] __driver_attach+0x100/0x118
[    8.273602] [<fffffc0008471b58>] bus_for_each_dev+0x6c/0xac
[    8.273605] [<fffffc0008473884>] driver_attach+0x30/0x38
[    8.273608] [<fffffc00084732f4>] bus_add_driver+0x1f8/0x29c
[    8.273611] [<fffffc0008475028>] driver_register+0x70/0x110
[    8.273617] [<fffffc00083ebf08>] __pci_register_driver+0x60/0x6c
[    8.273623] [<fffffc0000bf0040>] bgx_init_module+0x40/0x48 [thunder_bgx]
[    8.273626] [<fffffc0008090d04>] do_one_initcall+0xcc/0x1c0
[    8.273631] [<fffffc0008198abc>] do_init_module+0x68/0x1c8
[    8.273635] [<fffffc0008125668>] load_module+0xf44/0x11f4
[    8.273638] [<fffffc0008125b64>] SyS_finit_module+0xb8/0xe0
[    8.273641] [<fffffc0008093b30>] el0_svc_naked+0x24/0x28

Go back to the previous (correct) code that only did the extra
of_node_put() call on early exit from the loop.

Signed-off-by: David Daney <david.daney@cavium.com>
---
 drivers/net/ethernet/cavium/thunder/thunder_bgx.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
index 9679515..d20539a 100644
--- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
+++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
@@ -1011,10 +1011,11 @@ static int bgx_init_of_phy(struct bgx *bgx)
 		}
 
 		lmac++;
-		if (lmac == MAX_LMAC_PER_BGX)
+		if (lmac == MAX_LMAC_PER_BGX) {
+			of_node_put(node);
 			break;
+		}
 	}
-	of_node_put(node);
 	return 0;
 
 defer:
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH net] vxlan: synchronously and race-free destruction of vxlan sockets
From: Eric Dumazet @ 2016-04-08 20:40 UTC (permalink / raw)
  To: Hannes Frederic Sowa; +Cc: Marcelo Ricardo Leitner, netdev, Jiri Benc
In-Reply-To: <570814EF.8060308@stressinduktion.org>

On Fri, 2016-04-08 at 22:30 +0200, Hannes Frederic Sowa wrote:
> Hi Marcelo,
> ng rtnl?
> 
> I thought about that and try not to use synchronize_rcu, but I don't see 
> any other way. Anyway, ndo_stop isn't really fast path and is used to 
> shut the interface down. Also since we have lwtunnels we don't really 
> need a lot of interfaces created and torn down.
> 
> But I could switch to synchronize_rcu_expedited here.
> 
> Also we have another synchronize_rcu during device dismantling, maybe we 
> can split ndo_stop into two callbacks, one preparing for stopping and 
> the other one after the synchronize_rcu when we safely can free resources.
> 
> I will investigate this but for the mean time I think this patch is 
> already improving things as user space can bind the socket again when 
> the dellink command returned.

Of course, we have synchronize_net() which specifically put in a single
point the knowledge (rtnl being held or not)

^ permalink raw reply

* Re: pull-request: mac80211 2016-04-06
From: David Miller @ 2016-04-08 20:41 UTC (permalink / raw)
  To: johannes; +Cc: netdev, linux-wireless
In-Reply-To: <1459948799-22032-1-git-send-email-johannes@sipsolutions.net>

From: Johannes Berg <johannes@sipsolutions.net>
Date: Wed,  6 Apr 2016 15:19:58 +0200

> First set of fixes for 4.6. Nothing really stands out.
> 
> Let me know if there's any problem.

Pulled, thanks Johannes.

^ permalink raw reply

* Re: pull-request: mac80211-next 2016-04-06
From: David Miller @ 2016-04-08 20:43 UTC (permalink / raw)
  To: johannes; +Cc: netdev, linux-wireless
In-Reply-To: <1459949210.17504.77.camel@sipsolutions.net>

From: Johannes Berg <johannes@sipsolutions.net>
Date: Wed, 06 Apr 2016 15:26:50 +0200

> On Wed, 2016-04-06 at 15:25 +0200, Johannes Berg wrote:
>> Hi Dave,
>> 
>> For the 4.6 cycle, there's of course much more. The few things that
>> 
> 
> Err, -next, so that's 4.7.

Pulled, and I fixed the version number in the merge commit message.

Thanks.

^ permalink raw reply

* Re: [PATCH net-next v2 0/3] net: dsa: voidify STP setter and FDB/VLAN add ops
From: David Miller @ 2016-04-08 20:51 UTC (permalink / raw)
  To: vivien.didelot
  Cc: netdev, linux-kernel, kernel, andrew, f.fainelli, jiri, sfeldma
In-Reply-To: <1459958105-1090-1-git-send-email-vivien.didelot@savoirfairelinux.com>

From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Wed,  6 Apr 2016 11:55:02 -0400

> Neither the DSA layer nor the bridge code (see br_set_state) really care
> about eventual errors from STP state setters, so make it void.
> 
> The DSA layer separates the prepare and commit phases of switchdev in
> two different functions. Logical errors must not happen in commit
> routines, so make them void.
> 
> Changes v1 -> v2:
>   - rename port_stp_update to port_stp_state_set
>   - don't change code flow of bcm_sf2_sw_br_set_stp_state
>   - prefer netdev_err over netdev_warn

Series applied.

^ permalink raw reply

* Re: [PATCH net-next v2] net: dsa: document missing functions
From: David Miller @ 2016-04-08 20:51 UTC (permalink / raw)
  To: vivien.didelot; +Cc: netdev, linux-kernel, kernel, andrew, f.fainelli
In-Reply-To: <1459955180-12083-1-git-send-email-vivien.didelot@savoirfairelinux.com>

From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Wed,  6 Apr 2016 11:06:20 -0400

> Add description for the missing port_vlan_prepare, port_fdb_prepare,
> port_fdb_dump functions in the DSA documentation.
> 
> Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>

Applied.

^ permalink raw reply

* [PATCH net-next v2] vxlan: synchronously and race-free destruction of vxlan sockets
From: Hannes Frederic Sowa @ 2016-04-08 20:55 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, Jiri Benc, Marcelo Ricardo Leitner

Due to the fact that the udp socket is destructed asynchronously in a
work queue, we have some nondeterministic behavior during shutdown of
vxlan tunnels and creating new ones. Fix this by keeping the destruction
process synchronous in regards to the user space process so IFF_UP can
be reliably set.

udp_tunnel_sock_release destroys vs->sock->sk if reference counter
indicates so. We expect to have the same lifetime of vxlan_sock and
vxlan_sock->sock->sk even in fast paths with only rcu locks held. So
only destruct the whole socket after we can be sure it cannot be found
by searching vxlan_net->sock_list.

Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Jiri Benc <jbenc@redhat.com>
Cc: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
---
v2) synchronize_rcu -> synchronize_net (proposed by Eric, thanks!)
    also rebased on net-next to apply without conflicts
 drivers/net/vxlan.c | 20 +++-----------------
 include/net/vxlan.h |  2 --
 2 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 9f3634064c921f..77ba31a0e44f97 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -98,7 +98,6 @@ struct vxlan_fdb {
 
 /* salt for hash table */
 static u32 vxlan_salt __read_mostly;
-static struct workqueue_struct *vxlan_wq;
 
 static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
 {
@@ -1053,7 +1052,9 @@ static void __vxlan_sock_release(struct vxlan_sock *vs)
 	vxlan_notify_del_rx_port(vs);
 	spin_unlock(&vn->sock_lock);
 
-	queue_work(vxlan_wq, &vs->del_work);
+	synchronize_net();
+	udp_tunnel_sock_release(vs->sock);
+	kfree(vs);
 }
 
 static void vxlan_sock_release(struct vxlan_dev *vxlan)
@@ -2674,13 +2675,6 @@ static const struct ethtool_ops vxlan_ethtool_ops = {
 	.get_link	= ethtool_op_get_link,
 };
 
-static void vxlan_del_work(struct work_struct *work)
-{
-	struct vxlan_sock *vs = container_of(work, struct vxlan_sock, del_work);
-	udp_tunnel_sock_release(vs->sock);
-	kfree_rcu(vs, rcu);
-}
-
 static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
 					__be16 port, u32 flags)
 {
@@ -2726,8 +2720,6 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
 	for (h = 0; h < VNI_HASH_SIZE; ++h)
 		INIT_HLIST_HEAD(&vs->vni_list[h]);
 
-	INIT_WORK(&vs->del_work, vxlan_del_work);
-
 	sock = vxlan_create_sock(net, ipv6, port, flags);
 	if (IS_ERR(sock)) {
 		pr_info("Cannot bind port %d, err=%ld\n", ntohs(port),
@@ -3346,10 +3338,6 @@ static int __init vxlan_init_module(void)
 {
 	int rc;
 
-	vxlan_wq = alloc_workqueue("vxlan", 0, 0);
-	if (!vxlan_wq)
-		return -ENOMEM;
-
 	get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));
 
 	rc = register_pernet_subsys(&vxlan_net_ops);
@@ -3370,7 +3358,6 @@ out3:
 out2:
 	unregister_pernet_subsys(&vxlan_net_ops);
 out1:
-	destroy_workqueue(vxlan_wq);
 	return rc;
 }
 late_initcall(vxlan_init_module);
@@ -3379,7 +3366,6 @@ static void __exit vxlan_cleanup_module(void)
 {
 	rtnl_link_unregister(&vxlan_link_ops);
 	unregister_netdevice_notifier(&vxlan_notifier_block);
-	destroy_workqueue(vxlan_wq);
 	unregister_pernet_subsys(&vxlan_net_ops);
 	/* rcu_barrier() is called by netns */
 }
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 2f168f0ea32c39..d442eb3129cde4 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -184,9 +184,7 @@ struct vxlan_metadata {
 /* per UDP socket information */
 struct vxlan_sock {
 	struct hlist_node hlist;
-	struct work_struct del_work;
 	struct socket	 *sock;
-	struct rcu_head	  rcu;
 	struct hlist_head vni_list[VNI_HASH_SIZE];
 	atomic_t	  refcnt;
 	u32		  flags;
-- 
2.5.5

^ permalink raw reply related

* Re: [PATCH v4 1/2] RDS: memory allocated must be align to 8
From: santosh shilimkar @ 2016-04-08 20:55 UTC (permalink / raw)
  To: David Miller; +Cc: shamir.rabinovitch, rds-devel, netdev
In-Reply-To: <20160408.161055.212162741537870508.davem@davemloft.net>

On 4/8/2016 1:10 PM, David Miller wrote:
> From: santosh shilimkar <santosh.shilimkar@oracle.com>
> Date: Fri, 8 Apr 2016 12:44:39 -0700
>
>> On 4/7/2016 4:57 AM, Shamir Rabinovitch wrote:
>>> Fix issue in 'rds_ib_cong_recv' when accessing unaligned memory
>>> allocated by 'rds_page_remainder_alloc' using uint64_t pointer.
>>>
>> Sorry I still didn't follow this change still. What exactly is the
>> problem.
>
> You can't stop the offset at non-8byte intervals, because the chunks
> being used in these arenas can have 64-bit values in it, which must be
> 8-byte aligned.
>
I see. Thanks for explaining it Dave.
Its fine to apply then.
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>

Regards,
Santosh

^ permalink raw reply

* [next-queue PATCH 0/3] Add support for GSO partial to Intel NIC drivers
From: Alexander Duyck @ 2016-04-08 21:06 UTC (permalink / raw)
  To: herbert, tom, jesse, alexander.duyck, edumazet, intel-wired-lan,
	jeffrey.t.kirsher, netdev, davem
In-Reply-To: <20160408203013.12838.63429.stgit@ahduyck-xeon-server>

So these are the patches needed to enable tunnel segmentation offloads on
the igb, igbvf, ixgbe, and ixgbevf drivers.  In addition this patch extends
the i40e and i40evf drivers to include segmentation support for tunnels
with outer checksums.

The net performance gain for these patches are pretty significant.  In the
case of i40e a tunnel with outer checksums showed the following
improvement:
Throughput Throughput  Local Local   Result 
           Units       CPU   Service Tag    
                       Util  Demand         
                       %                    
14066.29   10^6bits/s  3.49  0.651   "before" 
20618.16   10^6bits/s  3.09  0.393   "after"

For ixgbe similar results were seen:
Throughput Throughput  Local  Local   Result 
           Units       CPU    Service Tag    
                       Util   Demand         
                       %               
12879.89   10^6bits/s  10.00  0.763   "before"
14286.77   10^6bits/s  5.74   0.395   "after" 

These patches all rely on the TSO_MANGLEID and GSO_PARTIAL patches so I
would not recommend applying them until those patches have first been
applied.

---

Alexander Duyck (3):
      i40e/i40evf: Add support for GSO partial with UDP_TUNNEL_CSUM and GRE_CSUM
      ixgbe/ixgbevf: Add support for GSO partial
      igb/igbvf: Add support for GSO partial


 drivers/net/ethernet/intel/i40e/i40e_main.c       |   10 +
 drivers/net/ethernet/intel/i40e/i40e_txrx.c       |    7 +
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c     |    7 +
 drivers/net/ethernet/intel/i40evf/i40evf_main.c   |   10 +
 drivers/net/ethernet/intel/igb/igb_main.c         |  112 ++++++++++----
 drivers/net/ethernet/intel/igbvf/netdev.c         |  173 +++++++++++++--------
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     |  105 +++++++++----
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |  123 ++++++++++++---
 8 files changed, 391 insertions(+), 156 deletions(-)

^ permalink raw reply

* [next-queue PATCH 1/3] i40e/i40evf: Add support for GSO partial with UDP_TUNNEL_CSUM and GRE_CSUM
From: Alexander Duyck @ 2016-04-08 21:06 UTC (permalink / raw)
  To: herbert, tom, jesse, alexander.duyck, edumazet, intel-wired-lan,
	jeffrey.t.kirsher, netdev, davem
In-Reply-To: <20160408210103.13096.77973.stgit@ahduyck-xeon-server>

This patch makes it so that i40e and i40evf can use GSO_PARTIAL to support
segmentation for frames with checksums enabled in outer headers.  As a
result we can now send data over these types of tunnels at over 20Gb/s
versus the 12Gb/s that was previously possible on my system.

The advantage with the i40e parts is that this offload is mostly
transparent as the hardware still deals with the inner and/or outer IPv4
headers so the IP ID is still incrementing for both when this offload is
performed.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c     |   10 ++++++++--
 drivers/net/ethernet/intel/i40e/i40e_txrx.c     |    7 ++++++-
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c   |    7 ++++++-
 drivers/net/ethernet/intel/i40evf/i40evf_main.c |   10 ++++++++--
 4 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 07a70c4ac49f..6342fab4d177 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -9119,20 +9119,25 @@ static int i40e_config_netdev(struct i40e_vsi *vsi)
 				   NETIF_F_TSO_ECN		|
 				   NETIF_F_TSO6			|
 				   NETIF_F_GSO_GRE		|
+				   NETIF_F_GSO_GRE_CSUM		|
 				   NETIF_F_GSO_IPIP		|
 				   NETIF_F_GSO_SIT		|
 				   NETIF_F_GSO_UDP_TUNNEL	|
 				   NETIF_F_GSO_UDP_TUNNEL_CSUM	|
+				   NETIF_F_GSO_PARTIAL		|
 				   NETIF_F_SCTP_CRC		|
 				   NETIF_F_RXHASH		|
 				   NETIF_F_RXCSUM		|
 				   0;
 
 	if (!(pf->flags & I40E_FLAG_OUTER_UDP_CSUM_CAPABLE))
-		netdev->hw_enc_features ^= NETIF_F_GSO_UDP_TUNNEL_CSUM;
+		netdev->gso_partial_features |= NETIF_F_GSO_UDP_TUNNEL_CSUM;
+
+	netdev->gso_partial_features |= NETIF_F_GSO_GRE_CSUM;
 
 	/* record features VLANs can make use of */
-	netdev->vlan_features |= netdev->hw_enc_features;
+	netdev->vlan_features |= netdev->hw_enc_features |
+				 NETIF_F_TSO_MANGLEID;
 
 	if (!(pf->flags & I40E_FLAG_MFP_ENABLED))
 		netdev->hw_features |= NETIF_F_NTUPLE;
@@ -9142,6 +9147,7 @@ static int i40e_config_netdev(struct i40e_vsi *vsi)
 			       NETIF_F_HW_VLAN_CTAG_RX;
 
 	netdev->features |= netdev->hw_features | NETIF_F_HW_VLAN_CTAG_FILTER;
+	netdev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
 
 	if (vsi->type == I40E_VSI_MAIN) {
 		SET_NETDEV_DEV(netdev, &pf->pdev->dev);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 6e44cf118843..ede4183468b9 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2300,11 +2300,15 @@ static int i40e_tso(struct sk_buff *skb, u8 *hdr_len, u64 *cd_type_cmd_tso_mss)
 	}
 
 	if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
+					 SKB_GSO_GRE_CSUM |
 					 SKB_GSO_IPIP |
 					 SKB_GSO_SIT |
 					 SKB_GSO_UDP_TUNNEL |
 					 SKB_GSO_UDP_TUNNEL_CSUM)) {
-		if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM) {
+		if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
+		    (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)) {
+			l4.udp->len = 0;
+
 			/* determine offset of outer transport header */
 			l4_offset = l4.hdr - skb->data;
 
@@ -2481,6 +2485,7 @@ static int i40e_tx_enable_csum(struct sk_buff *skb, u32 *tx_flags,
 
 		/* indicate if we need to offload outer UDP header */
 		if ((*tx_flags & I40E_TX_FLAGS_TSO) &&
+		    !(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
 		    (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM))
 			tunnel |= I40E_TXD_CTX_QW0_L4T_CS_MASK;
 
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index f101895ecf4a..6ce00547c13e 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -1565,11 +1565,15 @@ static int i40e_tso(struct sk_buff *skb, u8 *hdr_len, u64 *cd_type_cmd_tso_mss)
 	}
 
 	if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
+					 SKB_GSO_GRE_CSUM |
 					 SKB_GSO_IPIP |
 					 SKB_GSO_SIT |
 					 SKB_GSO_UDP_TUNNEL |
 					 SKB_GSO_UDP_TUNNEL_CSUM)) {
-		if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM) {
+		if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
+		    (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)) {
+			l4.udp->len = 0;
+
 			/* determine offset of outer transport header */
 			l4_offset = l4.hdr - skb->data;
 
@@ -1704,6 +1708,7 @@ static int i40e_tx_enable_csum(struct sk_buff *skb, u32 *tx_flags,
 
 		/* indicate if we need to offload outer UDP header */
 		if ((*tx_flags & I40E_TX_FLAGS_TSO) &&
+		    !(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
 		    (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM))
 			tunnel |= I40E_TXD_CTX_QW0_L4T_CS_MASK;
 
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index 806da2686623..42be838ce77e 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -2346,20 +2346,25 @@ int i40evf_process_config(struct i40evf_adapter *adapter)
 				   NETIF_F_TSO_ECN		|
 				   NETIF_F_TSO6			|
 				   NETIF_F_GSO_GRE		|
+				   NETIF_F_GSO_GRE_CSUM		|
 				   NETIF_F_GSO_IPIP		|
 				   NETIF_F_GSO_SIT		|
 				   NETIF_F_GSO_UDP_TUNNEL	|
 				   NETIF_F_GSO_UDP_TUNNEL_CSUM	|
+				   NETIF_F_GSO_PARTIAL		|
 				   NETIF_F_SCTP_CRC		|
 				   NETIF_F_RXHASH		|
 				   NETIF_F_RXCSUM		|
 				   0;
 
 	if (!(adapter->flags & I40EVF_FLAG_OUTER_UDP_CSUM_CAPABLE))
-		netdev->hw_enc_features ^= NETIF_F_GSO_UDP_TUNNEL_CSUM;
+		netdev->gso_partial_features |= NETIF_F_GSO_UDP_TUNNEL_CSUM;
+
+	netdev->gso_partial_features |= NETIF_F_GSO_GRE_CSUM;
 
 	/* record features VLANs can make use of */
-	netdev->vlan_features |= netdev->hw_enc_features;
+	netdev->vlan_features |= netdev->hw_enc_features |
+				 NETIF_F_TSO_MANGLEID;
 
 	/* Write features and hw_features separately to avoid polluting
 	 * with, or dropping, features that are set when we registgered.
@@ -2367,6 +2372,7 @@ int i40evf_process_config(struct i40evf_adapter *adapter)
 	netdev->hw_features |= netdev->hw_enc_features;
 
 	netdev->features |= netdev->hw_enc_features | I40EVF_VLAN_FEATURES;
+	netdev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
 
 	/* disable VLAN features if not supported */
 	if (!(vfres->vf_offload_flags & I40E_VIRTCHNL_VF_OFFLOAD_VLAN))

^ permalink raw reply related

* [next-queue PATCH 2/3] ixgbe/ixgbevf: Add support for GSO partial
From: Alexander Duyck @ 2016-04-08 21:06 UTC (permalink / raw)
  To: herbert, tom, jesse, alexander.duyck, edumazet, intel-wired-lan,
	jeffrey.t.kirsher, netdev, davem
In-Reply-To: <20160408210103.13096.77973.stgit@ahduyck-xeon-server>

This patch adds support for partial GSO segmentation in the case of
tunnels.  Specifically with this change the driver an perform segmenation
as long as the frame either has IPv6 inner headers, or we are allowed to
mangle the IP IDs on the inner header.  This is needed because we will not
be modifying any fields from the start of the start of the outer transport
header to the start of the inner transport header as we are treating them
like they are just a block of IP options.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     |  105 +++++++++++++-----
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |  123 ++++++++++++++++-----
 2 files changed, 172 insertions(+), 56 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 974aa7ca7a12..5134cb97f33c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7216,9 +7216,18 @@ static int ixgbe_tso(struct ixgbe_ring *tx_ring,
 		     struct ixgbe_tx_buffer *first,
 		     u8 *hdr_len)
 {
+	u32 vlan_macip_lens, type_tucmd, mss_l4len_idx;
 	struct sk_buff *skb = first->skb;
-	u32 vlan_macip_lens, type_tucmd;
-	u32 mss_l4len_idx, l4len;
+	union {
+		struct iphdr *v4;
+		struct ipv6hdr *v6;
+		unsigned char *hdr;
+	} ip;
+	union {
+		struct tcphdr *tcp;
+		unsigned char *hdr;
+	} l4;
+	u32 paylen, l4_offset;
 	int err;
 
 	if (skb->ip_summed != CHECKSUM_PARTIAL)
@@ -7231,46 +7240,52 @@ static int ixgbe_tso(struct ixgbe_ring *tx_ring,
 	if (err < 0)
 		return err;
 
+	ip.hdr = skb_network_header(skb);
+	l4.hdr = skb_checksum_start(skb);
+
 	/* ADV DTYP TUCMD MKRLOC/ISCSIHEDLEN */
 	type_tucmd = IXGBE_ADVTXD_TUCMD_L4T_TCP;
 
-	if (first->protocol == htons(ETH_P_IP)) {
-		struct iphdr *iph = ip_hdr(skb);
-		iph->tot_len = 0;
-		iph->check = 0;
-		tcp_hdr(skb)->check = ~csum_tcpudp_magic(iph->saddr,
-							 iph->daddr, 0,
-							 IPPROTO_TCP,
-							 0);
+	/* initialize outer IP header fields */
+	if (ip.v4->version == 4) {
+		/* IP header will have to cancel out any data that
+		 * is not a part of the outer IP header
+		 */
+		ip.v4->check = csum_fold(csum_add(lco_csum(skb),
+						  csum_unfold(l4.tcp->check)));
 		type_tucmd |= IXGBE_ADVTXD_TUCMD_IPV4;
+
+		ip.v4->tot_len = 0;
 		first->tx_flags |= IXGBE_TX_FLAGS_TSO |
 				   IXGBE_TX_FLAGS_CSUM |
 				   IXGBE_TX_FLAGS_IPV4;
-	} else if (skb_is_gso_v6(skb)) {
-		ipv6_hdr(skb)->payload_len = 0;
-		tcp_hdr(skb)->check =
-		    ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
-				     &ipv6_hdr(skb)->daddr,
-				     0, IPPROTO_TCP, 0);
+	} else {
+		ip.v6->payload_len = 0;
 		first->tx_flags |= IXGBE_TX_FLAGS_TSO |
 				   IXGBE_TX_FLAGS_CSUM;
 	}
 
-	/* compute header lengths */
-	l4len = tcp_hdrlen(skb);
-	*hdr_len = skb_transport_offset(skb) + l4len;
+	/* determine offset of inner transport header */
+	l4_offset = l4.hdr - skb->data;
+
+	/* compute length of segmentation header */
+	*hdr_len = (l4.tcp->doff * 4) + l4_offset;
+
+	/* remove payload length from inner checksum */
+	paylen = skb->len - l4_offset;
+	csum_replace_by_diff(&l4.tcp->check, htonl(paylen));
 
 	/* update gso size and bytecount with header size */
 	first->gso_segs = skb_shinfo(skb)->gso_segs;
 	first->bytecount += (first->gso_segs - 1) * *hdr_len;
 
 	/* mss_l4len_id: use 0 as index for TSO */
-	mss_l4len_idx = l4len << IXGBE_ADVTXD_L4LEN_SHIFT;
+	mss_l4len_idx = (*hdr_len - l4_offset) << IXGBE_ADVTXD_L4LEN_SHIFT;
 	mss_l4len_idx |= skb_shinfo(skb)->gso_size << IXGBE_ADVTXD_MSS_SHIFT;
 
 	/* vlan_macip_lens: HEADLEN, MACLEN, VLAN tag */
-	vlan_macip_lens = skb_network_header_len(skb);
-	vlan_macip_lens |= skb_network_offset(skb) << IXGBE_ADVTXD_MACLEN_SHIFT;
+	vlan_macip_lens = l4.hdr - ip.hdr;
+	vlan_macip_lens |= (ip.hdr - skb->data) << IXGBE_ADVTXD_MACLEN_SHIFT;
 	vlan_macip_lens |= first->tx_flags & IXGBE_TX_FLAGS_VLAN_MASK;
 
 	ixgbe_tx_ctxtdesc(tx_ring, vlan_macip_lens, 0, type_tucmd,
@@ -8614,6 +8629,14 @@ static int ixgbe_set_features(struct net_device *netdev,
 	if (changed & NETIF_F_RXALL)
 		need_reset = true;
 
+	/* We can only support IPV4 TSO in tunnels if we can mangle the
+	 * inner IP ID field, so strip TSO if MANGLEID is not supported.
+	 */
+	if (features & NETIF_F_TSO_MANGLEID)
+		netdev->hw_enc_features |= NETIF_F_TSO;
+	else
+		netdev->hw_enc_features &= ~NETIF_F_TSO;
+
 	netdev->features = features;
 
 #ifdef CONFIG_IXGBE_VXLAN
@@ -8927,17 +8950,25 @@ static void ixgbe_fwd_del(struct net_device *pdev, void *priv)
 	kfree(fwd_adapter);
 }
 
-#define IXGBE_MAX_TUNNEL_HDR_LEN 80
+#define IXGBE_MAX_MAC_HDR_LEN		127
+#define IXGBE_MAX_NETWORK_HDR_LEN	511
+
 static netdev_features_t
 ixgbe_features_check(struct sk_buff *skb, struct net_device *dev,
 		     netdev_features_t features)
 {
-	if (!skb->encapsulation)
-		return features;
+	unsigned int network_hdr_len, mac_hdr_len;
 
-	if (unlikely(skb_inner_mac_header(skb) - skb_transport_header(skb) >
-		     IXGBE_MAX_TUNNEL_HDR_LEN))
-		return features & ~NETIF_F_CSUM_MASK;
+	/* Make certain the headers can be described by a context descriptor */
+	mac_hdr_len = skb_network_header(skb) - skb->data;
+	network_hdr_len = skb_checksum_start(skb) - skb_network_header(skb);
+	if (unlikely((mac_hdr_len > IXGBE_MAX_MAC_HDR_LEN) ||
+		     (network_hdr_len >  IXGBE_MAX_NETWORK_HDR_LEN)))
+		return features & ~(NETIF_F_HW_CSUM |
+				    NETIF_F_SCTP_CRC |
+				    NETIF_F_HW_VLAN_CTAG_TX |
+				    NETIF_F_TSO |
+				    NETIF_F_TSO6);
 
 	return features;
 }
@@ -9309,6 +9340,17 @@ skip_sriov:
 			   NETIF_F_HW_VLAN_CTAG_RX |
 			   NETIF_F_HW_VLAN_CTAG_FILTER;
 
+#define IXGBE_GSO_PARTIAL_FEATURES (NETIF_F_GSO_GRE | \
+				    NETIF_F_GSO_GRE_CSUM | \
+				    NETIF_F_GSO_IPIP | \
+				    NETIF_F_GSO_SIT | \
+				    NETIF_F_GSO_UDP_TUNNEL | \
+				    NETIF_F_GSO_UDP_TUNNEL_CSUM)
+
+	netdev->gso_partial_features = IXGBE_GSO_PARTIAL_FEATURES;
+	netdev->features |= NETIF_F_GSO_PARTIAL |
+			    IXGBE_GSO_PARTIAL_FEATURES;
+
 	if (hw->mac.type >= ixgbe_mac_82599EB)
 		netdev->features |= NETIF_F_SCTP_CRC;
 
@@ -9323,12 +9365,17 @@ skip_sriov:
 
 	netdev->vlan_features |= NETIF_F_SG |
 				 NETIF_F_TSO |
+				 NETIF_F_TSO_MANGLEID |
 				 NETIF_F_TSO6 |
 				 NETIF_F_HW_CSUM |
 				 NETIF_F_SCTP_CRC;
 
 	netdev->mpls_features |= NETIF_F_HW_CSUM;
-	netdev->hw_enc_features |= NETIF_F_HW_CSUM;
+	netdev->hw_enc_features |= NETIF_F_HW_CSUM |
+				   NETIF_F_TSO_MANGLEID |
+				   NETIF_F_TSO6 |
+				   NETIF_F_GSO_PARTIAL |
+				   IXGBE_GSO_PARTIAL_FEATURES;
 
 	netdev->priv_flags |= IFF_UNICAST_FLT;
 	netdev->priv_flags |= IFF_SUPP_NOFCS;
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 007cbe094990..e4f471787dd4 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -3272,9 +3272,18 @@ static int ixgbevf_tso(struct ixgbevf_ring *tx_ring,
 		       struct ixgbevf_tx_buffer *first,
 		       u8 *hdr_len)
 {
+	u32 vlan_macip_lens, type_tucmd, mss_l4len_idx;
 	struct sk_buff *skb = first->skb;
-	u32 vlan_macip_lens, type_tucmd;
-	u32 mss_l4len_idx, l4len;
+	union {
+		struct iphdr *v4;
+		struct ipv6hdr *v6;
+		unsigned char *hdr;
+	} ip;
+	union {
+		struct tcphdr *tcp;
+		unsigned char *hdr;
+	} l4;
+	u32 paylen, l4_offset;
 	int err;
 
 	if (skb->ip_summed != CHECKSUM_PARTIAL)
@@ -3287,49 +3296,53 @@ static int ixgbevf_tso(struct ixgbevf_ring *tx_ring,
 	if (err < 0)
 		return err;
 
+	ip.hdr = skb_network_header(skb);
+	l4.hdr = skb_checksum_start(skb);
+
 	/* ADV DTYP TUCMD MKRLOC/ISCSIHEDLEN */
 	type_tucmd = IXGBE_ADVTXD_TUCMD_L4T_TCP;
 
-	if (first->protocol == htons(ETH_P_IP)) {
-		struct iphdr *iph = ip_hdr(skb);
-
-		iph->tot_len = 0;
-		iph->check = 0;
-		tcp_hdr(skb)->check = ~csum_tcpudp_magic(iph->saddr,
-							 iph->daddr, 0,
-							 IPPROTO_TCP,
-							 0);
+	/* initialize outer IP header fields */
+	if (ip.v4->version == 4) {
+		/* IP header will have to cancel out any data that
+		 * is not a part of the outer IP header
+		 */
+		ip.v4->check = csum_fold(csum_add(lco_csum(skb),
+						  csum_unfold(l4.tcp->check)));
 		type_tucmd |= IXGBE_ADVTXD_TUCMD_IPV4;
+
+		ip.v4->tot_len = 0;
 		first->tx_flags |= IXGBE_TX_FLAGS_TSO |
 				   IXGBE_TX_FLAGS_CSUM |
 				   IXGBE_TX_FLAGS_IPV4;
-	} else if (skb_is_gso_v6(skb)) {
-		ipv6_hdr(skb)->payload_len = 0;
-		tcp_hdr(skb)->check =
-		    ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
-				     &ipv6_hdr(skb)->daddr,
-				     0, IPPROTO_TCP, 0);
+	} else {
+		ip.v6->payload_len = 0;
 		first->tx_flags |= IXGBE_TX_FLAGS_TSO |
 				   IXGBE_TX_FLAGS_CSUM;
 	}
 
-	/* compute header lengths */
-	l4len = tcp_hdrlen(skb);
-	*hdr_len += l4len;
-	*hdr_len = skb_transport_offset(skb) + l4len;
+	/* determine offset of inner transport header */
+	l4_offset = l4.hdr - skb->data;
+
+	/* compute length of segmentation header */
+	*hdr_len = (l4.tcp->doff * 4) + l4_offset;
 
-	/* update GSO size and bytecount with header size */
+	/* remove payload length from inner checksum */
+	paylen = skb->len - l4_offset;
+	csum_replace_by_diff(&l4.tcp->check, htonl(paylen));
+
+	/* update gso size and bytecount with header size */
 	first->gso_segs = skb_shinfo(skb)->gso_segs;
 	first->bytecount += (first->gso_segs - 1) * *hdr_len;
 
 	/* mss_l4len_id: use 1 as index for TSO */
-	mss_l4len_idx = l4len << IXGBE_ADVTXD_L4LEN_SHIFT;
+	mss_l4len_idx = (*hdr_len - l4_offset) << IXGBE_ADVTXD_L4LEN_SHIFT;
 	mss_l4len_idx |= skb_shinfo(skb)->gso_size << IXGBE_ADVTXD_MSS_SHIFT;
 	mss_l4len_idx |= 1 << IXGBE_ADVTXD_IDX_SHIFT;
 
 	/* vlan_macip_lens: HEADLEN, MACLEN, VLAN tag */
-	vlan_macip_lens = skb_network_header_len(skb);
-	vlan_macip_lens |= skb_network_offset(skb) << IXGBE_ADVTXD_MACLEN_SHIFT;
+	vlan_macip_lens = l4.hdr - ip.hdr;
+	vlan_macip_lens |= (ip.hdr - skb->data) << IXGBE_ADVTXD_MACLEN_SHIFT;
 	vlan_macip_lens |= first->tx_flags & IXGBE_TX_FLAGS_VLAN_MASK;
 
 	ixgbevf_tx_ctxtdesc(tx_ring, vlan_macip_lens,
@@ -3870,6 +3883,45 @@ static struct rtnl_link_stats64 *ixgbevf_get_stats(struct net_device *netdev,
 	return stats;
 }
 
+static int ixgbevf_set_features(struct net_device *netdev,
+				netdev_features_t features)
+{
+	/* We can only support IPV4 TSO in tunnels if we can mangle the
+	 * inner IP ID field, so strip TSO if MANGLEID is not supported.
+	 */
+	if (features & NETIF_F_TSO_MANGLEID)
+		netdev->hw_enc_features |= NETIF_F_TSO;
+	else
+		netdev->hw_enc_features &= ~NETIF_F_TSO;
+
+	netdev->features = features;
+
+	return 0;
+}
+
+#define IXGBEVF_MAX_MAC_HDR_LEN		127
+#define IXGBEVF_MAX_NETWORK_HDR_LEN	511
+
+static netdev_features_t
+ixgbevf_features_check(struct sk_buff *skb, struct net_device *dev,
+		       netdev_features_t features)
+{
+	unsigned int network_hdr_len, mac_hdr_len;
+
+	/* Make certain the headers can be described by a context descriptor */
+	mac_hdr_len = skb_network_header(skb) - skb->data;
+	network_hdr_len = skb_checksum_start(skb) - skb_network_header(skb);
+	if (unlikely((mac_hdr_len > IXGBEVF_MAX_MAC_HDR_LEN) ||
+		     (network_hdr_len >  IXGBEVF_MAX_NETWORK_HDR_LEN)))
+		return features & ~(NETIF_F_HW_CSUM |
+				    NETIF_F_SCTP_CRC |
+				    NETIF_F_HW_VLAN_CTAG_TX |
+				    NETIF_F_TSO |
+				    NETIF_F_TSO6);
+
+	return features;
+}
+
 static const struct net_device_ops ixgbevf_netdev_ops = {
 	.ndo_open		= ixgbevf_open,
 	.ndo_stop		= ixgbevf_close,
@@ -3888,7 +3940,8 @@ static const struct net_device_ops ixgbevf_netdev_ops = {
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= ixgbevf_netpoll,
 #endif
-	.ndo_features_check	= passthru_features_check,
+	.ndo_set_features	= ixgbevf_set_features,
+	.ndo_features_check	= ixgbevf_features_check,
 };
 
 static void ixgbevf_assign_netdev_ops(struct net_device *dev)
@@ -3999,6 +4052,17 @@ static int ixgbevf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 			      NETIF_F_HW_CSUM |
 			      NETIF_F_SCTP_CRC;
 
+#define IXGBEVF_GSO_PARTIAL_FEATURES (NETIF_F_GSO_GRE | \
+				      NETIF_F_GSO_GRE_CSUM | \
+				      NETIF_F_GSO_IPIP | \
+				      NETIF_F_GSO_SIT | \
+				      NETIF_F_GSO_UDP_TUNNEL | \
+				      NETIF_F_GSO_UDP_TUNNEL_CSUM)
+
+	netdev->gso_partial_features = IXGBEVF_GSO_PARTIAL_FEATURES;
+	netdev->hw_features |= NETIF_F_GSO_PARTIAL |
+			       IXGBEVF_GSO_PARTIAL_FEATURES;
+
 	netdev->features = netdev->hw_features |
 			   NETIF_F_HW_VLAN_CTAG_TX |
 			   NETIF_F_HW_VLAN_CTAG_RX |
@@ -4006,12 +4070,17 @@ static int ixgbevf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	netdev->vlan_features |= NETIF_F_SG |
 				 NETIF_F_TSO |
+				 NETIF_F_TSO_MANGLEID |
 				 NETIF_F_TSO6 |
 				 NETIF_F_HW_CSUM |
 				 NETIF_F_SCTP_CRC;
 
 	netdev->mpls_features |= NETIF_F_HW_CSUM;
-	netdev->hw_enc_features |= NETIF_F_HW_CSUM;
+	netdev->hw_enc_features |= NETIF_F_HW_CSUM |
+				   NETIF_F_TSO_MANGLEID |
+				   NETIF_F_TSO6 |
+				   NETIF_F_GSO_PARTIAL |
+				   IXGBEVF_GSO_PARTIAL_FEATURES;
 
 	if (pci_using_dac)
 		netdev->features |= NETIF_F_HIGHDMA;

^ permalink raw reply related

* [next-queue PATCH 3/3] igb/igbvf: Add support for GSO partial
From: Alexander Duyck @ 2016-04-08 21:06 UTC (permalink / raw)
  To: herbert, tom, jesse, alexander.duyck, edumazet, intel-wired-lan,
	jeffrey.t.kirsher, netdev, davem
In-Reply-To: <20160408210103.13096.77973.stgit@ahduyck-xeon-server>

This patch adds support for partial GSO segmentation in the case of
tunnels.  Specifically with this change the driver an perform segmenation
as long as the frame either has IPv6 inner headers, or we are allowed to
mangle the IP IDs on the inner header.  This is needed because we will not
be modifying any fields from the start of the start of the outer transport
header to the start of the inner transport header as we are treating them
like they are just a block of IP options.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c |  112 ++++++++++++++-----
 drivers/net/ethernet/intel/igbvf/netdev.c |  173 ++++++++++++++++++-----------
 2 files changed, 191 insertions(+), 94 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 8e96c35307fb..998a24611246 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2055,6 +2055,14 @@ static int igb_set_features(struct net_device *netdev,
 	if (changed & NETIF_F_HW_VLAN_CTAG_RX)
 		igb_vlan_mode(netdev, features);
 
+	/* We can only support IPV4 TSO in tunnels if we can mangle the
+	 * inner IP ID field, so strip TSO if MANGLEID is not supported.
+	 */
+	if (features & NETIF_F_TSO_MANGLEID)
+		netdev->hw_enc_features |= NETIF_F_TSO;
+	else
+		netdev->hw_enc_features &= ~NETIF_F_TSO;
+
 	if (!(changed & (NETIF_F_RXALL | NETIF_F_NTUPLE)))
 		return 0;
 
@@ -2087,6 +2095,29 @@ static int igb_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 	return ndo_dflt_fdb_add(ndm, tb, dev, addr, vid, flags);
 }
 
+#define IGB_MAX_MAC_HDR_LEN	127
+#define IGB_MAX_NETWORK_HDR_LEN	511
+
+static netdev_features_t
+igb_features_check(struct sk_buff *skb, struct net_device *dev,
+		   netdev_features_t features)
+{
+	unsigned int network_hdr_len, mac_hdr_len;
+
+	/* Make certain the headers can be described by a context descriptor */
+	mac_hdr_len = skb_network_header(skb) - skb->data;
+	network_hdr_len = skb_checksum_start(skb) - skb_network_header(skb);
+	if (unlikely((mac_hdr_len > IGB_MAX_MAC_HDR_LEN) ||
+		     (network_hdr_len >  IGB_MAX_NETWORK_HDR_LEN)))
+		return features & ~(NETIF_F_HW_CSUM |
+				    NETIF_F_SCTP_CRC |
+				    NETIF_F_HW_VLAN_CTAG_TX |
+				    NETIF_F_TSO |
+				    NETIF_F_TSO6);
+
+	return features;
+}
+
 static const struct net_device_ops igb_netdev_ops = {
 	.ndo_open		= igb_open,
 	.ndo_stop		= igb_close,
@@ -2111,7 +2142,7 @@ static const struct net_device_ops igb_netdev_ops = {
 	.ndo_fix_features	= igb_fix_features,
 	.ndo_set_features	= igb_set_features,
 	.ndo_fdb_add		= igb_ndo_fdb_add,
-	.ndo_features_check	= passthru_features_check,
+	.ndo_features_check	= igb_features_check,
 };
 
 /**
@@ -2384,6 +2415,16 @@ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (hw->mac.type >= e1000_82576)
 		netdev->features |= NETIF_F_SCTP_CRC;
 
+#define IGB_GSO_PARTIAL_FEATURES (NETIF_F_GSO_GRE | \
+				  NETIF_F_GSO_GRE_CSUM | \
+				  NETIF_F_GSO_IPIP | \
+				  NETIF_F_GSO_SIT | \
+				  NETIF_F_GSO_UDP_TUNNEL | \
+				  NETIF_F_GSO_UDP_TUNNEL_CSUM)
+
+	netdev->gso_partial_features = IGB_GSO_PARTIAL_FEATURES;
+	netdev->features |= NETIF_F_GSO_PARTIAL | IGB_GSO_PARTIAL_FEATURES;
+
 	/* copy netdev features into list of user selectable features */
 	netdev->hw_features |= netdev->features;
 	netdev->hw_features |= NETIF_F_RXALL;
@@ -2396,19 +2437,22 @@ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	netdev->vlan_features |= NETIF_F_SG |
 				 NETIF_F_TSO |
+				 NETIF_F_TSO_MANGLEID |
 				 NETIF_F_TSO6 |
 				 NETIF_F_HW_CSUM |
 				 NETIF_F_SCTP_CRC;
 
 	netdev->mpls_features |= NETIF_F_HW_CSUM;
-	netdev->hw_enc_features |= NETIF_F_HW_CSUM;
+	netdev->hw_enc_features |= NETIF_F_HW_CSUM |
+				   NETIF_F_TSO_MANGLEID |
+				   NETIF_F_TSO6 |
+				   NETIF_F_GSO_PARTIAL |
+				   IGB_GSO_PARTIAL_FEATURES;
 
 	netdev->priv_flags |= IFF_SUPP_NOFCS;
 
-	if (pci_using_dac) {
+	if (pci_using_dac)
 		netdev->features |= NETIF_F_HIGHDMA;
-		netdev->vlan_features |= NETIF_F_HIGHDMA;
-	}
 
 	netdev->priv_flags |= IFF_UNICAST_FLT;
 
@@ -4842,9 +4886,18 @@ static int igb_tso(struct igb_ring *tx_ring,
 		   struct igb_tx_buffer *first,
 		   u8 *hdr_len)
 {
+	u32 vlan_macip_lens, type_tucmd, mss_l4len_idx;
 	struct sk_buff *skb = first->skb;
-	u32 vlan_macip_lens, type_tucmd;
-	u32 mss_l4len_idx, l4len;
+	union {
+		struct iphdr *v4;
+		struct ipv6hdr *v6;
+		unsigned char *hdr;
+	} ip;
+	union {
+		struct tcphdr *tcp;
+		unsigned char *hdr;
+	} l4;
+	u32 paylen, l4_offset;
 	int err;
 
 	if (skb->ip_summed != CHECKSUM_PARTIAL)
@@ -4857,45 +4910,52 @@ static int igb_tso(struct igb_ring *tx_ring,
 	if (err < 0)
 		return err;
 
+	ip.hdr = skb_network_header(skb);
+	l4.hdr = skb_checksum_start(skb);
+
 	/* ADV DTYP TUCMD MKRLOC/ISCSIHEDLEN */
 	type_tucmd = E1000_ADVTXD_TUCMD_L4T_TCP;
 
-	if (first->protocol == htons(ETH_P_IP)) {
-		struct iphdr *iph = ip_hdr(skb);
-		iph->tot_len = 0;
-		iph->check = 0;
-		tcp_hdr(skb)->check = ~csum_tcpudp_magic(iph->saddr,
-							 iph->daddr, 0,
-							 IPPROTO_TCP,
-							 0);
+	/* initialize outer IP header fields */
+	if (ip.v4->version == 4) {
+		/* IP header will have to cancel out any data that
+		 * is not a part of the outer IP header
+		 */
+		ip.v4->check = csum_fold(csum_add(lco_csum(skb),
+						  csum_unfold(l4.tcp->check)));
 		type_tucmd |= E1000_ADVTXD_TUCMD_IPV4;
+
+		ip.v4->tot_len = 0;
 		first->tx_flags |= IGB_TX_FLAGS_TSO |
 				   IGB_TX_FLAGS_CSUM |
 				   IGB_TX_FLAGS_IPV4;
-	} else if (skb_is_gso_v6(skb)) {
-		ipv6_hdr(skb)->payload_len = 0;
-		tcp_hdr(skb)->check = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
-						       &ipv6_hdr(skb)->daddr,
-						       0, IPPROTO_TCP, 0);
+	} else {
+		ip.v6->payload_len = 0;
 		first->tx_flags |= IGB_TX_FLAGS_TSO |
 				   IGB_TX_FLAGS_CSUM;
 	}
 
-	/* compute header lengths */
-	l4len = tcp_hdrlen(skb);
-	*hdr_len = skb_transport_offset(skb) + l4len;
+	/* determine offset of inner transport header */
+	l4_offset = l4.hdr - skb->data;
+
+	/* compute length of segmentation header */
+	*hdr_len = (l4.tcp->doff * 4) + l4_offset;
+
+	/* remove payload length from inner checksum */
+	paylen = skb->len - l4_offset;
+	csum_replace_by_diff(&l4.tcp->check, htonl(paylen));
 
 	/* update gso size and bytecount with header size */
 	first->gso_segs = skb_shinfo(skb)->gso_segs;
 	first->bytecount += (first->gso_segs - 1) * *hdr_len;
 
 	/* MSS L4LEN IDX */
-	mss_l4len_idx = l4len << E1000_ADVTXD_L4LEN_SHIFT;
+	mss_l4len_idx = (*hdr_len - l4_offset) << E1000_ADVTXD_L4LEN_SHIFT;
 	mss_l4len_idx |= skb_shinfo(skb)->gso_size << E1000_ADVTXD_MSS_SHIFT;
 
 	/* VLAN MACLEN IPLEN */
-	vlan_macip_lens = skb_network_header_len(skb);
-	vlan_macip_lens |= skb_network_offset(skb) << E1000_ADVTXD_MACLEN_SHIFT;
+	vlan_macip_lens = l4.hdr - ip.hdr;
+	vlan_macip_lens |= (ip.hdr - skb->data) << E1000_ADVTXD_MACLEN_SHIFT;
 	vlan_macip_lens |= first->tx_flags & IGB_TX_FLAGS_VLAN_MASK;
 
 	igb_tx_ctxtdesc(tx_ring, vlan_macip_lens, type_tucmd, mss_l4len_idx);
diff --git a/drivers/net/ethernet/intel/igbvf/netdev.c b/drivers/net/ethernet/intel/igbvf/netdev.c
index c12442252adb..cd759c5e09f8 100644
--- a/drivers/net/ethernet/intel/igbvf/netdev.c
+++ b/drivers/net/ethernet/intel/igbvf/netdev.c
@@ -1933,83 +1933,74 @@ static void igbvf_tx_ctxtdesc(struct igbvf_ring *tx_ring, u32 vlan_macip_lens,
 	buffer_info->dma = 0;
 }
 
-static int igbvf_tso(struct igbvf_adapter *adapter,
-		     struct igbvf_ring *tx_ring,
-		     struct sk_buff *skb, u32 tx_flags, u8 *hdr_len,
-		     __be16 protocol)
-{
-	struct e1000_adv_tx_context_desc *context_desc;
-	struct igbvf_buffer *buffer_info;
-	u32 info = 0, tu_cmd = 0;
-	u32 mss_l4len_idx, l4len;
-	unsigned int i;
+static int igbvf_tso(struct igbvf_ring *tx_ring,
+		     struct sk_buff *skb, u32 tx_flags, u8 *hdr_len)
+{
+	u32 vlan_macip_lens, type_tucmd, mss_l4len_idx;
+	union {
+		struct iphdr *v4;
+		struct ipv6hdr *v6;
+		unsigned char *hdr;
+	} ip;
+	union {
+		struct tcphdr *tcp;
+		unsigned char *hdr;
+	} l4;
+	u32 paylen, l4_offset;
 	int err;
 
-	*hdr_len = 0;
+	if (skb->ip_summed != CHECKSUM_PARTIAL)
+		return 0;
+
+	if (!skb_is_gso(skb))
+		return 0;
 
 	err = skb_cow_head(skb, 0);
-	if (err < 0) {
-		dev_err(&adapter->pdev->dev, "igbvf_tso returning an error\n");
+	if (err < 0)
 		return err;
-	}
 
-	l4len = tcp_hdrlen(skb);
-	*hdr_len += l4len;
-
-	if (protocol == htons(ETH_P_IP)) {
-		struct iphdr *iph = ip_hdr(skb);
-
-		iph->tot_len = 0;
-		iph->check = 0;
-		tcp_hdr(skb)->check = ~csum_tcpudp_magic(iph->saddr,
-							 iph->daddr, 0,
-							 IPPROTO_TCP,
-							 0);
-	} else if (skb_is_gso_v6(skb)) {
-		ipv6_hdr(skb)->payload_len = 0;
-		tcp_hdr(skb)->check = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
-						       &ipv6_hdr(skb)->daddr,
-						       0, IPPROTO_TCP, 0);
-	}
+	ip.hdr = skb_network_header(skb);
+	l4.hdr = skb_checksum_start(skb);
 
-	i = tx_ring->next_to_use;
+	/* ADV DTYP TUCMD MKRLOC/ISCSIHEDLEN */
+	type_tucmd = E1000_ADVTXD_TUCMD_L4T_TCP;
 
-	buffer_info = &tx_ring->buffer_info[i];
-	context_desc = IGBVF_TX_CTXTDESC_ADV(*tx_ring, i);
-	/* VLAN MACLEN IPLEN */
-	if (tx_flags & IGBVF_TX_FLAGS_VLAN)
-		info |= (tx_flags & IGBVF_TX_FLAGS_VLAN_MASK);
-	info |= (skb_network_offset(skb) << E1000_ADVTXD_MACLEN_SHIFT);
-	*hdr_len += skb_network_offset(skb);
-	info |= (skb_transport_header(skb) - skb_network_header(skb));
-	*hdr_len += (skb_transport_header(skb) - skb_network_header(skb));
-	context_desc->vlan_macip_lens = cpu_to_le32(info);
+	/* initialize outer IP header fields */
+	if (ip.v4->version == 4) {
+		/* IP header will have to cancel out any data that
+		 * is not a part of the outer IP header
+		 */
+		ip.v4->check = csum_fold(csum_add(lco_csum(skb),
+						  csum_unfold(l4.tcp->check)));
+		type_tucmd |= E1000_ADVTXD_TUCMD_IPV4;
 
-	/* ADV DTYP TUCMD MKRLOC/ISCSIHEDLEN */
-	tu_cmd |= (E1000_TXD_CMD_DEXT | E1000_ADVTXD_DTYP_CTXT);
+		ip.v4->tot_len = 0;
+	} else {
+		ip.v6->payload_len = 0;
+	}
 
-	if (protocol == htons(ETH_P_IP))
-		tu_cmd |= E1000_ADVTXD_TUCMD_IPV4;
-	tu_cmd |= E1000_ADVTXD_TUCMD_L4T_TCP;
+	/* determine offset of inner transport header */
+	l4_offset = l4.hdr - skb->data;
 
-	context_desc->type_tucmd_mlhl = cpu_to_le32(tu_cmd);
+	/* compute length of segmentation header */
+	*hdr_len = (l4.tcp->doff * 4) + l4_offset;
 
-	/* MSS L4LEN IDX */
-	mss_l4len_idx = (skb_shinfo(skb)->gso_size << E1000_ADVTXD_MSS_SHIFT);
-	mss_l4len_idx |= (l4len << E1000_ADVTXD_L4LEN_SHIFT);
+	/* remove payload length from inner checksum */
+	paylen = skb->len - l4_offset;
+	csum_replace_by_diff(&l4.tcp->check, htonl(paylen));
 
-	context_desc->mss_l4len_idx = cpu_to_le32(mss_l4len_idx);
-	context_desc->seqnum_seed = 0;
+	/* MSS L4LEN IDX */
+	mss_l4len_idx = (*hdr_len - l4_offset) << E1000_ADVTXD_L4LEN_SHIFT;
+	mss_l4len_idx |= skb_shinfo(skb)->gso_size << E1000_ADVTXD_MSS_SHIFT;
 
-	buffer_info->time_stamp = jiffies;
-	buffer_info->dma = 0;
-	i++;
-	if (i == tx_ring->count)
-		i = 0;
+	/* VLAN MACLEN IPLEN */
+	vlan_macip_lens = l4.hdr - ip.hdr;
+	vlan_macip_lens |= (ip.hdr - skb->data) << E1000_ADVTXD_MACLEN_SHIFT;
+	vlan_macip_lens |= tx_flags & IGBVF_TX_FLAGS_VLAN_MASK;
 
-	tx_ring->next_to_use = i;
+	igbvf_tx_ctxtdesc(tx_ring, vlan_macip_lens, type_tucmd, mss_l4len_idx);
 
-	return true;
+	return 1;
 }
 
 static inline bool igbvf_ipv6_csum_is_sctp(struct sk_buff *skb)
@@ -2271,8 +2262,7 @@ static netdev_tx_t igbvf_xmit_frame_ring_adv(struct sk_buff *skb,
 
 	first = tx_ring->next_to_use;
 
-	tso = skb_is_gso(skb) ?
-		igbvf_tso(adapter, tx_ring, skb, tx_flags, &hdr_len, protocol) : 0;
+	tso = igbvf_tso(tx_ring, skb, tx_flags, &hdr_len);
 	if (unlikely(tso < 0)) {
 		dev_kfree_skb_any(skb);
 		return NETDEV_TX_OK;
@@ -2612,9 +2602,40 @@ static int igbvf_set_features(struct net_device *netdev,
 	else
 		adapter->flags |= IGBVF_FLAG_RX_CSUM_DISABLED;
 
+	/* We can only support IPV4 TSO in tunnels if we can mangle the
+	 * inner IP ID field, so strip TSO if MANGLEID is not supported.
+	 */
+	if (features & NETIF_F_TSO_MANGLEID)
+		netdev->hw_enc_features |= NETIF_F_TSO;
+	else
+		netdev->hw_enc_features &= ~NETIF_F_TSO;
+
 	return 0;
 }
 
+#define IGBVF_MAX_MAC_HDR_LEN		127
+#define IGBVF_MAX_NETWORK_HDR_LEN	511
+
+static netdev_features_t
+igbvf_features_check(struct sk_buff *skb, struct net_device *dev,
+		     netdev_features_t features)
+{
+	unsigned int network_hdr_len, mac_hdr_len;
+
+	/* Make certain the headers can be described by a context descriptor */
+	mac_hdr_len = skb_network_header(skb) - skb->data;
+	network_hdr_len = skb_checksum_start(skb) - skb_network_header(skb);
+	if (unlikely((mac_hdr_len > IGBVF_MAX_MAC_HDR_LEN) ||
+		     (network_hdr_len >  IGBVF_MAX_NETWORK_HDR_LEN)))
+		return features & ~(NETIF_F_HW_CSUM |
+				    NETIF_F_SCTP_CRC |
+				    NETIF_F_HW_VLAN_CTAG_TX |
+				    NETIF_F_TSO |
+				    NETIF_F_TSO6);
+
+	return features;
+}
+
 static const struct net_device_ops igbvf_netdev_ops = {
 	.ndo_open		= igbvf_open,
 	.ndo_stop		= igbvf_close,
@@ -2631,7 +2652,7 @@ static const struct net_device_ops igbvf_netdev_ops = {
 	.ndo_poll_controller	= igbvf_netpoll,
 #endif
 	.ndo_set_features	= igbvf_set_features,
-	.ndo_features_check	= passthru_features_check,
+	.ndo_features_check	= igbvf_features_check,
 };
 
 /**
@@ -2739,22 +2760,38 @@ static int igbvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 			      NETIF_F_HW_CSUM |
 			      NETIF_F_SCTP_CRC;
 
+#define IGBVF_GSO_PARTIAL_FEATURES (NETIF_F_GSO_GRE | \
+				    NETIF_F_GSO_GRE_CSUM | \
+				    NETIF_F_GSO_IPIP | \
+				    NETIF_F_GSO_SIT | \
+				    NETIF_F_GSO_UDP_TUNNEL | \
+				    NETIF_F_GSO_UDP_TUNNEL_CSUM)
+
+	netdev->gso_partial_features = IGBVF_GSO_PARTIAL_FEATURES;
+	netdev->hw_features |= NETIF_F_GSO_PARTIAL |
+			       IGBVF_GSO_PARTIAL_FEATURES;
+
 	netdev->features = netdev->hw_features |
 			   NETIF_F_HW_VLAN_CTAG_TX |
 			   NETIF_F_HW_VLAN_CTAG_RX |
 			   NETIF_F_HW_VLAN_CTAG_FILTER;
 
-	if (pci_using_dac)
-		netdev->features |= NETIF_F_HIGHDMA;
-
 	netdev->vlan_features |= NETIF_F_SG |
 				 NETIF_F_TSO |
+				 NETIF_F_TSO_MANGLEID |
 				 NETIF_F_TSO6 |
 				 NETIF_F_HW_CSUM |
 				 NETIF_F_SCTP_CRC;
 
 	netdev->mpls_features |= NETIF_F_HW_CSUM;
-	netdev->hw_enc_features |= NETIF_F_HW_CSUM;
+	netdev->hw_enc_features |= NETIF_F_HW_CSUM |
+				   NETIF_F_TSO_MANGLEID |
+				   NETIF_F_TSO6 |
+				   NETIF_F_GSO_PARTIAL |
+				   IGBVF_GSO_PARTIAL_FEATURES;
+
+	if (pci_using_dac)
+		netdev->features |= NETIF_F_HIGHDMA;
 
 	/*reset the controller to put the device in a known good state */
 	err = hw->mac.ops.reset_hw(hw);

^ permalink raw reply related

* Re: [PATCH V3] net: emac: emac gigabit ethernet controller driver
From: Vikram Sethi @ 2016-04-08 21:07 UTC (permalink / raw)
  To: Timur Tabi, Andrew Lunn
  Cc: Rob Herring, Gilad Avidov, netdev, linux-kernel@vger.kernel.org,
	devicetree@vger.kernel.org, linux-arm-msm, Sagar Dharia, shankerd,
	Greg Kroah-Hartman, Christopher Covington
In-Reply-To: <5708013F.90207@codeaurora.org>

On 04/08/2016 02:06 PM, Timur Tabi wrote:
> Andrew Lunn wrote:
>
>> There are two different things here. One is configuring the pin to be
>> a GPIO. The second is using the GPIO as a GPIO. In this case,
>> bit-banging the MDIO bus.
>>
>> The firmware could be doing the configuration, setting the pin as a
>> GPIO. However, the firmware cannot be doing the MDIO bit-banging to
>> make an MDIO bus available. Linux has to do that.
>>
>> Or it could be we have all completely misunderstood the hardware, and
>> we are not doing bit-banging GPIO MDIO. There is a real MDIO
>> controller there, we don't use these pins as GPIOs, etc....
>
> Actually, I think there is a misunderstanding.
>
> On the FSM9900 SOC (which uses device-tree), the two pins that connect to the external PHY are gpio pins.  However, the driver needs to reprogram the pinmux so that those pins are wired to the Emac controller.  That's what the the gpio code in this driver is doing: it's just configuring the pins so that they connect directly between the Emac and the external PHY.  After that, they are no longer GPIO pins, and you cannot use the "GPIO controlled MDIO bus".  There is no MDIO controller on the SOC.  The external PHY is controlled directly from the Emac and also from the internal PHY.  It is screwy, I know, but that's what Gilad was trying to explain.
It is incorrect to say there's no MDIO controller on the SoC. The EMAC Core on the SoC itself has a MDIO controller which talks to the external PHY. The internal SGMII is not on MDIO however.
Please see the EMAC specification.
>
> On the QDF2432 (which uses ACPI), those two wires are now dedicated. There are not muxed GPIOs any more -- they are hard wired between Emac and the external PHY.
>
> In both cases, you need to use Emac registers to communicate with the external PHY.  Stuff like link detect and link speed are configured by programming the Emac and/or the internal phy.
You need to use EMAC *MDIO* registers to communicate with external PHY.
>
> And the internal phy isn't really an internal phy.  It's an SGMII-like device that's connected to the Emac and handles various phy-related tasks.  It has its own register block, but you still have to program it in concert with the Emac.  You can't really treat it separately.
>
> So I'm beginning to believe that Gilad's driver is actually correct as-is.  There are a few minor bug fixes, but in general it's correct.  I would like to post a V4 soon that has those minor fixes.
>


-- 
Vikram Sethi
Qualcomm Technologies Inc, on behalf of Qualcomm Innovation Center, Inc.
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project

^ permalink raw reply

* [PATCH v4] route: do not cache fib route info on local routes with oif
From: Chris Friesen @ 2016-04-08 21:21 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: netdev
In-Reply-To: <alpine.LFD.2.11.1604082328460.2124@ja.home.ssi.bg>

For local routes that require a particular output interface we do not want
to cache the result.  Caching the result causes incorrect behaviour when
there are multiple source addresses on the interface.  The end result
being that if the intended recipient is waiting on that interface for the
packet he won't receive it because it will be delivered on the loopback
interface and the IP_PKTINFO ipi_ifindex will be set to the loopback
interface as well.

This can be tested by running a program such as "dhcp_release" which
attempts to inject a packet on a particular interface so that it is
received by another program on the same board.  The receiving process
should see an IP_PKTINFO ipi_ifndex value of the source interface
(e.g., eth1) instead of the loopback interface (e.g., lo).  The packet
will still appear on the loopback interface in tcpdump but the important
aspect is that the CMSG info is correct.

Sample dhcp_release command line:

   dhcp_release eth1 192.168.204.222 02:11:33:22:44:66

Signed-off-by: Allain Legacy <allain.legacy@windriver.com>
Signed off-by: Chris Friesen <chris.friesen@windriver.com>
---
 net/ipv4/route.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 02c6229..b050cf9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2045,6 +2045,18 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 		 */
 		if (fi && res->prefixlen < 4)
 			fi = NULL;
+	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
+		   (orig_oif != dev_out->ifindex)) {
+		/* For local routes that require a particular output interface
+		 * we do not want to cache the result.  Caching the result
+		 * causes incorrect behaviour when there are multiple source
+		 * addresses on the interface, the end result being that if the
+		 * intended recipient is waiting on that interface for the
+		 * packet he won't receive it because it will be delivered on
+		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
+		 * be set to the loopback interface as well.
+		 */
+		fi = NULL;
 	}

 	fnhe = NULL;

^ permalink raw reply related

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
From: Alexei Starovoitov @ 2016-04-08 21:34 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Brenden Blanco, davem, netdev, tom, ogerlitz, daniel,
	eric.dumazet, ecree, john.fastabend, tgraf, johannes,
	eranlinuxmellanox, lorenzo, linux-mm
In-Reply-To: <20160408220808.682630d7@redhat.com>

On Fri, Apr 08, 2016 at 10:08:08PM +0200, Jesper Dangaard Brouer wrote:
> On Fri, 8 Apr 2016 10:26:53 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
> > On Fri, Apr 08, 2016 at 02:33:40PM +0200, Jesper Dangaard Brouer wrote:
> > > 
> > > On Fri, 8 Apr 2016 12:36:14 +0200 Jesper Dangaard Brouer <brouer@redhat.com> wrote:
> > >   
> > > > > +/* user return codes for PHYS_DEV prog type */
> > > > > +enum bpf_phys_dev_action {
> > > > > +	BPF_PHYS_DEV_DROP,
> > > > > +	BPF_PHYS_DEV_OK,
> > > > > +};    
> > > > 
> > > > I can imagine these extra return codes:
> > > > 
> > > >  BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
> > > >  BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
> > > >  BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */
> > > > 
> > > > The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> > > > which we can look at when we get that far...  
> > > 
> > > I want to point out something which is quite FUNDAMENTAL, for
> > > understanding these return codes (and network stack).
> > > 
> > > 
> > > At driver RX time, the network stack basically have two ways of
> > > building an SKB, which is send up the stack.
> > > 
> > > Option-A (fastest): The packet page is writable. The SKB can be
> > > allocated and skb->data/head can point directly to the page.  And
> > > we place/write skb_shared_info in the end/tail-room. (This is done by
> > > calling build_skb()).
> > > 
> > > Option-B (slower): The packet page is read-only.  The SKB cannot point
> > > skb->data/head directly to the page, because skb_shared_info need to be
> > > written into skb->end (slightly hidden via skb_shinfo() casting).  To
> > > get around this, a separate piece of memory is allocated (speedup by
> > > __alloc_page_frag) for pointing skb->data/head, so skb_shared_info can
> > > be written. (This is done when calling netdev/napi_alloc_skb()).
> > >   Drivers then need to copy over packet headers, and assign + adjust
> > > skb_shinfo(skb)->frags[0] offset to skip copied headers.
> > > 
> > > 
> > > Unfortunately most drivers use option-B.  Due to cost of calling the
> > > page allocator.  It is only slightly most expensive to get a larger
> > > compound page from the page allocator, which then can be partitioned into
> > > page-fragments, thus amortizing the page alloc cost.  Unfortunately the
> > > cost is added later, when constructing the SKB.
> > >  Another reason for option-B, is that archs with expensive IOMMU
> > > requirements (like PowerPC), don't need to dma_unmap on every packet,
> > > but only on the compound page level.
> > > 
> > > Side-note: Most drivers have a "copy-break" optimization.  Especially
> > > for option-B, when copying header data anyhow. For small packet, one
> > > might as well free (or recycle) the RX page, if header size fits into
> > > the newly allocated memory (for skb_shared_info).  
> > 
> > I think you guys are going into overdesign territory, so
> > . nack on read-only pages
> 
> Unfortunately you cannot just ignore or nack read-only pages. They are
> a fact in the current drivers.
> 
> Most drivers today (at-least the ones we care about) only deliver
> read-only pages.  If you don't accept read-only pages day-1, then you
> first have to rewrite a lot of drivers... and that will stall the
> project!  How will you deal with this fact?
> 
> The early drop filter use-case in this patchset, can ignore read-only
> pages.  But ABI wise we need to deal with the future case where we do
> need/require writeable pages.  A simple need-writable pages in the API
> could help us move forward.

the program should never need to worry about whether dma buffer is
writeable or not. Complicating drivers, api, abi, usability
for the single use case of fast packet drop is not acceptable.
XDP is not going to be a fit for all drivers and all architectures.
That is cruicial 'performance vs generality' aspect of the design.
All kernel-bypasses are taking advantage of specific architecture.
We have to take advantage of it as well. If it doesn't fit
powerpc with iommu, so be it. XDP will return -enotsupp.
That is fundamental point. We have to cut such corners and avoid
all cases where unnecessary generality hurts performance.
Read-only pages is clearly such thing.

> > The whole thing must be dead simple to use. Above is not simple by any means.
> 
> Maybe you missed that the above was a description of how the current
> network stack handles this, which is not simple... which is root of the
> hole performance issue.

Disagree. The stack has copy-break, gro, gso and everything else because
it's serving _host_ use case. XDP is packet forwarder use case.
The requirements are completely different. Ex. the host needs gso
in the core and drivers. It needs to deliver data all the way
to user space and back. That is hard and that's where complexity
comes from. For packet forwarder none of it is needed. So saying,
look we have this complexity, so XDP needs it too, is flawed argument.
The kernel is serving host and applications.
XDP is pure packet-in/packet-out framework to achieve better
performance than kernel-bypass, since kernel is the right
place to do it. It has clean access to interrupts, per-cpu,
scheduler, device registers and so on.
Though there are only two broad use cases packet drop and forward,
they cover a ton of real cases: firewalls, dos prevention,
load balancer, nat, etc. In other words mostly stateless.
As soon as packet needs to be queued somewhere we have to
instantiate skb and pass it to the stack.
So no queues in XDP and no 'stolen' and 'shared' return codes.
The program always runs to completion with single packet.
There is no header vs payload split. There is no header
from program point of view. It's raw bytes in dma buffer.

> I do like the idea of rejecting XDP eBPF programs based on the DMA
> setup is not compatible, or if the driver does not implement e.g.
> writable DMA pages.

exactly.

> Customers wanting this feature will then go buy the NIC which support
> this feature.  There is nothing more motivating for NIC vendors seeing
> customers buying the competitors hardware. And it only require a driver
> change to get this market...

exactly.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [RFC PATCH 07/11] GENEVE: Add option to mangle IP IDs on inner headers when using TSO
From: Jesse Gross @ 2016-04-08 21:40 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Alexander Duyck, Herbert Xu, Tom Herbert, Eric Dumazet,
	Linux Kernel Network Developers, David Miller
In-Reply-To: <CAKgT0Ufc8EiqCR5opSFUCE2v3hFCmnS641o+epyi+G92bD+yww@mail.gmail.com>

On Thu, Apr 7, 2016 at 8:52 PM, Alexander Duyck
<alexander.duyck@gmail.com> wrote:
> Just a thought.  What if I replaced NETIF_F_TSO_FIXEDID with something
> that meant we could mange the IP ID like a NETIF_F_TSO_IPID_MANGLE
> (advice for better name welcome).  Instead of the feature flag meaning
> we are going to transmit packets with a fixed ID it would mean we
> don't care about the ID and are free to mangle it as we see fit.  The
> GSO type can retain the same meaning as far as that requiring the same
> ID for all, but the feature would mean we will take fixed and convert
> it to incrementing, or incrementing and convert it to fixed.

I saw the new version of the code that you posted with this idea and
now that I understand it better, it seems like a reasonable choice to
me - it's nice that it is consistent with GRO and not tunnel specific.
It also makes behavior consistent across drivers in regards to
incrementing IDs in the default case, which was one of my concerns
from before.

Maybe I missed it but I didn't see any checks for the DF bit being set
when we transmit a packet with NETIF_F_TSO_MANGLEID. Even if I am
comfortable mangling my IDs in the DF case, I don't think this would
ever extend to non-DF packets. In the documentation you noted that it
is the driver's responsibility to do this check but I couldn't find it
in either ixgbe or igb. It would also be nice if the core stack could
enforce it somehow as well rather than each driver.

^ permalink raw reply

* Re: [net-next PATCH 2/5] GSO: Add GSO type for fixed IPv4 ID
From: Jesse Gross @ 2016-04-08 21:41 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Herbert Xu, Tom Herbert, Alexander Duyck, Eric Dumazet,
	Linux Kernel Network Developers, David Miller
In-Reply-To: <20160408203318.12838.59720.stgit@ahduyck-xeon-server>

On Fri, Apr 8, 2016 at 5:33 PM, Alexander Duyck <aduyck@mirantis.com> wrote:
> This patch adds support for TSO using IPv4 headers with a fixed IP ID
> field.  This is meant to allow us to do a lossless GRO in the case of TCP
> flows that use a fixed IP ID such as those that convert IPv6 header to IPv4
> headers.
>
> In addition I am adding a feature that for now I am referring to TSO with
> IP ID mangling.  Basically when this flag is enabled the device has the
> option to either output the flow with incrementing IP IDs or with a fixed
> IP ID regardless of what the original IP ID ordering was.  This is useful
> in cases where the DF bit is set and we do not care if the original IP ID
> value is maintained.
>
> Signed-off-by: Alexander Duyck <aduyck@mirantis.com>

I think SKB_GSO_TCP_FIXEDID would also need to be added to the list of
enumerated OK GSO types for MPLS GSO.

^ permalink raw reply

* Re: [PATCH V3] net: emac: emac gigabit ethernet controller driver
From: Timur Tabi @ 2016-04-08 21:45 UTC (permalink / raw)
  To: Vikram Sethi, Andrew Lunn
  Cc: Rob Herring, Gilad Avidov, netdev,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	devicetree-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, linux-arm-msm,
	Sagar Dharia, shankerd-sgV2jX0FEOL9JmXXK+q4OQ, Greg Kroah-Hartman,
	Christopher Covington
In-Reply-To: <57081D96.902-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>

Vikram Sethi wrote:

>> On the FSM9900 SOC (which uses device-tree), the two pins that connect to the external PHY are gpio pins.  However, the driver needs to reprogram the pinmux so that those pins are wired to the Emac controller.  That's what the the gpio code in this driver is doing: it's just configuring the pins so that they connect directly between the Emac and the external PHY.  After that, they are no longer GPIO pins, and you cannot use the "GPIO controlled MDIO bus".  There is no MDIO controller on the SOC.  The external PHY is controlled directly from the Emac and also from the internal PHY.  It is screwy, I know, but that's what Gilad was trying to explain.
> It is incorrect to say there's no MDIO controller on the SoC. The EMAC Core on the SoC itself has a MDIO controller which talks to the external PHY. The internal SGMII is not on MDIO however.
> Please see the EMAC specification.

Sorry, I should have said that there is no *independent* MDIO controller 
(one that has its own driver).  As you said, you can only talk to the 
external PHY through the Emac.

-- 
Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora
Forum, a Linux Foundation collaborative project.
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [RFC PATCH 07/11] GENEVE: Add option to mangle IP IDs on inner headers when using TSO
From: Alexander Duyck @ 2016-04-08 22:04 UTC (permalink / raw)
  To: Jesse Gross
  Cc: Alexander Duyck, Herbert Xu, Tom Herbert, Eric Dumazet,
	Linux Kernel Network Developers, David Miller
In-Reply-To: <CAEh+42gjqeGK94xvVQFuiUF=c3YKY2SCWOL=y=2gBsHyUxQ+GA@mail.gmail.com>

On Fri, Apr 8, 2016 at 2:40 PM, Jesse Gross <jesse@kernel.org> wrote:
> On Thu, Apr 7, 2016 at 8:52 PM, Alexander Duyck
> <alexander.duyck@gmail.com> wrote:
>> Just a thought.  What if I replaced NETIF_F_TSO_FIXEDID with something
>> that meant we could mange the IP ID like a NETIF_F_TSO_IPID_MANGLE
>> (advice for better name welcome).  Instead of the feature flag meaning
>> we are going to transmit packets with a fixed ID it would mean we
>> don't care about the ID and are free to mangle it as we see fit.  The
>> GSO type can retain the same meaning as far as that requiring the same
>> ID for all, but the feature would mean we will take fixed and convert
>> it to incrementing, or incrementing and convert it to fixed.
>
> I saw the new version of the code that you posted with this idea and
> now that I understand it better, it seems like a reasonable choice to
> me - it's nice that it is consistent with GRO and not tunnel specific.
> It also makes behavior consistent across drivers in regards to
> incrementing IDs in the default case, which was one of my concerns
> from before.
>
> Maybe I missed it but I didn't see any checks for the DF bit being set
> when we transmit a packet with NETIF_F_TSO_MANGLEID. Even if I am
> comfortable mangling my IDs in the DF case, I don't think this would
> ever extend to non-DF packets. In the documentation you noted that it
> is the driver's responsibility to do this check but I couldn't find it
> in either ixgbe or igb. It would also be nice if the core stack could
> enforce it somehow as well rather than each driver.

Yeah I had glossed over that in the igb and ixgbe patches.  A check is
only really needed for the incrementing to non-incrementing case and I
wasn't sure how common it was to have TCP with an IP header that
didn't set the DF bit.  In the case of the outer headers igb and ixgbe
will increment the IP ID always so we don't have to worry about if DF
is set of not there.  For the inner headers I had fudged it a bit and
didn't add the validation.  If needed I can see about adding that
shortly.

- Alex

^ permalink raw reply

* Re: [PATCH v4] route: do not cache fib route info on local routes with oif
From: Julian Anastasov @ 2016-04-08 22:08 UTC (permalink / raw)
  To: Chris Friesen; +Cc: netdev
In-Reply-To: <570820DA.2070007@windriver.com>


	Hello,

On Fri, 8 Apr 2016, Chris Friesen wrote:

> For local routes that require a particular output interface we do not want
> to cache the result.  Caching the result causes incorrect behaviour when
> there are multiple source addresses on the interface.  The end result
> being that if the intended recipient is waiting on that interface for the
> packet he won't receive it because it will be delivered on the loopback
> interface and the IP_PKTINFO ipi_ifindex will be set to the loopback
> interface as well.
> 
> This can be tested by running a program such as "dhcp_release" which
> attempts to inject a packet on a particular interface so that it is
> received by another program on the same board.  The receiving process
> should see an IP_PKTINFO ipi_ifndex value of the source interface
> (e.g., eth1) instead of the loopback interface (e.g., lo).  The packet
> will still appear on the loopback interface in tcpdump but the important
> aspect is that the CMSG info is correct.
> 
> Sample dhcp_release command line:
> 
>    dhcp_release eth1 192.168.204.222 02:11:33:22:44:66
> 
> Signed-off-by: Allain Legacy <allain.legacy@windriver.com>
> Signed off-by: Chris Friesen <chris.friesen@windriver.com>

	Looks good to me.

Reviewed-by: Julian Anastasov <ja@ssi.bg>

> ---
>  net/ipv4/route.c | 12 ++++++++++++
>  1 file changed, 12 insertions(+)
> 
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 02c6229..b050cf9 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -2045,6 +2045,18 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
>  		 */
>  		if (fi && res->prefixlen < 4)
>  			fi = NULL;
> +	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
> +		   (orig_oif != dev_out->ifindex)) {
> +		/* For local routes that require a particular output interface
> +		 * we do not want to cache the result.  Caching the result
> +		 * causes incorrect behaviour when there are multiple source
> +		 * addresses on the interface, the end result being that if the
> +		 * intended recipient is waiting on that interface for the
> +		 * packet he won't receive it because it will be delivered on
> +		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
> +		 * be set to the loopback interface as well.
> +		 */
> +		fi = NULL;
>  	}
>  
>  	fnhe = NULL;

Regards

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox