Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 4/7] udp: Changes to udp_offload to support remote checksum offload
From: Tom Herbert @ 2014-11-01 22:58 UTC (permalink / raw)
  To: davem, netdev
In-Reply-To: <1414882683-25484-1-git-send-email-therbert@google.com>

Add a new GSO type, SKB_GSO_TUNNEL_REMCSUM, which indicates remote
checksum offload being done (in this case inner checksum must not
be offloaded to the NIC).

Added logic in __skb_udp_tunnel_segment to handle remote checksum
offload case.

Signed-off-by: Tom Herbert <therbert@google.com>
---
 include/linux/netdev_features.h |  4 +++-
 include/linux/netdevice.h       |  1 +
 include/linux/skbuff.h          |  4 +++-
 net/core/skbuff.c               |  4 ++--
 net/ipv4/af_inet.c              |  1 +
 net/ipv4/tcp_offload.c          |  1 +
 net/ipv4/udp_offload.c          | 18 ++++++++++++++++--
 net/ipv6/ip6_offload.c          |  1 +
 net/ipv6/udp_offload.c          |  1 +
 9 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index dcfdecb..8c94b07 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -48,8 +48,9 @@ enum {
 	NETIF_F_GSO_UDP_TUNNEL_BIT,	/* ... UDP TUNNEL with TSO */
 	NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT,/* ... UDP TUNNEL with TSO & CSUM */
 	NETIF_F_GSO_MPLS_BIT,		/* ... MPLS segmentation */
+	NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */
 	/**/NETIF_F_GSO_LAST =		/* last bit, see GSO_MASK */
-		NETIF_F_GSO_MPLS_BIT,
+		NETIF_F_GSO_TUNNEL_REMCSUM_BIT,
 
 	NETIF_F_FCOE_CRC_BIT,		/* FCoE CRC32 */
 	NETIF_F_SCTP_CSUM_BIT,		/* SCTP checksum offload */
@@ -119,6 +120,7 @@ enum {
 #define NETIF_F_GSO_UDP_TUNNEL	__NETIF_F(GSO_UDP_TUNNEL)
 #define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM)
 #define NETIF_F_GSO_MPLS	__NETIF_F(GSO_MPLS)
+#define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX	__NETIF_F(HW_VLAN_STAG_RX)
 #define NETIF_F_HW_VLAN_STAG_TX	__NETIF_F(HW_VLAN_STAG_TX)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c85e065..b2364f0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3583,6 +3583,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 	BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_MPLS    != (NETIF_F_GSO_MPLS >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
 
 	return (features & feature) == feature;
 }
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a59d934..a41e101 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -372,6 +372,7 @@ enum {
 
 	SKB_GSO_MPLS = 1 << 12,
 
+	SKB_GSO_TUNNEL_REMCSUM = 1 << 13,
 };
 
 #if BITS_PER_LONG > 32
@@ -595,7 +596,8 @@ struct sk_buff {
 #endif
 	__u8			ipvs_property:1;
 	__u8			inner_protocol_type:1;
-	/* 4 or 6 bit hole */
+	__u8			remcsum_offload:1;
+	/* 3 or 5 bit hole */
 
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e48e5c0..7001896 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3013,7 +3013,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
 		if (nskb->len == len + doffset)
 			goto perform_csum_check;
 
-		if (!sg) {
+		if (!sg && !nskb->remcsum_offload) {
 			nskb->ip_summed = CHECKSUM_NONE;
 			nskb->csum = skb_copy_and_csum_bits(head_skb, offset,
 							    skb_put(nskb, len),
@@ -3085,7 +3085,7 @@ skip_fraglist:
 		nskb->truesize += nskb->data_len;
 
 perform_csum_check:
-		if (!csum) {
+		if (!csum && !nskb->remcsum_offload) {
 			nskb->csum = skb_checksum(nskb, doffset,
 						  nskb->len - doffset, 0);
 			nskb->ip_summed = CHECKSUM_NONE;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8b7fe5b..ed2c672 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1222,6 +1222,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_TCPV6 |
 		       SKB_GSO_UDP_TUNNEL |
 		       SKB_GSO_UDP_TUNNEL_CSUM |
+		       SKB_GSO_TUNNEL_REMCSUM |
 		       SKB_GSO_MPLS |
 		       0)))
 		goto out;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 5b90f2f..a1b2a56 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -97,6 +97,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 			       SKB_GSO_MPLS |
 			       SKB_GSO_UDP_TUNNEL |
 			       SKB_GSO_UDP_TUNNEL_CSUM |
+			       SKB_GSO_TUNNEL_REMCSUM |
 			       0) ||
 			     !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
 			goto out;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index a774711..0a5a70d 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -41,7 +41,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	unsigned int oldlen;
 	bool need_csum = !!(skb_shinfo(skb)->gso_type &
 			    SKB_GSO_UDP_TUNNEL_CSUM);
-	bool offload_csum = false, dont_encap = need_csum;
+	bool remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
+	bool offload_csum = false, dont_encap = (need_csum || remcsum);
 
 	oldlen = (u16)~skb->len;
 
@@ -55,6 +56,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	skb->mac_len = skb_inner_network_offset(skb);
 	skb->protocol = new_protocol;
 	skb->encap_hdr_csum = need_csum;
+	skb->remcsum_offload = remcsum;
 
 	/* Try to offload checksum if possible */
 	offload_csum = !!(need_csum &&
@@ -108,11 +110,22 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 		uh->check = ~csum_fold((__force __wsum)
 				       ((__force u32)uh->check +
 					(__force u32)delta));
-
 		if (offload_csum) {
 			skb->ip_summed = CHECKSUM_PARTIAL;
 			skb->csum_start = skb_transport_header(skb) - skb->head;
 			skb->csum_offset = offsetof(struct udphdr, check);
+		} else if (remcsum) {
+			/* Need to calculate checksum from scratch,
+			 * inner checksums are never when doing
+			 * remote_checksum_offload.
+			 */
+
+			skb->csum = skb_checksum(skb, udp_offset,
+						 skb->len - udp_offset,
+						 0);
+			uh->check = csum_fold(skb->csum);
+			if (uh->check == 0)
+				uh->check = CSUM_MANGLED_0;
 		} else {
 			uh->check = gso_make_checksum(skb, ~uh->check);
 
@@ -192,6 +205,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 		if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
 				      SKB_GSO_UDP_TUNNEL |
 				      SKB_GSO_UDP_TUNNEL_CSUM |
+				      SKB_GSO_TUNNEL_REMCSUM |
 				      SKB_GSO_IPIP |
 				      SKB_GSO_GRE | SKB_GSO_GRE_CSUM |
 				      SKB_GSO_MPLS) ||
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index a071563..e976707 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -78,6 +78,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_SIT |
 		       SKB_GSO_UDP_TUNNEL |
 		       SKB_GSO_UDP_TUNNEL_CSUM |
+		       SKB_GSO_TUNNEL_REMCSUM |
 		       SKB_GSO_MPLS |
 		       SKB_GSO_TCPV6 |
 		       0)))
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 6b8f543..637ba2e 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -42,6 +42,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 				      SKB_GSO_DODGY |
 				      SKB_GSO_UDP_TUNNEL |
 				      SKB_GSO_UDP_TUNNEL_CSUM |
+				      SKB_GSO_TUNNEL_REMCSUM |
 				      SKB_GSO_GRE |
 				      SKB_GSO_GRE_CSUM |
 				      SKB_GSO_IPIP |
-- 
2.1.0.rc2.206.gedb03e5

^ permalink raw reply related

* [PATCH net-next 5/7] gue: Protocol constants for remote checksum offload
From: Tom Herbert @ 2014-11-01 22:58 UTC (permalink / raw)
  To: davem, netdev
In-Reply-To: <1414882683-25484-1-git-send-email-therbert@google.com>

Define a private flag for remote checksun offload as well as a length
for the option.

Signed-off-by: Tom Herbert <therbert@google.com>
---
 include/net/gue.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/net/gue.h b/include/net/gue.h
index cb68ae8..3f28ec7 100644
--- a/include/net/gue.h
+++ b/include/net/gue.h
@@ -59,7 +59,10 @@ struct guehdr {
 
 /* Private flags in the private option extension */
 
-#define GUE_PFLAGS_ALL	(0)
+#define GUE_PFLAG_REMCSUM	htonl(1 << 31)
+#define GUE_PLEN_REMCSUM	4
+
+#define GUE_PFLAGS_ALL	(GUE_PFLAG_REMCSUM)
 
 /* Functions to compute options length corresponding to flags.
  * If we ever have a lot of flags this can be potentially be
-- 
2.1.0.rc2.206.gedb03e5

^ permalink raw reply related

* [PATCH net-next 6/7] gue: TX support for using remote checksum offload option
From: Tom Herbert @ 2014-11-01 22:58 UTC (permalink / raw)
  To: davem, netdev
In-Reply-To: <1414882683-25484-1-git-send-email-therbert@google.com>

Add if_tunnel flag TUNNEL_ENCAP_FLAG_REMCSUM to configure
remote checksum offload on an IP tunnel. Add logic in gue_build_header
to insert remote checksum offload option.

Signed-off-by: Tom Herbert <therbert@google.com>
---
 include/net/fou.h              |  5 ++++-
 include/uapi/linux/if_tunnel.h |  1 +
 net/ipv4/fou.c                 | 35 ++++++++++++++++++++++++++++++++---
 3 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/include/net/fou.h b/include/net/fou.h
index d2d8055..25b26ff 100644
--- a/include/net/fou.h
+++ b/include/net/fou.h
@@ -25,7 +25,10 @@ static size_t gue_encap_hlen(struct ip_tunnel_encap *e)
 
 	len = sizeof(struct udphdr) + sizeof(struct guehdr);
 
-	/* Add in lengths flags */
+	if (e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) {
+		len += GUE_PLEN_REMCSUM;
+		need_priv = true;
+	}
 
 	len += need_priv ? GUE_LEN_PRIV : 0;
 
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 280d9e0..bd3cc11 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -69,6 +69,7 @@ enum tunnel_encap_types {
 
 #define TUNNEL_ENCAP_FLAG_CSUM		(1<<0)
 #define TUNNEL_ENCAP_FLAG_CSUM6		(1<<1)
+#define TUNNEL_ENCAP_FLAG_REMCSUM	(1<<2)
 
 /* SIT-mode i_flags */
 #define	SIT_ISATAP	0x0001
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index a3b8c5b..fb0db99 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -562,11 +562,19 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 	bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM);
 	int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
 	struct guehdr *guehdr;
-	size_t optlen = 0;
+	size_t hdrlen, optlen = 0;
 	__be16 sport;
 	void *data;
 	bool need_priv = false;
 
+	if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) &&
+	    skb->ip_summed == CHECKSUM_PARTIAL) {
+		csum = false;
+		optlen += GUE_PLEN_REMCSUM;
+		type |= SKB_GSO_TUNNEL_REMCSUM;
+		need_priv = true;
+	}
+
 	optlen += need_priv ? GUE_LEN_PRIV : 0;
 
 	skb = iptunnel_handle_offloads(skb, csum, type);
@@ -578,7 +586,9 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 	sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
 					       skb, 0, 0, false);
 
-	skb_push(skb, sizeof(struct guehdr) + optlen);
+	hdrlen = sizeof(struct guehdr) + optlen;
+
+	skb_push(skb, hdrlen);
 
 	guehdr = (struct guehdr *)skb->data;
 
@@ -597,7 +607,26 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 		*flags = 0;
 		data += GUE_LEN_PRIV;
 
-		/* Add private flags */
+		if (type & SKB_GSO_TUNNEL_REMCSUM) {
+			u16 csum_start = skb_checksum_start_offset(skb);
+			__be16 *pd = data;
+
+			if (csum_start < hdrlen)
+				return -EINVAL;
+
+			csum_start -= hdrlen;
+			pd[0] = htons(csum_start);
+			pd[1] = htons(csum_start + skb->csum_offset);
+
+			if (!skb_is_gso(skb)) {
+				skb->ip_summed = CHECKSUM_NONE;
+				skb->encapsulation = 0;
+			}
+
+			*flags |= GUE_PFLAG_REMCSUM;
+			data += GUE_PLEN_REMCSUM;
+		}
+
 	}
 
 	fou_build_udp(skb, e, fl4, protocol, sport);
-- 
2.1.0.rc2.206.gedb03e5

^ permalink raw reply related

* [PATCH net-next 7/7] gue: Receive side of remote checksum offload
From: Tom Herbert @ 2014-11-01 22:58 UTC (permalink / raw)
  To: davem, netdev
In-Reply-To: <1414882683-25484-1-git-send-email-therbert@google.com>

Add processing of the remote checksum offload option in both the normal
path as well as the GRO path. The implements patching the affected
checksum to derive the offloaded checksum.

Signed-off-by: Tom Herbert <therbert@google.com>
---
 net/ipv4/fou.c | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 161 insertions(+), 9 deletions(-)

diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index fb0db99..740ae09 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -63,6 +63,59 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
 	return -fou->protocol;
 }
 
+static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
+				  void *data, int hdrlen, u8 ipproto)
+{
+	__be16 *pd = data;
+	u16 start = ntohs(pd[0]);
+	u16 offset = ntohs(pd[1]);
+	u16 poffset = 0;
+	u16 plen;
+	__wsum csum, delta;
+	__sum16 *psum;
+
+	if (skb->remcsum_offload) {
+		/* Already processed in GRO path */
+		skb->remcsum_offload = 0;
+		return guehdr;
+	}
+
+	if (start > skb->len - hdrlen ||
+	    offset > skb->len - hdrlen - sizeof(u16))
+		return NULL;
+
+	if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE))
+		__skb_checksum_complete(skb);
+
+	plen = hdrlen + offset + sizeof(u16);
+	if (!pskb_may_pull(skb, plen))
+		return NULL;
+	guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+	if (ipproto == IPPROTO_IP && sizeof(struct iphdr) < plen) {
+		struct iphdr *ip = (struct iphdr *)(skb->data + hdrlen);
+
+		/* If next header happens to be IP we can skip that for the
+		 * checksum calculation since the IP header checksum is zero
+		 * if correct.
+		 */
+		poffset = ip->ihl * 4;
+	}
+
+	csum = csum_sub(skb->csum, skb_checksum(skb, poffset + hdrlen,
+						start - poffset - hdrlen, 0));
+
+	/* Set derived checksum in packet */
+	psum = (__sum16 *)(skb->data + hdrlen + offset);
+	delta = csum_sub(csum_fold(csum), *psum);
+	*psum = csum_fold(csum);
+
+	/* Adjust skb->csum since we changed the packet */
+	skb->csum = csum_add(skb->csum, delta);
+
+	return guehdr;
+}
+
 static int gue_control_message(struct sk_buff *skb, struct guehdr *guehdr)
 {
 	/* No support yet */
@@ -76,6 +129,7 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
 	size_t len, optlen, hdrlen;
 	struct guehdr *guehdr;
 	void *data;
+	u16 doffset = 0;
 
 	if (!fou)
 		return 1;
@@ -100,20 +154,43 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
 	if (guehdr->version != 0 || validate_gue_flags(guehdr, optlen))
 		goto drop;
 
-	/* Pull UDP and GUE headers */
-	fou_recv_pull(skb, len);
+	hdrlen = sizeof(struct guehdr) + optlen;
+
+	ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+
+	/* Pull UDP header now, skb->data points to guehdr */
+	__skb_pull(skb, sizeof(struct udphdr));
+
+	/* Pull csum through the guehdr now . This can be used if
+	 * there is a remote checksum offload.
+	 */
+	skb_postpull_rcsum(skb, udp_hdr(skb), len);
 
 	data = &guehdr[1];
 
 	if (guehdr->flags & GUE_FLAG_PRIV) {
-		data += GUE_LEN_PRIV;
+		__be32 flags = *(__be32 *)(data + doffset);
+
+		doffset += GUE_LEN_PRIV;
 
-		/* Process private flags */
+		if (flags & GUE_PFLAG_REMCSUM) {
+			guehdr = gue_remcsum(skb, guehdr, data + doffset,
+					     hdrlen, guehdr->proto_ctype);
+			if (!guehdr)
+				goto drop;
+
+			data = &guehdr[1];
+
+			doffset += GUE_PLEN_REMCSUM;
+		}
 	}
 
 	if (unlikely(guehdr->control))
 		return gue_control_message(skb, guehdr);
 
+	__skb_pull(skb, hdrlen);
+	skb_reset_transport_header(skb);
+
 	return -guehdr->proto_ctype;
 
 drop:
@@ -164,6 +241,66 @@ out_unlock:
 	return err;
 }
 
+static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
+				      struct guehdr *guehdr, void *data,
+				      size_t hdrlen, u8 ipproto)
+{
+	__be16 *pd = data;
+	u16 start = ntohs(pd[0]);
+	u16 offset = ntohs(pd[1]);
+	u16 poffset = 0;
+	u16 plen;
+	void *ptr;
+	__wsum csum, delta;
+	__sum16 *psum;
+
+	if (skb->remcsum_offload)
+		return guehdr;
+
+	if (start > skb_gro_len(skb) - hdrlen ||
+	    offset > skb_gro_len(skb) - hdrlen - sizeof(u16) ||
+	    !NAPI_GRO_CB(skb)->csum_valid || skb->remcsum_offload)
+		return NULL;
+
+	plen = hdrlen + offset + sizeof(u16);
+
+	/* Pull checksum that will be written */
+	if (skb_gro_header_hard(skb, off + plen)) {
+		guehdr = skb_gro_header_slow(skb, off + plen, off);
+		if (!guehdr)
+			return NULL;
+	}
+
+	ptr = (void *)guehdr + hdrlen;
+
+	if (ipproto == IPPROTO_IP &&
+	    (hdrlen + sizeof(struct iphdr) < plen)) {
+		struct iphdr *ip = (struct iphdr *)(ptr + hdrlen);
+
+		/* If next header happens to be IP we can skip
+		 * that for the checksum calculation since the
+		 * IP header checksum is zero if correct.
+		 */
+		poffset = ip->ihl * 4;
+	}
+
+	csum = csum_sub(NAPI_GRO_CB(skb)->csum,
+			csum_partial(ptr + poffset, start - poffset, 0));
+
+	/* Set derived checksum in packet */
+	psum = (__sum16 *)(ptr + offset);
+	delta = csum_sub(csum_fold(csum), *psum);
+	*psum = csum_fold(csum);
+
+	/* Adjust skb->csum since we changed the packet */
+	skb->csum = csum_add(skb->csum, delta);
+	NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
+
+	skb->remcsum_offload = 1;
+
+	return guehdr;
+}
+
 static struct sk_buff **gue_gro_receive(struct sk_buff **head,
 					struct sk_buff *skb)
 {
@@ -174,6 +311,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
 	struct guehdr *guehdr;
 	size_t len, optlen, hdrlen, off;
 	void *data;
+	u16 doffset = 0;
 	int flush = 1;
 
 	off = skb_gro_offset(skb);
@@ -201,19 +339,33 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
 
 	hdrlen = sizeof(*guehdr) + optlen;
 
-	skb_gro_pull(skb, hdrlen);
-
-	/* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/
+	/* Adjust NAPI_GRO_CB(skb)->csum to account for guehdr,
+	 * this is needed if there is a remote checkcsum offload.
+	 */
 	skb_gro_postpull_rcsum(skb, guehdr, hdrlen);
 
 	data = &guehdr[1];
 
 	if (guehdr->flags & GUE_FLAG_PRIV) {
-		data += GUE_LEN_PRIV;
+		__be32 flags = *(__be32 *)(data + doffset);
 
-		/* Process private flags */
+		doffset += GUE_LEN_PRIV;
+
+		if (flags & GUE_PFLAG_REMCSUM) {
+			guehdr = gue_gro_remcsum(skb, off, guehdr,
+						 data + doffset, hdrlen,
+						 guehdr->proto_ctype);
+			if (!guehdr)
+				goto out;
+
+			data = &guehdr[1];
+
+			doffset += GUE_PLEN_REMCSUM;
+		}
 	}
 
+	skb_gro_pull(skb, hdrlen);
+
 	flush = 0;
 
 	for (p = *head; p; p = p->next) {
-- 
2.1.0.rc2.206.gedb03e5

^ permalink raw reply related

* [PATCH bluetooth-next] netdevice: add ieee802154_ptr to net_device
From: Alexander Aring @ 2014-11-02  5:44 UTC (permalink / raw)
  To: linux-wpan
  Cc: kernel, netdev, linux-wireless, Alexander Aring, David S. Miller

This patch adds an ieee802154_ptr to the net_device structure.
Furthermore the 802.15.4 subsystem will introduce a nl802154 framework
which is similar like the nl80211 framework and a wpan_dev structure.
The wpan_dev structure will hold additional net_device attributes like
address options which are 802.15.4 specific. In the upcoming nl802154
implementation we will introduce a NL802154_FLAG_NEED_WPAN_DEV like
NL80211_FLAG_NEED_WDEV. For this flag an ieee802154_ptr in net_device is
needed. Additional we can access the wpan_dev attributes in upper layers
like IEEE 802.15.4 6LoWPAN easily. Current solution is a complicated
callback interface and getting these values over subif data structure
in mac802154.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Cc: David S. Miller <davem@davemloft.net>
---
Another option would be to combine ieee80211_ptr and ieee802154_ptr in
an union. These pointer can't be used twice at the same time and the
union solution will not make the struct net_device bigger.

My working repository is bluetooth-next. Marcel will apply all 802.15.4
changes. That's why this patch should go into bluetooth-next. Then I can
send new patches which depends on this patch for introducing wpan_dev and
nl802154.

 include/linux/netdevice.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 74fd5d3..c9bcf33 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -57,6 +57,8 @@ struct device;
 struct phy_device;
 /* 802.11 specific */
 struct wireless_dev;
+/* 802.15.4 specific */
+struct wpan_dev;
 
 void netdev_set_default_ethtool_ops(struct net_device *dev,
 				    const struct ethtool_ops *ops);
@@ -1572,6 +1574,7 @@ struct net_device {
 	struct inet6_dev __rcu	*ip6_ptr;
 	void			*ax25_ptr;
 	struct wireless_dev	*ieee80211_ptr;
+	struct wpan_dev		*ieee802154_ptr;
 
 /*
  * Cache lines mostly used on receive path (including eth_type_trans())
-- 
2.1.3

^ permalink raw reply related

* Re: [PATCH]  net: mvpp2: fix possible memory leak
From: Sudip Mukherjee @ 2014-11-02  6:19 UTC (permalink / raw)
  To: Thomas Petazzoni; +Cc: David S. Miller, netdev, linux-kernel, Marcin Wojtas
In-Reply-To: <20141101232445.1a3fe27e@free-electrons.com>

On Sat, Nov 01, 2014 at 11:24:45PM +0100, Thomas Petazzoni wrote:
> Dear Sudip Mukherjee,
> 
> On Sat,  1 Nov 2014 16:59:34 +0530, Sudip Mukherjee wrote:
> > we are allocating memory using kzalloc for struct mvpp2_prs_entry,
> > but later when we are getting error we were just returning the error
> > value without releasing the memory.
> > 
> > Signed-off-by: Sudip Mukherjee <sudip@vectorindia.org>
> > ---
> > 
> > hi,
> > i could not build test after modifying it. I tried to compile using
> > multi_v7_defconfig , but the cross compiler i have is not able to
> > compile it and giving sevaral warnings from the assembler.
> 
> That seems weird. Which compiler are you using, and which errors were
> you getting?
> 
> In any case, it would have been good to Cc the authors of the driver.
yes, i should have. Ccing now. better late than never.
i am using gcc version 4.3.2 (Sourcery G++ Lite 2008q3-72).

thanks
sudip

> 
> Thanks!
> 
> Thomas
> -- 
> Thomas Petazzoni, CTO, Free Electrons
> Embedded Linux, Kernel and Android engineering
> http://free-electrons.com

^ permalink raw reply

* IEEE 802.15.4 6LoWPAN need to change netdev type UAPI - How we can do it right now?
From: Alexander Aring @ 2014-11-02 12:41 UTC (permalink / raw)
  To: netdev; +Cc: linux-wpan, linux-bluetooth

Hi,

The IEEE 802.15.4 with 6LoWPAN has a big problem. We have two interfaces
one "wpan" interface which belongs the IEEE 802.15.4 subsystem and the
"lowpan" interface for IEEE 802.15.4 6LoWPAN layer.

The big problem is that "wpan" and "lowpan" interfaces use the same
ARPHRD type. This is "ARPHRD_IEEE802154".

In kernelspace we can't decide if I handle now a "wpan" interface or
"lowpan" interface and there exist two problems which I know.

These are:

1. Freeing resources

If we create a "wpan" interface we allocate some private data, etc.
which the "lowpan" doesn't allocate. If we free/unregister a wpan
interface over netlink we will free these private resources. Now if
we call the netlink with an "lowpan" interface we try to freeing the
same resources. Of course this will fail because on allocation we don't
allocate the "wpan" resources. We can't decide at netlink interface if
it's a "wpan" or "lowpan" interface. On a "lowpan" interface we could
return -EINVAL then, but we can't decide that.

Possible hacking solution would be to remember the ifindex of all "wpan"
registrated interfaces and check if it fits. I don't know if this
solution could be 100% save.

2. Confusing userspace applications

Userspace applications can't also decide between "wpan" and "lowpan"
interfaces. Currently applications like "wireshark" will decode all
"packets" on "lowpan" as IEEE 802.15.4 frames by default. Correct
should be IPv6 packets decoding. Changing wireshark to decode
ARPHRD_IEEE802154 as IPv6 by default will occur that a "wpan" interface
will have a wrong default decoding.

Possible hacking solution would here to try to create a IPv6 socket if
it's fail it's an wpan interface, if succesful we have a lowpan interface.


In my opinion we need to change this behaviour, but it's an UAPI change
and I will do it right the first time. Possible types we could change to
is "ARPHRD_6LOWPAN" which is also used by bluetooth.


Two solutions:

- Changing type to "ARPHRD_6LOWPAN":

Furthermore we need to make "small" runtime decisions in IPv6. The
ARPHRD_6LOWPAN is used by bluetooth and maybe possible IEEE 802.15.4.
These "small" runtime decisions needs L2 informations from bluetooth or
IEEE 802.15.4. If we change now to "ARPHRD_6LOWPAN" we have a much
similar issue that we can't decide between a 6LoWPAN bluetooth interface
or 6LoWPAN 802.15.4 interface. At userspace this should make no
difference. It's only to decide interface types inside upper layers
inside kernelspace.

Possible solution could be to introduce a ARPHRD_SUBTYPE and place it to
the beginning of netdev_priv(dev) structure. This structure could look
like:

struct lowpan_netdev_priv {
	/* subtype of ARPHRD_6LOWPAN */
	enum lowpan_subtype subtype;
	/* private data of L2 subtype */
	void *priv;
};

In upper layers like IPv6 we could first check if it's a ARPHRD_6LOWPAN
type. After that and we need really L2 different handling we can check
on the lowpan_subtype subtype which is placed in netdev_priv.

This solution require that all ARPHRD_6LOWPAN interfaces use the
lowpan_netdev_priv structure in netdev_priv. I am also not 100% sure if
we need such information in userspace, maybe we can introduce also a
subtype netlink type to get the ARPHRD_6LOWPAN subtype.


- introduce new ARPHRD "ARPHRD_6LOWPAN_IEEE802154"

Just add another type without complicated subtype mechanism.



I hope it's clear that I run into several issues because "wpan" and
"lowpan" uses the same ARPHRD. This code is already 4 years old and
there exists already userspace software which checks on
ARPHRD_IEEE802154 for lowpan interfaces. I need some help how we can
deal now with this "just change the dev->type?". If yes, to which type?

- Alex

^ permalink raw reply

* [PATCH] net: shrink struct softnet_data
From: Eric Dumazet @ 2014-11-02 14:00 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

From: Eric Dumazet <edumazet@google.com>

flow_limit in struct softnet_data is only read from local cpu
and can be moved to fill a hole, reducing softnet_data size by
64 bytes on x86_64

While we are at it, move output_queue, output_queue_tailp and
completion_queue, so that rx / tx paths touch a single cache line.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/linux/netdevice.h |   15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c85e065122460e9f077bcb6788018be38e1d7ddf..5ed05bd764dcf3699afdd9a7b17600246de22d1d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2329,10 +2329,7 @@ extern int netdev_flow_limit_table_len;
  * Incoming packets are placed on per-cpu queues
  */
 struct softnet_data {
-	struct Qdisc		*output_queue;
-	struct Qdisc		**output_queue_tailp;
 	struct list_head	poll_list;
-	struct sk_buff		*completion_queue;
 	struct sk_buff_head	process_queue;
 
 	/* stats */
@@ -2340,10 +2337,17 @@ struct softnet_data {
 	unsigned int		time_squeeze;
 	unsigned int		cpu_collision;
 	unsigned int		received_rps;
-
 #ifdef CONFIG_RPS
 	struct softnet_data	*rps_ipi_list;
+#endif
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit __rcu *flow_limit;
+#endif
+	struct Qdisc		*output_queue;
+	struct Qdisc		**output_queue_tailp;
+	struct sk_buff		*completion_queue;
 
+#ifdef CONFIG_RPS
 	/* Elements below can be accessed between CPUs for RPS */
 	struct call_single_data	csd ____cacheline_aligned_in_smp;
 	struct softnet_data	*rps_ipi_next;
@@ -2355,9 +2359,6 @@ struct softnet_data {
 	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
 
-#ifdef CONFIG_NET_FLOW_LIMIT
-	struct sd_flow_limit __rcu *flow_limit;
-#endif
 };
 
 static inline void input_queue_head_incr(struct softnet_data *sd)

^ permalink raw reply related

* Re: [PATCH net-next 5/8] net/mlx4_en: Remove redundant code from RX/GRO path
From: Or Gerlitz @ 2014-11-02 14:09 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S. Miller, Linux Netdev List, Matan Barak, Amir Vadai,
	Saeed Mahameed, Shani Michaeli, Ido Shamay
In-Reply-To: <1414770362.27538.7.camel@edumazet-glaptop2.roam.corp.google.com>

On 10/31/2014 5:46 PM, Eric Dumazet wrote:
> On Fri, 2014-10-31 at 16:00 +0200, Or Gerlitz wrote:
>> On Fri, Oct 31, 2014 at 5:19 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>> On Fri, 2014-10-31 at 01:25 +0200, Or Gerlitz wrote:
>>>> On Thu, Oct 30, 2014 at 9:00 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>>>> On Thu, 2014-10-30 at 18:06 +0200, Or Gerlitz wrote:
>>>>>> Remove the code which goes through napi_gro_frags() on the RX path,
>>>>>> use only napi_gro_receive().
>>>>> Hmpff... napi_gro_frags() should be faster. Have you benchmarked this ?
>>>>
>>>> yep we did, napi_gro_frags() was somehow better for single stream. Do
>>>> you think we need to do it the other way around, e.g converge to use napi_gro_frags()?
>>> napi_gro_frags() is faster because the napi->skb is reused fast (not
>>> going through kfree_skb()/alloc_skb() for every fragment)
>> I see. Is this a strong vote to convert the code to use napi_gro_frags
>> on it's usual track?
> I don't know yet. In some cases, actually slowing down the rx path can
> help by building bigger GRO packets. But instead of inserting delays,
> we can simply force napi to be run another time, with a nanosec based
> timer.
>
> I've tested this kind of heuristic :
>
>         /* If some packets are waiting in GRO engine and timeout is not expired,
>          * reschedule a NAPI poll. We allow servicing other softirqs
>          * before repoll, we do not rearm CQ.
>          */
>         if (rx_nsecs && napi->gro_list && !need_resched()) {
>                 u64 now = local_clock();
>                 unsigned long flags;
>
>                 /* If we got packets in this round, restart timeout */
>                 if (done)
>                         cq->tstart = now;
>                 else if (now - cq->tstart >= (u64)rx_nsecs)
>                         goto complete;
>
>                 /* Since we might need one skb very soon, build it now */
>                 napi_get_frags(napi);
>
>                 local_irq_save(flags);
>                 list_del(&napi->poll_list);
>                 __napi_schedule_irqoff(napi);
>                 local_irq_restore(flags);
>
>          } else {
> complete:
>                  napi_complete(napi);
>                  mlx4_en_arm_cq(priv, cq);
>          }
> 	return done;

Hi Eric,

For the time being, I'll drop from this series thischange and the 
following ones which depend on it. So can pick in the earlier patches of 
the series, and investigate in parallel thevarious optionsw.r.t GRO here.

Or.

^ permalink raw reply

* [PATCH] net: less interrupt masking in NAPI
From: Eric Dumazet @ 2014-11-02 14:19 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Willem de Bruijn

From: Eric Dumazet <edumazet@google.com>

net_rx_action() can mask irqs a single time to transfert sd->poll_list
into a private list, for a very short duration.

Then, napi_complete() can avoid masking irqs again,
and net_rx_action() only needs to mask irq again in slow path.

This patch removes 2 couples of irq mask/unmask per typical NAPI run,
more if multiple napi were triggered.

Note this also allows to give control back to caller (do_softirq())
more often, so that other softirq handlers can be called a bit earlier,
or ksoftirqd can be wakeup earlier under pressure.

This was developed while testing an alternative to RX interrupt
mitigation to reduce latencies while keeping or improving GRO
aggregation on fast NIC.

Idea is to test napi->gro_list at the end of a napi->poll() and
reschedule one NAPI poll, but after servicing a full round of
softirqs (timers, TX, rcu, ...). This will be allowed only if softirq
is currently serviced by idle task or ksoftirqd, and resched not needed.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
---
 net/core/dev.c |   68 +++++++++++++++++++++++++++++------------------
 1 file changed, 43 insertions(+), 25 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index ebf778df58cd..40be481268de 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4316,20 +4316,28 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
 		local_irq_enable();
 }
 
+static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+	return sd->rps_ipi_list != NULL;
+#else
+	return false;
+#endif
+}
+
 static int process_backlog(struct napi_struct *napi, int quota)
 {
 	int work = 0;
 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
 
-#ifdef CONFIG_RPS
 	/* Check if we have pending ipi, its better to send them now,
 	 * not waiting net_rx_action() end.
 	 */
-	if (sd->rps_ipi_list) {
+	if (sd_has_rps_ipi_waiting(sd)) {
 		local_irq_disable();
 		net_rps_action_and_irq_enable(sd);
 	}
-#endif
+
 	napi->weight = weight_p;
 	local_irq_disable();
 	while (1) {
@@ -4356,7 +4364,6 @@ static int process_backlog(struct napi_struct *napi, int quota)
 			 * We can use a plain write instead of clear_bit(),
 			 * and we dont need an smp_mb() memory barrier.
 			 */
-			list_del(&napi->poll_list);
 			napi->state = 0;
 			rps_unlock(sd);
 
@@ -4406,7 +4413,7 @@ void __napi_complete(struct napi_struct *n)
 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
 	BUG_ON(n->gro_list);
 
-	list_del(&n->poll_list);
+	list_del_init(&n->poll_list);
 	smp_mb__before_atomic();
 	clear_bit(NAPI_STATE_SCHED, &n->state);
 }
@@ -4424,9 +4431,15 @@ void napi_complete(struct napi_struct *n)
 		return;
 
 	napi_gro_flush(n, false);
-	local_irq_save(flags);
-	__napi_complete(n);
-	local_irq_restore(flags);
+
+	if (likely(list_empty(&n->poll_list))) {
+		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
+	} else {
+		/* If n->poll_list is not empty, we need to mask irqs */
+		local_irq_save(flags);
+		__napi_complete(n);
+		local_irq_restore(flags);
+	}
 }
 EXPORT_SYMBOL(napi_complete);
 
@@ -4520,29 +4533,28 @@ static void net_rx_action(struct softirq_action *h)
 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 	unsigned long time_limit = jiffies + 2;
 	int budget = netdev_budget;
+	LIST_HEAD(list);
+	LIST_HEAD(repoll);
 	void *have;
 
 	local_irq_disable();
+	list_splice_init(&sd->poll_list, &list);
+	local_irq_enable();
 
-	while (!list_empty(&sd->poll_list)) {
+	while (!list_empty(&list)) {
 		struct napi_struct *n;
 		int work, weight;
 
-		/* If softirq window is exhuasted then punt.
+		/* If softirq window is exhausted then punt.
 		 * Allow this to run for 2 jiffies since which will allow
 		 * an average latency of 1.5/HZ.
 		 */
 		if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
 			goto softnet_break;
 
-		local_irq_enable();
 
-		/* Even though interrupts have been re-enabled, this
-		 * access is safe because interrupts can only add new
-		 * entries to the tail of this list, and only ->poll()
-		 * calls can remove this head entry from the list.
-		 */
-		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
+		n = list_first_entry(&list, struct napi_struct, poll_list);
+		list_del_init(&n->poll_list);
 
 		have = netpoll_poll_lock(n);
 
@@ -4564,8 +4576,6 @@ static void net_rx_action(struct softirq_action *h)
 
 		budget -= work;
 
-		local_irq_disable();
-
 		/* Drivers must not modify the NAPI state if they
 		 * consume the entire weight.  In such cases this code
 		 * still "owns" the NAPI instance and therefore can
@@ -4573,32 +4583,40 @@ static void net_rx_action(struct softirq_action *h)
 		 */
 		if (unlikely(work == weight)) {
 			if (unlikely(napi_disable_pending(n))) {
-				local_irq_enable();
 				napi_complete(n);
-				local_irq_disable();
 			} else {
 				if (n->gro_list) {
 					/* flush too old packets
 					 * If HZ < 1000, flush all packets.
 					 */
-					local_irq_enable();
 					napi_gro_flush(n, HZ >= 1000);
-					local_irq_disable();
 				}
-				list_move_tail(&n->poll_list, &sd->poll_list);
+				list_add_tail(&n->poll_list, &repoll);
 			}
 		}
 
 		netpoll_poll_unlock(have);
 	}
+
+	if (!sd_has_rps_ipi_waiting(sd) &&
+	    list_empty(&list) &&
+	    list_empty(&repoll))
+		return;
 out:
+	local_irq_disable();
+
+	list_splice_tail_init(&sd->poll_list, &list);
+	list_splice_tail(&repoll, &list);
+	list_splice(&list, &sd->poll_list);
+	if (!list_empty(&sd->poll_list))
+		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+
 	net_rps_action_and_irq_enable(sd);
 
 	return;
 
 softnet_break:
 	sd->time_squeeze++;
-	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 	goto out;
 }
 

^ permalink raw reply related

* [PATCH V1 net-next 0/5] Mellanox ethernet driver update Oct-30-2014
From: Or Gerlitz @ 2014-11-02 14:26 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Matan Barak, Amir Vadai, Saeed Mahameed, Shani Michaeli,
	Ido Shamay, Or Gerlitz

Hi Dave,

The 1st patch from Saeed fixes a bug in the last net-next batch where
a VF could get access to set port configuration, the next patch from Amir
fixes a race in the port VPI logic. Next are two performance patches from Ido.

The patch to add checksum complete status on GRE and such packets was 
preceded with a patch that converted the driver to only use napi_gro_receive 
vs. the current code which goes through napi_gro_frags on it's usual track.
Eric D. has some thoughts and suggestions on that change for which we 
want to take the time and consider, so for the time being dropped that
patch and the ones that depend on it.

Or.

Changes from V0:
  - have the caller to provide the __GFP_COLD hint to the service function
  - dropped the patch that changes the GRO logic and the subsequent dependent
    patches. 

Amir Vadai (1):
  net/mlx4_core: Protect port type setting by mutex

Ido Shamay (2):
  net/mlx4_en: Remove RX buffers alignment to IP_ALIGN
  net/mlx4_en: Add __GFP_COLD gfp flags in alloc_pages

Matan Barak (1):
  net/mlx4_core: Add retrieval of CONFIG_DEV parameters

Saeed Mahameed (1):
  net/mlx4_core: Prevent VF from changing port configuration

 drivers/net/ethernet/mellanox/mlx4/cmd.c           |    6 +-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c         |   23 ++---
 drivers/net/ethernet/mellanox/mlx4/fw.c            |  118 +++++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx4/main.c          |    9 ++-
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |   10 ++
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h       |    1 -
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |   17 +++
 include/linux/mlx4/cmd.h                           |   29 +++++
 include/linux/mlx4/device.h                        |    3 +-
 9 files changed, 190 insertions(+), 26 deletions(-)

^ permalink raw reply

* [PATCH V1 net-next 2/5] net/mlx4_core: Protect port type setting by mutex
From: Or Gerlitz @ 2014-11-02 14:26 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Matan Barak, Amir Vadai, Saeed Mahameed, Shani Michaeli,
	Ido Shamay
In-Reply-To: <1414938377-421-1-git-send-email-ogerlitz@mellanox.com>

From: Amir Vadai <amirv@mellanox.com>

We need to protect set_port_type() for concurrency, as the sysfs code could
call it from mutliple contexts in parallel.

The port_mutex is not enough because we need to protect from concurrent
modification of 'info' and stopping of the port sensing work.

Signed-off-by: Amir Vadai <amirv@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/main.c |    9 ++++++++-
 1 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 90de6e1..9f82196 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -901,9 +901,12 @@ static ssize_t set_port_type(struct device *dev,
 	struct mlx4_priv *priv = mlx4_priv(mdev);
 	enum mlx4_port_type types[MLX4_MAX_PORTS];
 	enum mlx4_port_type new_types[MLX4_MAX_PORTS];
+	static DEFINE_MUTEX(set_port_type_mutex);
 	int i;
 	int err = 0;
 
+	mutex_lock(&set_port_type_mutex);
+
 	if (!strcmp(buf, "ib\n"))
 		info->tmp_type = MLX4_PORT_TYPE_IB;
 	else if (!strcmp(buf, "eth\n"))
@@ -912,7 +915,8 @@ static ssize_t set_port_type(struct device *dev,
 		info->tmp_type = MLX4_PORT_TYPE_AUTO;
 	else {
 		mlx4_err(mdev, "%s is not supported port type\n", buf);
-		return -EINVAL;
+		err = -EINVAL;
+		goto err_out;
 	}
 
 	mlx4_stop_sense(mdev);
@@ -958,6 +962,9 @@ static ssize_t set_port_type(struct device *dev,
 out:
 	mlx4_start_sense(mdev);
 	mutex_unlock(&priv->port_mutex);
+err_out:
+	mutex_unlock(&set_port_type_mutex);
+
 	return err ? err : count;
 }
 
-- 
1.7.1

^ permalink raw reply related

* [PATCH V1 net-next 5/5] net/mlx4_core: Add retrieval of CONFIG_DEV parameters
From: Or Gerlitz @ 2014-11-02 14:26 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Matan Barak, Amir Vadai, Saeed Mahameed, Shani Michaeli,
	Ido Shamay, Or Gerlitz
In-Reply-To: <1414938377-421-1-git-send-email-ogerlitz@mellanox.com>

From: Matan Barak <matanb@mellanox.com>

Add code to issue CONFIG_DEV "get" firmware command.

This command is used in order to obtain certain parameters used for
supporting various RX checksumming options and vxlan UDP port.

The GET operation is allowed for VFs too.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Shani Michaeli <shanim@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c           |    4 +-
 drivers/net/ethernet/mellanox/mlx4/fw.c            |   88 +++++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |    5 +
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |   17 ++++
 include/linux/mlx4/cmd.h                           |   29 +++++++
 include/linux/mlx4/device.h                        |    3 +-
 6 files changed, 139 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 1312ccf..3c05e58 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -990,11 +990,11 @@ static struct mlx4_cmd_info cmd_info[] = {
 	{
 		.opcode = MLX4_CMD_CONFIG_DEV,
 		.has_inbox = false,
-		.has_outbox = false,
+		.has_outbox = true,
 		.out_is_imm = false,
 		.encode_slave_id = false,
 		.verify = NULL,
-		.wrapper = mlx4_CMD_EPERM_wrapper
+		.wrapper = mlx4_CONFIG_DEV_wrapper
 	},
 	{
 		.opcode = MLX4_CMD_ALLOC_RES,
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index e7639e3..d6dba77 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -141,7 +141,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 		[12] = "Large cache line (>64B) CQE stride support",
 		[13] = "Large cache line (>64B) EQE stride support",
 		[14] = "Ethernet protocol control support",
-		[15] = "Ethernet Backplane autoneg support"
+		[15] = "Ethernet Backplane autoneg support",
+		[16] = "CONFIG DEV support"
 	};
 	int i;
 
@@ -574,6 +575,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_MTT_ENTRY_SZ_OFFSET	0x90
 #define QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET	0x92
 #define QUERY_DEV_CAP_BMME_FLAGS_OFFSET		0x94
+#define QUERY_DEV_CAP_CONFIG_DEV_OFFSET		0x94
 #define QUERY_DEV_CAP_RSVD_LKEY_OFFSET		0x98
 #define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET		0xa0
 #define QUERY_DEV_CAP_ETH_BACKPL_OFFSET		0x9c
@@ -749,6 +751,9 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
 	MLX4_GET(dev_cap->bmme_flags, outbox,
 		 QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_CONFIG_DEV_OFFSET);
+	if (field & 0x20)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_CONFIG_DEV;
 	MLX4_GET(dev_cap->reserved_lkey, outbox,
 		 QUERY_DEV_CAP_RSVD_LKEY_OFFSET);
 	MLX4_GET(field32, outbox, QUERY_DEV_CAP_ETH_BACKPL_OFFSET);
@@ -1849,14 +1854,18 @@ int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic)
 
 struct mlx4_config_dev {
 	__be32	update_flags;
-	__be32	rsdv1[3];
+	__be32	rsvd1[3];
 	__be16	vxlan_udp_dport;
 	__be16	rsvd2;
+	__be32	rsvd3[27];
+	__be16	rsvd4;
+	u8	rsvd5;
+	u8	rx_checksum_val;
 };
 
 #define MLX4_VXLAN_UDP_DPORT (1 << 0)
 
-static int mlx4_CONFIG_DEV(struct mlx4_dev *dev, struct mlx4_config_dev *config_dev)
+static int mlx4_CONFIG_DEV_set(struct mlx4_dev *dev, struct mlx4_config_dev *config_dev)
 {
 	int err;
 	struct mlx4_cmd_mailbox *mailbox;
@@ -1874,6 +1883,77 @@ static int mlx4_CONFIG_DEV(struct mlx4_dev *dev, struct mlx4_config_dev *config_
 	return err;
 }
 
+static int mlx4_CONFIG_DEV_get(struct mlx4_dev *dev, struct mlx4_config_dev *config_dev)
+{
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 1, MLX4_CMD_CONFIG_DEV,
+			   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+
+	if (!err)
+		memcpy(config_dev, mailbox->buf, sizeof(*config_dev));
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+/* Conversion between the HW values and the actual functionality.
+ * The value represented by the array index,
+ * and the functionality determined by the flags.
+ */
+static const u8 config_dev_csum_flags[] = {
+	[0] =	0,
+	[1] =	MLX4_RX_CSUM_MODE_VAL_NON_TCP_UDP,
+	[2] =	MLX4_RX_CSUM_MODE_VAL_NON_TCP_UDP	|
+		MLX4_RX_CSUM_MODE_L4,
+	[3] =	MLX4_RX_CSUM_MODE_L4			|
+		MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP	|
+		MLX4_RX_CSUM_MODE_MULTI_VLAN
+};
+
+int mlx4_config_dev_retrieval(struct mlx4_dev *dev,
+			      struct mlx4_config_dev_params *params)
+{
+	struct mlx4_config_dev config_dev;
+	int err;
+	u8 csum_mask;
+
+#define CONFIG_DEV_RX_CSUM_MODE_MASK			0x7
+#define CONFIG_DEV_RX_CSUM_MODE_PORT1_BIT_OFFSET	0
+#define CONFIG_DEV_RX_CSUM_MODE_PORT2_BIT_OFFSET	4
+
+	if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_CONFIG_DEV))
+		return -ENOTSUPP;
+
+	err = mlx4_CONFIG_DEV_get(dev, &config_dev);
+	if (err)
+		return err;
+
+	csum_mask = (config_dev.rx_checksum_val >> CONFIG_DEV_RX_CSUM_MODE_PORT1_BIT_OFFSET) &
+			CONFIG_DEV_RX_CSUM_MODE_MASK;
+
+	if (csum_mask >= sizeof(config_dev_csum_flags)/sizeof(config_dev_csum_flags[0]))
+		return -EINVAL;
+	params->rx_csum_flags_port_1 = config_dev_csum_flags[csum_mask];
+
+	csum_mask = (config_dev.rx_checksum_val >> CONFIG_DEV_RX_CSUM_MODE_PORT2_BIT_OFFSET) &
+			CONFIG_DEV_RX_CSUM_MODE_MASK;
+
+	if (csum_mask >= sizeof(config_dev_csum_flags)/sizeof(config_dev_csum_flags[0]))
+		return -EINVAL;
+	params->rx_csum_flags_port_2 = config_dev_csum_flags[csum_mask];
+
+	params->vxlan_udp_dport = be16_to_cpu(config_dev.vxlan_udp_dport);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_config_dev_retrieval);
+
 int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port)
 {
 	struct mlx4_config_dev config_dev;
@@ -1882,7 +1962,7 @@ int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port)
 	config_dev.update_flags    = cpu_to_be32(MLX4_VXLAN_UDP_DPORT);
 	config_dev.vxlan_udp_dport = udp_port;
 
-	return mlx4_CONFIG_DEV(dev, &config_dev);
+	return mlx4_CONFIG_DEV_set(dev, &config_dev);
 }
 EXPORT_SYMBOL_GPL(mlx4_config_vxlan_port);
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 254ec7b..f8fc7bd 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -947,6 +947,11 @@ int mlx4_SW2HW_EQ_wrapper(struct mlx4_dev *dev, int slave,
 			  struct mlx4_cmd_mailbox *inbox,
 			  struct mlx4_cmd_mailbox *outbox,
 			  struct mlx4_cmd_info *cmd);
+int mlx4_CONFIG_DEV_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
 int mlx4_DMA_wrapper(struct mlx4_dev *dev, int slave,
 		     struct mlx4_vhcr *vhcr,
 		     struct mlx4_cmd_mailbox *inbox,
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 5d2498d..d718ca0 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -2872,6 +2872,23 @@ out_add:
 	return err;
 }
 
+int mlx4_CONFIG_DEV_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	int err;
+	u8 get = vhcr->op_modifier;
+
+	if (get != 1)
+		return -EPERM;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+
+	return err;
+}
+
 static int get_containing_mtt(struct mlx4_dev *dev, int slave, int start,
 			      int len, struct res_mtt **res)
 {
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index ff5f5de..64d2594 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -199,6 +199,33 @@ enum {
 	MLX4_CMD_NATIVE
 };
 
+/*
+ * MLX4_RX_CSUM_MODE_VAL_NON_TCP_UDP -
+ * Receive checksum value is reported in CQE also for non TCP/UDP packets.
+ *
+ * MLX4_RX_CSUM_MODE_L4 -
+ * L4_CSUM bit in CQE, which indicates whether or not L4 checksum
+ * was validated correctly, is supported.
+ *
+ * MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP -
+ * IP_OK CQE's field is supported also for non TCP/UDP IP packets.
+ *
+ * MLX4_RX_CSUM_MODE_MULTI_VLAN -
+ * Receive Checksum offload is supported for packets with more than 2 vlan headers.
+ */
+enum mlx4_rx_csum_mode {
+	MLX4_RX_CSUM_MODE_VAL_NON_TCP_UDP		= 1UL << 0,
+	MLX4_RX_CSUM_MODE_L4				= 1UL << 1,
+	MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP		= 1UL << 2,
+	MLX4_RX_CSUM_MODE_MULTI_VLAN			= 1UL << 3
+};
+
+struct mlx4_config_dev_params {
+	u16	vxlan_udp_dport;
+	u8	rx_csum_flags_port_1;
+	u8	rx_csum_flags_port_2;
+};
+
 struct mlx4_dev;
 
 struct mlx4_cmd_mailbox {
@@ -250,6 +277,8 @@ int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos);
 int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting);
 int mlx4_get_vf_config(struct mlx4_dev *dev, int port, int vf, struct ifla_vf_info *ivf);
 int mlx4_set_vf_link_state(struct mlx4_dev *dev, int port, int vf, int link_state);
+int mlx4_config_dev_retrieval(struct mlx4_dev *dev,
+			      struct mlx4_config_dev_params *params);
 /*
  * mlx4_get_slave_default_vlan -
  * return true if VST ( default vlan)
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index e4c136e..5cc5eac 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -188,7 +188,8 @@ enum {
 	MLX4_DEV_CAP_FLAG2_CQE_STRIDE		= 1LL <<  12,
 	MLX4_DEV_CAP_FLAG2_EQE_STRIDE		= 1LL <<  13,
 	MLX4_DEV_CAP_FLAG2_ETH_PROT_CTRL        = 1LL <<  14,
-	MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP	= 1LL <<  15
+	MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP	= 1LL <<  15,
+	MLX4_DEV_CAP_FLAG2_CONFIG_DEV		= 1LL <<  16
 };
 
 enum {
-- 
1.7.1

^ permalink raw reply related

* [PATCH V1 net-next 4/5] net/mlx4_en: Add __GFP_COLD gfp flags in alloc_pages
From: Or Gerlitz @ 2014-11-02 14:26 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Matan Barak, Amir Vadai, Saeed Mahameed, Shani Michaeli,
	Ido Shamay
In-Reply-To: <1414938377-421-1-git-send-email-ogerlitz@mellanox.com>

From: Ido Shamay <idos@mellanox.com>

Needed in order to get cache cold pages (L3 flushed) for HW scatter.

Otherwise memory may flush those entries when the packet comes from
PCI, causing back pressure resulting in BW decrease.

Signed-off-by: Ido Shamay <idos@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c |    7 ++++---
 1 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 4cb716f..317abc9 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -156,7 +156,7 @@ static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
 		const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
 
 		if (mlx4_alloc_pages(priv, &ring->page_alloc[i],
-				     frag_info, GFP_KERNEL))
+				     frag_info, GFP_KERNEL | __GFP_COLD))
 			goto out;
 	}
 	return 0;
@@ -268,7 +268,7 @@ static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv)
 
 			if (mlx4_en_prepare_rx_desc(priv, ring,
 						    ring->actual_size,
-						    GFP_KERNEL)) {
+						    GFP_KERNEL | __GFP_COLD)) {
 				if (ring->actual_size < MLX4_EN_MIN_RX_SIZE) {
 					en_err(priv, "Failed to allocate enough rx buffers\n");
 					return -ENOMEM;
@@ -635,7 +635,8 @@ static void mlx4_en_refill_rx_buffers(struct mlx4_en_priv *priv,
 	int index = ring->prod & ring->size_mask;
 
 	while ((u32) (ring->prod - ring->cons) < ring->actual_size) {
-		if (mlx4_en_prepare_rx_desc(priv, ring, index, GFP_ATOMIC))
+		if (mlx4_en_prepare_rx_desc(priv, ring, index,
+					    GFP_ATOMIC | __GFP_COLD))
 			break;
 		ring->prod++;
 		index = ring->prod & ring->size_mask;
-- 
1.7.1

^ permalink raw reply related

* [PATCH V1 net-next 3/5] net/mlx4_en: Remove RX buffers alignment to IP_ALIGN
From: Or Gerlitz @ 2014-11-02 14:26 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Matan Barak, Amir Vadai, Saeed Mahameed, Shani Michaeli,
	Ido Shamay
In-Reply-To: <1414938377-421-1-git-send-email-ogerlitz@mellanox.com>

From: Ido Shamay <idos@mellanox.com>

When IP_ALIGN has a non zero value, hardware will write to a non aligned
address. The only reader from this address is when copying the header
from the first frag into the linear buffer (further access to the IP
address will be from the linear buffer, in which the headers are
aligned). Since the penalty of non align access by the hardware is
greater than the software memcpy, changing the frag_align to always be 0.

Signed-off-by: Ido Shamay <idos@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c   |   16 ++++------------
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h |    1 -
 2 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index c8e75da..4cb716f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -74,7 +74,7 @@ static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
 	page_alloc->page_size = PAGE_SIZE << order;
 	page_alloc->page = page;
 	page_alloc->dma = dma;
-	page_alloc->page_offset = frag_info->frag_align;
+	page_alloc->page_offset = 0;
 	/* Not doing get_page() for each frag is a big win
 	 * on asymetric workloads. Note we can not use atomic_set().
 	 */
@@ -945,15 +945,8 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
 			(eff_mtu > buf_size + frag_sizes[i]) ?
 				frag_sizes[i] : eff_mtu - buf_size;
 		priv->frag_info[i].frag_prefix_size = buf_size;
-		if (!i)	{
-			priv->frag_info[i].frag_align = NET_IP_ALIGN;
-			priv->frag_info[i].frag_stride =
-				ALIGN(frag_sizes[i] + NET_IP_ALIGN, SMP_CACHE_BYTES);
-		} else {
-			priv->frag_info[i].frag_align = 0;
-			priv->frag_info[i].frag_stride =
-				ALIGN(frag_sizes[i], SMP_CACHE_BYTES);
-		}
+		priv->frag_info[i].frag_stride = ALIGN(frag_sizes[i],
+						       SMP_CACHE_BYTES);
 		buf_size += priv->frag_info[i].frag_size;
 		i++;
 	}
@@ -966,11 +959,10 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
 	       eff_mtu, priv->num_frags);
 	for (i = 0; i < priv->num_frags; i++) {
 		en_err(priv,
-		       "  frag:%d - size:%d prefix:%d align:%d stride:%d\n",
+		       "  frag:%d - size:%d prefix:%d stride:%d\n",
 		       i,
 		       priv->frag_info[i].frag_size,
 		       priv->frag_info[i].frag_prefix_size,
-		       priv->frag_info[i].frag_align,
 		       priv->frag_info[i].frag_stride);
 	}
 }
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 6beb4d3..ef83d12 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -481,7 +481,6 @@ struct mlx4_en_frag_info {
 	u16 frag_size;
 	u16 frag_prefix_size;
 	u16 frag_stride;
-	u16 frag_align;
 };
 
 #ifdef CONFIG_MLX4_EN_DCB
-- 
1.7.1

^ permalink raw reply related

* [PATCH V1 net-next 1/5] net/mlx4_core: Prevent VF from changing port configuration
From: Or Gerlitz @ 2014-11-02 14:26 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Matan Barak, Amir Vadai, Saeed Mahameed, Shani Michaeli,
	Ido Shamay
In-Reply-To: <1414938377-421-1-git-send-email-ogerlitz@mellanox.com>

From: Saeed Mahameed <saeedm@mellanox.com>

Added wrapper to the ACCESS_REG command for handling guest HW
registers access, preventing write operations, but do allow reads.

This will prevent SRIOV guests to change port PTYS configuration,
such as speed/advertised link modes.

Fixes: adbc7ac5c15e ('net/mlx4_core: Introduce ACCESS_REG CMD [...]')
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c  |    2 +-
 drivers/net/ethernet/mellanox/mlx4/fw.c   |   30 ++++++++++++++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx4/mlx4.h |    5 ++++
 3 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 916459e..1312ccf 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -1345,7 +1345,7 @@ static struct mlx4_cmd_info cmd_info[] = {
 		.out_is_imm = false,
 		.encode_slave_id = false,
 		.verify = NULL,
-		.wrapper = NULL,
+		.wrapper = mlx4_ACCESS_REG_wrapper,
 	},
 	/* Native multicast commands are not available for guests */
 	{
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 72289ef..e7639e3 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -2220,7 +2220,7 @@ static int mlx4_ACCESS_REG(struct mlx4_dev *dev, u16 reg_id,
 	memcpy(inbuf->reg_data, reg_data, reg_len);
 	err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, 0, 0,
 			   MLX4_CMD_ACCESS_REG, MLX4_CMD_TIME_CLASS_C,
-			   MLX4_CMD_NATIVE);
+			   MLX4_CMD_WRAPPED);
 	if (err)
 		goto out;
 
@@ -2263,3 +2263,31 @@ int mlx4_ACCESS_PTYS_REG(struct mlx4_dev *dev,
 			       method, sizeof(*ptys_reg), ptys_reg);
 }
 EXPORT_SYMBOL_GPL(mlx4_ACCESS_PTYS_REG);
+
+int mlx4_ACCESS_REG_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_access_reg *inbuf = inbox->buf;
+	u8 method = inbuf->method & MLX4_ACCESS_REG_METHOD_MASK;
+	u16 reg_id = be16_to_cpu(inbuf->reg_id);
+
+	if (slave != mlx4_master_func_num(dev) &&
+	    method == MLX4_ACCESS_REG_WRITE)
+		return -EPERM;
+
+	if (reg_id == MLX4_REG_ID_PTYS) {
+		struct mlx4_ptys_reg *ptys_reg =
+			(struct mlx4_ptys_reg *)inbuf->reg_data;
+
+		ptys_reg->local_port =
+			mlx4_slave_convert_port(dev, slave,
+						ptys_reg->local_port);
+	}
+
+	return mlx4_cmd_box(dev, inbox->dma, outbox->dma, vhcr->in_modifier,
+			    0, MLX4_CMD_ACCESS_REG, MLX4_CMD_TIME_CLASS_C,
+			    MLX4_CMD_NATIVE);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index de10dbb..254ec7b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -1273,6 +1273,11 @@ int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev *dev, int slave,
 					 struct mlx4_cmd_mailbox *inbox,
 					 struct mlx4_cmd_mailbox *outbox,
 					 struct mlx4_cmd_info *cmd);
+int mlx4_ACCESS_REG_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
 
 int mlx4_get_mgm_entry_size(struct mlx4_dev *dev);
 int mlx4_get_qp_per_mgm(struct mlx4_dev *dev);
-- 
1.7.1

^ permalink raw reply related

* Re: [PATCH net-next 5/8] net/mlx4_en: Remove redundant code from RX/GRO path
From: Eric Dumazet @ 2014-11-02 14:28 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: David S. Miller, Linux Netdev List, Matan Barak, Amir Vadai,
	Saeed Mahameed, Shani Michaeli, Ido Shamay
In-Reply-To: <54563B12.1040905@mellanox.com>

On Sun, 2014-11-02 at 16:09 +0200, Or Gerlitz wrote:

> Hi Eric,
> 
> For the time being, I'll drop from this series thischange and the 
> following ones which depend on it. So can pick in the earlier patches of 
> the series, and investigate in parallel thevarious optionsw.r.t GRO here.

Thanks Or !

I posted first patch to allow implementing this better GRO strategy
(https://patchwork.ozlabs.org/patch/405892/ )

^ permalink raw reply

* IEEE 802.15.4 - Realization interframe spacing time after each ndo_start_xmit
From: Alexander Aring @ 2014-11-02 14:33 UTC (permalink / raw)
  To: netdev; +Cc: linux-wpan

Hi,

the IEEE 802.15.4 standard describes an interframe spacing time.

This spacing time describes that after each transmit we need to wait
some microseconds before we doing the next transmit.

The current workaround is a udelay in driver layer for the at86rf230
driver. [0]

This is a very terrible solution and I need some better one. If I don't
do this wait time I got fragmentation issues at 6LoWPAN layer.

To do a interframe spacing time depends on payload. If the payload is
below 18 bytes we need to wait the "sifs - short interframe spacing time".
If the payload is above or equal 18 bytes we need to wait "lifs - long
interframe spacing time".

Also some transceiver do the interframe spacing time on transceiver level.
For example the at86rf230 do the interframe spacing time on his own when
max_frame_retries parameter is above 1. (Then automatic retransmission
is activated).

I need some solution which I can turn on/off at runtime while running
'ieee802154_xmit_complete". The function "ieee802154_xmit_complete" will
consume the skb and wake the netdev queue again. 



Possible better solution would be:

Better solution would be to take some timestamps after each transmit
complete and wait "if necessary" the calculated lifs/sifs delta time
inside of "ndo_start_xmit" - means before doing next transmit. If we see
in "ndo_start_xmit" that we need some time because the next transmit is
inside the lifs/sifs time of the last transmit we wait the delta time
of last completed transmit timestamp and now to hold the lifs/sifs
timing contraints. But I don't feeling well to do a "udelay" inside of
ndo_start_xmit.



I would be grateful for any suggestion how we can deal which such
interframe spacing time. Maybe there exist already some other L2 layer
which have already a solution for something like that.

- Alex

[0] http://git.kernel.org/cgit/linux/kernel/git/bluetooth/bluetooth-next.git/tree/drivers/net/ieee802154/at86rf230.c?id=fe58d016e396fc685364b5a1743faf83c1fb8103#n722

^ permalink raw reply

* Re: [PATCH] bridge: fix netfilter/NF_BR_LOCAL_OUT for own, locally generated queries
From: Linus Lüssing @ 2014-11-02 15:37 UTC (permalink / raw)
  To: netdev; +Cc: bridge, Stephen Hemminger, David S. Miller, Herbert Xu,
	linux-kernel
In-Reply-To: <1411342364-4791-1-git-send-email-linus.luessing@web.de>

On Mon, Sep 22, 2014 at 01:32:44AM +0200, Linus Lüssing wrote:
> Ebtables on the OUTPUT chain (NF_BR_LOCAL_OUT) would not work as expected
> for both locally generated IGMP and MLD queries. The IP header specific
> filter options are off by 14 Bytes for netfilter (actual output on
> interfaces is fine).
> 
> NF_HOOK() expects the skb->data to point to the IP header, not the
> ethernet one (while dev_queue_xmit() does not). Luckily there is an
> br_dev_queue_push_xmit() helper function already - let's just use that.

bump

^ permalink raw reply

* Arndale ethernet regression
From: Charles Keepax @ 2014-11-02 16:12 UTC (permalink / raw)
  To: m.stam; +Cc: davem, linux-usb, netdev, linux-kernel

Hi Guys,

I have been having an issue with the ethernet on Arndale and it
bisects down to this commit:

commit 3cc81d85ee01e5a0b7ea2f4190e2ed1165f53c31
Author: Michel Stam <m.stam@fugro.nl>
asix: Don't reset PHY on if_up for ASIX 88772

I am afraid I am not overly familiar with the USB or networking
subsystems. But are we sure this patch is good? It seems to set
the reset callback to the link_reset function which feels a
little odd from my layman perspective. It doesn't look like there
could be any config settings or such that I am missing relating
to this patch.

Thanks,
Charles

^ permalink raw reply

* RE: [PATCH net-next v2 2/3] r8152: clear the flagofSCHEDULE_TASKLET in tasklet
From: Hayes Wang @ 2014-11-02 17:57 UTC (permalink / raw)
  To: David Miller
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, nic_swsd,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-usb-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <20141031.161520.3547230591227504.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>

 David Miller [davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org]

> This is racey.

> If another thread of control sets the bit between the test and the
> clear, you will lose an event.

It is fine. The flag is used to schedule a tasklet, so if the tasklet is
starting running, all the other plans for scheduling a tasklet could
be cleared.

Besides, the flag is only set when a transmission occurs and the
device is in autosuspend mode. Then the workqueue could wake
up the device and schedule the tasklet to make sure the tasklet
wouldn't be run when the device is in suspend mode. Therefore,
if the tasklet is running, it means something happens and the
device is waked up. And the queue for scheduling the tasklet is
unnecessary. We don't need the tasklet runs again after current
one except that the relative tx/rx flows schedule it.

> It really never makes sense to work with atomic bitops in a non-atomic
> test-and-whatever manner like this, it's always a red flag and
> indicates you're doing something very wrong.

I use atomic because I don't wish the different threads which
set the different flags would chang the other bits which they
don't interesting in.

Best Regards,
Hayes
--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: Connection with Smart Z hangs
From: Linus Torvalds @ 2014-11-02 18:20 UTC (permalink / raw)
  To: Martin Lang, David Miller; +Cc: devel, Network Development
In-Reply-To: <54566950.5080206@gmail.com>

On Sun, Nov 2, 2014 at 9:26 AM, Martin Lang <mlg.hessigheim@gmail.com> wrote:
>
> I debugged a little bit and got the impression that the problem is with the
> IRDA driver and not libdivecomputer.

Looks that way.

> [  767.319321] BUG: unable to handle kernel NULL pointer dereference at (null)
> [  767.320541] CPU: 0 PID: 3093 Comm: smart Not tainted 3.13.0-37-generic #64-Ubuntu

Ugh. 3.13. I guess you can't get anything newer through Ubuntu without
going through some big upgrade of everything.

Not that I see any likely fixes since 3.13. Irda is basically dead
technology, the only user we ever see any more are a very few dive
computers. So nobody maintains it any more.

In fact, we had this very bug reported last *year*, and I debugged it,
and sent my

> [  767.322008]  [<ffffffffa02c75c6>] ? irda_connect+0x156/0x480 [irda]
> [  767.322540] Code:  Bad RIP value.

Ok, it's a call to a NULL pointer, which is bad. The only such call is the

    sk->sk_prot->disconnect()

call, and the "disconnect" function for irda is NULL, always has been
and probably always will be. And nobody has ever fixed this. There was
a thread in late december last year (and early January this year about
this particular oops and another one) where I reported this, and
people agreed that it was all bogus. There was a separate locking
issue too, which wasn't as simple.

David, I'm just going to remove that whole bogus disconnect call. It
won't make things *work* for Martin (because this is all in the
"connect failed" path), but that call as-is is just wrong, wrong,
wrong. And apparently nobody cares about irda any more.

If anybody is at all interested in helping maintain irda code, holler
to David and to the netdev mailing list. The position is up for grabs.

                          Linus

^ permalink raw reply

* [PATCH net] uapi: add missing network related headers to kbuild
From: Stephen Hemminger @ 2014-11-02 19:31 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

The makefile for sanitizing kernel headers uses the kbuild file
to determine which files to do. Several networking related headers
were missing. Without these headers iproute2 build would break.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>


--- a/include/uapi/linux/Kbuild	2014-11-02 11:18:59.918650081 -0800
+++ b/include/uapi/linux/Kbuild	2014-11-02 11:24:32.304658688 -0800
@@ -125,6 +125,7 @@ header-y += filter.h
 header-y += firewire-cdev.h
 header-y += firewire-constants.h
 header-y += flat.h
+header-y += fou.h
 header-y += fs.h
 header-y += fsl_hypervisor.h
 header-y += fuse.h
@@ -141,6 +142,7 @@ header-y += hid.h
 header-y += hiddev.h
 header-y += hidraw.h
 header-y += hpet.h
+header-y += hsr_netlink.h
 header-y += hyperv.h
 header-y += hysdn_if.h
 header-y += i2c-dev.h
@@ -251,6 +253,7 @@ header-y += mii.h
 header-y += minix_fs.h
 header-y += mman.h
 header-y += mmtimer.h
+header-y += mpls.h
 header-y += mqueue.h
 header-y += mroute.h
 header-y += mroute6.h
@@ -424,6 +427,7 @@ header-y += virtio_net.h
 header-y += virtio_pci.h
 header-y += virtio_ring.h
 header-y += virtio_rng.h
+header=y += vm_sockets.h
 header-y += vt.h
 header-y += wait.h
 header-y += wanrouter.h

^ permalink raw reply

* Re: [PATCH iproute2 2/5] ip fou: Support to configure foo-over-udp RX
From: Stephen Hemminger @ 2014-11-02 19:36 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev
In-Reply-To: <1412351718-22921-3-git-send-email-therbert@google.com>

On Fri,  3 Oct 2014 08:55:15 -0700
Tom Herbert <therbert@google.com> wrote:

> Added 'ip fou...' commands to enable/disable UDP ports for doing
> foo-over-udp and Generic UDP Encapsulation variant. Arguments are port
> number to bind to and IP protocol to map to port (for direct FOU).
> 
> Examples:
> 
> ip fou add port 7777 gue
> ip fou add port 8888 ipproto 4
> 
> The first command creates a GUE port, the second creates a direct FOU
> port for IPIP (receive payload is a assumed to be an IPv4 packet).
> 
> Signed-off-by: Tom Herbert <therbert@google.com>

Accepted.
Also discovered that fou.h was missing from kernel Kbuild uapi.

^ permalink raw reply

* Re: [PATCH iproute2 2/5] ip fou: Support to configure foo-over-udp RX
From: Stephen Hemminger @ 2014-11-02 19:45 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev
In-Reply-To: <20141102113610.785543ff@urahara>

On Sun, 2 Nov 2014 11:36:10 -0800
Stephen Hemminger <stephen@networkplumber.org> wrote:

> On Fri,  3 Oct 2014 08:55:15 -0700
> Tom Herbert <therbert@google.com> wrote:
> 
> > Added 'ip fou...' commands to enable/disable UDP ports for doing
> > foo-over-udp and Generic UDP Encapsulation variant. Arguments are port
> > number to bind to and IP protocol to map to port (for direct FOU).
> > 
> > Examples:
> > 
> > ip fou add port 7777 gue
> > ip fou add port 8888 ipproto 4
> > 
> > The first command creates a GUE port, the second creates a direct FOU
> > port for IPIP (receive payload is a assumed to be an IPv4 packet).
> > 
> > Signed-off-by: Tom Herbert <therbert@google.com>
> 
> Accepted.
> Also discovered that fou.h was missing from kernel Kbuild uapi.

I backed out the change since the rest of the patch series has
issues. Please fix and resubmit

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox