Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next RFC 4/8] ipv6: remove offload exception for hopopts
From: Willem de Bruijn @ 2018-09-14 17:59 UTC (permalink / raw)
  To: netdev; +Cc: pabeni, steffen.klassert, davem, Willem de Bruijn
In-Reply-To: <20180914175941.213950-1-willemdebruijn.kernel@gmail.com>

From: Willem de Bruijn <willemb@google.com>

Extension headers in ipv6 are pulled without calling a callback
function. An inet6_offload signals this feature with flag
INET6_PROTO_GSO_EXTHDR.

Add net_has_flag helper to hide implementation details and in
prepartion for configurable gro.

Convert NEXTHDR_HOP from a special case branch to a standard
extension header offload.

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 include/linux/netdevice.h  |  9 +++++++++
 net/ipv6/exthdrs_offload.c | 17 ++++++++++++++---
 net/ipv6/ip6_offload.c     | 36 +++++++++++++-----------------------
 3 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0be594f8d1ce..1c97a048506f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3567,6 +3567,15 @@ static inline u8 net_offload_from_type(u16 type)
 	return type & 0xFF;
 }
 
+static inline bool net_offload_has_flag(const struct net_offload __rcu **offs,
+					u16 type, u16 flag)
+{
+	const struct net_offload *off;
+
+	off = offs ? rcu_dereference(offs[net_offload_from_type(type)]) : NULL;
+	return off && off->flags & flag;
+}
+
 static inline const struct net_offload *
 net_gro_receive(const struct net_offload __rcu **offs, u16 type)
 {
diff --git a/net/ipv6/exthdrs_offload.c b/net/ipv6/exthdrs_offload.c
index f5e2ba1c18bf..2230331c6012 100644
--- a/net/ipv6/exthdrs_offload.c
+++ b/net/ipv6/exthdrs_offload.c
@@ -12,11 +12,15 @@
 #include <net/protocol.h>
 #include "ip6_offload.h"
 
-static const struct net_offload rthdr_offload = {
+static struct net_offload hophdr_offload = {
 	.flags		=	INET6_PROTO_GSO_EXTHDR,
 };
 
-static const struct net_offload dstopt_offload = {
+static struct net_offload rthdr_offload = {
+	.flags		=	INET6_PROTO_GSO_EXTHDR,
+};
+
+static struct net_offload dstopt_offload = {
 	.flags		=	INET6_PROTO_GSO_EXTHDR,
 };
 
@@ -24,10 +28,14 @@ int __init ipv6_exthdrs_offload_init(void)
 {
 	int ret;
 
-	ret = inet6_add_offload(&rthdr_offload, IPPROTO_ROUTING);
+	ret = inet6_add_offload(&hophdr_offload, IPPROTO_HOPOPTS);
 	if (ret)
 		goto out;
 
+	ret = inet6_add_offload(&rthdr_offload, IPPROTO_ROUTING);
+	if (ret)
+		goto out_hop;
+
 	ret = inet6_add_offload(&dstopt_offload, IPPROTO_DSTOPTS);
 	if (ret)
 		goto out_rt;
@@ -37,5 +45,8 @@ int __init ipv6_exthdrs_offload_init(void)
 
 out_rt:
 	inet6_del_offload(&rthdr_offload, IPPROTO_ROUTING);
+
+out_hop:
+	inet6_del_offload(&rthdr_offload, IPPROTO_HOPOPTS);
 	goto out;
 }
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 9d301bef0e23..4854509a2c5d 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -22,21 +22,13 @@
 
 static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto)
 {
-	const struct net_offload *ops = NULL;
-
 	for (;;) {
 		struct ipv6_opt_hdr *opth;
 		int len;
 
-		if (proto != NEXTHDR_HOP) {
-			ops = rcu_dereference(inet6_offloads[proto]);
-
-			if (unlikely(!ops))
-				break;
-
-			if (!(ops->flags & INET6_PROTO_GSO_EXTHDR))
-				break;
-		}
+		if (!net_offload_has_flag(inet6_offloads, proto,
+					  INET6_PROTO_GSO_EXTHDR))
+			break;
 
 		if (unlikely(!pskb_may_pull(skb, 8)))
 			break;
@@ -141,26 +133,24 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 /* Return the total length of all the extension hdrs, following the same
  * logic in ipv6_gso_pull_exthdrs() when parsing ext-hdrs.
  */
-static int ipv6_exthdrs_len(struct ipv6hdr *iph,
-			    const struct net_offload **opps)
+static int ipv6_exthdrs_len(struct ipv6hdr *iph, u8 *pproto)
 {
 	struct ipv6_opt_hdr *opth = (void *)iph;
 	int len = 0, proto, optlen = sizeof(*iph);
 
 	proto = iph->nexthdr;
 	for (;;) {
-		if (proto != NEXTHDR_HOP) {
-			*opps = rcu_dereference(inet6_offloads[proto]);
-			if (unlikely(!(*opps)))
-				break;
-			if (!((*opps)->flags & INET6_PROTO_GSO_EXTHDR))
-				break;
-		}
+		if (!net_offload_has_flag(inet6_offloads, proto,
+					  INET6_PROTO_GSO_EXTHDR))
+			break;
+
 		opth = (void *)opth + optlen;
 		optlen = ipv6_optlen(opth);
 		len += optlen;
 		proto = opth->nexthdr;
 	}
+
+	*pproto = proto;
 	return len;
 }
 
@@ -296,8 +286,8 @@ static struct sk_buff *ip4ip6_gro_receive(struct list_head *head,
 
 static int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
 {
-	const struct net_offload *ops;
 	struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff);
+	u8 proto;
 
 	if (skb->encapsulation) {
 		skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IPV6));
@@ -306,8 +296,8 @@ static int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
 
 	iph->payload_len = htons(skb->len - nhoff - sizeof(*iph));
 
-	nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &ops);
-	return net_gro_complete(inet6_offloads, ops->type, skb, nhoff);
+	nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &proto);
+	return net_gro_complete(inet6_offloads, proto, skb, nhoff);
 }
 
 static int sit_gro_complete(struct sk_buff *skb, int nhoff)
-- 
2.19.0.397.gdd90340f6a-goog

^ permalink raw reply related

* [PATCH net-next RFC 5/8] net: deconstify net_offload
From: Willem de Bruijn @ 2018-09-14 17:59 UTC (permalink / raw)
  To: netdev; +Cc: pabeni, steffen.klassert, davem, Willem de Bruijn
In-Reply-To: <20180914175941.213950-1-willemdebruijn.kernel@gmail.com>

From: Willem de Bruijn <willemb@google.com>

With configurable gro, the flags field in net_offloads may be changed.

Remove the const keyword. This is a noop otherwise.

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 include/linux/netdevice.h | 14 +++++++-------
 include/net/protocol.h    | 12 ++++++------
 net/core/dev.c            |  8 +++-----
 net/ipv4/af_inet.c        |  2 +-
 net/ipv4/esp4_offload.c   |  2 +-
 net/ipv4/fou.c            |  8 ++++----
 net/ipv4/gre_offload.c    |  2 +-
 net/ipv4/protocol.c       | 10 +++++-----
 net/ipv4/tcp_offload.c    |  2 +-
 net/ipv4/udp_offload.c    |  6 +++---
 net/ipv6/esp6_offload.c   |  2 +-
 net/ipv6/ip6_offload.c    |  6 +++---
 net/ipv6/protocol.c       | 10 +++++-----
 net/ipv6/tcpv6_offload.c  |  2 +-
 net/ipv6/udp_offload.c    |  2 +-
 net/sctp/offload.c        |  2 +-
 16 files changed, 44 insertions(+), 46 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1c97a048506f..b9e671887fc2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3557,7 +3557,7 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old);
 struct sk_buff *napi_get_frags(struct napi_struct *napi);
 gro_result_t napi_gro_frags(struct napi_struct *napi);
 
-extern const struct net_offload __rcu *dev_offloads[256];
+extern struct net_offload __rcu *dev_offloads[256];
 
 static inline u8 net_offload_from_type(u16 type)
 {
@@ -3567,19 +3567,19 @@ static inline u8 net_offload_from_type(u16 type)
 	return type & 0xFF;
 }
 
-static inline bool net_offload_has_flag(const struct net_offload __rcu **offs,
+static inline bool net_offload_has_flag(struct net_offload __rcu **offs,
 					u16 type, u16 flag)
 {
-	const struct net_offload *off;
+	struct net_offload *off;
 
 	off = offs ? rcu_dereference(offs[net_offload_from_type(type)]) : NULL;
 	return off && off->flags & flag;
 }
 
 static inline const struct net_offload *
-net_gro_receive(const struct net_offload __rcu **offs, u16 type)
+net_gro_receive(struct net_offload __rcu **offs, u16 type)
 {
-	const struct net_offload *off;
+	struct net_offload *off;
 
 	off = rcu_dereference(offs[net_offload_from_type(type)]);
 	if (off && off->callbacks.gro_receive &&
@@ -3589,10 +3589,10 @@ net_gro_receive(const struct net_offload __rcu **offs, u16 type)
 		return NULL;
 }
 
-static inline int net_gro_complete(const struct net_offload __rcu **offs,
+static inline int net_gro_complete(struct net_offload __rcu **offs,
 				   u16 type, struct sk_buff *skb, int nhoff)
 {
-	const struct net_offload *off;
+	struct net_offload *off;
 	int ret = -ENOENT;
 
 	rcu_read_lock();
diff --git a/include/net/protocol.h b/include/net/protocol.h
index 53a0322ee545..5e2c20b662d1 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -87,8 +87,8 @@ struct inet_protosw {
 #define INET_PROTOSW_ICSK      0x04  /* Is this an inet_connection_sock? */
 
 extern struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS];
-extern const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS];
-extern const struct net_offload __rcu *inet6_offloads[MAX_INET_PROTOS];
+extern struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS];
+extern struct net_offload __rcu *inet6_offloads[MAX_INET_PROTOS];
 
 #if IS_ENABLED(CONFIG_IPV6)
 extern struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS];
@@ -96,8 +96,8 @@ extern struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS];
 
 int inet_add_protocol(const struct net_protocol *prot, unsigned char num);
 int inet_del_protocol(const struct net_protocol *prot, unsigned char num);
-int inet_add_offload(const struct net_offload *prot, unsigned char num);
-int inet_del_offload(const struct net_offload *prot, unsigned char num);
+int inet_add_offload(struct net_offload *prot, unsigned char num);
+int inet_del_offload(struct net_offload *prot, unsigned char num);
 void inet_register_protosw(struct inet_protosw *p);
 void inet_unregister_protosw(struct inet_protosw *p);
 
@@ -107,7 +107,7 @@ int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char num);
 int inet6_register_protosw(struct inet_protosw *p);
 void inet6_unregister_protosw(struct inet_protosw *p);
 #endif
-int inet6_add_offload(const struct net_offload *prot, unsigned char num);
-int inet6_del_offload(const struct net_offload *prot, unsigned char num);
+int inet6_add_offload(struct net_offload *prot, unsigned char num);
+int inet6_del_offload(struct net_offload *prot, unsigned char num);
 
 #endif	/* _PROTOCOL_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index ae5fbd4114d2..20d9552afd38 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -466,7 +466,7 @@ void dev_remove_pack(struct packet_type *pt)
 EXPORT_SYMBOL(dev_remove_pack);
 
 
-const struct net_offload __rcu *dev_offloads[256] __read_mostly;
+struct net_offload __rcu *dev_offloads[256] __read_mostly;
 EXPORT_SYMBOL(dev_offloads);
 
 /**
@@ -483,8 +483,7 @@ EXPORT_SYMBOL(dev_offloads);
  */
 void dev_add_offload(struct packet_offload *po)
 {
-	cmpxchg((const struct net_offload **)
-		&dev_offloads[net_offload_from_type(po->type)],
+	cmpxchg(&dev_offloads[net_offload_from_type(po->type)],
 			NULL, po);
 }
 EXPORT_SYMBOL(dev_add_offload);
@@ -504,8 +503,7 @@ EXPORT_SYMBOL(dev_add_offload);
  */
 static int __dev_remove_offload(struct packet_offload *po)
 {
-	return (cmpxchg((const struct net_offload **)
-			&dev_offloads[net_offload_from_type(po->type)],
+	return (cmpxchg(&dev_offloads[net_offload_from_type(po->type)],
 		       po, NULL) == po) ? 0 : -1;
 }
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 28b7c7671789..f3ee6f4dfc0f 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1839,7 +1839,7 @@ static struct packet_offload ip_packet_offload __read_mostly = {
 	},
 };
 
-static const struct net_offload ipip_offload = {
+static struct net_offload ipip_offload = {
 	.callbacks = {
 		.gso_segment	= inet_gso_segment,
 		.gro_receive	= ipip_gro_receive,
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 58834a10c0be..e6d7a9be9244 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -240,7 +240,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb,  netdev_features_
 	return 0;
 }
 
-static const struct net_offload esp4_offload = {
+static struct net_offload esp4_offload = {
 	.callbacks = {
 		.gro_receive = esp4_gro_receive,
 		.gso_segment = esp4_gso_segment,
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 13401cb2e7a4..52e01dcaa417 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -229,7 +229,7 @@ static struct sk_buff *fou_gro_receive(struct sock *sk,
 				       struct sk_buff *skb)
 {
 	u8 proto = fou_from_sock(sk)->protocol;
-	const struct net_offload **offloads;
+	struct net_offload **offloads;
 	const struct net_offload *ops;
 	struct sk_buff *pp = NULL;
 
@@ -262,7 +262,7 @@ static int fou_gro_complete(struct sock *sk, struct sk_buff *skb,
 			    int nhoff)
 {
 	u8 proto = fou_from_sock(sk)->protocol;
-	const struct net_offload **offloads;
+	struct net_offload **offloads;
 	int err;
 
 	offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
@@ -299,7 +299,7 @@ static struct sk_buff *gue_gro_receive(struct sock *sk,
 				       struct list_head *head,
 				       struct sk_buff *skb)
 {
-	const struct net_offload **offloads;
+	struct net_offload **offloads;
 	const struct net_offload *ops;
 	struct sk_buff *pp = NULL;
 	struct sk_buff *p;
@@ -445,7 +445,7 @@ static struct sk_buff *gue_gro_receive(struct sock *sk,
 
 static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
 {
-	const struct net_offload **offloads;
+	struct net_offload **offloads;
 	struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
 	unsigned int guehlen = 0;
 	u8 proto;
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 4f9237a4bea1..70910650d322 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -252,7 +252,7 @@ static int gre_gro_complete(struct sk_buff *skb, int nhoff)
 	return err;
 }
 
-static const struct net_offload gre_offload = {
+static struct net_offload gre_offload = {
 	.callbacks = {
 		.gso_segment = gre_gso_segment,
 		.gro_receive = gre_gro_receive,
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 32a691b7ce2c..66948d77672e 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -29,7 +29,7 @@
 #include <net/protocol.h>
 
 struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
-const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
+struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
 EXPORT_SYMBOL(inet_offloads);
 
 int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
@@ -45,9 +45,9 @@ int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
 }
 EXPORT_SYMBOL(inet_add_protocol);
 
-int inet_add_offload(const struct net_offload *prot, unsigned char protocol)
+int inet_add_offload(struct net_offload *prot, unsigned char protocol)
 {
-	return !cmpxchg((const struct net_offload **)&inet_offloads[protocol],
+	return !cmpxchg((struct net_offload **)&inet_offloads[protocol],
 			NULL, prot) ? 0 : -1;
 }
 EXPORT_SYMBOL(inet_add_offload);
@@ -65,11 +65,11 @@ int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
 }
 EXPORT_SYMBOL(inet_del_protocol);
 
-int inet_del_offload(const struct net_offload *prot, unsigned char protocol)
+int inet_del_offload(struct net_offload *prot, unsigned char protocol)
 {
 	int ret;
 
-	ret = (cmpxchg((const struct net_offload **)&inet_offloads[protocol],
+	ret = (cmpxchg((struct net_offload **)&inet_offloads[protocol],
 		       prot, NULL) == prot) ? 0 : -1;
 
 	synchronize_net();
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 870b0a335061..d670f2d008bc 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -333,7 +333,7 @@ static int tcp4_gro_complete(struct sk_buff *skb, int thoff)
 	return tcp_gro_complete(skb);
 }
 
-static const struct net_offload tcpv4_offload = {
+static struct net_offload tcpv4_offload = {
 	.callbacks = {
 		.gso_segment	=	tcp4_gso_segment,
 		.gro_receive	=	tcp4_gro_receive,
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 0c0522b79b43..4f6aa95a9b12 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -153,8 +153,8 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 				       bool is_ipv6)
 {
 	__be16 protocol = skb->protocol;
-	const struct net_offload **offloads;
-	const struct net_offload *ops;
+	struct net_offload **offloads;
+	struct net_offload *ops;
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb,
 					     netdev_features_t features);
@@ -472,7 +472,7 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
 	return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
 }
 
-static const struct net_offload udpv4_offload = {
+static struct net_offload udpv4_offload = {
 	.callbacks = {
 		.gso_segment = udp4_ufo_fragment,
 		.gro_receive  =	udp4_gro_receive,
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index 6177e2171171..169dcd5c7135 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -268,7 +268,7 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb,  netdev_features
 	return 0;
 }
 
-static const struct net_offload esp6_offload = {
+static struct net_offload esp6_offload = {
 	.callbacks = {
 		.gro_receive = esp6_gro_receive,
 		.gso_segment = esp6_gso_segment,
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 4854509a2c5d..2d0ea3f453f2 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -330,7 +330,7 @@ static struct packet_offload ipv6_packet_offload __read_mostly = {
 	},
 };
 
-static const struct net_offload sit_offload = {
+static struct net_offload sit_offload = {
 	.callbacks = {
 		.gso_segment	= ipv6_gso_segment,
 		.gro_receive    = sit_ip6ip6_gro_receive,
@@ -338,7 +338,7 @@ static const struct net_offload sit_offload = {
 	},
 };
 
-static const struct net_offload ip4ip6_offload = {
+static struct net_offload ip4ip6_offload = {
 	.callbacks = {
 		.gso_segment	= inet_gso_segment,
 		.gro_receive    = ip4ip6_gro_receive,
@@ -346,7 +346,7 @@ static const struct net_offload ip4ip6_offload = {
 	},
 };
 
-static const struct net_offload ip6ip6_offload = {
+static struct net_offload ip6ip6_offload = {
 	.callbacks = {
 		.gso_segment	= ipv6_gso_segment,
 		.gro_receive    = sit_ip6ip6_gro_receive,
diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c
index b5d54d4f995c..06efcfc6d02b 100644
--- a/net/ipv6/protocol.c
+++ b/net/ipv6/protocol.c
@@ -50,21 +50,21 @@ int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char protocol
 EXPORT_SYMBOL(inet6_del_protocol);
 #endif
 
-const struct net_offload __rcu *inet6_offloads[MAX_INET_PROTOS] __read_mostly;
+struct net_offload __rcu *inet6_offloads[MAX_INET_PROTOS] __read_mostly;
 EXPORT_SYMBOL(inet6_offloads);
 
-int inet6_add_offload(const struct net_offload *prot, unsigned char protocol)
+int inet6_add_offload(struct net_offload *prot, unsigned char protocol)
 {
-	return !cmpxchg((const struct net_offload **)&inet6_offloads[protocol],
+	return !cmpxchg((struct net_offload **)&inet6_offloads[protocol],
 			NULL, prot) ? 0 : -1;
 }
 EXPORT_SYMBOL(inet6_add_offload);
 
-int inet6_del_offload(const struct net_offload *prot, unsigned char protocol)
+int inet6_del_offload(struct net_offload *prot, unsigned char protocol)
 {
 	int ret;
 
-	ret = (cmpxchg((const struct net_offload **)&inet6_offloads[protocol],
+	ret = (cmpxchg((struct net_offload **)&inet6_offloads[protocol],
 		       prot, NULL) == prot) ? 0 : -1;
 
 	synchronize_net();
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
index e72947c99454..a3c5010e1361 100644
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -67,7 +67,7 @@ static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb,
 
 	return tcp_gso_segment(skb, features);
 }
-static const struct net_offload tcpv6_offload = {
+static struct net_offload tcpv6_offload = {
 	.callbacks = {
 		.gso_segment	=	tcp6_gso_segment,
 		.gro_receive	=	tcp6_gro_receive,
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 95dee9ca8d22..2a41da0dd33f 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -158,7 +158,7 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff)
 	return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb);
 }
 
-static const struct net_offload udpv6_offload = {
+static struct net_offload udpv6_offload = {
 	.callbacks = {
 		.gso_segment	=	udp6_ufo_fragment,
 		.gro_receive	=	udp6_gro_receive,
diff --git a/net/sctp/offload.c b/net/sctp/offload.c
index 123e9f2dc226..ad504b83245d 100644
--- a/net/sctp/offload.c
+++ b/net/sctp/offload.c
@@ -90,7 +90,7 @@ static struct sk_buff *sctp_gso_segment(struct sk_buff *skb,
 	return segs;
 }
 
-static const struct net_offload sctp_offload = {
+static struct net_offload sctp_offload = {
 	.callbacks = {
 		.gso_segment = sctp_gso_segment,
 	},
-- 
2.19.0.397.gdd90340f6a-goog

^ permalink raw reply related

* [PATCH net-next RFC 6/8] net: make gro configurable
From: Willem de Bruijn @ 2018-09-14 17:59 UTC (permalink / raw)
  To: netdev; +Cc: pabeni, steffen.klassert, davem, Willem de Bruijn
In-Reply-To: <20180914175941.213950-1-willemdebruijn.kernel@gmail.com>

From: Willem de Bruijn <willemb@google.com>

Add net_offload flag NET_OFF_FLAG_GRO_OFF. If set, a net_offload will
not be used for gro receive processing.

Also add sysctl helper proc_do_net_offload that toggles this flag and
register sysctls net.{core,ipv4,ipv6}.gro

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 drivers/net/vxlan.c        |  8 +++++
 include/linux/netdevice.h  |  7 ++++-
 net/core/dev.c             |  1 +
 net/core/sysctl_net_core.c | 60 ++++++++++++++++++++++++++++++++++++++
 net/ipv4/sysctl_net_ipv4.c |  7 +++++
 net/ipv6/ip6_offload.c     | 10 +++++--
 net/ipv6/sysctl_net_ipv6.c |  8 +++++
 7 files changed, 97 insertions(+), 4 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index e5d236595206..8cb8e02c8ab6 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -572,6 +572,7 @@ static struct sk_buff *vxlan_gro_receive(struct sock *sk,
 					 struct list_head *head,
 					 struct sk_buff *skb)
 {
+	const struct net_offload *ops;
 	struct sk_buff *pp = NULL;
 	struct sk_buff *p;
 	struct vxlanhdr *vh, *vh2;
@@ -606,6 +607,12 @@ static struct sk_buff *vxlan_gro_receive(struct sock *sk,
 			goto out;
 	}
 
+	rcu_read_lock();
+	ops = net_gro_receive(dev_offloads, ETH_P_TEB);
+	rcu_read_unlock();
+	if (!ops)
+		goto out;
+
 	skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */
 
 	list_for_each_entry(p, head, list) {
@@ -621,6 +628,7 @@ static struct sk_buff *vxlan_gro_receive(struct sock *sk,
 	}
 
 	pp = call_gro_receive(eth_gro_receive, head, skb);
+
 	flush = 0;
 
 out:
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b9e671887fc2..93e8c9ade593 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2377,6 +2377,10 @@ struct net_offload {
 
 /* This should be set for any extension header which is compatible with GSO. */
 #define INET6_PROTO_GSO_EXTHDR	0x1
+#define NET_OFF_FLAG_GRO_OFF	0x2
+
+int proc_do_net_offload(struct ctl_table *ctl, int write, void __user *buffer,
+			size_t *lenp, loff_t *ppos);
 
 /* often modified stats are per-CPU, other are shared (netdev->stats) */
 struct pcpu_sw_netstats {
@@ -3583,7 +3587,8 @@ net_gro_receive(struct net_offload __rcu **offs, u16 type)
 
 	off = rcu_dereference(offs[net_offload_from_type(type)]);
 	if (off && off->callbacks.gro_receive &&
-	    (!off->type || off->type == type))
+	    (!off->type || off->type == type) &&
+	    !(off->flags & NET_OFF_FLAG_GRO_OFF))
 		return off;
 	else
 		return NULL;
diff --git a/net/core/dev.c b/net/core/dev.c
index 20d9552afd38..0fd5273bc931 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -154,6 +154,7 @@
 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 
 static DEFINE_SPINLOCK(ptype_lock);
+DEFINE_SPINLOCK(offload_lock);
 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 struct list_head ptype_all __read_mostly;	/* Taps */
 static struct list_head offload_base __read_mostly;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index b1a2c5e38530..d2d72afdd9eb 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -15,6 +15,7 @@
 #include <linux/vmalloc.h>
 #include <linux/init.h>
 #include <linux/slab.h>
+#include <linux/bitmap.h>
 
 #include <net/ip.h>
 #include <net/sock.h>
@@ -34,6 +35,58 @@ static int net_msg_warn;	/* Unused, but still a sysctl */
 int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0;
 EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net);
 
+extern spinlock_t offload_lock;
+
+#define NET_OFF_TBL_LEN	256
+
+int proc_do_net_offload(struct ctl_table *ctl, int write, void __user *buffer,
+			size_t *lenp, loff_t *ppos)
+{
+	unsigned long bitmap[NET_OFF_TBL_LEN / (sizeof(unsigned long) << 3)];
+	struct ctl_table tbl = { .maxlen = NET_OFF_TBL_LEN, .data = bitmap };
+	unsigned long flag = (unsigned long) ctl->extra2;
+	struct net_offload __rcu **offs = ctl->extra1;
+	struct net_offload *off;
+	int i, ret;
+
+	memset(bitmap, 0, sizeof(bitmap));
+
+	spin_lock(&offload_lock);
+
+	for (i = 0; i < tbl.maxlen; i++) {
+		off = rcu_dereference_protected(offs[i], lockdep_is_held(&offload_lock));
+		if (off && off->flags & flag) {
+			/* flag specific constraints */
+			if (flag == NET_OFF_FLAG_GRO_OFF) {
+				/* gro disable bit: only if can gro */
+				if (!off->callbacks.gro_receive &&
+				    !(off->flags & INET6_PROTO_GSO_EXTHDR))
+					continue;
+			}
+			set_bit(i, bitmap);
+		}
+	}
+
+	ret = proc_do_large_bitmap(&tbl, write, buffer, lenp, ppos);
+
+	if (write && !ret) {
+		for (i = 0; i < tbl.maxlen; i++) {
+			bool isset = test_bit(i, bitmap);
+
+			off = rcu_dereference_protected(offs[i], lockdep_is_held(&offload_lock));
+			if (!isset && (off->flags & flag))
+				off->flags &= ~flag;
+			else if (isset && !(off->flags & flag))
+				off->flags |= flag;
+		}
+	}
+
+	spin_unlock(&offload_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(proc_do_net_offload);
+
 #ifdef CONFIG_RPS
 static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
 				void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -435,6 +488,13 @@ static struct ctl_table net_core_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one
 	},
+	{
+		.procname	= "gro",
+		.mode		= 0644,
+		.proc_handler	= proc_do_net_offload,
+		.extra1		= dev_offloads,
+		.extra2		= (void *) NET_OFF_FLAG_GRO_OFF,
+	},
 #ifdef CONFIG_RPS
 	{
 		.procname	= "rps_sock_flow_entries",
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b92f422f2fa8..7a525039afb2 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -477,6 +477,13 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "gro",
+		.mode		= 0644,
+		.proc_handler	= proc_do_net_offload,
+		.extra1		= inet_offloads,
+		.extra2		= (void *) NET_OFF_FLAG_GRO_OFF,
+	},
 #ifdef CONFIG_NETLABEL
 	{
 		.procname	= "cipso_cache_enable",
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 2d0ea3f453f2..6be5adbd2ce7 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -20,7 +20,7 @@
 
 #include "ip6_offload.h"
 
-static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto)
+static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto, bool is_gro)
 {
 	for (;;) {
 		struct ipv6_opt_hdr *opth;
@@ -30,6 +30,10 @@ static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto)
 					  INET6_PROTO_GSO_EXTHDR))
 			break;
 
+		if (is_gro && !net_offload_has_flag(inet6_offloads, proto,
+						    NET_OFF_FLAG_GRO_OFF))
+			break;
+
 		if (unlikely(!pskb_may_pull(skb, 8)))
 			break;
 
@@ -76,7 +80,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	__skb_pull(skb, sizeof(*ipv6h));
 	segs = ERR_PTR(-EPROTONOSUPPORT);
 
-	proto = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr);
+	proto = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr, false);
 
 	if (skb->encapsulation &&
 	    skb_shinfo(skb)->gso_type & (SKB_GSO_IPXIP4 | SKB_GSO_IPXIP6))
@@ -188,7 +192,7 @@ static struct sk_buff *ipv6_gro_receive(struct list_head *head,
 	if (!ops) {
 		__pskb_pull(skb, skb_gro_offset(skb));
 		skb_gro_frag0_invalidate(skb);
-		proto = ipv6_gso_pull_exthdrs(skb, proto);
+		proto = ipv6_gso_pull_exthdrs(skb, proto, true);
 		skb_gro_pull(skb, -skb_transport_offset(skb));
 		skb_reset_transport_header(skb);
 		__skb_push(skb, skb_gro_offset(skb));
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index e15cd37024fd..83f14962a909 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -20,6 +20,7 @@
 #ifdef CONFIG_NETLABEL
 #include <net/calipso.h>
 #endif
+#include <net/protocol.h>
 
 static int zero;
 static int one = 1;
@@ -178,6 +179,13 @@ static struct ctl_table ipv6_rotable[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &one
 	},
+	{
+		.procname	= "gro",
+		.mode		= 0644,
+		.proc_handler	= proc_do_net_offload,
+		.extra1		= inet6_offloads,
+		.extra2		= (void *) NET_OFF_FLAG_GRO_OFF,
+	},
 #ifdef CONFIG_NETLABEL
 	{
 		.procname	= "calipso_cache_enable",
-- 
2.19.0.397.gdd90340f6a-goog

^ permalink raw reply related

* [PATCH net-next RFC 7/8] udp: gro behind static key
From: Willem de Bruijn @ 2018-09-14 17:59 UTC (permalink / raw)
  To: netdev; +Cc: pabeni, steffen.klassert, davem, Willem de Bruijn
In-Reply-To: <20180914175941.213950-1-willemdebruijn.kernel@gmail.com>

From: Willem de Bruijn <willemb@google.com>

Avoid the socket lookup cost in udp_gro_receive if no socket has a
gro callback configured.

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 include/net/udp.h      | 2 ++
 net/ipv4/udp.c         | 2 +-
 net/ipv4/udp_offload.c | 2 +-
 net/ipv6/udp.c         | 2 +-
 net/ipv6/udp_offload.c | 2 +-
 5 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/net/udp.h b/include/net/udp.h
index 8482a990b0bb..9e82cb391dea 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -443,8 +443,10 @@ int udpv4_offload_init(void);
 
 void udp_init(void);
 
+DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key);
 void udp_encap_enable(void);
 #if IS_ENABLED(CONFIG_IPV6)
+DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
 void udpv6_encap_enable(void);
 #endif
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f4e35b2ff8b8..bd873a5b8a86 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1889,7 +1889,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
-static DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
+DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
 void udp_encap_enable(void)
 {
 	static_branch_enable(&udp_encap_needed_key);
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 4f6aa95a9b12..f44fe328aa0f 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -405,7 +405,7 @@ static struct sk_buff *udp4_gro_receive(struct list_head *head,
 {
 	struct udphdr *uh = udp_gro_udphdr(skb);
 
-	if (unlikely(!uh))
+	if (unlikely(!uh) || !static_branch_unlikely(&udp_encap_needed_key))
 		goto flush;
 
 	/* Don't bother verifying checksum if we're going to flush anyway. */
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 83f4c77c79d8..d84672959f10 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -548,7 +548,7 @@ static __inline__ void udpv6_err(struct sk_buff *skb,
 	__udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
 }
 
-static DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
+DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
 void udpv6_encap_enable(void)
 {
 	static_branch_enable(&udpv6_encap_needed_key);
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 2a41da0dd33f..e00f19c4a939 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -119,7 +119,7 @@ static struct sk_buff *udp6_gro_receive(struct list_head *head,
 {
 	struct udphdr *uh = udp_gro_udphdr(skb);
 
-	if (unlikely(!uh))
+	if (unlikely(!uh) || !static_branch_unlikely(&udpv6_encap_needed_key))
 		goto flush;
 
 	/* Don't bother verifying checksum if we're going to flush anyway. */
-- 
2.19.0.397.gdd90340f6a-goog

^ permalink raw reply related

* [PATCH net-next RFC 8/8] udp: add gro
From: Willem de Bruijn @ 2018-09-14 17:59 UTC (permalink / raw)
  To: netdev; +Cc: pabeni, steffen.klassert, davem, Willem de Bruijn
In-Reply-To: <20180914175941.213950-1-willemdebruijn.kernel@gmail.com>

From: Willem de Bruijn <willemb@google.com>

Very rough initial version of udp gro, for discussion purpose only at
this point.

Among others it
- lacks the cmsg UDP_SEGMENT to return gso_size
- probably breaks udp tunnels
- hard breaks at 40 segments
- does not allow a last segment of unequal size

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 include/uapi/linux/udp.h |  1 +
 net/ipv4/udp.c           | 71 ++++++++++++++++++++++++++++++++++++++++
 net/ipv4/udp_offload.c   | 11 +++----
 3 files changed, 76 insertions(+), 7 deletions(-)

diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h
index 09d00f8c442b..7fda3e8c7fcf 100644
--- a/include/uapi/linux/udp.h
+++ b/include/uapi/linux/udp.h
@@ -33,6 +33,7 @@ struct udphdr {
 #define UDP_NO_CHECK6_TX 101	/* Disable sending checksum for UDP6X */
 #define UDP_NO_CHECK6_RX 102	/* Disable accpeting checksum for UDP6 */
 #define UDP_SEGMENT	103	/* Set GSO segmentation size */
+#define UDP_GRO		104	/* Enable GRO */
 
 /* UDP encapsulation types */
 #define UDP_ENCAP_ESPINUDP_NON_IKE	1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index bd873a5b8a86..ae49c08e6225 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2387,6 +2387,51 @@ void udp_destroy_sock(struct sock *sk)
 	}
 }
 
+static struct sk_buff *udp_gro_receive_cb(struct sock *sk,
+					  struct list_head *head,
+					  struct sk_buff *skb)
+{
+	struct sk_buff *p;
+	unsigned int off;
+
+	off = skb_gro_offset(skb) - sizeof(struct udphdr);
+
+	list_for_each_entry(p, head, list) {
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		/* TODO: for UDP_GRO: match size unless last segment */
+		if (NAPI_GRO_CB(p)->flush)
+			break;
+
+		/* TODO: look into ip id check */
+		if (skb_gro_receive(p, skb)) {
+			NAPI_GRO_CB(skb)->flush = 1;
+			break;
+		}
+
+		if (NAPI_GRO_CB(skb)->count >= 40) {
+			return p;
+		}
+
+		return NULL;
+	}
+
+	return NULL;
+}
+
+static int udp_gro_complete_cb(struct sock *sk, struct sk_buff *skb,
+			       int nhoff)
+{
+	skb->csum_start = (unsigned char *)udp_hdr(skb) - skb->head;
+	skb->csum_offset = offsetof(struct udphdr, check);
+	skb->ip_summed = CHECKSUM_PARTIAL;
+
+	skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+	return 0;
+}
+
 /*
  *	Socket option code for UDP
  */
@@ -2450,6 +2495,32 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 		up->gso_size = val;
 		break;
 
+	case UDP_GRO:
+	{
+		if (val < 0 || val > 1)
+			return -EINVAL;
+
+		lock_sock(sk);
+		if (val) {
+
+			if (!udp_sk(sk)->gro_receive) {
+				udp_sk(sk)->gro_complete = udp_gro_complete_cb;
+				udp_sk(sk)->gro_receive = udp_gro_receive_cb;
+			} else {
+				err = -EALREADY;
+			}
+		} else {
+			if (udp_sk(sk)->gro_receive) {
+				udp_sk(sk)->gro_receive = NULL;
+				udp_sk(sk)->gro_complete = NULL;
+			} else {
+				err = -ENOENT;
+			}
+		}
+		release_sock(sk);
+		break;
+	}
+
 	/*
 	 * 	UDP-Lite's partial checksum coverage (RFC 3828).
 	 */
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index f44fe328aa0f..6dd3f0a28b5e 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -386,6 +386,8 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
 			NAPI_GRO_CB(p)->same_flow = 0;
 			continue;
 		}
+
+		/* TODO: for UDP_GRO: match size */
 	}
 
 	skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
@@ -437,11 +439,6 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
 
 	uh->len = newlen;
 
-	/* Set encapsulation before calling into inner gro_complete() functions
-	 * to make them set up the inner offsets.
-	 */
-	skb->encapsulation = 1;
-
 	rcu_read_lock();
 	sk = (*lookup)(skb, uh->source, uh->dest);
 	if (sk && udp_sk(sk)->gro_complete)
@@ -462,11 +459,11 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
 	struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
 
 	if (uh->check) {
-		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4;
 		uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr,
 					  iph->daddr, 0);
 	} else {
-		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
+		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4;
 	}
 
 	return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
-- 
2.19.0.397.gdd90340f6a-goog

^ permalink raw reply related

* Re: [PATCH] net: caif: remove redundant null check on frontpkt
From: Colin Ian King @ 2018-09-14 18:18 UTC (permalink / raw)
  To: Sergei Shtylyov, Dmitry Tarnyagin, David S . Miller
  Cc: kernel-janitors, netdev
In-Reply-To: <47b7f5a0-8b13-ead7-33b7-6e9c6ada8e61@cogentembedded.com>

On 14/09/18 18:54, Sergei Shtylyov wrote:
> Hello!
> 
> On 09/14/2018 08:19 PM, Colin King wrote:
> 
>> From: Colin Ian King <colin.king@canonical.com>
>>
>> It is impossible for frontpkt to be null at the point of the null
>> check because it has been assigned from rearpkt and there is no
>> way realpkt can be null at the point of the assignment because
> 
>    rearpkt?

Good spot. Can this be fixed up when the patch is applied?

> 
>> of the sanity checking and exit paths taken previously. Remove
>> the redundant null check.
>>
>> Detected by CoverityScan, CID#114434 ("Logically dead code")
>>
>> Signed-off-by: Colin Ian King <colin.king@canonical.com>
> [...]
> 
> MBR, Sergei
> 

^ permalink raw reply

* KMSAN: uninit-value in strlcpy (2)
From: syzbot @ 2018-09-14 18:23 UTC (permalink / raw)
  To: coreteam, davem, fw, horms, ja, kadlec, linux-kernel, lvs-devel,
	netdev, netfilter-devel, pablo, syzkaller-bugs, wensong

Hello,

syzbot found the following crash on:

HEAD commit:    9822946c7fee kmsan: update .config.example to v4.17-rc5
git tree:       https://github.com/google/kmsan.git/master
console output: https://syzkaller.appspot.com/x/log.txt?x=169a5197800000
kernel config:  https://syzkaller.appspot.com/x/.config?x=9fa436d3ae606638
dashboard link: https://syzkaller.appspot.com/bug?extid=c86cf7903306a6c201ba
compiler:       clang version 7.0.0 (trunk 329391)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=15d1b87b800000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=11235417800000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+c86cf7903306a6c201ba@syzkaller.appspotmail.com

random: sshd: uninitialized urandom read (32 bytes read)
random: sshd: uninitialized urandom read (32 bytes read)
random: sshd: uninitialized urandom read (32 bytes read)
random: sshd: uninitialized urandom read (32 bytes read)
==================================================================
BUG: KMSAN: uninit-value in strlen lib/string.c:482 [inline]
BUG: KMSAN: uninit-value in strlcpy+0x68/0x1c0 lib/string.c:142
CPU: 0 PID: 4506 Comm: syz-executor160 Not tainted 4.17.0-rc5+ #95
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Call Trace:
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x185/0x1d0 lib/dump_stack.c:113
  kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1084
  __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:683
  strlen lib/string.c:482 [inline]
  strlcpy+0x68/0x1c0 lib/string.c:142
  do_ip_vs_set_ctl+0x3f1/0x2760 net/netfilter/ipvs/ip_vs_ctl.c:2384
  nf_sockopt net/netfilter/nf_sockopt.c:106 [inline]
  nf_setsockopt+0x476/0x4d0 net/netfilter/nf_sockopt.c:115
  ip_setsockopt+0x24b/0x2b0 net/ipv4/ip_sockglue.c:1253
  udp_setsockopt+0x108/0x1b0 net/ipv4/udp.c:2416
  ipv6_setsockopt+0x30c/0x340 net/ipv6/ipv6_sockglue.c:917
  tcp_setsockopt+0x1bb/0x1f0 net/ipv4/tcp.c:2891
  sock_common_setsockopt+0x136/0x170 net/core/sock.c:3039
  __sys_setsockopt+0x4af/0x560 net/socket.c:1903
  __do_sys_setsockopt net/socket.c:1914 [inline]
  __se_sys_setsockopt net/socket.c:1911 [inline]
  __x64_sys_setsockopt+0x15c/0x1c0 net/socket.c:1911
  do_syscall_64+0x154/0x220 arch/x86/entry/common.c:287
  entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x43fce9
RSP: 002b:00007ffea6b1dd08 EFLAGS: 00000213 ORIG_RAX: 0000000000000036
RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 000000000043fce9
RDX: 000000000000048b RSI: 0000000000000000 RDI: 0000000000000003
RBP: 00000000006ca018 R08: 0000000000000018 R09: 00000000004002c8
R10: 00000000200001c0 R11: 0000000000000213 R12: 0000000000401610
R13: 00000000004016a0 R14: 0000000000000000 R15: 0000000000000000

Local variable description: ----arg@do_ip_vs_set_ctl
Variable was created at:
  read_pnet include/net/net_namespace.h:288 [inline]
  sock_net include/net/sock.h:2306 [inline]
  do_ip_vs_set_ctl+0x93/0x2760 net/netfilter/ipvs/ip_vs_ctl.c:2347
  nf_sockopt net/netfilter/nf_sockopt.c:106 [inline]
  nf_setsockopt+0x476/0x4d0 net/netfilter/nf_sockopt.c:115
==================================================================


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with  
syzbot.
syzbot can test patches for this bug, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* Re: mlx5 driver loading failing on v4.19 / net-next / bpf-next
From: Saeed Mahameed @ 2018-09-14 18:26 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Alexei Starovoitov, Moshe Shemesh, Eli Cohen, Or Gerlitz,
	Tariq Toukan, Saeed Mahameed, netdev@vger.kernel.org,
	Eran Ben Elisha
In-Reply-To: <20180914105235.65dfafcd@redhat.com>

On Fri, Sep 14, 2018 at 1:52 AM, Jesper Dangaard Brouer
<brouer@redhat.com> wrote:
> On Fri, 14 Sep 2018 01:22:15 -0700
> Saeed Mahameed <saeedm@dev.mellanox.co.il> wrote:
>
>> On Thu, Sep 13, 2018 at 11:36 PM, Jesper Dangaard Brouer
>> <brouer@redhat.com> wrote:
>> > On Thu, 13 Sep 2018 15:55:29 -0700
>> > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>> >
>> >> On Thu, Aug 30, 2018 at 1:35 AM, Tariq Toukan <tariqt@mellanox.com> wrote:
>> >> >
>> >> >
>> >> > On 29/08/2018 6:05 PM, Jesper Dangaard Brouer wrote:
>> >> >>
>> >> >> Hi Saeed,
>> >> >>
>> >> >> I'm having issues loading mlx5 driver on v4.19 kernels (tested both
>> >> >> net-next and bpf-next), while kernel v4.18 seems to work.  It happens
>> >> >> with a Mellanox ConnectX-5 NIC (and also a CX4-Lx but I removed that
>> >> >> from the system now).
>> >> >>
>> >> >
>> >> > Hi Jesper,
>> >> >
>> >> > Thanks for your report!
>> >> >
>> >> > We are working to analyze and debug the issue.
>> >>
>> >> looks like serious issue to me... while no news in 2 weeks.
>> >> any update?
>> >
>> > Mellanox took it offlist, and Sep 6th found that this is a regression
>> > introduced by commit 269d26f47f6f ("net/mlx5: Reduce command polling
>> > interval"), but only if CONFIG_PREEMPT is on.
>> >
>> > I can confirm that reverting this commit fixed the issue (and not the
>> > firmware upgrade I also did).
>> >
>> > I think Moshe (Cc) is responsible for this case, and I expect to soon
>> > see a revert or alternative solution to this!?
>> >
>> > Thanks for the kick Alexei :-)
>>
>> Thanks you Alexei and Jesper for following up,
>> the fix is already being tested [1] and will be submitted tomorrow,
>> as Jesper pointed out the issue happens only with 269d26f47f6f
>> ("net/mlx5: Reduce command polling
>> interval"), and only if CONFIG_PREEMPT is on.
>> the only affected kernel is 4.19 which is not GA yet.
>>
>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git/commit/?h=net-mlx5
>
> Sound good.
>
> I will appreciate if you add a:
>
> Reported-by: Jesper Dangaard Brouer <brouer@redhat.com>
>

Of course i will add it, simply the patch was in my review queue
before your report :).

> --
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [Cake] [PATCH iproute2] q_cake: Also print nonat, nowash and no-ack-filter keywords
From: Stephen Hemminger @ 2018-09-14 18:35 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: netdev, cake
In-Reply-To: <20180914135139.16369-1-toke@toke.dk>

On Fri, 14 Sep 2018 15:51:39 +0200
Toke Høiland-Jørgensen <toke@toke.dk> wrote:

> Similar to the previous patch for no-split-gso, the negative keywords for
> 'nat', 'wash' and 'ack-filter' were not printed either. Add those well.
> 
> Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH net-next RFC 6/8] net: make gro configurable
From: Stephen Hemminger @ 2018-09-14 18:38 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: netdev, pabeni, steffen.klassert, davem, Willem de Bruijn
In-Reply-To: <20180914175941.213950-7-willemdebruijn.kernel@gmail.com>

On Fri, 14 Sep 2018 13:59:39 -0400
Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:

> diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
> index e5d236595206..8cb8e02c8ab6 100644
> --- a/drivers/net/vxlan.c
> +++ b/drivers/net/vxlan.c
> @@ -572,6 +572,7 @@ static struct sk_buff *vxlan_gro_receive(struct sock *sk,
>  					 struct list_head *head,
>  					 struct sk_buff *skb)
>  {
> +	const struct net_offload *ops;
>  	struct sk_buff *pp = NULL;
>  	struct sk_buff *p;
>  	struct vxlanhdr *vh, *vh2;
> @@ -606,6 +607,12 @@ static struct sk_buff *vxlan_gro_receive(struct sock *sk,
>  			goto out;
>  	}
>  
> +	rcu_read_lock();
> +	ops = net_gro_receive(dev_offloads, ETH_P_TEB);
> +	rcu_read_unlock();
> +	if (!ops)
> +		goto out;

Isn't rcu_read_lock already held here?
RCU read lock is always held in the receive handler path

> +
>  	skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */
>  
>  	list_for_each_entry(p, head, list) {
> @@ -621,6 +628,7 @@ static struct sk_buff *vxlan_gro_receive(struct sock *sk,
>  	}
>  
>  	pp = call_gro_receive(eth_gro_receive, head, skb);
> +
>  	flush = 0;

whitespace change crept into this patch.

^ permalink raw reply

* Re: [RFC PATCH net-next v1 00/14] rename and shrink i40evf
From: Jesse Brandeburg @ 2018-09-14 18:55 UTC (permalink / raw)
  To: Benjamin Poirier; +Cc: netdev, intel-wired-lan, jeffrey.t.kirsher
In-Reply-To: <20180914043917.GB24996@f2>

On Fri, 14 Sep 2018 13:39:17 +0900 Benjamin wrote:
> > Jesse Brandeburg (14):
> >   intel-ethernet: rename i40evf to iavf  
> 
> Seems like patch 1 didn't make it to netdev
> https://lists.osuosl.org/pipermail/intel-wired-lan/Week-of-Mon-20180910/014025.html

Hi Ben, Thanks for the note, I don't know why it didn't show up for
you, it's here if you want to take a look:
https://patchwork.ozlabs.org/patch/969557/

^ permalink raw reply

* [PATCH net] ipv6: fix possible use-after-free in ip6_xmit()
From: Eric Dumazet @ 2018-09-14 19:02 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet

In the unlikely case ip6_xmit() has to call skb_realloc_headroom(),
we need to call skb_set_owner_w() before consuming original skb,
otherwise we risk a use-after-free.

Bring IPv6 in line with what we do in IPv4 to fix this.

Fixes: 1da177e4c3f41 ("Linux-2.6.12-rc2")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
---
 net/ipv6/ip6_output.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 16f200f06500758c4cae84ea16229d5dbce912cb..f9f8f554d141676a7d342f85088d12d9a6815e9d 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -219,12 +219,10 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 				kfree_skb(skb);
 				return -ENOBUFS;
 			}
+			if (skb->sk)
+				skb_set_owner_w(skb2, skb->sk);
 			consume_skb(skb);
 			skb = skb2;
-			/* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
-			 * it is safe to call in our context (socket lock not held)
-			 */
-			skb_set_owner_w(skb, (struct sock *)sk);
 		}
 		if (opt->opt_flen)
 			ipv6_push_frag_opts(skb, opt, &proto);
-- 
2.19.0.397.gdd90340f6a-goog

^ permalink raw reply related

* Re: [RFC PATCH net-next v1 00/14] rename and shrink i40evf
From: Jesse Brandeburg @ 2018-09-14 19:17 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Linux Netdev List, intel-wired-lan, Jeff Kirsher, Saeed Mahameed
In-Reply-To: <CAJ3xEMj3OXwz=A5+5JQ0mXoq2OU3N7DP-TpJVMtdTWXZw5tQ9w@mail.gmail.com>

On Fri, 14 Sep 2018 12:10:45 +0300 Or wrote:
> On Fri, Sep 14, 2018 at 1:31 AM, Jesse Brandeburg
> <jesse.brandeburg@intel.com> wrote:
> on what HW ring format do you standardize? do i40e/Fortville and
> ice/what's-the-intel-code-name?  HWs can/use the same posting/completion
> descriptor?

The initial ring format is the same as used for XL710/X722 devices, and
planned be supported for the Intel Ethernet E800 series (ice driver) and
future VF devices using SR-IOV.

> > This solves 2 issues we saw coming or were already present, the
> > first was constant code duplication happening with i40e/i40evf,
> > when much of the duplicate code in the i40evf was not used or was
> > not needed.  
> 
> could you spare few words on the origin/nature of these duplicates? were them
> just developer C&P mistakes for functionality which is irrelevant for
> a VF? like what?
> if not, what was there?

In particular, some of the code was not used at all, but was not caught
by any automation because it was in a header file and included into
multiple file scopes.  Other big chunk of the duplicate code was for
the PF's usage of the communication channel to firmware, which for some
reason was left in the VF driver code (probably just to avoid changing
the file) - but the VF driver doesn't communicate to firmware, just to
the PF.

> > The second was to remove the future confusion of why
> > future VF devices that were not considered "40GbE" only devices
> > were supported by i40evf.  
> 
> can elaborate further?

The name i40evf was generating customer questions, and was confusing
when you add in multiple generations of PF hardware that are no longer
using the i40e driver.

> > The thought is that iavf will be the virtual function driver for
> > all future devices, so it should have a "generic" name to propery
> > represent that it is the VF driver for multiple generations of
> > devices.  
> 
> for that end,  as I think was explained @ the netdev Tokyo AVF session,
> you would need a mechanism for feature negotiation, is it here or coming up?

The driver already has it (a feature negotitiation), please see the
function called iavf_send_vf_config_msg, and follow from where it is
called.  Basically the VF driver negotiates with the PF for what it can
do, and the PF guarantees that the base set of features will always
work, with optional advanced features which the code may/may-not have
in the future.

> >  41 files changed, 3436 insertions(+), 7581 deletions(-)  
> 
> code diet is cool!

Thanks! ~4000 lines less made me very happy too.

^ permalink raw reply

* Re: [bpf-next, v4 0/5] Introduce eBPF flow dissector
From: Alexei Starovoitov @ 2018-09-14 19:22 UTC (permalink / raw)
  To: Petar Penkov
  Cc: netdev, davem, ast, daniel, simon.horman, ecree, songliubraving,
	tom, Petar Penkov
In-Reply-To: <20180914144622.16436-1-peterpenkov96@gmail.com>

On Fri, Sep 14, 2018 at 07:46:17AM -0700, Petar Penkov wrote:
> From: Petar Penkov <ppenkov@google.com>
> 
> This patch series hardens the RX stack by allowing flow dissection in BPF,
> as previously discussed [1]. Because of the rigorous checks of the BPF
> verifier, this provides significant security guarantees. In particular, the
> BPF flow dissector cannot get inside of an infinite loop, as with
> CVE-2013-4348, because BPF programs are guaranteed to terminate. It cannot
> read outside of packet bounds, because all memory accesses are checked.
> Also, with BPF the administrator can decide which protocols to support,
> reducing potential attack surface. Rarely encountered protocols can be
> excluded from dissection and the program can be updated without kernel
> recompile or reboot if a bug is discovered.
> 
> Patch 1 adds infrastructure to execute a BPF program in __skb_flow_dissect.
> This includes a new BPF program and attach type.
> 
> Patch 2 adds the new BPF flow dissector definitions to tools/uapi.
> 
> Patch 3 adds support for the new BPF program type to libbpf and bpftool.
> 
> Patch 4 adds a flow dissector program in BPF. This parses most protocols in
> __skb_flow_dissect in BPF for a subset of flow keys (basic, control, ports,
> and address types).
> 
> Patch 5 adds a selftest that attaches the BPF program to the flow dissector
> and sends traffic with different levels of encapsulation.
> 
> Performance Evaluation:
> The in-kernel implementation was compared against the demo program from
> patch 4 using the test in patch 5 with IPv4/UDP traffic over 10 seconds.
> 	$perf record -a -C 4 taskset -c 4 ./test_flow_dissector -i 4 -f 8 \
> 		-t 10

Looks great. Applied to bpf-next with one extra patch:
 SEC("dissect")
-int dissect(struct __sk_buff *skb)
+int _dissect(struct __sk_buff *skb)

otherwise the test doesn't build.
I'm not sure how it builds for you. Which llvm did you use?

Also above command works and ipv4 test in ./test_flow_dissector.sh
is passing as well, but it still fails at the end for me:
./test_flow_dissector.sh
bpffs not mounted. Mounting...
0: IP
1: IPV6
2: IPV6OP
3: IPV6FR
4: MPLS
5: VLAN
Testing IPv4...
inner.dest4: 127.0.0.1
inner.source4: 127.0.0.3
pkts: tx=10 rx=10
inner.dest4: 127.0.0.1
inner.source4: 127.0.0.3
pkts: tx=10 rx=0
inner.dest4: 127.0.0.1
inner.source4: 127.0.0.3
pkts: tx=10 rx=10
Testing IPIP...
tunnels before test:
tunl0: any/ip remote any local any ttl inherit nopmtudisc
sit_test_LV5N: any/ip remote 127.0.0.2 local 127.0.0.1 dev lo ttl inherit
ipip_test_LV5N: any/ip remote 127.0.0.2 local 127.0.0.1 dev lo ttl inherit
sit0: ipv6/ip remote any local any ttl 64 nopmtudisc
gre_test_LV5N: gre/ip remote 127.0.0.2 local 127.0.0.1 dev lo ttl inherit
gre0: gre/ip remote any local any ttl inherit nopmtudisc
inner.dest4: 192.168.0.1
inner.source4: 1.1.1.1
encap proto:   4
outer.dest4: 127.0.0.1
outer.source4: 127.0.0.2
pkts: tx=10 rx=0
tunnels after test:
tunl0: any/ip remote any local any ttl inherit nopmtudisc
sit0: ipv6/ip remote any local any ttl 64 nopmtudisc
gre0: gre/ip remote any local any ttl inherit nopmtudisc
selftests: test_flow_dissector [FAILED]

is it something in my setup or test is broken?

^ permalink raw reply

* Re: [PATCH net-next v2] net/tls: Add support for async decryption of tls records
From: John Fastabend @ 2018-09-14 19:39 UTC (permalink / raw)
  To: Vakul Garg, netdev; +Cc: borisp, aviadye, davejwatson, davem
In-Reply-To: <20180829095655.31963-1-vakul.garg@nxp.com>

On 08/29/2018 02:56 AM, Vakul Garg wrote:
> When tls records are decrypted using asynchronous acclerators such as
> NXP CAAM engine, the crypto apis return -EINPROGRESS. Presently, on
> getting -EINPROGRESS, the tls record processing stops till the time the
> crypto accelerator finishes off and returns the result. This incurs a
> context switch and is not an efficient way of accessing the crypto
> accelerators. Crypto accelerators work efficient when they are queued
> with multiple crypto jobs without having to wait for the previous ones
> to complete.
> 
> The patch submits multiple crypto requests without having to wait for
> for previous ones to complete. This has been implemented for records
> which are decrypted in zero-copy mode. At the end of recvmsg(), we wait
> for all the asynchronous decryption requests to complete.
> 
> The references to records which have been sent for async decryption are
> dropped. For cases where record decryption is not possible in zero-copy
> mode, asynchronous decryption is not used and we wait for decryption
> crypto api to complete.
> 
> For crypto requests executing in async fashion, the memory for
> aead_request, sglists and skb etc is freed from the decryption
> completion handler. The decryption completion handler wakesup the
> sleeping user context when recvmsg() flags that it has done sending
> all the decryption requests and there are no more decryption requests
> pending to be completed.
> 
> Signed-off-by: Vakul Garg <vakul.garg@nxp.com>
> Reviewed-by: Dave Watson <davejwatson@fb.com>
> ---

[...]


> @@ -1271,6 +1377,8 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
>  		goto free_aead;
>  
>  	if (sw_ctx_rx) {
> +		(*aead)->reqsize = sizeof(struct decrypt_req_ctx);
> +

This is not valid and may cause GPF or best case only a KASAN
warning. 'reqsize' should probably not be mangled outside the
internal crypto APIs but the real reason is the reqsize is used
to determine how much space is needed at the end of the aead_request
for crypto private ctx use in encrypt/decrypt. After this patch
when we submit an aead_request the crypto layer will think it
has room for its private structs at the end but now only 8B will
be there and crypto layer will happily memset some arbitrary
memory for you amongst other things.

Anyways testing a fix now will post shortly.

Thanks,
John

^ permalink raw reply

* [PATCH net] bnxt_en: Fix VF mac address regression.
From: Michael Chan @ 2018-09-14 19:41 UTC (permalink / raw)
  To: davem; +Cc: netdev, seth.forshee, loseweigh

The recent commit to always forward the VF MAC address to the PF for
approval may not work if the PF driver or the firmware is older.  This
will cause the VF driver to fail during probe:

  bnxt_en 0000:00:03.0 (unnamed net_device) (uninitialized): hwrm req_type 0xf seq id 0x5 error 0xffff
  bnxt_en 0000:00:03.0 (unnamed net_device) (uninitialized): VF MAC address 00:00:17:02:05:d0 not approved by the PF
  bnxt_en 0000:00:03.0: Unable to initialize mac address.
  bnxt_en: probe of 0000:00:03.0 failed with error -99

We fix it by treating the error as fatal only if the VF MAC address is
locally generated by the VF.

Fixes: 707e7e966026 ("bnxt_en: Always forward VF MAC address to the PF.")
Reported-by: Seth Forshee <seth.forshee@canonical.com>
Reported-by: Siwei Liu <loseweigh@gmail.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
Please queue this for stable as well.  Thanks.

 drivers/net/ethernet/broadcom/bnxt/bnxt.c       | 9 +++++++--
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 9 +++++----
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h | 2 +-
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index cecbb1d..177587f 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -8027,7 +8027,7 @@ static int bnxt_change_mac_addr(struct net_device *dev, void *p)
 	if (ether_addr_equal(addr->sa_data, dev->dev_addr))
 		return 0;
 
-	rc = bnxt_approve_mac(bp, addr->sa_data);
+	rc = bnxt_approve_mac(bp, addr->sa_data, true);
 	if (rc)
 		return rc;
 
@@ -8827,14 +8827,19 @@ static int bnxt_init_mac_addr(struct bnxt *bp)
 	} else {
 #ifdef CONFIG_BNXT_SRIOV
 		struct bnxt_vf_info *vf = &bp->vf;
+		bool strict_approval = true;
 
 		if (is_valid_ether_addr(vf->mac_addr)) {
 			/* overwrite netdev dev_addr with admin VF MAC */
 			memcpy(bp->dev->dev_addr, vf->mac_addr, ETH_ALEN);
+			/* Older PF driver or firmware may not approve this
+			 * correctly.
+			 */
+			strict_approval = false;
 		} else {
 			eth_hw_addr_random(bp->dev);
 		}
-		rc = bnxt_approve_mac(bp, bp->dev->dev_addr);
+		rc = bnxt_approve_mac(bp, bp->dev->dev_addr, strict_approval);
 #endif
 	}
 	return rc;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
index fcd085a..3962f6f 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -1104,7 +1104,7 @@ void bnxt_update_vf_mac(struct bnxt *bp)
 	mutex_unlock(&bp->hwrm_cmd_lock);
 }
 
-int bnxt_approve_mac(struct bnxt *bp, u8 *mac)
+int bnxt_approve_mac(struct bnxt *bp, u8 *mac, bool strict)
 {
 	struct hwrm_func_vf_cfg_input req = {0};
 	int rc = 0;
@@ -1122,12 +1122,13 @@ int bnxt_approve_mac(struct bnxt *bp, u8 *mac)
 	memcpy(req.dflt_mac_addr, mac, ETH_ALEN);
 	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
 mac_done:
-	if (rc) {
+	if (rc && strict) {
 		rc = -EADDRNOTAVAIL;
 		netdev_warn(bp->dev, "VF MAC address %pM not approved by the PF\n",
 			    mac);
+		return rc;
 	}
-	return rc;
+	return 0;
 }
 #else
 
@@ -1144,7 +1145,7 @@ void bnxt_update_vf_mac(struct bnxt *bp)
 {
 }
 
-int bnxt_approve_mac(struct bnxt *bp, u8 *mac)
+int bnxt_approve_mac(struct bnxt *bp, u8 *mac, bool strict)
 {
 	return 0;
 }
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
index e9b20cd..2eed9ed 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
@@ -39,5 +39,5 @@ int bnxt_sriov_configure(struct pci_dev *pdev, int num_vfs);
 void bnxt_sriov_disable(struct bnxt *);
 void bnxt_hwrm_exec_fwd_req(struct bnxt *);
 void bnxt_update_vf_mac(struct bnxt *);
-int bnxt_approve_mac(struct bnxt *, u8 *);
+int bnxt_approve_mac(struct bnxt *, u8 *, bool);
 #endif
-- 
2.5.1

^ permalink raw reply related

* [net-next PATCH] tls: async support causes out-of-bounds access in crypto APIs
From: John Fastabend @ 2018-09-14 20:01 UTC (permalink / raw)
  To: vakul.garg, davejwatson
  Cc: doronrk, netdev, alexei.starovoitov, daniel, davem

When async support was added it needed to access the sk from the async
callback to report errors up the stack. The patch tried to use space
after the aead request struct by directly setting the reqsize field in
aead_request. This is an internal field that should not be used
outside the crypto APIs. It is used by the crypto code to define extra
space for private structures used in the crypto context. Users of the
API then use crypto_aead_reqsize() and add the returned amount of
bytes to the end of the request memory allocation before posting the
request to encrypt/decrypt APIs.

So this breaks (with general protection fault and KASAN error, if
enabled) because the request sent to decrypt is shorter than required
causing the crypto API out-of-bounds errors. Also it seems unlikely the
sk is even valid by the time it gets to the callback because of memset
in crypto layer.

Anyways, fix this by holding the sk in the skb->sk field when the
callback is set up and because the skb is already passed through to
the callback handler via void* we can access it in the handler. Then
in the handler we need to be careful to NULL the pointer again before
kfree_skb. I added comments on both the setup (in tls_do_decryption)
and when we clear it from the crypto callback handler
tls_decrypt_done(). After this selftests pass again and fixes KASAN
errors/warnings.

Fixes: 94524d8fc965 ("net/tls: Add support for async decryption of tls records")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 include/net/tls.h |    4 ----
 net/tls/tls_sw.c  |   39 +++++++++++++++++++++++----------------
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index cd0a65b..8630d28 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -128,10 +128,6 @@ struct tls_sw_context_rx {
 	bool async_notify;
 };
 
-struct decrypt_req_ctx {
-	struct sock *sk;
-};
-
 struct tls_record_info {
 	struct list_head list;
 	u32 end_seq;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index be4f2e9..cef69b6 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -122,25 +122,32 @@ static int skb_nsg(struct sk_buff *skb, int offset, int len)
 static void tls_decrypt_done(struct crypto_async_request *req, int err)
 {
 	struct aead_request *aead_req = (struct aead_request *)req;
-	struct decrypt_req_ctx *req_ctx =
-			(struct decrypt_req_ctx *)(aead_req + 1);
-
 	struct scatterlist *sgout = aead_req->dst;
-
-	struct tls_context *tls_ctx = tls_get_ctx(req_ctx->sk);
-	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
-	int pending = atomic_dec_return(&ctx->decrypt_pending);
+	struct tls_sw_context_rx *ctx;
+	struct tls_context *tls_ctx;
 	struct scatterlist *sg;
+	struct sk_buff *skb;
 	unsigned int pages;
+	int pending;
+
+	skb = (struct sk_buff *)req->data;
+	tls_ctx = tls_get_ctx(skb->sk);
+	ctx = tls_sw_ctx_rx(tls_ctx);
+	pending = atomic_dec_return(&ctx->decrypt_pending);
 
 	/* Propagate if there was an err */
 	if (err) {
 		ctx->async_wait.err = err;
-		tls_err_abort(req_ctx->sk, err);
+		tls_err_abort(skb->sk, err);
 	}
 
+	/* After using skb->sk to propagate sk through crypto async callback
+	 * we need to NULL it again.
+	 */
+	skb->sk = NULL;
+
 	/* Release the skb, pages and memory allocated for crypto req */
-	kfree_skb(req->data);
+	kfree_skb(skb);
 
 	/* Skip the first S/G entry as it points to AAD */
 	for_each_sg(sg_next(sgout), sg, UINT_MAX, pages) {
@@ -175,11 +182,13 @@ static int tls_do_decryption(struct sock *sk,
 			       (u8 *)iv_recv);
 
 	if (async) {
-		struct decrypt_req_ctx *req_ctx;
-
-		req_ctx = (struct decrypt_req_ctx *)(aead_req + 1);
-		req_ctx->sk = sk;
-
+		/* Using skb->sk to push sk through to crypto async callback
+		 * handler. This allows propagating errors up to the socket
+		 * if needed. It _must_ be cleared in the async handler
+		 * before kfree_skb is called. We _know_ skb->sk is NULL
+		 * because it is a clone from strparser.
+		 */
+		skb->sk = sk;
 		aead_request_set_callback(aead_req,
 					  CRYPTO_TFM_REQ_MAY_BACKLOG,
 					  tls_decrypt_done, skb);
@@ -1455,8 +1464,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		goto free_aead;
 
 	if (sw_ctx_rx) {
-		(*aead)->reqsize = sizeof(struct decrypt_req_ctx);
-
 		/* Set up strparser */
 		memset(&cb, 0, sizeof(cb));
 		cb.rcv_msg = tls_queue;

^ permalink raw reply related

* Re: [PATCH v2,net-next 1/2] ip_gre: fix parsing gre header in ipgre_err
From: Haishuang Yan @ 2018-09-15  1:22 UTC (permalink / raw)
  To: Edward Cree; +Cc: David Miller, kuznet, jbenc, netdev, linux-kernel
In-Reply-To: <4bd44714-8190-feca-27dc-6f6b254341f8@solarflare.com>



> On 2018年9月14日, at 下午8:44, Edward Cree <ecree@solarflare.com> wrote:
> 
> On 13/09/18 18:58, David Miller wrote:
>> From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
>> Date: Wed, 12 Sep 2018 17:21:21 +0800
>> 
>>> @@ -86,7 +86,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
>>> 
>>> 	options = (__be32 *)(greh + 1);
>>> 	if (greh->flags & GRE_CSUM) {
>>> -		if (skb_checksum_simple_validate(skb)) {
>>> +		if (csum_err && skb_checksum_simple_validate(skb)) {
>>> 			*csum_err = true;
>>> 			return -EINVAL;
>>> 		}
>> You want to ignore csum errors, but you do not want to elide the side
>> effects of the skb_checksum_simple_validate() call which are to set
>> skb->csum_valid and skb->csum.
>> 
>> Therefore, the skb_checksum_simple_validate() call still needs to be
>> performed. We just wont return -EINVAL in the NULL csum_err case.
> 
> How about just reversing the order of the AND?
> 
> 	if (skb_checksum_simple_validate(skb) && csum_err) {
> 		*csum_err = true;
> 		return -EINVAL;
> 	}
> 
> 

It looks good to me, thanks!

But skb_checksum_try_convert only need to be called after the checksum is 
validated, so I suggested a better solution as following:

   89                 if (!skb_checksum_simple_validate(skb)) {
   90                         skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
   91                                                  null_compute_pseudo);
   92                 } else if (csum_err) {
   93                         *csum_err = true;
   94                         return -EINVAL;
   95                 }

^ permalink raw reply

* Re: [PATCH net] bnxt_en: Fix VF mac address regression.
From: Siwei Liu @ 2018-09-14 20:14 UTC (permalink / raw)
  To: Michael Chan; +Cc: David Miller, Netdev, seth.forshee, si-wei liu
In-Reply-To: <1536954089-6061-1-git-send-email-michael.chan@broadcom.com>

Ack. Looks fine to me.

-Siwei

On Fri, Sep 14, 2018 at 12:41 PM, Michael Chan
<michael.chan@broadcom.com> wrote:
> The recent commit to always forward the VF MAC address to the PF for
> approval may not work if the PF driver or the firmware is older.  This
> will cause the VF driver to fail during probe:
>
>   bnxt_en 0000:00:03.0 (unnamed net_device) (uninitialized): hwrm req_type 0xf seq id 0x5 error 0xffff
>   bnxt_en 0000:00:03.0 (unnamed net_device) (uninitialized): VF MAC address 00:00:17:02:05:d0 not approved by the PF
>   bnxt_en 0000:00:03.0: Unable to initialize mac address.
>   bnxt_en: probe of 0000:00:03.0 failed with error -99
>
> We fix it by treating the error as fatal only if the VF MAC address is
> locally generated by the VF.
>
> Fixes: 707e7e966026 ("bnxt_en: Always forward VF MAC address to the PF.")
> Reported-by: Seth Forshee <seth.forshee@canonical.com>
> Reported-by: Siwei Liu <loseweigh@gmail.com>
> Signed-off-by: Michael Chan <michael.chan@broadcom.com>
> ---
> Please queue this for stable as well.  Thanks.
>
>  drivers/net/ethernet/broadcom/bnxt/bnxt.c       | 9 +++++++--
>  drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 9 +++++----
>  drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h | 2 +-
>  3 files changed, 13 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> index cecbb1d..177587f 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> @@ -8027,7 +8027,7 @@ static int bnxt_change_mac_addr(struct net_device *dev, void *p)
>         if (ether_addr_equal(addr->sa_data, dev->dev_addr))
>                 return 0;
>
> -       rc = bnxt_approve_mac(bp, addr->sa_data);
> +       rc = bnxt_approve_mac(bp, addr->sa_data, true);
>         if (rc)
>                 return rc;
>
> @@ -8827,14 +8827,19 @@ static int bnxt_init_mac_addr(struct bnxt *bp)
>         } else {
>  #ifdef CONFIG_BNXT_SRIOV
>                 struct bnxt_vf_info *vf = &bp->vf;
> +               bool strict_approval = true;
>
>                 if (is_valid_ether_addr(vf->mac_addr)) {
>                         /* overwrite netdev dev_addr with admin VF MAC */
>                         memcpy(bp->dev->dev_addr, vf->mac_addr, ETH_ALEN);
> +                       /* Older PF driver or firmware may not approve this
> +                        * correctly.
> +                        */
> +                       strict_approval = false;
>                 } else {
>                         eth_hw_addr_random(bp->dev);
>                 }
> -               rc = bnxt_approve_mac(bp, bp->dev->dev_addr);
> +               rc = bnxt_approve_mac(bp, bp->dev->dev_addr, strict_approval);
>  #endif
>         }
>         return rc;
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
> index fcd085a..3962f6f 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
> @@ -1104,7 +1104,7 @@ void bnxt_update_vf_mac(struct bnxt *bp)
>         mutex_unlock(&bp->hwrm_cmd_lock);
>  }
>
> -int bnxt_approve_mac(struct bnxt *bp, u8 *mac)
> +int bnxt_approve_mac(struct bnxt *bp, u8 *mac, bool strict)
>  {
>         struct hwrm_func_vf_cfg_input req = {0};
>         int rc = 0;
> @@ -1122,12 +1122,13 @@ int bnxt_approve_mac(struct bnxt *bp, u8 *mac)
>         memcpy(req.dflt_mac_addr, mac, ETH_ALEN);
>         rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
>  mac_done:
> -       if (rc) {
> +       if (rc && strict) {
>                 rc = -EADDRNOTAVAIL;
>                 netdev_warn(bp->dev, "VF MAC address %pM not approved by the PF\n",
>                             mac);
> +               return rc;
>         }
> -       return rc;
> +       return 0;
>  }
>  #else
>
> @@ -1144,7 +1145,7 @@ void bnxt_update_vf_mac(struct bnxt *bp)
>  {
>  }
>
> -int bnxt_approve_mac(struct bnxt *bp, u8 *mac)
> +int bnxt_approve_mac(struct bnxt *bp, u8 *mac, bool strict)
>  {
>         return 0;
>  }
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
> index e9b20cd..2eed9ed 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
> @@ -39,5 +39,5 @@ int bnxt_sriov_configure(struct pci_dev *pdev, int num_vfs);
>  void bnxt_sriov_disable(struct bnxt *);
>  void bnxt_hwrm_exec_fwd_req(struct bnxt *);
>  void bnxt_update_vf_mac(struct bnxt *);
> -int bnxt_approve_mac(struct bnxt *, u8 *);
> +int bnxt_approve_mac(struct bnxt *, u8 *, bool);
>  #endif
> --
> 2.5.1
>

^ permalink raw reply

* [Patch net-next] ipv4: initialize ra_mutex in inet_init_net()
From: Cong Wang @ 2018-09-14 20:32 UTC (permalink / raw)
  To: netdev; +Cc: Cong Wang, Kirill Tkhai

ra_mutex is a IPv4 specific mutex, it is inside struct netns_ipv4,
but its initialization is in the generic netns code, setup_net().

Move it to IPv4 specific net init code, inet_init_net().

Fixes: d9ff3049739e ("net: Replace ip_ra_lock with per-net mutex")
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
 net/core/net_namespace.c | 1 -
 net/ipv4/af_inet.c       | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 670c84b1bfc2..b272ccfcbf63 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -308,7 +308,6 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 	net->user_ns = user_ns;
 	idr_init(&net->netns_ids);
 	spin_lock_init(&net->nsid_lock);
-	mutex_init(&net->ipv4.ra_mutex);
 
 	list_for_each_entry(ops, &pernet_list, list) {
 		error = ops_init(ops, net);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 20fda8fb8ffd..57b7bffb93e5 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1817,6 +1817,8 @@ static __net_init int inet_init_net(struct net *net)
 	net->ipv4.sysctl_igmp_llm_reports = 1;
 	net->ipv4.sysctl_igmp_qrv = 2;
 
+	mutex_init(&net->ipv4.ra_mutex);
+
 	return 0;
 }
 
-- 
2.14.4

^ permalink raw reply related

* Re: [PATCH net-next v2] net: sched: change tcf_del_walker() to take idrinfo->lock
From: Cong Wang @ 2018-09-14 20:53 UTC (permalink / raw)
  To: Vlad Buslov
  Cc: Linux Kernel Network Developers, Jamal Hadi Salim, Jiri Pirko,
	David Miller
In-Reply-To: <vbfmusknypr.fsf@reg-r-vrt-018-180.mtr.labs.mlnx>

On Fri, Sep 14, 2018 at 3:46 AM Vlad Buslov <vladbu@mellanox.com> wrote:
>
>
> On Thu 13 Sep 2018 at 17:13, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > On Wed, Sep 12, 2018 at 1:51 AM Vlad Buslov <vladbu@mellanox.com> wrote:
> >>
> >>
> >> On Fri 07 Sep 2018 at 19:12, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >> > On Fri, Sep 7, 2018 at 6:52 AM Vlad Buslov <vladbu@mellanox.com> wrote:
> >> >>
> >> >> Action API was changed to work with actions and action_idr in concurrency
> >> >> safe manner, however tcf_del_walker() still uses actions without taking a
> >> >> reference or idrinfo->lock first, and deletes them directly, disregarding
> >> >> possible concurrent delete.
> >> >>
> >> >> Add tc_action_wq workqueue to action API. Implement
> >> >> tcf_idr_release_unsafe() that assumes external synchronization by caller
> >> >> and delays blocking action cleanup part to tc_action_wq workqueue. Extend
> >> >> tcf_action_cleanup() with 'async' argument to indicate that function should
> >> >> free action asynchronously.
> >> >
> >> > Where exactly is blocking in tcf_action_cleanup()?
> >> >
> >> > From your code, it looks like free_tcf(), but from my observation,
> >> > the only blocking function inside is tcf_action_goto_chain_fini()
> >> > which calls __tcf_chain_put(). But, __tcf_chain_put() is blocking
> >> > _ONLY_ when tc_chain_notify() is called, for tc action it is never
> >> > called.
> >> >
> >> > So, what else is blocking?
> >>
> >> __tcf_chain_put() calls tc_chain_tmplt_del(), which calls
> >> ops->tmplt_destroy(). This last function uses hw offload API, which is
> >> blocking.
> >
> > Good to know.
> >
> > Can we just make ops->tmplt_destroy() to use workqueue?
> > Making tc action to workqueue seems overkill, for me.
>
> How about changing tcf_chain_put_by_act() to use tc_filter_wq, instead
> of directly calling __tcf_chain_put()? IMO it is a better solution
> because it benefits all classifiers, instead of requiring every
> classifier with templates support to implement non-blocking
> ops->tmplt_destroy().

My point is, there is only one filter implements ops->tmplt_destroy
so far, so there is no reason to just make all filters to adjusted
for this single one. Not to mention actions, actions are innocent
here.

^ permalink raw reply

* Re: [PATCH v2 05/17] compat_ioctl: move more drivers to generic_compat_ioctl_ptrarg
From: Al Viro @ 2018-09-14 20:57 UTC (permalink / raw)
  To: Darren Hart
  Cc: Arnd Bergmann, linux-fsdevel, Greg Kroah-Hartman, David S. Miller,
	devel, linux-kernel, qat-linux, linux-crypto, linux-media,
	dri-devel, linaro-mm-sig, amd-gfx, linux-input, linux-iio,
	linux-rdma, linux-nvdimm, linux-nvme, linux-pci,
	platform-driver-x86, linux-remoteproc, sparclinux, linux-scsi,
	linux-usb, linux-fbdev, linuxppc-dev, linux-btrfs
In-Reply-To: <20180914203506.GE35251@wrath>

On Fri, Sep 14, 2018 at 01:35:06PM -0700, Darren Hart wrote:
 
> Acked-by: Darren Hart (VMware) <dvhart@infradead.org>
> 
> As for a longer term solution, would it be possible to init fops in such
> a way that the compat_ioctl call defaults to generic_compat_ioctl_ptrarg
> so we don't have to duplicate this boilerplate for every ioctl fops
> structure?

	Bad idea, that...  Because several years down the road somebody will add
an ioctl that takes an unsigned int for argument.  Without so much as looking
at your magical mystery macro being used to initialize file_operations.

	FWIW, I would name that helper in more blunt way - something like
compat_ioctl_only_compat_pointer_ioctls_here()...

^ permalink raw reply

* [PATCH net] tls: fix currently broken MSG_PEEK behavior
From: Daniel Borkmann @ 2018-09-14 21:00 UTC (permalink / raw)
  To: davejwatson
  Cc: doronrk, alexei.starovoitov, john.fastabend, davem, netdev,
	Daniel Borkmann

In kTLS MSG_PEEK behavior is currently failing, strace example:

  [pid  2430] socket(AF_INET, SOCK_STREAM, IPPROTO_IP) = 3
  [pid  2430] socket(AF_INET, SOCK_STREAM, IPPROTO_IP) = 4
  [pid  2430] bind(4, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("0.0.0.0")}, 16) = 0
  [pid  2430] listen(4, 10)               = 0
  [pid  2430] getsockname(4, {sa_family=AF_INET, sin_port=htons(38855), sin_addr=inet_addr("0.0.0.0")}, [16]) = 0
  [pid  2430] connect(3, {sa_family=AF_INET, sin_port=htons(38855), sin_addr=inet_addr("0.0.0.0")}, 16) = 0
  [pid  2430] setsockopt(3, SOL_TCP, 0x1f /* TCP_??? */, [7564404], 4) = 0
  [pid  2430] setsockopt(3, 0x11a /* SOL_?? */, 1, "\3\0033\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 40) = 0
  [pid  2430] accept(4, {sa_family=AF_INET, sin_port=htons(49636), sin_addr=inet_addr("127.0.0.1")}, [16]) = 5
  [pid  2430] setsockopt(5, SOL_TCP, 0x1f /* TCP_??? */, [7564404], 4) = 0
  [pid  2430] setsockopt(5, 0x11a /* SOL_?? */, 2, "\3\0033\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 40) = 0
  [pid  2430] close(4)                    = 0
  [pid  2430] sendto(3, "test_read_peek", 14, 0, NULL, 0) = 14
  [pid  2430] sendto(3, "_mult_recs\0", 11, 0, NULL, 0) = 11
  [pid  2430] recvfrom(5, "test_read_peektest_read_peektest"..., 64, MSG_PEEK, NULL, NULL) = 64

As can be seen from strace, there are two TLS records sent,
i) 'test_read_peek' and ii) '_mult_recs\0' where we end up
peeking 'test_read_peektest_read_peektest'. This is clearly
wrong, and what happens is that given peek cannot call into
tls_sw_advance_skb() to unpause strparser and proceed with
the next skb, we end up looping over the current one, copying
the 'test_read_peek' over and over into the user provided
buffer.

Here, we can only peek into the currently held skb (current,
full TLS record) as otherwise we would end up having to hold
all the original skb(s) (depending on the peek depth) in a
separate queue when unpausing strparser to process next
records, minimally intrusive is to return only up to the
current record's size (which likely was what c46234ebb4d1
("tls: RX path for ktls") originally intended as well). Thus,
after patch we properly peek the first record:

  [pid  2046] wait4(2075,  <unfinished ...>
  [pid  2075] socket(AF_INET, SOCK_STREAM, IPPROTO_IP) = 3
  [pid  2075] socket(AF_INET, SOCK_STREAM, IPPROTO_IP) = 4
  [pid  2075] bind(4, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("0.0.0.0")}, 16) = 0
  [pid  2075] listen(4, 10)               = 0
  [pid  2075] getsockname(4, {sa_family=AF_INET, sin_port=htons(55115), sin_addr=inet_addr("0.0.0.0")}, [16]) = 0
  [pid  2075] connect(3, {sa_family=AF_INET, sin_port=htons(55115), sin_addr=inet_addr("0.0.0.0")}, 16) = 0
  [pid  2075] setsockopt(3, SOL_TCP, 0x1f /* TCP_??? */, [7564404], 4) = 0
  [pid  2075] setsockopt(3, 0x11a /* SOL_?? */, 1, "\3\0033\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 40) = 0
  [pid  2075] accept(4, {sa_family=AF_INET, sin_port=htons(45732), sin_addr=inet_addr("127.0.0.1")}, [16]) = 5
  [pid  2075] setsockopt(5, SOL_TCP, 0x1f /* TCP_??? */, [7564404], 4) = 0
  [pid  2075] setsockopt(5, 0x11a /* SOL_?? */, 2, "\3\0033\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 40) = 0
  [pid  2075] close(4)                    = 0
  [pid  2075] sendto(3, "test_read_peek", 14, 0, NULL, 0) = 14
  [pid  2075] sendto(3, "_mult_recs\0", 11, 0, NULL, 0) = 11
  [pid  2075] recvfrom(5, "test_read_peek", 64, MSG_PEEK, NULL, NULL) = 14

Fixes: c46234ebb4d1 ("tls: RX path for ktls")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/tls/tls_sw.c                  |  8 +++++++
 tools/testing/selftests/net/tls.c | 49 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index e28a6ff..b0cea79 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -931,7 +931,15 @@ int tls_sw_recvmsg(struct sock *sk,
 				if (control != TLS_RECORD_TYPE_DATA)
 					goto recv_end;
 			}
+		} else {
+			/* MSG_PEEK right now cannot look beyond current skb
+			 * from strparser, meaning we cannot advance skb here
+			 * and thus unpause strparser since we'd loose original
+			 * one.
+			 */
+			break;
 		}
+
 		/* If we have a new message from strparser, continue now. */
 		if (copied >= target && !ctx->recv_pkt)
 			break;
diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index b3ebf26..8fdfeaf 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -502,6 +502,55 @@ TEST_F(tls, recv_peek_multiple)
 	EXPECT_EQ(memcmp(test_str, buf, send_len), 0);
 }
 
+TEST_F(tls, recv_peek_multiple_records)
+{
+	char const *test_str = "test_read_peek_mult_recs";
+	char const *test_str_first = "test_read_peek";
+	char const *test_str_second = "_mult_recs";
+	int len;
+	char buf[64];
+
+	len = strlen(test_str_first);
+	EXPECT_EQ(send(self->fd, test_str_first, len, 0), len);
+
+	len = strlen(test_str_second) + 1;
+	EXPECT_EQ(send(self->fd, test_str_second, len, 0), len);
+
+	len = sizeof(buf);
+	memset(buf, 0, len);
+	EXPECT_NE(recv(self->cfd, buf, len, MSG_PEEK), -1);
+
+	/* MSG_PEEK can only peek into the current record. */
+	len = strlen(test_str_first) + 1;
+	EXPECT_EQ(memcmp(test_str_first, buf, len), 0);
+
+	len = sizeof(buf);
+	memset(buf, 0, len);
+	EXPECT_NE(recv(self->cfd, buf, len, 0), -1);
+
+	/* Non-MSG_PEEK will advance strparser (and therefore record)
+	 * however.
+	 */
+	len = strlen(test_str) + 1;
+	EXPECT_EQ(memcmp(test_str, buf, len), 0);
+
+	/* MSG_MORE will hold current record open, so later MSG_PEEK
+	 * will see everything.
+	 */
+	len = strlen(test_str_first);
+	EXPECT_EQ(send(self->fd, test_str_first, len, MSG_MORE), len);
+
+	len = strlen(test_str_second) + 1;
+	EXPECT_EQ(send(self->fd, test_str_second, len, 0), len);
+
+	len = sizeof(buf);
+	memset(buf, 0, len);
+	EXPECT_NE(recv(self->cfd, buf, len, MSG_PEEK), -1);
+
+	len = strlen(test_str) + 1;
+	EXPECT_EQ(memcmp(test_str, buf, len), 0);
+}
+
 TEST_F(tls, pollin)
 {
 	char const *test_str = "test_poll";
-- 
2.9.5

^ permalink raw reply related

* Re: [PATCH net-next 5/5] net: phy: mscc: remove unneeded temporary variable
From: Florian Fainelli @ 2018-09-15  2:19 UTC (permalink / raw)
  To: Quentin Schulz, davem, andrew
  Cc: allan.nielsen, linux-kernel, netdev, thomas.petazzoni
In-Reply-To: <d9cca8eef36bb8918c9ed28574b79b7674fd36f6.1536913944.git-series.quentin.schulz@bootlin.com>



On 09/14/18 01:33, Quentin Schulz wrote:
> Here, the rc variable is either used only for the condition right after
> the assignment or right before being used as the return value of the
> function it's being used in.
> 
> So let's remove this unneeded temporary variable whenever possible.
> 
> Signed-off-by: Quentin Schulz <quentin.schulz@bootlin.com>

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
-- 
Florian

^ permalink raw reply

* Re: [PATCH net-next 2/5] net: phy: mscc: Add EEE init sequence
From: Florian Fainelli @ 2018-09-15  2:21 UTC (permalink / raw)
  To: Quentin Schulz, davem, andrew
  Cc: allan.nielsen, linux-kernel, netdev, thomas.petazzoni,
	Raju Lakkaraju
In-Reply-To: <64809c5f01f3c6407257553a286b82949cef1ac0.1536913944.git-series.quentin.schulz@bootlin.com>



On 09/14/18 01:33, Quentin Schulz wrote:
> From: Raju Lakkaraju <Raju.Lakkaraju@microchip.com>
> 
> Microsemi PHYs (VSC 8530/31/40/41) need to update the Energy Efficient
> Ethernet initialization sequence.
> In order to avoid certain link state errors that could result in link
> drops and packet loss, the physical coding sublayer (PCS) must be
> updated with settings related to EEE in order to improve performance.
> 
> Signed-off-by: Raju Lakkaraju <Raju.Lakkaraju@microchip.com>
> Signed-off-by: Quentin Schulz <quentin.schulz@bootlin.com>
> ---

[snip]

> +	vsc85xx_tr_write(phydev, 0x0f82, 0x0012b00a);

Can you just make this an array of register + value pair? That would be
less error prone in case you need to update that sequence in the future.

With that:

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
-- 
Florian

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox