Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v3 net-next 1/8] flow_dissector: Change skbuf argument to be non const
From: Tom Herbert @ 2017-09-28 21:48 UTC (permalink / raw)
  To: davem; +Cc: netdev, rohit, Tom Herbert

Change the skbuf argument of __skb_flow_dissect to be non constant so
that the function can call functions that take non constant skbuf
arguments. This is needed if we are to call socket lookup or BPF in the
flow dissector path.

The changes include unraveling the call chain into __skb_flow_dissect so
that those also use non constant skbufs.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 include/linux/skbuff.h    | 12 ++++++------
 include/net/ip_fib.h      |  4 ++--
 include/net/route.h       |  4 ++--
 net/core/flow_dissector.c | 10 +++++-----
 net/ipv4/fib_semantics.c  |  2 +-
 net/ipv4/route.c          |  6 +++---
 net/sched/sch_sfq.c       |  2 +-
 7 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 19e64bfb1a66..5a6e765e120f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1155,8 +1155,8 @@ __skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4)
 }
 
 void __skb_get_hash(struct sk_buff *skb);
-u32 __skb_get_hash_symmetric(const struct sk_buff *skb);
-u32 skb_get_poff(const struct sk_buff *skb);
+u32 __skb_get_hash_symmetric(struct sk_buff *skb);
+u32 skb_get_poff(struct sk_buff *skb);
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 		   const struct flow_keys *keys, int hlen);
 __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
@@ -1172,13 +1172,13 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 			     const struct flow_dissector_key *key,
 			     unsigned int key_count);
 
-bool __skb_flow_dissect(const struct sk_buff *skb,
+bool __skb_flow_dissect(struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
 			void *target_container,
 			void *data, __be16 proto, int nhoff, int hlen,
 			unsigned int flags);
 
-static inline bool skb_flow_dissect(const struct sk_buff *skb,
+static inline bool skb_flow_dissect(struct sk_buff *skb,
 				    struct flow_dissector *flow_dissector,
 				    void *target_container, unsigned int flags)
 {
@@ -1186,7 +1186,7 @@ static inline bool skb_flow_dissect(const struct sk_buff *skb,
 				  NULL, 0, 0, 0, flags);
 }
 
-static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
+static inline bool skb_flow_dissect_flow_keys(struct sk_buff *skb,
 					      struct flow_keys *flow,
 					      unsigned int flags)
 {
@@ -1225,7 +1225,7 @@ static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6
 	return skb->hash;
 }
 
-__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb);
+__u32 skb_get_hash_perturb(struct sk_buff *skb, u32 perturb);
 
 static inline __u32 skb_get_hash_raw(const struct sk_buff *skb)
 {
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 1a7f7e424320..a376dfe1ad44 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -374,11 +374,11 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
-		       const struct sk_buff *skb);
+		       struct sk_buff *skb);
 #endif
 void fib_select_multipath(struct fib_result *res, int hash);
 void fib_select_path(struct net *net, struct fib_result *res,
-		     struct flowi4 *fl4, const struct sk_buff *skb);
+		     struct flowi4 *fl4, struct sk_buff *skb);
 
 /* Exported by fib_trie.c */
 void fib_trie_init(void);
diff --git a/include/net/route.h b/include/net/route.h
index 57dfc6850d37..cb95b79f0117 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -114,10 +114,10 @@ int ip_rt_init(void);
 void rt_cache_flush(struct net *net);
 void rt_flush_dev(struct net_device *dev);
 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *flp,
-					const struct sk_buff *skb);
+					struct sk_buff *skb);
 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *flp,
 					    struct fib_result *res,
-					    const struct sk_buff *skb);
+					    struct sk_buff *skb);
 
 static inline struct rtable *__ip_route_output_key(struct net *net,
 						   struct flowi4 *flp)
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 0a977373d003..76f5e5bc3177 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -424,7 +424,7 @@ static bool skb_flow_dissect_allowed(int *num_hdrs)
  *
  * Caller must take care of zeroing target container memory.
  */
-bool __skb_flow_dissect(const struct sk_buff *skb,
+bool __skb_flow_dissect(struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
 			void *target_container,
 			void *data, __be16 proto, int nhoff, int hlen,
@@ -1015,7 +1015,7 @@ u32 flow_hash_from_keys(struct flow_keys *keys)
 }
 EXPORT_SYMBOL(flow_hash_from_keys);
 
-static inline u32 ___skb_get_hash(const struct sk_buff *skb,
+static inline u32 ___skb_get_hash(struct sk_buff *skb,
 				  struct flow_keys *keys, u32 keyval)
 {
 	skb_flow_dissect_flow_keys(skb, keys,
@@ -1053,7 +1053,7 @@ EXPORT_SYMBOL(make_flow_keys_digest);
 
 static struct flow_dissector flow_keys_dissector_symmetric __read_mostly;
 
-u32 __skb_get_hash_symmetric(const struct sk_buff *skb)
+u32 __skb_get_hash_symmetric(struct sk_buff *skb)
 {
 	struct flow_keys keys;
 
@@ -1090,7 +1090,7 @@ void __skb_get_hash(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(__skb_get_hash);
 
-__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
+__u32 skb_get_hash_perturb(struct sk_buff *skb, u32 perturb)
 {
 	struct flow_keys keys;
 
@@ -1158,7 +1158,7 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data,
  * truncate packets without needing to push actual payload to the user
  * space and can analyze headers only, instead.
  */
-u32 skb_get_poff(const struct sk_buff *skb)
+u32 skb_get_poff(struct sk_buff *skb)
 {
 	struct flow_keys keys;
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 57a5d48acee8..dc610646bc4c 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1759,7 +1759,7 @@ void fib_select_multipath(struct fib_result *res, int hash)
 #endif
 
 void fib_select_path(struct net *net, struct fib_result *res,
-		     struct flowi4 *fl4, const struct sk_buff *skb)
+		     struct flowi4 *fl4, struct sk_buff *skb)
 {
 	bool oif_check;
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 94d4cd2d5ea4..94c5b81d8f2b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1791,7 +1791,7 @@ static void ip_multipath_l3_keys(const struct sk_buff *skb,
 
 /* if skb is set it will be used and fl4 can be NULL */
 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
-		       const struct sk_buff *skb)
+		       struct sk_buff *skb)
 {
 	struct net *net = fi->fib_net;
 	struct flow_keys hash_keys;
@@ -2270,7 +2270,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
  */
 
 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
-					const struct sk_buff *skb)
+					struct sk_buff *skb)
 {
 	__u8 tos = RT_FL_TOS(fl4);
 	struct fib_result res;
@@ -2295,7 +2295,7 @@ EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
 
 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
 					    struct fib_result *res,
-					    const struct sk_buff *skb)
+					    struct sk_buff *skb)
 {
 	struct net_device *dev_out = NULL;
 	int orig_oif = fl4->flowi4_oif;
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 74ea863b8240..0d2d3a8d03f0 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -158,7 +158,7 @@ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index
 }
 
 static unsigned int sfq_hash(const struct sfq_sched_data *q,
-			     const struct sk_buff *skb)
+			     struct sk_buff *skb)
 {
 	return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
 }
-- 
2.11.0

^ permalink raw reply related

* [PATCH v3 net-next 2/8] flow_dissector: Move ETH_P_TEB processing to main switch
From: Tom Herbert @ 2017-09-28 21:48 UTC (permalink / raw)
  To: davem; +Cc: netdev, rohit, Tom Herbert
In-Reply-To: <20170928214823.2426-1-tom@quantonium.net>

Support for processing TEB is currently in GRE flow dissection as a
special case. This can be moved to be a case the main proto switch in
__skb_flow_dissect.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 net/core/flow_dissector.c | 45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 76f5e5bc3177..c15b41f96cbe 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -282,27 +282,8 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
 	if (hdr->flags & GRE_SEQ)
 		offset += sizeof(((struct pptp_gre_header *) 0)->seq);
 
-	if (gre_ver == 0) {
-		if (*p_proto == htons(ETH_P_TEB)) {
-			const struct ethhdr *eth;
-			struct ethhdr _eth;
-
-			eth = __skb_header_pointer(skb, *p_nhoff + offset,
-						   sizeof(_eth),
-						   data, *p_hlen, &_eth);
-			if (!eth)
-				return FLOW_DISSECT_RET_OUT_BAD;
-			*p_proto = eth->h_proto;
-			offset += sizeof(*eth);
-
-			/* Cap headers that we access via pointers at the
-			 * end of the Ethernet header as our maximum alignment
-			 * at that point is only 2 bytes.
-			 */
-			if (NET_IP_ALIGN)
-				*p_hlen = *p_nhoff + offset;
-		}
-	} else { /* version 1, must be PPTP */
+	/* version 1, must be PPTP */
+	if (gre_ver == 1) {
 		u8 _ppp_hdr[PPP_HDRLEN];
 		u8 *ppp_hdr;
 
@@ -595,6 +576,28 @@ bool __skb_flow_dissect(struct sk_buff *skb,
 
 		break;
 	}
+	case htons(ETH_P_TEB): {
+		const struct ethhdr *eth;
+		struct ethhdr _eth;
+
+		eth = __skb_header_pointer(skb, nhoff, sizeof(_eth),
+					   data, hlen, &_eth);
+		if (!eth)
+			goto out_bad;
+
+		proto = eth->h_proto;
+		nhoff += sizeof(*eth);
+
+		/* Cap headers that we access via pointers at the
+		 * end of the Ethernet header as our maximum alignment
+		 * at that point is only 2 bytes.
+		 */
+		if (NET_IP_ALIGN)
+			hlen = nhoff;
+
+		fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+		break;
+	}
 	case htons(ETH_P_8021AD):
 	case htons(ETH_P_8021Q): {
 		const struct vlan_hdr *vlan;
-- 
2.11.0

^ permalink raw reply related

* [PATCH v3 net-next 3/8] udp: Check static key udp_encap_needed in udp_gro_receive
From: Tom Herbert @ 2017-09-28 21:48 UTC (permalink / raw)
  To: davem; +Cc: netdev, rohit, Tom Herbert
In-Reply-To: <20170928214823.2426-1-tom@quantonium.net>

Currently, the only support for udp gro is provided by UDP encapsulation
protocols. Since they always set udp_encap_needed we can check that in
udp_gro_receive functions before performing a socket lookup.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 include/net/udp.h      | 2 ++
 net/ipv4/udp.c         | 4 +++-
 net/ipv4/udp_offload.c | 7 +++++++
 net/ipv6/udp_offload.c | 7 +++++++
 4 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/include/net/udp.h b/include/net/udp.h
index 12dfbfe2e2d7..c6b1c5d8d3c9 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -97,6 +97,8 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table,
 
 extern struct proto udp_prot;
 
+extern struct static_key udp_encap_needed;
+
 extern atomic_long_t udp_memory_allocated;
 
 /* sysctl variables for udp */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 784ced0b9150..2788843e8eb2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1813,7 +1813,9 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
-static struct static_key udp_encap_needed __read_mostly;
+struct static_key udp_encap_needed __read_mostly;
+EXPORT_SYMBOL(udp_encap_needed);
+
 void udp_encap_enable(void)
 {
 	static_key_enable(&udp_encap_needed);
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 97658bfc1b58..a744bb515455 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -261,6 +261,13 @@ static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
 {
 	struct udphdr *uh = udp_gro_udphdr(skb);
 
+	if (!static_key_false(&udp_encap_needed)) {
+		/* Currently udp_gro_receive only does something if
+		 * a UDP encapsulation has been set.
+		 */
+		goto flush;
+	}
+
 	if (unlikely(!uh))
 		goto flush;
 
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 455fd4e39333..111b026e4f03 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -34,6 +34,13 @@ static struct sk_buff **udp6_gro_receive(struct sk_buff **head,
 {
 	struct udphdr *uh = udp_gro_udphdr(skb);
 
+	if (!static_key_false(&udp_encap_needed)) {
+		/* Currently udp_gro_receive only does something if
+		 * a UDP encapsulation has been set.
+		 */
+		goto flush;
+	}
+
 	if (unlikely(!uh))
 		goto flush;
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH v3 net-next 4/8] flow_dissector: Add protocol specific flow dissection offload
From: Tom Herbert @ 2017-09-28 21:48 UTC (permalink / raw)
  To: davem; +Cc: netdev, rohit, Tom Herbert
In-Reply-To: <20170928214823.2426-1-tom@quantonium.net>

Add offload capability for performing protocol specific flow dissection
(either by EtherType or IP protocol).

Specifically:

- Add flow_dissect to offload callbacks
- Move flow_dissect_ret enum to flow_dissector.h, cleanup names and add a
  couple of values
- Unify handling of functions that return flow_dissect_ret enum
- In __skb_flow_dissect, add default case for switch(proto) as well as
  switch(ip_proto) that looks up and calls protocol specific flow
  dissection

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 include/linux/netdevice.h    | 27 ++++++++++++++++++
 include/net/flow_dissector.h |  1 +
 net/core/dev.c               | 65 ++++++++++++++++++++++++++++++++++++++++++++
 net/core/flow_dissector.c    | 16 +++++++++--
 net/ipv4/route.c             |  4 ++-
 5 files changed, 110 insertions(+), 3 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f535779d9dc1..565d7cdfe967 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2207,12 +2207,25 @@ struct offload_callbacks {
 	struct sk_buff		**(*gro_receive)(struct sk_buff **head,
 						 struct sk_buff *skb);
 	int			(*gro_complete)(struct sk_buff *skb, int nhoff);
+	enum flow_dissect_ret (*flow_dissect)(struct sk_buff *skb,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags);
 };
 
 struct packet_offload {
 	__be16			 type;	/* This is really htons(ether_type). */
 	u16			 priority;
 	struct offload_callbacks callbacks;
+	enum flow_dissect_ret (*proto_flow_dissect)(struct sk_buff *skb,
+			u8 proto,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags);
 	struct list_head	 list;
 };
 
@@ -3252,6 +3265,20 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi);
 gro_result_t napi_gro_frags(struct napi_struct *napi);
 struct packet_offload *gro_find_receive_by_type(__be16 type);
 struct packet_offload *gro_find_complete_by_type(__be16 type);
+enum flow_dissect_ret flow_dissect_by_type(struct sk_buff *skb,
+			__be16 type,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags);
+enum flow_dissect_ret flow_dissect_by_type_proto(struct sk_buff *skb,
+			__be16 type, u8 proto,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags);
 
 static inline void napi_free_frags(struct napi_struct *napi)
 {
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index fc3dce730a6b..ad75bbfd1c9c 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -213,6 +213,7 @@ enum flow_dissector_key_id {
 #define FLOW_DISSECTOR_F_STOP_AT_L3		BIT(1)
 #define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL	BIT(2)
 #define FLOW_DISSECTOR_F_STOP_AT_ENCAP		BIT(3)
+#define FLOW_DISSECTOR_F_STOP_AT_L4		BIT(4)
 
 struct flow_dissector_key {
 	enum flow_dissector_key_id key_id;
diff --git a/net/core/dev.c b/net/core/dev.c
index e350c768d4b5..f3cd884bd04b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -104,6 +104,7 @@
 #include <linux/stat.h>
 #include <net/dst.h>
 #include <net/dst_metadata.h>
+#include <net/flow_dissector.h>
 #include <net/pkt_sched.h>
 #include <net/pkt_cls.h>
 #include <net/checksum.h>
@@ -4907,6 +4908,70 @@ struct packet_offload *gro_find_complete_by_type(__be16 type)
 }
 EXPORT_SYMBOL(gro_find_complete_by_type);
 
+enum flow_dissect_ret flow_dissect_by_type(struct sk_buff *skb,
+			__be16 type,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags)
+{
+	enum flow_dissect_ret ret = FLOW_DISSECT_RET_CONTINUE;
+	struct list_head *offload_head = &offload_base;
+	struct packet_offload *ptype;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(ptype, offload_head, list) {
+		if (ptype->type != type || !ptype->callbacks.flow_dissect)
+			continue;
+		ret = ptype->callbacks.flow_dissect(skb, key_control,
+						    flow_dissector,
+						    target_container,
+						    data, p_proto,
+						    p_ip_proto, p_nhoff,
+						    p_hlen, flags);
+		break;
+	}
+
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(flow_dissect_by_type);
+
+enum flow_dissect_ret flow_dissect_by_type_proto(struct sk_buff *skb,
+			__be16 type, u8 proto,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags)
+{
+	enum flow_dissect_ret ret = FLOW_DISSECT_RET_CONTINUE;
+	struct list_head *offload_head = &offload_base;
+	struct packet_offload *ptype;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(ptype, offload_head, list) {
+		if (ptype->type != type || !ptype->proto_flow_dissect)
+			continue;
+		ret = ptype->proto_flow_dissect(skb, proto, key_control,
+						    flow_dissector,
+						    target_container,
+						    data, p_proto,
+						    p_ip_proto, p_nhoff,
+						    p_hlen, flags);
+		break;
+	}
+
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(flow_dissect_by_type_proto);
+
 static void napi_skb_free_stolen_head(struct sk_buff *skb)
 {
 	skb_dst_drop(skb);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index c15b41f96cbe..84b8eb1f6664 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -9,6 +9,7 @@
 #include <net/ipv6.h>
 #include <net/gre.h>
 #include <net/pptp.h>
+#include <net/protocol.h>
 #include <linux/igmp.h>
 #include <linux/icmp.h>
 #include <linux/sctp.h>
@@ -721,7 +722,11 @@ bool __skb_flow_dissect(struct sk_buff *skb,
 		break;
 
 	default:
-		fdret = FLOW_DISSECT_RET_OUT_BAD;
+		fdret = flow_dissect_by_type(skb, proto, key_control,
+					     flow_dissector,
+					     target_container,
+					     data, &proto, &ip_proto, &nhoff,
+					     &hlen, flags);
 		break;
 	}
 
@@ -838,6 +843,12 @@ bool __skb_flow_dissect(struct sk_buff *skb,
 		break;
 
 	default:
+		fdret = flow_dissect_by_type_proto(skb, proto,
+						ip_proto, key_control,
+						flow_dissector,
+						target_container,
+						data, &proto, &ip_proto, &nhoff,
+						&hlen, flags);
 		break;
 	}
 
@@ -1022,7 +1033,8 @@ static inline u32 ___skb_get_hash(struct sk_buff *skb,
 				  struct flow_keys *keys, u32 keyval)
 {
 	skb_flow_dissect_flow_keys(skb, keys,
-				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL |
+				   FLOW_DISSECTOR_F_STOP_AT_L4);
 
 	return __flow_hash_from_keys(keys, keyval);
 }
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 94c5b81d8f2b..69d6ce7dfa18 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1811,7 +1811,9 @@ int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
 	case 1:
 		/* skb is currently provided only when forwarding */
 		if (skb) {
-			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
+			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP |
+					    FLOW_DISSECTOR_F_STOP_AT_L4;
+;
 			struct flow_keys keys;
 
 			/* short-circuit if we already have L4 hash present */
-- 
2.11.0

^ permalink raw reply related

* [PATCH v3 net-next 5/8] ip: Add callbacks to flow dissection by IP protocol
From: Tom Herbert @ 2017-09-28 21:48 UTC (permalink / raw)
  To: davem; +Cc: netdev, rohit, Tom Herbert
In-Reply-To: <20170928214823.2426-1-tom@quantonium.net>

Populate the proto_flow_dissect function for IPv4 and IPv6 packet
offloads. This allows the caller to flow dissect a packet starting
at the given IP protocol (as parsed to that point by flow dissector
for instance).

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 net/ipv4/af_inet.c     | 27 +++++++++++++++++++++++++++
 net/ipv6/ip6_offload.c | 27 +++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e31108e5ef79..18c1d884999a 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1440,6 +1440,32 @@ static struct sk_buff **ipip_gro_receive(struct sk_buff **head,
 	return inet_gro_receive(head, skb);
 }
 
+static enum flow_dissect_ret inet_proto_flow_dissect(struct sk_buff *skb,
+			u8 proto,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags)
+{
+	enum flow_dissect_ret ret = FLOW_DISSECT_RET_CONTINUE;
+	const struct net_offload *ops;
+
+	rcu_read_lock();
+
+	ops = rcu_dereference(inet_offloads[proto]);
+	if (ops && ops->callbacks.flow_dissect)
+		ret =  ops->callbacks.flow_dissect(skb, key_control,
+						   flow_dissector,
+						   target_container,
+						   data, p_proto, p_ip_proto,
+						   p_nhoff, p_hlen, flags);
+
+	rcu_read_unlock();
+
+	return ret;
+}
+
 #define SECONDS_PER_DAY	86400
 
 /* inet_current_timestamp - Return IP network timestamp
@@ -1763,6 +1789,7 @@ static int ipv4_proc_init(void);
 
 static struct packet_offload ip_packet_offload __read_mostly = {
 	.type = cpu_to_be16(ETH_P_IP),
+	.proto_flow_dissect = inet_proto_flow_dissect,
 	.callbacks = {
 		.gso_segment = inet_gso_segment,
 		.gro_receive = inet_gro_receive,
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index cdb3728faca7..a33a2b40b3d6 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -339,8 +339,35 @@ static int ip4ip6_gro_complete(struct sk_buff *skb, int nhoff)
 	return inet_gro_complete(skb, nhoff);
 }
 
+static enum flow_dissect_ret inet6_proto_flow_dissect(struct sk_buff *skb,
+			u8 proto,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags)
+{
+	enum flow_dissect_ret ret = FLOW_DISSECT_RET_CONTINUE;
+	const struct net_offload *ops;
+
+	rcu_read_lock();
+
+	ops = rcu_dereference(inet6_offloads[proto]);
+	if (ops && ops->callbacks.flow_dissect)
+		ret =  ops->callbacks.flow_dissect(skb, key_control,
+						   flow_dissector,
+						   target_container, data,
+						   p_proto, p_ip_proto, p_nhoff,
+						   p_hlen, flags);
+
+	rcu_read_unlock();
+
+	return ret;
+}
+
 static struct packet_offload ipv6_packet_offload __read_mostly = {
 	.type = cpu_to_be16(ETH_P_IPV6),
+	.proto_flow_dissect = inet6_proto_flow_dissect,
 	.callbacks = {
 		.gso_segment = ipv6_gso_segment,
 		.gro_receive = ipv6_gro_receive,
-- 
2.11.0

^ permalink raw reply related

* [PATCH v3 net-next 6/8] udp: flow dissector offload
From: Tom Herbert @ 2017-09-28 21:48 UTC (permalink / raw)
  To: davem; +Cc: netdev, rohit, Tom Herbert
In-Reply-To: <20170928214823.2426-1-tom@quantonium.net>

Add support to perform UDP specific flow dissection. This is
primarily intended for dissecting encapsulated packets in UDP
encapsulation.

This patch adds a flow_dissect offload for UDP4 and UDP6. The backend
function performs a socket lookup and calls the flow_dissect function
if a socket is found.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 include/linux/udp.h      |  8 ++++++++
 include/net/udp.h        |  8 ++++++++
 include/net/udp_tunnel.h |  8 ++++++++
 net/ipv4/udp_offload.c   | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/udp_tunnel.c    |  1 +
 net/ipv6/udp_offload.c   | 16 ++++++++++++++++
 6 files changed, 89 insertions(+)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index eaea63bc79bb..2e90b189ef6a 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -79,6 +79,14 @@ struct udp_sock {
 	int			(*gro_complete)(struct sock *sk,
 						struct sk_buff *skb,
 						int nhoff);
+	/* Flow dissector function for a UDP socket */
+	enum flow_dissect_ret (*flow_dissect)(struct sock *sk,
+			const struct sk_buff *skb,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags);
 
 	/* udp_recvmsg try to use this before splicing sk_receive_queue */
 	struct sk_buff_head	reader_queue ____cacheline_aligned_in_smp;
diff --git a/include/net/udp.h b/include/net/udp.h
index c6b1c5d8d3c9..4867f329538c 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -176,6 +176,14 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
 				 struct udphdr *uh, udp_lookup_t lookup);
 int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
 
+enum flow_dissect_ret udp_flow_dissect(struct sk_buff *skb,
+			udp_lookup_t lookup,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags);
+
 static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
 {
 	struct udphdr *uh;
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 10cce0dd4450..b7102e0f41a9 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -69,6 +69,13 @@ typedef struct sk_buff **(*udp_tunnel_gro_receive_t)(struct sock *sk,
 						     struct sk_buff *skb);
 typedef int (*udp_tunnel_gro_complete_t)(struct sock *sk, struct sk_buff *skb,
 					 int nhoff);
+typedef enum flow_dissect_ret (*udp_tunnel_flow_dissect_t)(struct sock *sk,
+			const struct sk_buff *skb,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags);
 
 struct udp_tunnel_sock_cfg {
 	void *sk_user_data;     /* user data used by encap_rcv call back */
@@ -78,6 +85,7 @@ struct udp_tunnel_sock_cfg {
 	udp_tunnel_encap_destroy_t encap_destroy;
 	udp_tunnel_gro_receive_t gro_receive;
 	udp_tunnel_gro_complete_t gro_complete;
+	udp_tunnel_flow_dissect_t flow_dissect;
 };
 
 /* Setup the given (UDP) sock to receive UDP encapsulated packets */
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index a744bb515455..fddf923ef433 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -335,11 +335,59 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
 	return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
 }
 
+enum flow_dissect_ret udp_flow_dissect(struct sk_buff *skb,
+			udp_lookup_t lookup,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags)
+{
+	enum flow_dissect_ret ret = FLOW_DISSECT_RET_CONTINUE;
+	struct udphdr *uh, _uh;
+	struct sock *sk;
+
+	uh = __skb_header_pointer(skb, *p_nhoff, sizeof(_uh), data,
+				  *p_hlen, &_uh);
+	if (!uh)
+		return FLOW_DISSECT_RET_OUT_BAD;
+
+	rcu_read_lock();
+
+	sk = (*lookup)(skb, uh->source, uh->dest);
+
+	if (sk && udp_sk(sk)->flow_dissect)
+		ret = udp_sk(sk)->flow_dissect(sk, skb, key_control,
+					       flow_dissector, target_container,
+					       data, p_proto, p_ip_proto,
+					       p_nhoff, p_hlen, flags);
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(udp_flow_dissect);
+
+static enum flow_dissect_ret udp4_flow_dissect(struct sk_buff *skb,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags)
+{
+	if (!static_key_false(&udp_encap_needed))
+		return FLOW_DISSECT_RET_CONTINUE;
+
+	return udp_flow_dissect(skb, udp4_lib_lookup_skb, key_control,
+				flow_dissector, target_container, data,
+				p_proto, p_ip_proto, p_nhoff, p_hlen, flags);
+}
+
 static const struct net_offload udpv4_offload = {
 	.callbacks = {
 		.gso_segment = udp4_tunnel_segment,
 		.gro_receive  =	udp4_gro_receive,
 		.gro_complete =	udp4_gro_complete,
+		.flow_dissect = udp4_flow_dissect,
 	},
 };
 
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 6539ff15e9a3..a4eec2a044d2 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -71,6 +71,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
 	udp_sk(sk)->encap_destroy = cfg->encap_destroy;
 	udp_sk(sk)->gro_receive = cfg->gro_receive;
 	udp_sk(sk)->gro_complete = cfg->gro_complete;
+	udp_sk(sk)->flow_dissect = cfg->flow_dissect;
 
 	udp_tunnel_encap_enable(sock);
 }
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 111b026e4f03..45b77f92d77d 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -80,11 +80,27 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff)
 	return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb);
 }
 
+static enum flow_dissect_ret udp6_flow_dissect(struct sk_buff *skb,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags)
+{
+	if (!static_key_false(&udp_encap_needed))
+		return FLOW_DISSECT_RET_CONTINUE;
+
+	return udp_flow_dissect(skb, udp6_lib_lookup_skb, key_control,
+				flow_dissector, target_container, data,
+				p_proto, p_ip_proto, p_nhoff, p_hlen, flags);
+}
+
 static const struct net_offload udpv6_offload = {
 	.callbacks = {
 		.gso_segment	=	udp6_tunnel_segment,
 		.gro_receive	=	udp6_gro_receive,
 		.gro_complete	=	udp6_gro_complete,
+		.flow_dissect	=	udp6_flow_dissect,
 	},
 };
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH v3 net-next 7/8] fou: Support flow dissection
From: Tom Herbert @ 2017-09-28 21:48 UTC (permalink / raw)
  To: davem; +Cc: netdev, rohit, Tom Herbert
In-Reply-To: <20170928214823.2426-1-tom@quantonium.net>

Populate offload flow_dissect callabck appropriately for fou and gue.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 net/ipv4/fou.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 1540db65241a..a831dd49fb28 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -282,6 +282,20 @@ static int fou_gro_complete(struct sock *sk, struct sk_buff *skb,
 	return err;
 }
 
+static enum flow_dissect_ret fou_flow_dissect(struct sock *sk,
+			const struct sk_buff *skb,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags)
+{
+	*p_ip_proto = fou_from_sock(sk)->protocol;
+	*p_nhoff += sizeof(struct udphdr);
+
+	return FLOW_DISSECT_RET_IPPROTO_AGAIN;
+}
+
 static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
 				      struct guehdr *guehdr, void *data,
 				      size_t hdrlen, struct gro_remcsum *grc,
@@ -500,6 +514,53 @@ static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
 	return err;
 }
 
+static enum flow_dissect_ret gue_flow_dissect(struct sock *sk,
+			const struct sk_buff *skb,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags)
+{
+	struct guehdr *guehdr, _guehdr;
+
+	guehdr = __skb_header_pointer(skb, *p_nhoff + sizeof(struct udphdr),
+				      sizeof(_guehdr), data, *p_hlen, &_guehdr);
+	if (!guehdr)
+		return FLOW_DISSECT_RET_OUT_BAD;
+
+	switch (guehdr->version) {
+	case 0:
+		if (unlikely(guehdr->control))
+			return FLOW_DISSECT_RET_CONTINUE;
+
+		*p_ip_proto = guehdr->proto_ctype;
+		*p_nhoff += sizeof(struct udphdr) +
+		    sizeof(*guehdr) + (guehdr->hlen << 2);
+
+		break;
+	case 1:
+		switch (((struct iphdr *)guehdr)->version) {
+		case 4:
+			*p_ip_proto = IPPROTO_IPIP;
+			break;
+		case 6:
+			*p_ip_proto = IPPROTO_IPV6;
+			break;
+		default:
+			return FLOW_DISSECT_RET_CONTINUE;
+		}
+
+		*p_nhoff += sizeof(struct udphdr);
+
+		break;
+	default:
+		return FLOW_DISSECT_RET_CONTINUE;
+	}
+
+	return FLOW_DISSECT_RET_IPPROTO_AGAIN;
+}
+
 static int fou_add_to_port_list(struct net *net, struct fou *fou)
 {
 	struct fou_net *fn = net_generic(net, fou_net_id);
@@ -570,12 +631,14 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
 		tunnel_cfg.encap_rcv = fou_udp_recv;
 		tunnel_cfg.gro_receive = fou_gro_receive;
 		tunnel_cfg.gro_complete = fou_gro_complete;
+		tunnel_cfg.flow_dissect = fou_flow_dissect;
 		fou->protocol = cfg->protocol;
 		break;
 	case FOU_ENCAP_GUE:
 		tunnel_cfg.encap_rcv = gue_udp_recv;
 		tunnel_cfg.gro_receive = gue_gro_receive;
 		tunnel_cfg.gro_complete = gue_gro_complete;
+		tunnel_cfg.flow_dissect = gue_flow_dissect;
 		break;
 	default:
 		err = -EINVAL;
-- 
2.11.0

^ permalink raw reply related

* [PATCH v3 net-next 8/8] vxlan: support flow dissect
From: Tom Herbert @ 2017-09-28 21:48 UTC (permalink / raw)
  To: davem; +Cc: netdev, rohit, Tom Herbert
In-Reply-To: <20170928214823.2426-1-tom@quantonium.net>

Populate offload flow_dissect callback appropriately for VXLAN and
VXLAN-GPE.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 drivers/net/vxlan.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index d7c49cf1d5e9..80227050b2d4 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1327,6 +1327,45 @@ static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph,
 	return err <= 1;
 }
 
+static enum flow_dissect_ret vxlan_flow_dissect(struct sock *sk,
+			const struct sk_buff *skb,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags)
+{
+	__be16 protocol = htons(ETH_P_TEB);
+	struct vxlanhdr *vhdr, _vhdr;
+	struct vxlan_sock *vs;
+
+	vhdr = __skb_header_pointer(skb, *p_nhoff + sizeof(struct udphdr),
+				    sizeof(_vhdr), data, *p_hlen, &_vhdr);
+	if (!vhdr)
+		return FLOW_DISSECT_RET_OUT_BAD;
+
+	vs = rcu_dereference_sk_user_data(sk);
+	if (!vs)
+		return FLOW_DISSECT_RET_OUT_BAD;
+
+	if (vs->flags & VXLAN_F_GPE) {
+		struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vhdr;
+
+		/* Need to have Next Protocol set for interfaces in GPE mode. */
+		if (gpe->version != 0 || !gpe->np_applied || gpe->oam_flag)
+			return FLOW_DISSECT_RET_CONTINUE;
+
+		protocol = tun_p_from_eth_p(gpe->next_protocol);
+		if (!protocol)
+			return FLOW_DISSECT_RET_CONTINUE;
+	}
+
+	*p_nhoff += sizeof(struct udphdr) + sizeof(_vhdr);
+	*p_proto = protocol;
+
+	return FLOW_DISSECT_RET_PROTO_AGAIN;
+}
+
 /* Callback from net/ipv4/udp.c to receive packets */
 static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
 {
@@ -2846,6 +2885,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
 	tunnel_cfg.encap_destroy = NULL;
 	tunnel_cfg.gro_receive = vxlan_gro_receive;
 	tunnel_cfg.gro_complete = vxlan_gro_complete;
+	tunnel_cfg.flow_dissect = vxlan_flow_dissect;
 
 	setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
 
-- 
2.11.0

^ permalink raw reply related

* Re: [Patch net-next] net_sched: use idr to allocate u32 filter handles
From: Cong Wang @ 2017-09-28 22:19 UTC (permalink / raw)
  To: Simon Horman; +Cc: Linux Kernel Network Developers, Chris Mi, Jamal Hadi Salim
In-Reply-To: <20170928073423.GB15815@netronome.com>

On Thu, Sep 28, 2017 at 12:34 AM, Simon Horman
<simon.horman@netronome.com> wrote:
> Hi Cong,
>
> this looks like a nice enhancement to me. Did you measure any performance
> benefit from it.  Perhaps it could be described in the changelog_ I also
> have a more detailed question below.

No, I am inspired by commit c15ab236d69d, don't measure it.


>
>> ---
>>  net/sched/cls_u32.c | 108 ++++++++++++++++++++++++++++++++--------------------
>>  1 file changed, 67 insertions(+), 41 deletions(-)
>>
>> diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
>> index 10b8d851fc6b..316b8a791b13 100644
>> --- a/net/sched/cls_u32.c
>> +++ b/net/sched/cls_u32.c
>> @@ -46,6 +46,7 @@
>
> ...
>
>> @@ -937,22 +940,33 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
>>                       return -EINVAL;
>>               if (TC_U32_KEY(handle))
>>                       return -EINVAL;
>> -             if (handle == 0) {
>> -                     handle = gen_new_htid(tp->data);
>> -                     if (handle == 0)
>> -                             return -ENOMEM;
>> -             }
>>               ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL);
>>               if (ht == NULL)
>>                       return -ENOBUFS;
>> +             if (handle == 0) {
>> +                     handle = gen_new_htid(tp->data, ht);
>> +                     if (handle == 0) {
>> +                             kfree(ht);
>> +                             return -ENOMEM;
>> +                     }
>> +             } else {
>> +                     err = idr_alloc_ext(&tp_c->handle_idr, ht, NULL,
>> +                                         handle, handle + 1, GFP_KERNEL);
>> +                     if (err) {
>> +                             kfree(ht);
>> +                             return err;
>> +                     }
>
> The above seems to check that handle is not already in use and mark it as
> in use. But I don't see that logic in the code prior to this patch.
> Am I missing something? If not perhaps this portion should be a separate
> patch or described in the changelog.

The logic is in upper layer, tc_ctl_tfilter(). It tries to get a
filter by handle
(if non-zero), and errors out if we are creating a new filter with the same
handle.

At the point you quote above, 'n' is already NULL and 'handle' is non-zero,
which means there is no existing filter has same handle, it is safe to just
mark it as in-use.

Thanks.

^ permalink raw reply

* Re: [PATCH V2] r8152: add Linksys USB3GIGV1 id
From: Rustad, Mark D @ 2017-09-28 22:28 UTC (permalink / raw)
  To: Grant Grundler
  Cc: Oliver Neukum, Doug Anderson, David S . Miller,
	Greg Kroah-Hartman, Hayes Wang, LKML, linux-usb, netdev
In-Reply-To: <CANEJEGtEfxR+8n3tRtsci=4Gtyhy021Z-DRvEgXPGFUqYK2u7Q@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1206 bytes --]


> On Sep 27, 2017, at 9:39 AM, Grant Grundler <grundler@chromium.org> wrote:
> 
> On Wed, Sep 27, 2017 at 12:15 AM, Oliver Neukum <oneukum@suse.com> wrote:
>> Am Dienstag, den 26.09.2017, 08:19 -0700 schrieb Doug Anderson:
>>> 
>>> I know that for at least some of the adapters in the CDC Ethernet
>>> blacklist it was claimed that the CDC Ethernet support in the adapter
>>> was kinda broken anyway so the blacklist made sense.  ...but for the
>>> Linksys Gigabit adapter the CDC Ethernet driver seems to work OK, it's
>>> just not quite as full featured / efficient as the R8152 driver.
>>> 
>>> Is that not a concern?  I guess you could tell people in this
>>> situation that they simply need to enable the R8152 driver to get
>>> continued support for their Ethernet adapter?
>> 
>> Hi,
>> 
>> yes, it is a valid concern. An #ifdef will be needed.
> 
> Good idea - I will post V3 shortly.
> 
> I'm assuming you mean to add #ifdef CONFIG_USB_RTL8152 around the
> blacklist entry in cdc_ether driver.

Shouldn't that be an #if IS_ENABLED(...) test, since that seems to be the proper way to check configured drivers.

--
Mark Rustad, Networking Division, Intel Corporation


[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 841 bytes --]

^ permalink raw reply

* Re: [next-queue PATCH 2/3] net/sched: Introduce Credit Based Shaper (CBS) qdisc
From: Cong Wang @ 2017-09-28 22:34 UTC (permalink / raw)
  To: Vinicius Costa Gomes
  Cc: Linux Kernel Network Developers, intel-wired-lan,
	Jamal Hadi Salim, Jiri Pirko, andre.guedes, Ivan Briano,
	jesus.sanchez-palencia, boon.leong.ong, richardcochran,
	Henrik Austad
In-Reply-To: <87lgkzg7xv.fsf@intel.com>

On Wed, Sep 27, 2017 at 2:14 PM, Vinicius Costa Gomes
<vinicius.gomes@intel.com> wrote:
> Hi,
>
> Cong Wang <xiyou.wangcong@gmail.com> writes:
>
>> On Tue, Sep 26, 2017 at 4:39 PM, Vinicius Costa Gomes
>> <vinicius.gomes@intel.com> wrote:
>>> +static int cbs_init(struct Qdisc *sch, struct nlattr *opt)
>>> +{
>>> +       struct cbs_sched_data *q = qdisc_priv(sch);
>>> +       struct net_device *dev = qdisc_dev(sch);
>>> +
>>> +       if (!opt)
>>> +               return -EINVAL;
>>> +
>>> +       /* FIXME: this means that we can only install this qdisc
>>> +        * "under" mqprio. Do we need a more generic way to retrieve
>>> +        * the queue, or do we pass the netdev_queue to the driver?
>>> +        */
>>> +       q->queue = TC_H_MIN(sch->parent) - 1 - netdev_get_num_tc(dev);
>>> +
>>> +       return cbs_change(sch, opt);
>>> +}
>>
>> Yeah it is ugly to assume its parent is mqprio, at least you should
>> error out if it is not the case.
>
> Will add an error for this, for now.
>
>>
>> I am not sure how we can solve this elegantly, perhaps you should
>> extend mqprio rather than add a new one?
>
> Is the alternative hinted in the FIXME worse? Instead of passing the
> index of the hardware queue to the driver we pass the pointer to a
> netdev_queue to the driver and it "discovers" the HW queue from that.

Does this way solve the dependency on mqprio? If yes then it is good.
And you have to fix it before merge, we don't have any qdisc depending
a specific type of qdisc to be its parent.

^ permalink raw reply

* Re: [net-next PATCH 0/5] New bpf cpumap type for XDP_REDIRECT
From: Daniel Borkmann @ 2017-09-28 22:45 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, netdev
  Cc: jakub.kicinski, Michael S. Tsirkin, Jason Wang, mchan,
	John Fastabend, peter.waskiewicz.jr, Daniel Borkmann,
	Alexei Starovoitov, Andy Gospodarek, edumazet
In-Reply-To: <150660339205.2808.7084136789768233829.stgit@firesoul>

On 09/28/2017 02:57 PM, Jesper Dangaard Brouer wrote:
> Introducing a new way to redirect XDP frames.  Notice how no driver
> changes are necessary given the design of XDP_REDIRECT.
>
> This redirect map type is called 'cpumap', as it allows redirection
> XDP frames to remote CPUs.  The remote CPU will do the SKB allocation
> and start the network stack invocation on that CPU.
>
> This is a scalability and isolation mechanism, that allow separating
> the early driver network XDP layer, from the rest of the netstack, and
> assigning dedicated CPUs for this stage.  The sysadm control/configure
> the RX-CPU to NIC-RX queue (as usual) via procfs smp_affinity and how
> many queues are configured via ethtool --set-channels.  Benchmarks
> show that a single CPU can handle approx 11Mpps.  Thus, only assigning
> two NIC RX-queues (and two CPUs) is sufficient for handling 10Gbit/s
> wirespeed smallest packet 14.88Mpps.  Reducing the number of queues
> have the advantage that more packets being "bulk" available per hard
> interrupt[1].
>
> [1] https://www.netdevconf.org/2.1/papers/BusyPollingNextGen.pdf
>
> Use-cases:
>
> 1. End-host based pre-filtering for DDoS mitigation.  This is fast
>     enough to allow software to see and filter all packets wirespeed.
>     Thus, no packets getting silently dropped by hardware.
>
> 2. Given NIC HW unevenly distributes packets across RX queue, this
>     mechanism can be used for redistribution load across CPUs.  This
>     usually happens when HW is unaware of a new protocol.  This
>     resembles RPS (Receive Packet Steering), just faster, but with more
>     responsibility placed on the BPF program for correct steering.
>
> 3. Auto-scaling or power saving via only activating the appropriate
>     number of remote CPUs for handling the current load.  The cpumap
>     tracepoints can function as a feedback loop for this purpose.

Interesting work, thanks! Still digesting the code a bit. I think
it pretty much goes into the direction that Eric describes in his
netdev paper quoted above; not on a generic level though but specific
to XDP at least; theoretically XDP could just run transparently on
the CPU doing the filtering, and raw buffers are handed to remote
CPU with similar batching, but it would need some different config
interface at minimum.

Shouldn't we take the CPU(s) running XDP on the RX queues out from
the normal process scheduler, so that we have a guarantee that user
space or unrelated kernel tasks cannot interfere with them anymore,
and we could then turn them into busy polling eventually (e.g. as
long as XDP is running there and once off could put them back into
normal scheduling domain transparently)?

What about RPS/RFS in the sense that once you punt them to remote
CPU, could we reuse application locality information so they'd end
up on the right CPU in the first place (w/o backlog detour), or is
the intent to rather disable it and have some own orchestration
with relation to the CPU map?

Cheers,
Daniel

^ permalink raw reply

* Re: [PATCH] Add a driver for Renesas uPD60620 and uPD60620A PHYs
From: Andrew Lunn @ 2017-09-28 22:59 UTC (permalink / raw)
  To: Bernd Edlinger; +Cc: netdev@vger.kernel.org, Florian Fainelli
In-Reply-To: <ca1763c4-fb7b-cf49-6bcf-d9e2c29c7363@hotmail.de>

Hi Bernd

> >> +	if (phy_state & BMSR_ANEGCOMPLETE) {
> > 
> > It is worth comparing this against genphy_read_status() which is the
> > reference implementation. You would normally check if auto negotiation
> > is enabled, not if it has completed. If it is enabled you read the
> > current negotiated state, even if it is not completed.
> > 
> 
> Do you suggest that there are cases where auto negotiation does not
> reach completion, and still provides a usable link status?

My experience is that it often return 10/half, since everything should
support that. And depending on what the MAC is doing, packets can
sometime get across the link.

> I have tried to connect to link partners with fixed configuration
> but even then the auto negotiation always competes normally.

Which is a bit odd.

There are a few different possibilities here.  The peer PHY driver is
broken. Rather than doing fixed, it actually set the possible
negotiation options to just the one setting you tried to fix it
to. And hence the uPD60620 device negotiated fine. Or the uPD60620 is
broken is said it negotiated, but in fact it failed.

What was the result? 10/Half, or the fixed values you set the peer to?

> 
> >From 2e101aed8466b314251972d1eaccfb43cf177078 Mon Sep 17 00:00:00 2001
> From: Bernd Edlinger <bernd.edlinger@hotmail.de>
> Date: Thu, 21 Sep 2017 15:46:16 +0200
> Subject: [PATCH 2/5] Add a driver for Renesas uPD60620 and uPD60620A PHYs.
> 
> Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>

Please send this is a new patch. If we were to take this is is, all
the comments above would end up in the commit message.

> ---

Under the --- you can however add comments which don't go into the
commit log. Good practice is to list the things you changed since the
previous version.

Thanks
	Andrew

^ permalink raw reply

* Re: [next-queue PATCH 2/3] net/sched: Introduce Credit Based Shaper (CBS) qdisc
From: Vinicius Costa Gomes @ 2017-09-28 23:07 UTC (permalink / raw)
  To: Cong Wang
  Cc: Linux Kernel Network Developers, intel-wired-lan,
	Jamal Hadi Salim, Jiri Pirko, andre.guedes, Ivan Briano,
	jesus.sanchez-palencia, boon.leong.ong, richardcochran,
	Henrik Austad
In-Reply-To: <CAM_iQpUUUrYeyqYK-CDAXzuKcy9FDtq_etQQvO4D4uCjV6As-w@mail.gmail.com>

Hi,

Cong Wang <xiyou.wangcong@gmail.com> writes:

[...]

>>>
>>> I am not sure how we can solve this elegantly, perhaps you should
>>> extend mqprio rather than add a new one?
>>
>> Is the alternative hinted in the FIXME worse? Instead of passing the
>> index of the hardware queue to the driver we pass the pointer to a
>> netdev_queue to the driver and it "discovers" the HW queue from that.
>
> Does this way solve the dependency on mqprio? If yes then it is good.
> And you have to fix it before merge, we don't have any qdisc depending
> a specific type of qdisc to be its parent.

Yes, it does. And if we do like Jesus pointed out, we can do this on the
CBS qdisc side, no need to change the driver.


Cheers,

^ permalink raw reply

* Re: [net-next PATCH 3/5] bpf: cpumap xdp_buff to skb conversion and allocation
From: Daniel Borkmann @ 2017-09-28 23:21 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, netdev
  Cc: jakub.kicinski, Michael S. Tsirkin, Jason Wang, mchan,
	John Fastabend, peter.waskiewicz.jr, Daniel Borkmann,
	Alexei Starovoitov, Andy Gospodarek
In-Reply-To: <150660343811.2808.7680200486950101509.stgit@firesoul>

On 09/28/2017 02:57 PM, Jesper Dangaard Brouer wrote:
[...]
> +/* Convert xdp_buff to xdp_pkt */
> +static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
> +{
> +	struct xdp_pkt *xdp_pkt;
> +	int headroom;
> +
> +	/* Assure headroom is available for storing info */
> +	headroom = xdp->data - xdp->data_hard_start;
> +	if (headroom < sizeof(*xdp_pkt))
> +		return NULL;
> +
> +	/* Store info in top of packet */
> +	xdp_pkt = xdp->data_hard_start;

(You'd also need to handle data_meta here if set, and for below
cpu_map_build_skb(), e.g. headroom is data_meta-data_hard_start.)

> +	xdp_pkt->data = xdp->data;
> +	xdp_pkt->len  = xdp->data_end - xdp->data;
> +	xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
> +
> +	return xdp_pkt;
> +}
> +
> +static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
> +					 struct xdp_pkt *xdp_pkt)
> +{
> +	unsigned int frame_size;
> +	void *pkt_data_start;
> +	struct sk_buff *skb;
> +
> +	/* build_skb need to place skb_shared_info after SKB end, and
> +	 * also want to know the memory "truesize".  Thus, need to
[...]
>   static int cpu_map_kthread_run(void *data)
>   {
> +	const unsigned long busy_poll_jiffies = usecs_to_jiffies(2000);
> +	unsigned long time_limit = jiffies + busy_poll_jiffies;
>   	struct bpf_cpu_map_entry *rcpu = data;
> +	unsigned int empty_cnt = 0;
>
>   	set_current_state(TASK_INTERRUPTIBLE);
>   	while (!kthread_should_stop()) {
> +		unsigned int processed = 0, drops = 0;
>   		struct xdp_pkt *xdp_pkt;
>
> -		schedule();
> -		/* Do work */
> -		while ((xdp_pkt = ptr_ring_consume(rcpu->queue))) {
> -			/* For now just "refcnt-free" */
> -			page_frag_free(xdp_pkt);
> +		/* Release CPU reschedule checks */
> +		if ((time_after_eq(jiffies, time_limit) || empty_cnt > 25) &&
> +		    __ptr_ring_empty(rcpu->queue)) {
> +			empty_cnt++;
> +			schedule();
> +			time_limit = jiffies + busy_poll_jiffies;
> +			WARN_ON(smp_processor_id() != rcpu->cpu);
> +		} else {
> +			cond_resched();
>   		}
> +
> +		/* Process packets in rcpu->queue */
> +		local_bh_disable();
> +		/*
> +		 * The bpf_cpu_map_entry is single consumer, with this
> +		 * kthread CPU pinned. Lockless access to ptr_ring
> +		 * consume side valid as no-resize allowed of queue.
> +		 */
> +		while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) {
> +			struct sk_buff *skb;
> +			int ret;
> +
> +			/* Allow busy polling again */
> +			empty_cnt = 0;
> +
> +			skb = cpu_map_build_skb(rcpu, xdp_pkt);
> +			if (!skb) {
> +				page_frag_free(xdp_pkt);
> +				continue;
> +			}
> +
> +			/* Inject into network stack */
> +			ret = netif_receive_skb(skb);

Have you looked into whether it's feasible to reuse GRO
engine here as well?

> +			if (ret == NET_RX_DROP)
> +				drops++;
> +
> +			/* Limit BH-disable period */
> +			if (++processed == 8)
> +				break;
> +		}
> +		local_bh_enable();
> +
>   		__set_current_state(TASK_INTERRUPTIBLE);
>   	}
>   	put_cpu_map_entry(rcpu);
[...]

^ permalink raw reply

* Re: [REGRESSION] Warning in tcp_fastretrans_alert() of net/ipv4/tcp_input.c
From: Yuchung Cheng @ 2017-09-28 23:36 UTC (permalink / raw)
  To: Oleksandr Natalenko
  Cc: Roman Gushchin, Hideaki YOSHIFUJI, Alexey Kuznetsov, netdev,
	linux-kernel@vger.kernel.org
In-Reply-To: <2325466.Xo6SG5M5hd@natalenko.name>

On Thu, Sep 28, 2017 at 1:14 AM, Oleksandr Natalenko
<oleksandr@natalenko.name> wrote:
> Hi.
>
> Won't tell about panic in tcp_sacktag_walk() since I cannot trigger it
> intentionally, but setting net.ipv4.tcp_retrans_collapse to 0 *does not* fix
> warning in tcp_fastretrans_alert() for me.

Hi Oleksandr: no retrans_collapse should not matter for that warning
in tcp_fstretrans_alert(). the warning as I explained earlier is
likely false. Neal and I are more concerned the panic in
tcp_sacktag_walk. This is just a blind shot but thx for retrying.

We can submit a one-liner to remove the fast retrans warning but want
to nail the bigger issue first.

>
> On středa 27. září 2017 2:18:32 CEST Yuchung Cheng wrote:
>> On Tue, Sep 26, 2017 at 5:12 PM, Yuchung Cheng <ycheng@google.com> wrote:
>> > On Tue, Sep 26, 2017 at 6:10 AM, Roman Gushchin <guro@fb.com> wrote:
>> >>> On Wed, Sep 20, 2017 at 6:46 PM, Roman Gushchin <guro@fb.com> wrote:
>> >>> > > Hello.
>> >>> > >
>> >>> > > Since, IIRC, v4.11, there is some regression in TCP stack resulting
>> >>> > > in the
>> >>> > > warning shown below. Most of the time it is harmless, but rarely it
>> >>> > > just
>> >>> > > causes either freeze or (I believe, this is related too) panic in
>> >>> > > tcp_sacktag_walk() (because sk_buff passed to this function is
>> >>> > > NULL).
>> >>> > > Unfortunately, I still do not have proper stacktrace from panic, but
>> >>> > > will try to capture it if possible.
>> >>> > >
>> >>> > > Also, I have custom settings regarding TCP stack, shown below as
>> >>> > > well. ifb is used to shape traffic with tc.
>> >>> > >
>> >>> > > Please note this regression was already reported as BZ [1] and as a
>> >>> > > letter to ML [2], but got neither attention nor resolution. It is
>> >>> > > reproducible for (not only) me on my home router since v4.11 till
>> >>> > > v4.13.1 incl.
>> >>> > >
>> >>> > > Please advise on how to deal with it. I'll provide any additional
>> >>> > > info if
>> >>> > > necessary, also ready to test patches if any.
>> >>> > >
>> >>> > > Thanks.
>> >>> > >
>> >>> > > [1] https://bugzilla.kernel.org/show_bug.cgi?id=195835
>> >>> > > [2]
>> >>> > > https://urldefense.proofpoint.com/v2/url?u=https-3A__www.spinics.ne
>> >>> > > t_lists_netdev_msg436158.html&d=DwIBaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=jJ
>> >>> > > YgtDM7QT-W-Fz_d29HYQ&m=MDDRfLG5DvdOeniMpaZDJI8ulKQ6PQ6OX_1YtRsiTMA&s
>> >>> > > =-n3dGZw-pQ95kMBUfq5G9nYZFcuWtbTDlYFkcvQPoKc&e=>>> >
>> >>> > We're experiencing the same problems on some machines in our fleet.
>> >>> > Exactly the same symptoms: tcp_fastretrans_alert() warnings and
>> >>> > sometimes panics in tcp_sacktag_walk().
>> >>
>> >>> > Here is an example of a backtrace with the panic log:
>> >> Hi Yuchung!
>> >>
>> >>> do you still see the panics if you disable RACK?
>> >>> sysctl net.ipv4.tcp_recovery=0?
>> >>
>> >> No, we haven't seen any crash since that.
>> >
>> > I am out of ideas how RACK can potentially cause tcp_sacktag_walk to
>> > take an empty skb :-( Do you have stack trace or any hint on which call
>> > to tcp-sacktag_walk triggered the panic? internally at Google we never
>> > see that.
>>
>> hmm something just struck me: could you try
>> sysctl net.ipv4.tcp_recovery=1 net.ipv4.tcp_retrans_collapse=0
>> and see if kernel still panics on sack processing?
>>
>> >>> also have you experience any sack reneg? could you post the output of
>> >>> ' nstat |grep -i TCP' thanks
>> >>
>> >> hostname        TcpActiveOpens                  2289680            0.0
>> >> hostname        TcpPassiveOpens                 3592758            0.0
>> >> hostname        TcpAttemptFails                 746910             0.0
>> >> hostname        TcpEstabResets                  154988             0.0
>> >> hostname        TcpInSegs                       16258678255        0.0
>> >> hostname        TcpOutSegs                      46967011611        0.0
>> >> hostname        TcpRetransSegs                  13724310           0.0
>> >> hostname        TcpInErrs                       2                  0.0
>> >> hostname        TcpOutRsts                      9418798            0.0
>> >> hostname        TcpExtEmbryonicRsts             2303               0.0
>> >> hostname        TcpExtPruneCalled               90192              0.0
>> >> hostname        TcpExtOfoPruned                 57274              0.0
>> >> hostname        TcpExtOutOfWindowIcmps          3                  0.0
>> >> hostname        TcpExtTW                        1164705            0.0
>> >> hostname        TcpExtTWRecycled                2                  0.0
>> >> hostname        TcpExtPAWSEstab                 159                0.0
>> >> hostname        TcpExtDelayedACKs               209207209          0.0
>> >> hostname        TcpExtDelayedACKLocked          508571             0.0
>> >> hostname        TcpExtDelayedACKLost            1713248            0.0
>> >> hostname        TcpExtListenOverflows           625                0.0
>> >> hostname        TcpExtListenDrops               625                0.0
>> >> hostname        TcpExtTCPHPHits                 9341188489         0.0
>> >> hostname        TcpExtTCPPureAcks               1434646465         0.0
>> >> hostname        TcpExtTCPHPAcks                 5733614672         0.0
>> >> hostname        TcpExtTCPSackRecovery           3261698            0.0
>> >> hostname        TcpExtTCPSACKReneging           12203              0.0
>> >> hostname        TcpExtTCPSACKReorder            433189             0.0
>> >> hostname        TcpExtTCPTSReorder              22694              0.0
>> >> hostname        TcpExtTCPFullUndo               45092              0.0
>> >> hostname        TcpExtTCPPartialUndo            22016              0.0
>> >> hostname        TcpExtTCPLossUndo               2150040            0.0
>> >> hostname        TcpExtTCPLostRetransmit         60119              0.0
>> >> hostname        TcpExtTCPSackFailures           2626782            0.0
>> >> hostname        TcpExtTCPLossFailures           182999             0.0
>> >> hostname        TcpExtTCPFastRetrans            4334275            0.0
>> >> hostname        TcpExtTCPSlowStartRetrans       3453348            0.0
>> >> hostname        TcpExtTCPTimeouts               1070997            0.0
>> >> hostname        TcpExtTCPLossProbes             2633545            0.0
>> >> hostname        TcpExtTCPLossProbeRecovery      941647             0.0
>> >> hostname        TcpExtTCPSackRecoveryFail       336302             0.0
>> >> hostname        TcpExtTCPRcvCollapsed           461354             0.0
>> >> hostname        TcpExtTCPAbortOnData            349196             0.0
>> >> hostname        TcpExtTCPAbortOnClose           3395               0.0
>> >> hostname        TcpExtTCPAbortOnTimeout         51201              0.0
>> >> hostname        TcpExtTCPMemoryPressures        2                  0.0
>> >> hostname        TcpExtTCPSpuriousRTOs           2120503            0.0
>> >> hostname        TcpExtTCPSackShifted            2613736            0.0
>> >> hostname        TcpExtTCPSackMerged             21358743           0.0
>> >> hostname        TcpExtTCPSackShiftFallback      8769387            0.0
>> >> hostname        TcpExtTCPBacklogDrop            5                  0.0
>> >> hostname        TcpExtTCPRetransFail            843                0.0
>> >> hostname        TcpExtTCPRcvCoalesce            949068035          0.0
>> >> hostname        TcpExtTCPOFOQueue               470118             0.0
>> >> hostname        TcpExtTCPOFODrop                9915               0.0
>> >> hostname        TcpExtTCPOFOMerge               9                  0.0
>> >> hostname        TcpExtTCPChallengeACK           90                 0.0
>> >> hostname        TcpExtTCPSYNChallenge           3                  0.0
>> >> hostname        TcpExtTCPFastOpenActive         2089               0.0
>> >> hostname        TcpExtTCPSpuriousRtxHostQueues  896596             0.0
>> >> hostname        TcpExtTCPAutoCorking            547386735          0.0
>> >> hostname        TcpExtTCPFromZeroWindowAdv      28757              0.0
>> >> hostname        TcpExtTCPToZeroWindowAdv        28761              0.0
>> >> hostname        TcpExtTCPWantZeroWindowAdv      322431             0.0
>> >> hostname        TcpExtTCPSynRetrans             3026               0.0
>> >> hostname        TcpExtTCPOrigDataSent           40976870977        0.0
>> >> hostname        TcpExtTCPHystartTrainDetect     453920             0.0
>> >> hostname        TcpExtTCPHystartTrainCwnd       11586273           0.0
>> >> hostname        TcpExtTCPHystartDelayDetect     10943              0.0
>> >> hostname        TcpExtTCPHystartDelayCwnd       763554             0.0
>> >> hostname        TcpExtTCPACKSkippedPAWS         30                 0.0
>> >> hostname        TcpExtTCPACKSkippedSeq          218                0.0
>> >> hostname        TcpExtTCPWinProbe               2408               0.0
>> >> hostname        TcpExtTCPKeepAlive              213768             0.0
>> >> hostname        TcpExtTCPMTUPFail               69                 0.0
>> >> hostname        TcpExtTCPMTUPSuccess            8811               0.0
>> >>
>> >> Thanks!
>
>

^ permalink raw reply

* [PATCH v4 net-next 0/8] flow_dissector: Protocol specific flow dissector offload
From: Tom Herbert @ 2017-09-28 23:52 UTC (permalink / raw)
  To: davem; +Cc: netdev, rohit, Tom Herbert

This patch set adds a new offload type to perform flow dissection for
specific protocols (either by EtherType or by IP protocol). This is
primary useful to crack open UDP encapsulations (like VXLAN, GUE) for
the purposes of parsing the encapsulated packet.

Items in this patch set:
- Create new protocol case in __skb_dissect for ETH_P_TEB. This is based
  on the code in the GRE dissect function and the special handling in
  GRE can now be removed (it sets protocol to ETH_P_TEB and returns so
  goto proto_again is done)
- Add infrastructure for protocol specific flow dissection offload
- Add infrastructure to perform UDP flow dissection. Uses same model of
  GRO where a flow_dissect callback can be associated with a UDP
  socket
- Use the infrastructure to support flow dissection of VXLAN and GUE

Tested:

Forced RPS to call flow dissection for VXLAN, FOU, and GUE. Observed
that inner packet was being properly dissected.

v2: Add signed off

v3:
   - Make skb argument of flow dissector to be non const
   - Change UDP GRO to only do something if encap_needed static
     key is set
   - don't reference inet6_offloads or inet_offloads, get to
     them through ptype

v4:
   - skb argument to ndo_rx_flow_steer allso needs to become
     non constant

Tom Herbert (8):
  flow_dissector: Change skbuf argument to be non const
  flow_dissector: Move ETH_P_TEB processing to main switch
  udp: Check static key udp_encap_needed in udp_gro_receive
  flow_dissector: Add protocol specific flow dissection offload
  ip: Add callbacks to flow dissection by IP protocol
  udp: flow dissector offload
  fou: Support flow dissection
  vxlan: support flow dissect

 drivers/net/ethernet/broadcom/bnxt/bnxt.c         |  2 +-
 drivers/net/ethernet/cisco/enic/enic_clsf.c       |  2 +-
 drivers/net/ethernet/cisco/enic/enic_clsf.h       |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c    |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c |  2 +-
 drivers/net/ethernet/qlogic/qede/qede.h           |  2 +-
 drivers/net/ethernet/qlogic/qede/qede_filter.c    |  2 +-
 drivers/net/ethernet/sfc/efx.h                    |  2 +-
 drivers/net/ethernet/sfc/falcon/efx.h             |  2 +-
 drivers/net/ethernet/sfc/falcon/rx.c              |  2 +-
 drivers/net/ethernet/sfc/rx.c                     |  2 +-
 drivers/net/vxlan.c                               | 40 +++++++++++++
 include/linux/netdevice.h                         | 31 +++++++++-
 include/linux/skbuff.h                            | 12 ++--
 include/linux/udp.h                               |  8 +++
 include/net/flow_dissector.h                      |  1 +
 include/net/ip_fib.h                              |  4 +-
 include/net/route.h                               |  4 +-
 include/net/udp.h                                 | 10 ++++
 include/net/udp_tunnel.h                          |  8 +++
 net/core/dev.c                                    | 65 +++++++++++++++++++++
 net/core/flow_dissector.c                         | 71 ++++++++++++++---------
 net/ipv4/af_inet.c                                | 27 +++++++++
 net/ipv4/fib_semantics.c                          |  2 +-
 net/ipv4/fou.c                                    | 63 ++++++++++++++++++++
 net/ipv4/route.c                                  | 10 ++--
 net/ipv4/udp.c                                    |  4 +-
 net/ipv4/udp_offload.c                            | 55 ++++++++++++++++++
 net/ipv4/udp_tunnel.c                             |  1 +
 net/ipv6/ip6_offload.c                            | 27 +++++++++
 net/ipv6/udp_offload.c                            | 23 ++++++++
 net/sched/sch_sfq.c                               |  2 +-
 33 files changed, 433 insertions(+), 59 deletions(-)

-- 
2.11.0

^ permalink raw reply

* [PATCH v4 net-next 1/8] flow_dissector: Change skbuf argument to be non const
From: Tom Herbert @ 2017-09-28 23:52 UTC (permalink / raw)
  To: davem; +Cc: netdev, rohit, Tom Herbert
In-Reply-To: <20170928235230.22158-1-tom@quantonium.net>

Change the skbuf argument of __skb_flow_dissect to be non constant so
that the function can call functions that take non constant skbuf
arguments. This is needed if we are to call socket lookup or BPF in the
flow dissector path.

The changes include unraveling the call chain into __skb_flow_dissect so
that those also use non constant skbufs.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         |  2 +-
 drivers/net/ethernet/cisco/enic/enic_clsf.c       |  2 +-
 drivers/net/ethernet/cisco/enic/enic_clsf.h       |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c    |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c |  2 +-
 drivers/net/ethernet/qlogic/qede/qede.h           |  2 +-
 drivers/net/ethernet/qlogic/qede/qede_filter.c    |  2 +-
 drivers/net/ethernet/sfc/efx.h                    |  2 +-
 drivers/net/ethernet/sfc/falcon/efx.h             |  2 +-
 drivers/net/ethernet/sfc/falcon/rx.c              |  2 +-
 drivers/net/ethernet/sfc/rx.c                     |  2 +-
 include/linux/netdevice.h                         |  4 ++--
 include/linux/skbuff.h                            | 12 ++++++------
 include/net/ip_fib.h                              |  4 ++--
 include/net/route.h                               |  4 ++--
 net/core/flow_dissector.c                         | 10 +++++-----
 net/ipv4/fib_semantics.c                          |  2 +-
 net/ipv4/route.c                                  |  6 +++---
 net/sched/sch_sfq.c                               |  2 +-
 20 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 5ba49938ba55..29f5cf6bea4a 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -7344,7 +7344,7 @@ static bool bnxt_fltr_match(struct bnxt_ntuple_filter *f1,
 	return false;
 }
 
-static int bnxt_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
+static int bnxt_rx_flow_steer(struct net_device *dev, struct sk_buff *skb,
 			      u16 rxq_index, u32 flow_id)
 {
 	struct bnxt *bp = netdev_priv(dev);
diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.c b/drivers/net/ethernet/cisco/enic/enic_clsf.c
index 3c677ed3c29e..7ee2aa1c3184 100644
--- a/drivers/net/ethernet/cisco/enic/enic_clsf.c
+++ b/drivers/net/ethernet/cisco/enic/enic_clsf.c
@@ -167,7 +167,7 @@ static struct enic_rfs_fltr_node *htbl_key_search(struct hlist_head *h,
 	return NULL;
 }
 
-int enic_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
+int enic_rx_flow_steer(struct net_device *dev, struct sk_buff *skb,
 		       u16 rxq_index, u32 flow_id)
 {
 	struct flow_keys keys;
diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.h b/drivers/net/ethernet/cisco/enic/enic_clsf.h
index 4bfbf25f9ddc..0e7f533f81b9 100644
--- a/drivers/net/ethernet/cisco/enic/enic_clsf.h
+++ b/drivers/net/ethernet/cisco/enic/enic_clsf.h
@@ -13,7 +13,7 @@ void enic_rfs_flw_tbl_free(struct enic *enic);
 struct enic_rfs_fltr_node *htbl_fltr_search(struct enic *enic, u16 fltr_id);
 
 #ifdef CONFIG_RFS_ACCEL
-int enic_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
+int enic_rx_flow_steer(struct net_device *dev, struct sk_buff *skb,
 		       u16 rxq_index, u32 flow_id);
 void enic_flow_may_expire(unsigned long data);
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 9c218f1cfc6c..9f7afbfb09f9 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -348,7 +348,7 @@ mlx4_en_filter_find(struct mlx4_en_priv *priv, __be32 src_ip, __be32 dst_ip,
 }
 
 static int
-mlx4_en_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
+mlx4_en_filter_rfs(struct net_device *net_dev, struct sk_buff *skb,
 		   u16 rxq_index, u32 flow_id)
 {
 	struct mlx4_en_priv *priv = netdev_priv(net_dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index cc13d3dbd366..897c9d46702c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -1017,7 +1017,7 @@ int mlx5e_arfs_create_tables(struct mlx5e_priv *priv);
 void mlx5e_arfs_destroy_tables(struct mlx5e_priv *priv);
 int mlx5e_arfs_enable(struct mlx5e_priv *priv);
 int mlx5e_arfs_disable(struct mlx5e_priv *priv);
-int mlx5e_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
+int mlx5e_rx_flow_steer(struct net_device *dev, struct sk_buff *skb,
 			u16 rxq_index, u32 flow_id);
 #endif
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
index 12d3ced61114..f5e182bd613d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
@@ -699,7 +699,7 @@ static struct arfs_rule *arfs_find_rule(struct arfs_table *arfs_t,
 	return NULL;
 }
 
-int mlx5e_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
+int mlx5e_rx_flow_steer(struct net_device *dev, struct sk_buff *skb,
 			u16 rxq_index, u32 flow_id)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index adb700512baa..56c364811929 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -445,7 +445,7 @@ struct qede_fastpath {
 #define QEDE_SP_RX_MODE			1
 
 #ifdef CONFIG_RFS_ACCEL
-int qede_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
+int qede_rx_flow_steer(struct net_device *dev, struct sk_buff *skb,
 		       u16 rxq_index, u32 flow_id);
 #define QEDE_SP_ARFS_CONFIG	4
 #define QEDE_SP_TASK_POLL_DELAY	(5 * HZ)
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index f79e36e4060a..2d2b473fbff8 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -411,7 +411,7 @@ qede_alloc_filter(struct qede_dev *edev, int min_hlen)
 	return n;
 }
 
-int qede_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
+int qede_rx_flow_steer(struct net_device *dev, struct sk_buff *skb,
 		       u16 rxq_index, u32 flow_id)
 {
 	struct qede_dev *edev = netdev_priv(dev);
diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h
index d407adf59610..805c7880df8d 100644
--- a/drivers/net/ethernet/sfc/efx.h
+++ b/drivers/net/ethernet/sfc/efx.h
@@ -171,7 +171,7 @@ static inline s32 efx_filter_get_rx_ids(struct efx_nic *efx,
 	return efx->type->filter_get_rx_ids(efx, priority, buf, size);
 }
 #ifdef CONFIG_RFS_ACCEL
-int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
+int efx_filter_rfs(struct net_device *net_dev, struct sk_buff *skb,
 		   u16 rxq_index, u32 flow_id);
 bool __efx_filter_rfs_expire(struct efx_nic *efx, unsigned quota);
 static inline void efx_filter_rfs_expire(struct efx_channel *channel)
diff --git a/drivers/net/ethernet/sfc/falcon/efx.h b/drivers/net/ethernet/sfc/falcon/efx.h
index 4f3bb30661ea..e3b9b7cbbb39 100644
--- a/drivers/net/ethernet/sfc/falcon/efx.h
+++ b/drivers/net/ethernet/sfc/falcon/efx.h
@@ -164,7 +164,7 @@ static inline s32 ef4_filter_get_rx_ids(struct ef4_nic *efx,
 	return efx->type->filter_get_rx_ids(efx, priority, buf, size);
 }
 #ifdef CONFIG_RFS_ACCEL
-int ef4_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
+int ef4_filter_rfs(struct net_device *net_dev, struct sk_buff *skb,
 		   u16 rxq_index, u32 flow_id);
 bool __ef4_filter_rfs_expire(struct ef4_nic *efx, unsigned quota);
 static inline void ef4_filter_rfs_expire(struct ef4_channel *channel)
diff --git a/drivers/net/ethernet/sfc/falcon/rx.c b/drivers/net/ethernet/sfc/falcon/rx.c
index 6a8406dc0c2b..d5d2816b30dd 100644
--- a/drivers/net/ethernet/sfc/falcon/rx.c
+++ b/drivers/net/ethernet/sfc/falcon/rx.c
@@ -833,7 +833,7 @@ MODULE_PARM_DESC(rx_refill_threshold,
 
 #ifdef CONFIG_RFS_ACCEL
 
-int ef4_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
+int ef4_filter_rfs(struct net_device *net_dev, struct sk_buff *skb,
 		   u16 rxq_index, u32 flow_id)
 {
 	struct ef4_nic *efx = netdev_priv(net_dev);
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 42443f434569..35898054aced 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -827,7 +827,7 @@ MODULE_PARM_DESC(rx_refill_threshold,
 
 #ifdef CONFIG_RFS_ACCEL
 
-int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
+int efx_filter_rfs(struct net_device *net_dev, struct sk_buff *skb,
 		   u16 rxq_index, u32 flow_id)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f535779d9dc1..06b173200e23 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1010,7 +1010,7 @@ struct xfrmdev_ops {
  *	protocol stack to use.
  *
  *	RFS acceleration.
- * int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb,
+ * int (*ndo_rx_flow_steer)(struct net_device *dev, struct sk_buff *skb,
  *			    u16 rxq_index, u32 flow_id);
  *	Set hardware filter for RFS.  rxq_index is the target queue index;
  *	flow_id is a flow ID to be passed to rps_may_expire_flow() later.
@@ -1236,7 +1236,7 @@ struct net_device_ops {
 
 #ifdef CONFIG_RFS_ACCEL
 	int			(*ndo_rx_flow_steer)(struct net_device *dev,
-						     const struct sk_buff *skb,
+						     struct sk_buff *skb,
 						     u16 rxq_index,
 						     u32 flow_id);
 #endif
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 19e64bfb1a66..5a6e765e120f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1155,8 +1155,8 @@ __skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4)
 }
 
 void __skb_get_hash(struct sk_buff *skb);
-u32 __skb_get_hash_symmetric(const struct sk_buff *skb);
-u32 skb_get_poff(const struct sk_buff *skb);
+u32 __skb_get_hash_symmetric(struct sk_buff *skb);
+u32 skb_get_poff(struct sk_buff *skb);
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 		   const struct flow_keys *keys, int hlen);
 __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
@@ -1172,13 +1172,13 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 			     const struct flow_dissector_key *key,
 			     unsigned int key_count);
 
-bool __skb_flow_dissect(const struct sk_buff *skb,
+bool __skb_flow_dissect(struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
 			void *target_container,
 			void *data, __be16 proto, int nhoff, int hlen,
 			unsigned int flags);
 
-static inline bool skb_flow_dissect(const struct sk_buff *skb,
+static inline bool skb_flow_dissect(struct sk_buff *skb,
 				    struct flow_dissector *flow_dissector,
 				    void *target_container, unsigned int flags)
 {
@@ -1186,7 +1186,7 @@ static inline bool skb_flow_dissect(const struct sk_buff *skb,
 				  NULL, 0, 0, 0, flags);
 }
 
-static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
+static inline bool skb_flow_dissect_flow_keys(struct sk_buff *skb,
 					      struct flow_keys *flow,
 					      unsigned int flags)
 {
@@ -1225,7 +1225,7 @@ static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6
 	return skb->hash;
 }
 
-__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb);
+__u32 skb_get_hash_perturb(struct sk_buff *skb, u32 perturb);
 
 static inline __u32 skb_get_hash_raw(const struct sk_buff *skb)
 {
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 1a7f7e424320..a376dfe1ad44 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -374,11 +374,11 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
-		       const struct sk_buff *skb);
+		       struct sk_buff *skb);
 #endif
 void fib_select_multipath(struct fib_result *res, int hash);
 void fib_select_path(struct net *net, struct fib_result *res,
-		     struct flowi4 *fl4, const struct sk_buff *skb);
+		     struct flowi4 *fl4, struct sk_buff *skb);
 
 /* Exported by fib_trie.c */
 void fib_trie_init(void);
diff --git a/include/net/route.h b/include/net/route.h
index 57dfc6850d37..cb95b79f0117 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -114,10 +114,10 @@ int ip_rt_init(void);
 void rt_cache_flush(struct net *net);
 void rt_flush_dev(struct net_device *dev);
 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *flp,
-					const struct sk_buff *skb);
+					struct sk_buff *skb);
 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *flp,
 					    struct fib_result *res,
-					    const struct sk_buff *skb);
+					    struct sk_buff *skb);
 
 static inline struct rtable *__ip_route_output_key(struct net *net,
 						   struct flowi4 *flp)
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 0a977373d003..76f5e5bc3177 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -424,7 +424,7 @@ static bool skb_flow_dissect_allowed(int *num_hdrs)
  *
  * Caller must take care of zeroing target container memory.
  */
-bool __skb_flow_dissect(const struct sk_buff *skb,
+bool __skb_flow_dissect(struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
 			void *target_container,
 			void *data, __be16 proto, int nhoff, int hlen,
@@ -1015,7 +1015,7 @@ u32 flow_hash_from_keys(struct flow_keys *keys)
 }
 EXPORT_SYMBOL(flow_hash_from_keys);
 
-static inline u32 ___skb_get_hash(const struct sk_buff *skb,
+static inline u32 ___skb_get_hash(struct sk_buff *skb,
 				  struct flow_keys *keys, u32 keyval)
 {
 	skb_flow_dissect_flow_keys(skb, keys,
@@ -1053,7 +1053,7 @@ EXPORT_SYMBOL(make_flow_keys_digest);
 
 static struct flow_dissector flow_keys_dissector_symmetric __read_mostly;
 
-u32 __skb_get_hash_symmetric(const struct sk_buff *skb)
+u32 __skb_get_hash_symmetric(struct sk_buff *skb)
 {
 	struct flow_keys keys;
 
@@ -1090,7 +1090,7 @@ void __skb_get_hash(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(__skb_get_hash);
 
-__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
+__u32 skb_get_hash_perturb(struct sk_buff *skb, u32 perturb)
 {
 	struct flow_keys keys;
 
@@ -1158,7 +1158,7 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data,
  * truncate packets without needing to push actual payload to the user
  * space and can analyze headers only, instead.
  */
-u32 skb_get_poff(const struct sk_buff *skb)
+u32 skb_get_poff(struct sk_buff *skb)
 {
 	struct flow_keys keys;
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 57a5d48acee8..dc610646bc4c 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1759,7 +1759,7 @@ void fib_select_multipath(struct fib_result *res, int hash)
 #endif
 
 void fib_select_path(struct net *net, struct fib_result *res,
-		     struct flowi4 *fl4, const struct sk_buff *skb)
+		     struct flowi4 *fl4, struct sk_buff *skb)
 {
 	bool oif_check;
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 94d4cd2d5ea4..94c5b81d8f2b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1791,7 +1791,7 @@ static void ip_multipath_l3_keys(const struct sk_buff *skb,
 
 /* if skb is set it will be used and fl4 can be NULL */
 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
-		       const struct sk_buff *skb)
+		       struct sk_buff *skb)
 {
 	struct net *net = fi->fib_net;
 	struct flow_keys hash_keys;
@@ -2270,7 +2270,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
  */
 
 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
-					const struct sk_buff *skb)
+					struct sk_buff *skb)
 {
 	__u8 tos = RT_FL_TOS(fl4);
 	struct fib_result res;
@@ -2295,7 +2295,7 @@ EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
 
 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
 					    struct fib_result *res,
-					    const struct sk_buff *skb)
+					    struct sk_buff *skb)
 {
 	struct net_device *dev_out = NULL;
 	int orig_oif = fl4->flowi4_oif;
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 74ea863b8240..0d2d3a8d03f0 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -158,7 +158,7 @@ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index
 }
 
 static unsigned int sfq_hash(const struct sfq_sched_data *q,
-			     const struct sk_buff *skb)
+			     struct sk_buff *skb)
 {
 	return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
 }
-- 
2.11.0

^ permalink raw reply related

* [PATCH v4 net-next 2/8] flow_dissector: Move ETH_P_TEB processing to main switch
From: Tom Herbert @ 2017-09-28 23:52 UTC (permalink / raw)
  To: davem; +Cc: netdev, rohit, Tom Herbert
In-Reply-To: <20170928235230.22158-1-tom@quantonium.net>

Support for processing TEB is currently in GRE flow dissection as a
special case. This can be moved to be a case the main proto switch in
__skb_flow_dissect.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 net/core/flow_dissector.c | 45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 76f5e5bc3177..c15b41f96cbe 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -282,27 +282,8 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
 	if (hdr->flags & GRE_SEQ)
 		offset += sizeof(((struct pptp_gre_header *) 0)->seq);
 
-	if (gre_ver == 0) {
-		if (*p_proto == htons(ETH_P_TEB)) {
-			const struct ethhdr *eth;
-			struct ethhdr _eth;
-
-			eth = __skb_header_pointer(skb, *p_nhoff + offset,
-						   sizeof(_eth),
-						   data, *p_hlen, &_eth);
-			if (!eth)
-				return FLOW_DISSECT_RET_OUT_BAD;
-			*p_proto = eth->h_proto;
-			offset += sizeof(*eth);
-
-			/* Cap headers that we access via pointers at the
-			 * end of the Ethernet header as our maximum alignment
-			 * at that point is only 2 bytes.
-			 */
-			if (NET_IP_ALIGN)
-				*p_hlen = *p_nhoff + offset;
-		}
-	} else { /* version 1, must be PPTP */
+	/* version 1, must be PPTP */
+	if (gre_ver == 1) {
 		u8 _ppp_hdr[PPP_HDRLEN];
 		u8 *ppp_hdr;
 
@@ -595,6 +576,28 @@ bool __skb_flow_dissect(struct sk_buff *skb,
 
 		break;
 	}
+	case htons(ETH_P_TEB): {
+		const struct ethhdr *eth;
+		struct ethhdr _eth;
+
+		eth = __skb_header_pointer(skb, nhoff, sizeof(_eth),
+					   data, hlen, &_eth);
+		if (!eth)
+			goto out_bad;
+
+		proto = eth->h_proto;
+		nhoff += sizeof(*eth);
+
+		/* Cap headers that we access via pointers at the
+		 * end of the Ethernet header as our maximum alignment
+		 * at that point is only 2 bytes.
+		 */
+		if (NET_IP_ALIGN)
+			hlen = nhoff;
+
+		fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+		break;
+	}
 	case htons(ETH_P_8021AD):
 	case htons(ETH_P_8021Q): {
 		const struct vlan_hdr *vlan;
-- 
2.11.0

^ permalink raw reply related

* [PATCH v4 net-next 3/8] udp: Check static key udp_encap_needed in udp_gro_receive
From: Tom Herbert @ 2017-09-28 23:52 UTC (permalink / raw)
  To: davem; +Cc: netdev, rohit, Tom Herbert
In-Reply-To: <20170928235230.22158-1-tom@quantonium.net>

Currently, the only support for udp gro is provided by UDP encapsulation
protocols. Since they always set udp_encap_needed we can check that in
udp_gro_receive functions before performing a socket lookup.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 include/net/udp.h      | 2 ++
 net/ipv4/udp.c         | 4 +++-
 net/ipv4/udp_offload.c | 7 +++++++
 net/ipv6/udp_offload.c | 7 +++++++
 4 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/include/net/udp.h b/include/net/udp.h
index 12dfbfe2e2d7..c6b1c5d8d3c9 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -97,6 +97,8 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table,
 
 extern struct proto udp_prot;
 
+extern struct static_key udp_encap_needed;
+
 extern atomic_long_t udp_memory_allocated;
 
 /* sysctl variables for udp */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 784ced0b9150..2788843e8eb2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1813,7 +1813,9 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
-static struct static_key udp_encap_needed __read_mostly;
+struct static_key udp_encap_needed __read_mostly;
+EXPORT_SYMBOL(udp_encap_needed);
+
 void udp_encap_enable(void)
 {
 	static_key_enable(&udp_encap_needed);
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 97658bfc1b58..a744bb515455 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -261,6 +261,13 @@ static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
 {
 	struct udphdr *uh = udp_gro_udphdr(skb);
 
+	if (!static_key_false(&udp_encap_needed)) {
+		/* Currently udp_gro_receive only does something if
+		 * a UDP encapsulation has been set.
+		 */
+		goto flush;
+	}
+
 	if (unlikely(!uh))
 		goto flush;
 
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 455fd4e39333..111b026e4f03 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -34,6 +34,13 @@ static struct sk_buff **udp6_gro_receive(struct sk_buff **head,
 {
 	struct udphdr *uh = udp_gro_udphdr(skb);
 
+	if (!static_key_false(&udp_encap_needed)) {
+		/* Currently udp_gro_receive only does something if
+		 * a UDP encapsulation has been set.
+		 */
+		goto flush;
+	}
+
 	if (unlikely(!uh))
 		goto flush;
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH v4 net-next 4/8] flow_dissector: Add protocol specific flow dissection offload
From: Tom Herbert @ 2017-09-28 23:52 UTC (permalink / raw)
  To: davem; +Cc: netdev, rohit, Tom Herbert
In-Reply-To: <20170928235230.22158-1-tom@quantonium.net>

Add offload capability for performing protocol specific flow dissection
(either by EtherType or IP protocol).

Specifically:

- Add flow_dissect to offload callbacks
- Move flow_dissect_ret enum to flow_dissector.h, cleanup names and add a
  couple of values
- Unify handling of functions that return flow_dissect_ret enum
- In __skb_flow_dissect, add default case for switch(proto) as well as
  switch(ip_proto) that looks up and calls protocol specific flow
  dissection

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 include/linux/netdevice.h    | 27 ++++++++++++++++++
 include/net/flow_dissector.h |  1 +
 net/core/dev.c               | 65 ++++++++++++++++++++++++++++++++++++++++++++
 net/core/flow_dissector.c    | 16 +++++++++--
 net/ipv4/route.c             |  4 ++-
 5 files changed, 110 insertions(+), 3 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 06b173200e23..f186b6ab480a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2207,12 +2207,25 @@ struct offload_callbacks {
 	struct sk_buff		**(*gro_receive)(struct sk_buff **head,
 						 struct sk_buff *skb);
 	int			(*gro_complete)(struct sk_buff *skb, int nhoff);
+	enum flow_dissect_ret (*flow_dissect)(struct sk_buff *skb,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags);
 };
 
 struct packet_offload {
 	__be16			 type;	/* This is really htons(ether_type). */
 	u16			 priority;
 	struct offload_callbacks callbacks;
+	enum flow_dissect_ret (*proto_flow_dissect)(struct sk_buff *skb,
+			u8 proto,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags);
 	struct list_head	 list;
 };
 
@@ -3252,6 +3265,20 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi);
 gro_result_t napi_gro_frags(struct napi_struct *napi);
 struct packet_offload *gro_find_receive_by_type(__be16 type);
 struct packet_offload *gro_find_complete_by_type(__be16 type);
+enum flow_dissect_ret flow_dissect_by_type(struct sk_buff *skb,
+			__be16 type,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags);
+enum flow_dissect_ret flow_dissect_by_type_proto(struct sk_buff *skb,
+			__be16 type, u8 proto,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags);
 
 static inline void napi_free_frags(struct napi_struct *napi)
 {
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index fc3dce730a6b..ad75bbfd1c9c 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -213,6 +213,7 @@ enum flow_dissector_key_id {
 #define FLOW_DISSECTOR_F_STOP_AT_L3		BIT(1)
 #define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL	BIT(2)
 #define FLOW_DISSECTOR_F_STOP_AT_ENCAP		BIT(3)
+#define FLOW_DISSECTOR_F_STOP_AT_L4		BIT(4)
 
 struct flow_dissector_key {
 	enum flow_dissector_key_id key_id;
diff --git a/net/core/dev.c b/net/core/dev.c
index e350c768d4b5..f3cd884bd04b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -104,6 +104,7 @@
 #include <linux/stat.h>
 #include <net/dst.h>
 #include <net/dst_metadata.h>
+#include <net/flow_dissector.h>
 #include <net/pkt_sched.h>
 #include <net/pkt_cls.h>
 #include <net/checksum.h>
@@ -4907,6 +4908,70 @@ struct packet_offload *gro_find_complete_by_type(__be16 type)
 }
 EXPORT_SYMBOL(gro_find_complete_by_type);
 
+enum flow_dissect_ret flow_dissect_by_type(struct sk_buff *skb,
+			__be16 type,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags)
+{
+	enum flow_dissect_ret ret = FLOW_DISSECT_RET_CONTINUE;
+	struct list_head *offload_head = &offload_base;
+	struct packet_offload *ptype;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(ptype, offload_head, list) {
+		if (ptype->type != type || !ptype->callbacks.flow_dissect)
+			continue;
+		ret = ptype->callbacks.flow_dissect(skb, key_control,
+						    flow_dissector,
+						    target_container,
+						    data, p_proto,
+						    p_ip_proto, p_nhoff,
+						    p_hlen, flags);
+		break;
+	}
+
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(flow_dissect_by_type);
+
+enum flow_dissect_ret flow_dissect_by_type_proto(struct sk_buff *skb,
+			__be16 type, u8 proto,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags)
+{
+	enum flow_dissect_ret ret = FLOW_DISSECT_RET_CONTINUE;
+	struct list_head *offload_head = &offload_base;
+	struct packet_offload *ptype;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(ptype, offload_head, list) {
+		if (ptype->type != type || !ptype->proto_flow_dissect)
+			continue;
+		ret = ptype->proto_flow_dissect(skb, proto, key_control,
+						    flow_dissector,
+						    target_container,
+						    data, p_proto,
+						    p_ip_proto, p_nhoff,
+						    p_hlen, flags);
+		break;
+	}
+
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(flow_dissect_by_type_proto);
+
 static void napi_skb_free_stolen_head(struct sk_buff *skb)
 {
 	skb_dst_drop(skb);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index c15b41f96cbe..84b8eb1f6664 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -9,6 +9,7 @@
 #include <net/ipv6.h>
 #include <net/gre.h>
 #include <net/pptp.h>
+#include <net/protocol.h>
 #include <linux/igmp.h>
 #include <linux/icmp.h>
 #include <linux/sctp.h>
@@ -721,7 +722,11 @@ bool __skb_flow_dissect(struct sk_buff *skb,
 		break;
 
 	default:
-		fdret = FLOW_DISSECT_RET_OUT_BAD;
+		fdret = flow_dissect_by_type(skb, proto, key_control,
+					     flow_dissector,
+					     target_container,
+					     data, &proto, &ip_proto, &nhoff,
+					     &hlen, flags);
 		break;
 	}
 
@@ -838,6 +843,12 @@ bool __skb_flow_dissect(struct sk_buff *skb,
 		break;
 
 	default:
+		fdret = flow_dissect_by_type_proto(skb, proto,
+						ip_proto, key_control,
+						flow_dissector,
+						target_container,
+						data, &proto, &ip_proto, &nhoff,
+						&hlen, flags);
 		break;
 	}
 
@@ -1022,7 +1033,8 @@ static inline u32 ___skb_get_hash(struct sk_buff *skb,
 				  struct flow_keys *keys, u32 keyval)
 {
 	skb_flow_dissect_flow_keys(skb, keys,
-				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL |
+				   FLOW_DISSECTOR_F_STOP_AT_L4);
 
 	return __flow_hash_from_keys(keys, keyval);
 }
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 94c5b81d8f2b..69d6ce7dfa18 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1811,7 +1811,9 @@ int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
 	case 1:
 		/* skb is currently provided only when forwarding */
 		if (skb) {
-			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
+			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP |
+					    FLOW_DISSECTOR_F_STOP_AT_L4;
+;
 			struct flow_keys keys;
 
 			/* short-circuit if we already have L4 hash present */
-- 
2.11.0

^ permalink raw reply related

* [PATCH v4 net-next 5/8] ip: Add callbacks to flow dissection by IP protocol
From: Tom Herbert @ 2017-09-28 23:52 UTC (permalink / raw)
  To: davem; +Cc: netdev, rohit, Tom Herbert
In-Reply-To: <20170928235230.22158-1-tom@quantonium.net>

Populate the proto_flow_dissect function for IPv4 and IPv6 packet
offloads. This allows the caller to flow dissect a packet starting
at the given IP protocol (as parsed to that point by flow dissector
for instance).

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 net/ipv4/af_inet.c     | 27 +++++++++++++++++++++++++++
 net/ipv6/ip6_offload.c | 27 +++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e31108e5ef79..18c1d884999a 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1440,6 +1440,32 @@ static struct sk_buff **ipip_gro_receive(struct sk_buff **head,
 	return inet_gro_receive(head, skb);
 }
 
+static enum flow_dissect_ret inet_proto_flow_dissect(struct sk_buff *skb,
+			u8 proto,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags)
+{
+	enum flow_dissect_ret ret = FLOW_DISSECT_RET_CONTINUE;
+	const struct net_offload *ops;
+
+	rcu_read_lock();
+
+	ops = rcu_dereference(inet_offloads[proto]);
+	if (ops && ops->callbacks.flow_dissect)
+		ret =  ops->callbacks.flow_dissect(skb, key_control,
+						   flow_dissector,
+						   target_container,
+						   data, p_proto, p_ip_proto,
+						   p_nhoff, p_hlen, flags);
+
+	rcu_read_unlock();
+
+	return ret;
+}
+
 #define SECONDS_PER_DAY	86400
 
 /* inet_current_timestamp - Return IP network timestamp
@@ -1763,6 +1789,7 @@ static int ipv4_proc_init(void);
 
 static struct packet_offload ip_packet_offload __read_mostly = {
 	.type = cpu_to_be16(ETH_P_IP),
+	.proto_flow_dissect = inet_proto_flow_dissect,
 	.callbacks = {
 		.gso_segment = inet_gso_segment,
 		.gro_receive = inet_gro_receive,
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index cdb3728faca7..a33a2b40b3d6 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -339,8 +339,35 @@ static int ip4ip6_gro_complete(struct sk_buff *skb, int nhoff)
 	return inet_gro_complete(skb, nhoff);
 }
 
+static enum flow_dissect_ret inet6_proto_flow_dissect(struct sk_buff *skb,
+			u8 proto,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags)
+{
+	enum flow_dissect_ret ret = FLOW_DISSECT_RET_CONTINUE;
+	const struct net_offload *ops;
+
+	rcu_read_lock();
+
+	ops = rcu_dereference(inet6_offloads[proto]);
+	if (ops && ops->callbacks.flow_dissect)
+		ret =  ops->callbacks.flow_dissect(skb, key_control,
+						   flow_dissector,
+						   target_container, data,
+						   p_proto, p_ip_proto, p_nhoff,
+						   p_hlen, flags);
+
+	rcu_read_unlock();
+
+	return ret;
+}
+
 static struct packet_offload ipv6_packet_offload __read_mostly = {
 	.type = cpu_to_be16(ETH_P_IPV6),
+	.proto_flow_dissect = inet6_proto_flow_dissect,
 	.callbacks = {
 		.gso_segment = ipv6_gso_segment,
 		.gro_receive = ipv6_gro_receive,
-- 
2.11.0

^ permalink raw reply related

* [PATCH v4 net-next 6/8] udp: flow dissector offload
From: Tom Herbert @ 2017-09-28 23:52 UTC (permalink / raw)
  To: davem; +Cc: netdev, rohit, Tom Herbert
In-Reply-To: <20170928235230.22158-1-tom@quantonium.net>

Add support to perform UDP specific flow dissection. This is
primarily intended for dissecting encapsulated packets in UDP
encapsulation.

This patch adds a flow_dissect offload for UDP4 and UDP6. The backend
function performs a socket lookup and calls the flow_dissect function
if a socket is found.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 include/linux/udp.h      |  8 ++++++++
 include/net/udp.h        |  8 ++++++++
 include/net/udp_tunnel.h |  8 ++++++++
 net/ipv4/udp_offload.c   | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/udp_tunnel.c    |  1 +
 net/ipv6/udp_offload.c   | 16 ++++++++++++++++
 6 files changed, 89 insertions(+)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index eaea63bc79bb..2e90b189ef6a 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -79,6 +79,14 @@ struct udp_sock {
 	int			(*gro_complete)(struct sock *sk,
 						struct sk_buff *skb,
 						int nhoff);
+	/* Flow dissector function for a UDP socket */
+	enum flow_dissect_ret (*flow_dissect)(struct sock *sk,
+			const struct sk_buff *skb,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags);
 
 	/* udp_recvmsg try to use this before splicing sk_receive_queue */
 	struct sk_buff_head	reader_queue ____cacheline_aligned_in_smp;
diff --git a/include/net/udp.h b/include/net/udp.h
index c6b1c5d8d3c9..4867f329538c 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -176,6 +176,14 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
 				 struct udphdr *uh, udp_lookup_t lookup);
 int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
 
+enum flow_dissect_ret udp_flow_dissect(struct sk_buff *skb,
+			udp_lookup_t lookup,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags);
+
 static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
 {
 	struct udphdr *uh;
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 10cce0dd4450..b7102e0f41a9 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -69,6 +69,13 @@ typedef struct sk_buff **(*udp_tunnel_gro_receive_t)(struct sock *sk,
 						     struct sk_buff *skb);
 typedef int (*udp_tunnel_gro_complete_t)(struct sock *sk, struct sk_buff *skb,
 					 int nhoff);
+typedef enum flow_dissect_ret (*udp_tunnel_flow_dissect_t)(struct sock *sk,
+			const struct sk_buff *skb,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags);
 
 struct udp_tunnel_sock_cfg {
 	void *sk_user_data;     /* user data used by encap_rcv call back */
@@ -78,6 +85,7 @@ struct udp_tunnel_sock_cfg {
 	udp_tunnel_encap_destroy_t encap_destroy;
 	udp_tunnel_gro_receive_t gro_receive;
 	udp_tunnel_gro_complete_t gro_complete;
+	udp_tunnel_flow_dissect_t flow_dissect;
 };
 
 /* Setup the given (UDP) sock to receive UDP encapsulated packets */
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index a744bb515455..fddf923ef433 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -335,11 +335,59 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
 	return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
 }
 
+enum flow_dissect_ret udp_flow_dissect(struct sk_buff *skb,
+			udp_lookup_t lookup,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags)
+{
+	enum flow_dissect_ret ret = FLOW_DISSECT_RET_CONTINUE;
+	struct udphdr *uh, _uh;
+	struct sock *sk;
+
+	uh = __skb_header_pointer(skb, *p_nhoff, sizeof(_uh), data,
+				  *p_hlen, &_uh);
+	if (!uh)
+		return FLOW_DISSECT_RET_OUT_BAD;
+
+	rcu_read_lock();
+
+	sk = (*lookup)(skb, uh->source, uh->dest);
+
+	if (sk && udp_sk(sk)->flow_dissect)
+		ret = udp_sk(sk)->flow_dissect(sk, skb, key_control,
+					       flow_dissector, target_container,
+					       data, p_proto, p_ip_proto,
+					       p_nhoff, p_hlen, flags);
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(udp_flow_dissect);
+
+static enum flow_dissect_ret udp4_flow_dissect(struct sk_buff *skb,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags)
+{
+	if (!static_key_false(&udp_encap_needed))
+		return FLOW_DISSECT_RET_CONTINUE;
+
+	return udp_flow_dissect(skb, udp4_lib_lookup_skb, key_control,
+				flow_dissector, target_container, data,
+				p_proto, p_ip_proto, p_nhoff, p_hlen, flags);
+}
+
 static const struct net_offload udpv4_offload = {
 	.callbacks = {
 		.gso_segment = udp4_tunnel_segment,
 		.gro_receive  =	udp4_gro_receive,
 		.gro_complete =	udp4_gro_complete,
+		.flow_dissect = udp4_flow_dissect,
 	},
 };
 
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 6539ff15e9a3..a4eec2a044d2 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -71,6 +71,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
 	udp_sk(sk)->encap_destroy = cfg->encap_destroy;
 	udp_sk(sk)->gro_receive = cfg->gro_receive;
 	udp_sk(sk)->gro_complete = cfg->gro_complete;
+	udp_sk(sk)->flow_dissect = cfg->flow_dissect;
 
 	udp_tunnel_encap_enable(sock);
 }
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 111b026e4f03..45b77f92d77d 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -80,11 +80,27 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff)
 	return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb);
 }
 
+static enum flow_dissect_ret udp6_flow_dissect(struct sk_buff *skb,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags)
+{
+	if (!static_key_false(&udp_encap_needed))
+		return FLOW_DISSECT_RET_CONTINUE;
+
+	return udp_flow_dissect(skb, udp6_lib_lookup_skb, key_control,
+				flow_dissector, target_container, data,
+				p_proto, p_ip_proto, p_nhoff, p_hlen, flags);
+}
+
 static const struct net_offload udpv6_offload = {
 	.callbacks = {
 		.gso_segment	=	udp6_tunnel_segment,
 		.gro_receive	=	udp6_gro_receive,
 		.gro_complete	=	udp6_gro_complete,
+		.flow_dissect	=	udp6_flow_dissect,
 	},
 };
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH v4 net-next 7/8] fou: Support flow dissection
From: Tom Herbert @ 2017-09-28 23:52 UTC (permalink / raw)
  To: davem; +Cc: netdev, rohit, Tom Herbert
In-Reply-To: <20170928235230.22158-1-tom@quantonium.net>

Populate offload flow_dissect callabck appropriately for fou and gue.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 net/ipv4/fou.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 1540db65241a..a831dd49fb28 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -282,6 +282,20 @@ static int fou_gro_complete(struct sock *sk, struct sk_buff *skb,
 	return err;
 }
 
+static enum flow_dissect_ret fou_flow_dissect(struct sock *sk,
+			const struct sk_buff *skb,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags)
+{
+	*p_ip_proto = fou_from_sock(sk)->protocol;
+	*p_nhoff += sizeof(struct udphdr);
+
+	return FLOW_DISSECT_RET_IPPROTO_AGAIN;
+}
+
 static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
 				      struct guehdr *guehdr, void *data,
 				      size_t hdrlen, struct gro_remcsum *grc,
@@ -500,6 +514,53 @@ static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
 	return err;
 }
 
+static enum flow_dissect_ret gue_flow_dissect(struct sock *sk,
+			const struct sk_buff *skb,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags)
+{
+	struct guehdr *guehdr, _guehdr;
+
+	guehdr = __skb_header_pointer(skb, *p_nhoff + sizeof(struct udphdr),
+				      sizeof(_guehdr), data, *p_hlen, &_guehdr);
+	if (!guehdr)
+		return FLOW_DISSECT_RET_OUT_BAD;
+
+	switch (guehdr->version) {
+	case 0:
+		if (unlikely(guehdr->control))
+			return FLOW_DISSECT_RET_CONTINUE;
+
+		*p_ip_proto = guehdr->proto_ctype;
+		*p_nhoff += sizeof(struct udphdr) +
+		    sizeof(*guehdr) + (guehdr->hlen << 2);
+
+		break;
+	case 1:
+		switch (((struct iphdr *)guehdr)->version) {
+		case 4:
+			*p_ip_proto = IPPROTO_IPIP;
+			break;
+		case 6:
+			*p_ip_proto = IPPROTO_IPV6;
+			break;
+		default:
+			return FLOW_DISSECT_RET_CONTINUE;
+		}
+
+		*p_nhoff += sizeof(struct udphdr);
+
+		break;
+	default:
+		return FLOW_DISSECT_RET_CONTINUE;
+	}
+
+	return FLOW_DISSECT_RET_IPPROTO_AGAIN;
+}
+
 static int fou_add_to_port_list(struct net *net, struct fou *fou)
 {
 	struct fou_net *fn = net_generic(net, fou_net_id);
@@ -570,12 +631,14 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
 		tunnel_cfg.encap_rcv = fou_udp_recv;
 		tunnel_cfg.gro_receive = fou_gro_receive;
 		tunnel_cfg.gro_complete = fou_gro_complete;
+		tunnel_cfg.flow_dissect = fou_flow_dissect;
 		fou->protocol = cfg->protocol;
 		break;
 	case FOU_ENCAP_GUE:
 		tunnel_cfg.encap_rcv = gue_udp_recv;
 		tunnel_cfg.gro_receive = gue_gro_receive;
 		tunnel_cfg.gro_complete = gue_gro_complete;
+		tunnel_cfg.flow_dissect = gue_flow_dissect;
 		break;
 	default:
 		err = -EINVAL;
-- 
2.11.0

^ permalink raw reply related

* [PATCH v4 net-next 8/8] vxlan: support flow dissect
From: Tom Herbert @ 2017-09-28 23:52 UTC (permalink / raw)
  To: davem; +Cc: netdev, rohit, Tom Herbert
In-Reply-To: <20170928235230.22158-1-tom@quantonium.net>

Populate offload flow_dissect callback appropriately for VXLAN and
VXLAN-GPE.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 drivers/net/vxlan.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index d7c49cf1d5e9..80227050b2d4 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1327,6 +1327,45 @@ static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph,
 	return err <= 1;
 }
 
+static enum flow_dissect_ret vxlan_flow_dissect(struct sock *sk,
+			const struct sk_buff *skb,
+			struct flow_dissector_key_control *key_control,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data,
+			__be16 *p_proto, u8 *p_ip_proto, int *p_nhoff,
+			int *p_hlen, unsigned int flags)
+{
+	__be16 protocol = htons(ETH_P_TEB);
+	struct vxlanhdr *vhdr, _vhdr;
+	struct vxlan_sock *vs;
+
+	vhdr = __skb_header_pointer(skb, *p_nhoff + sizeof(struct udphdr),
+				    sizeof(_vhdr), data, *p_hlen, &_vhdr);
+	if (!vhdr)
+		return FLOW_DISSECT_RET_OUT_BAD;
+
+	vs = rcu_dereference_sk_user_data(sk);
+	if (!vs)
+		return FLOW_DISSECT_RET_OUT_BAD;
+
+	if (vs->flags & VXLAN_F_GPE) {
+		struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vhdr;
+
+		/* Need to have Next Protocol set for interfaces in GPE mode. */
+		if (gpe->version != 0 || !gpe->np_applied || gpe->oam_flag)
+			return FLOW_DISSECT_RET_CONTINUE;
+
+		protocol = tun_p_from_eth_p(gpe->next_protocol);
+		if (!protocol)
+			return FLOW_DISSECT_RET_CONTINUE;
+	}
+
+	*p_nhoff += sizeof(struct udphdr) + sizeof(_vhdr);
+	*p_proto = protocol;
+
+	return FLOW_DISSECT_RET_PROTO_AGAIN;
+}
+
 /* Callback from net/ipv4/udp.c to receive packets */
 static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
 {
@@ -2846,6 +2885,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
 	tunnel_cfg.encap_destroy = NULL;
 	tunnel_cfg.gro_receive = vxlan_gro_receive;
 	tunnel_cfg.gro_complete = vxlan_gro_complete;
+	tunnel_cfg.flow_dissect = vxlan_flow_dissect;
 
 	setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
 
-- 
2.11.0

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox