Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v3 net-next 11/12] gtp: Experimental support encpasulating over IPv6
From: Tom Herbert @ 2017-09-25  3:29 UTC (permalink / raw)
  To: davem; +Cc: pablo, laforge, aschultz, netdev, rohit, Tom Herbert
In-Reply-To: <20170925032941.14586-1-tom@quantonium.net>

Allows using GTP datapath over IPv6. Remote peers are indicated by IPv6.

Note this is experimental, more work is needed to make this
compliant with 3GPP standard.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 drivers/net/gtp.c            | 248 ++++++++++++++++++++++++++++++++++---------
 include/uapi/linux/gtp.h     |   1 +
 include/uapi/linux/if_link.h |   3 +
 3 files changed, 200 insertions(+), 52 deletions(-)

diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index 919ec6e14973..1c580df4cfc5 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -28,6 +28,7 @@
 #include <net/net_namespace.h>
 #include <net/protocol.h>
 #include <net/ip.h>
+#include <net/ip6_tunnel.h>
 #include <net/udp.h>
 #include <net/udp_tunnel.h>
 #include <net/icmp.h>
@@ -59,16 +60,22 @@ struct pdp_ctx {
 	__be16			gtp_port;
 
 	u16			ms_af;
+	u16			peer_af;
 #if GTP_IPV6
 	union {
 		struct in_addr	ms_addr_ip4;
 		struct in6_addr	ms_addr_ip6;
 	};
+
+	union {
+		struct in_addr	peer_addr_ip4;
+		struct in6_addr	peer_addr_ip6;
+	};
 #else
 	struct in_addr	ms_addr_ip4;
+	struct in_addr	peer_addr_ip4;
 #endif
 
-	struct in_addr		peer_addr_ip4;
 
 	struct sock		*sk;
 	struct net_device       *dev;
@@ -93,8 +100,11 @@ struct gtp_dev {
 	struct hlist_head	*tid_hash;
 
 	struct hlist_head	*addr4_hash;
+
 #if GTP_IPV6
 	struct hlist_head	*addr6_hash;
+
+	unsigned int		is_ipv6:1;
 #endif
 
 	struct gro_cells	gro_cells;
@@ -534,8 +544,6 @@ static int gtp_xmit(struct sk_buff *skb, struct net_device *dev,
 {
 	struct iphdr *inner_iph = NULL;
 	struct sock *sk = pctx->sk;
-	__be32 saddr = inet_sk(sk)->inet_saddr;
-	struct rtable *rt;
 	int err = 0;
 
 	if (skb->protocol == ETH_P_IP)
@@ -548,38 +556,84 @@ static int gtp_xmit(struct sk_buff *skb, struct net_device *dev,
 
 	skb_reset_inner_headers(skb);
 
-	/* Source address returned by route lookup is ignored since
-	 * we get the address from a socket.
-	 */
-	rt = ip_tunnel_get_route(dev, skb, sk->sk_protocol,
-				 sk->sk_bound_dev_if, RT_CONN_FLAGS(sk),
-				 pctx->peer_addr_ip4.s_addr, &saddr,
-				 pctx->gtp_port, pctx->gtp_port,
-				 &pctx->dst_cache, NULL);
-
-	if (IS_ERR(rt)) {
-		err = PTR_ERR(rt);
-		goto out_err;
-	}
+	if (pctx->peer_af == AF_INET) {
+		__be32 saddr = inet_sk(sk)->inet_saddr;
+		struct rtable *rt;
+
+		/* Source address returned by route lookup is ignored since
+		 * we get the address from a socket.
+		 */
+		rt = ip_tunnel_get_route(dev, skb, sk->sk_protocol,
+					 sk->sk_bound_dev_if, RT_CONN_FLAGS(sk),
+					 pctx->peer_addr_ip4.s_addr, &saddr,
+					 pctx->gtp_port, pctx->gtp_port,
+					 &pctx->dst_cache, NULL);
+
+		if (IS_ERR(rt)) {
+			err = PTR_ERR(rt);
+			goto out_err;
+		}
+
+		skb_dst_drop(skb);
 
-	skb_dst_drop(skb);
+		gtp_push_header(skb, pctx);
 
-	gtp_push_header(skb, pctx);
+		if (inner_iph)
+			__iptunnel_update_pmtu(dev, skb, &rt->dst,
+					       !!inner_iph->frag_off,
+					       inner_iph, pctx->hlen,
+					       pctx->peer_addr_ip4.s_addr);
 
-	if (inner_iph)
-		__iptunnel_update_pmtu(dev, skb, &rt->dst,
-				       !!inner_iph->frag_off,
-				       inner_iph, pctx->hlen,
-				       pctx->peer_addr_ip4.s_addr);
+		udp_tunnel_xmit_skb(rt, sk, skb, saddr,
+				    pctx->peer_addr_ip4.s_addr,
+				    0, ip4_dst_hoplimit(&rt->dst), 0,
+				    pctx->gtp_port, pctx->gtp_port,
+				    false, false);
 
-	udp_tunnel_xmit_skb(rt, sk, skb, saddr,
-			    pctx->peer_addr_ip4.s_addr,
-			    0, ip4_dst_hoplimit(&rt->dst), 0,
-			    pctx->gtp_port, pctx->gtp_port,
-			    false, false);
+		netdev_dbg(dev, "gtp -> IP src: %pI4 dst: %pI4\n",
+			   &saddr, &pctx->peer_addr_ip4.s_addr);
 
-	netdev_dbg(dev, "gtp -> IP src: %pI4 dst: %pI4\n",
-		   &saddr, &pctx->peer_addr_ip4.s_addr);
+#if GTP_IPV6
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (pctx->peer_af == AF_INET6) {
+		struct in6_addr saddr = inet6_sk(sk)->saddr;
+		struct dst_entry *dst;
+
+		/* Source address returned by route lookup is ignored since
+		 * we get the address from a socket.
+		 */
+		dst = ip6_tnl_get_route(dev, skb, sk, sk->sk_protocol,
+					sk->sk_bound_dev_if, 0,
+					0, &pctx->peer_addr_ip6, &saddr,
+					pctx->gtp_port, pctx->gtp_port,
+					&pctx->dst_cache, NULL);
+
+		if (IS_ERR(dst)) {
+			err = PTR_ERR(dst);
+			goto out_err;
+		}
+
+		skb_dst_drop(skb);
+
+		gtp_push_header(skb, pctx);
+
+		if (inner_iph)
+			__iptunnel_update_pmtu(dev, skb, dst,
+					       !!inner_iph->frag_off,
+					       inner_iph, pctx->hlen, 0);
+
+		udp_tunnel6_xmit_skb(dst, sk, skb, dev,
+				     &saddr, &pctx->peer_addr_ip6,
+				     0, ip6_dst_hoplimit(dst), 0,
+				     pctx->gtp_port, pctx->gtp_port,
+				     false);
+
+		netdev_dbg(dev, "gtp -> IP src: %pI6 dst: %pI6\n",
+			   &saddr, &pctx->peer_addr_ip6);
+
+#endif
+#endif
+	}
 
 	return 0;
 
@@ -688,7 +742,12 @@ static void gtp_link_setup(struct net_device *dev)
 
 	/* Assume largest header, ie. GTPv0. */
 	dev->needed_headroom	= LL_MAX_HEADER +
+#if GTP_IPV6
+				  max_t(int, sizeof(struct iphdr),
+					sizeof(struct ipv6hdr)) +
+#else
 				  sizeof(struct iphdr) +
+#endif
 				  sizeof(struct udphdr) +
 				  sizeof(struct gtp0_header);
 
@@ -697,12 +756,15 @@ static void gtp_link_setup(struct net_device *dev)
 
 static int gtp_hashtable_new(struct gtp_dev *gtp, int hsize);
 static void gtp_hashtable_free(struct gtp_dev *gtp);
-static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[]);
+static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[],
+			    bool is_ipv6);
 
 static int gtp_newlink(struct net *src_net, struct net_device *dev,
 		       struct nlattr *tb[], struct nlattr *data[],
 		       struct netlink_ext_ack *extack)
 {
+	unsigned int role = GTP_ROLE_GGSN;
+	bool is_ipv6 = false;
 	struct gtp_dev *gtp;
 	struct gtp_net *gn;
 	int hashsize, err;
@@ -710,9 +772,32 @@ static int gtp_newlink(struct net *src_net, struct net_device *dev,
 	if (!data[IFLA_GTP_FD0] && !data[IFLA_GTP_FD1])
 		return -EINVAL;
 
+	if (data[IFLA_GTP_ROLE]) {
+		role = nla_get_u32(data[IFLA_GTP_ROLE]);
+		if (role > GTP_ROLE_SGSN)
+			return -EINVAL;
+	}
+
+	if (data[IFLA_GTP_AF]) {
+		u16 af = nla_get_u16(data[IFLA_GTP_AF]);
+
+		switch (af) {
+		case AF_INET:
+			is_ipv6 = false;
+			break;
+#if GTP_IPV6
+		case AF_INET6:
+			is_ipv6 = true;
+			break;
+#endif
+		default:
+			return -EINVAL;
+		}
+	}
+
 	gtp = netdev_priv(dev);
 
-	err = gtp_encap_enable(gtp, data);
+	err = gtp_encap_enable(gtp, data, is_ipv6);
 	if (err < 0)
 		return err;
 
@@ -731,6 +816,11 @@ static int gtp_newlink(struct net *src_net, struct net_device *dev,
 		goto out_hashtable;
 	}
 
+	gtp->role = role;
+#if GTP_IPV6
+	gtp->is_ipv6 = is_ipv6;
+#endif
+
 	gn = net_generic(dev_net(dev), gtp_net_id);
 	list_add_rcu(&gtp->list, &gn->gtp_dev_list);
 
@@ -860,7 +950,8 @@ static void gtp_hashtable_free(struct gtp_dev *gtp)
 }
 
 static struct sock *gtp_encap_enable_socket(int fd, int type,
-					    struct gtp_dev *gtp)
+					    struct gtp_dev *gtp,
+					    bool is_ipv6)
 {
 	struct udp_tunnel_sock_cfg tuncfg = {NULL};
 	struct socket *sock;
@@ -881,6 +972,12 @@ static struct sock *gtp_encap_enable_socket(int fd, int type,
 		goto out_sock;
 	}
 
+	if (sock->sk->sk_family != (is_ipv6 ? AF_INET6 : AF_INET)) {
+		pr_debug("socket fd=%d not right family\n", fd);
+		sk = ERR_PTR(-EINVAL);
+		goto out_sock;
+	}
+
 	if (rcu_dereference_sk_user_data(sock->sk)) {
 		sk = ERR_PTR(-EBUSY);
 		goto out_sock;
@@ -913,16 +1010,16 @@ static struct sock *gtp_encap_enable_socket(int fd, int type,
 	return sk;
 }
 
-static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[])
+static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[],
+			    bool is_ipv6)
 {
-	struct sock *sk1u = NULL;
-	struct sock *sk0 = NULL;
-	unsigned int role = GTP_ROLE_GGSN;
+	struct sock *sk0 = NULL, *sk1u = NULL;
 
 	if (data[IFLA_GTP_FD0]) {
 		u32 fd0 = nla_get_u32(data[IFLA_GTP_FD0]);
 
-		sk0 = gtp_encap_enable_socket(fd0, UDP_ENCAP_GTP0, gtp);
+		sk0 = gtp_encap_enable_socket(fd0, UDP_ENCAP_GTP0, gtp,
+					      is_ipv6);
 		if (IS_ERR(sk0))
 			return PTR_ERR(sk0);
 	}
@@ -930,7 +1027,8 @@ static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[])
 	if (data[IFLA_GTP_FD1]) {
 		u32 fd1 = nla_get_u32(data[IFLA_GTP_FD1]);
 
-		sk1u = gtp_encap_enable_socket(fd1, UDP_ENCAP_GTP1U, gtp);
+		sk1u = gtp_encap_enable_socket(fd1, UDP_ENCAP_GTP1U, gtp,
+					       is_ipv6);
 		if (IS_ERR(sk1u)) {
 			if (sk0)
 				gtp_encap_disable_sock(sk0);
@@ -938,15 +1036,8 @@ static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[])
 		}
 	}
 
-	if (data[IFLA_GTP_ROLE]) {
-		role = nla_get_u32(data[IFLA_GTP_ROLE]);
-		if (role > GTP_ROLE_SGSN)
-			return -EINVAL;
-	}
-
 	gtp->sk0 = sk0;
 	gtp->sk1u = sk1u;
-	gtp->role = role;
 
 	return 0;
 }
@@ -982,8 +1073,18 @@ static void pdp_fill(struct pdp_ctx *pctx, struct genl_info *info)
 	__be16 default_port = 0;
 
 	pctx->gtp_version = nla_get_u32(info->attrs[GTPA_VERSION]);
-	pctx->peer_addr_ip4.s_addr =
-		nla_get_be32(info->attrs[GTPA_PEER_ADDRESS]);
+
+	if (info->attrs[GTPA_PEER_ADDRESS]) {
+		pctx->peer_af = AF_INET;
+		pctx->peer_addr_ip4.s_addr =
+			nla_get_in_addr(info->attrs[GTPA_PEER_ADDRESS]);
+#if GTP_IPV6
+	} else if (info->attrs[GTPA_PEER6_ADDRESS]) {
+		pctx->peer_af = AF_INET6;
+		pctx->peer_addr_ip6 = nla_get_in6_addr(
+					info->attrs[GTPA_PEER6_ADDRESS]);
+#endif
+	}
 
 	switch (pctx->gtp_version) {
 	case GTP_V0:
@@ -1162,11 +1263,17 @@ static int gtp_genl_new_pdp(struct sk_buff *skb, struct genl_info *info)
 	int err;
 
 	if (!info->attrs[GTPA_VERSION] ||
-	   !info->attrs[GTPA_LINK] ||
-	   !info->attrs[GTPA_PEER_ADDRESS])
+	    !info->attrs[GTPA_LINK])
 		return -EINVAL;
 
 #if GTP_IPV6
+	if (!(!!info->attrs[GTPA_PEER_ADDRESS] ^
+	      !!info->attrs[GTPA_PEER6_ADDRESS])) {
+		/* Either v4 or v6 peer address must be set */
+
+		return -EINVAL;
+	}
+
 	if (!(!!info->attrs[GTPA_MS_ADDRESS] ^
 	      !!info->attrs[GTPA_MS6_ADDRESS])) {
 		/* Either v4 or v6 mobile subscriber address must be set */
@@ -1174,6 +1281,12 @@ static int gtp_genl_new_pdp(struct sk_buff *skb, struct genl_info *info)
 		return -EINVAL;
 	}
 #else
+	if (!info->attrs[GTPA_PEER_ADDRESS]) {
+		/* v4 peer address must be set */
+
+		return -EINVAL;
+	}
+
 	if (!info->attrs[GTPA_MS_ADDRESS]) {
 		/* v4 mobile subscriber address must be set */
 
@@ -1207,6 +1320,14 @@ static int gtp_genl_new_pdp(struct sk_buff *skb, struct genl_info *info)
 		goto out_unlock;
 	}
 
+#if GTP_IPV6
+	if ((info->attrs[GTPA_PEER_ADDRESS] && gtp->is_ipv6) ||
+	    (info->attrs[GTPA_PEER6_ADDRESS] && !gtp->is_ipv6)) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
+#endif
+
 	if (version == GTP_V0)
 		sk = gtp->sk0;
 	else if (version == GTP_V1)
@@ -1315,10 +1436,31 @@ static int gtp_genl_fill_info(struct sk_buff *skb, u32 snd_portid, u32 snd_seq,
 	if (genlh == NULL)
 		goto nlmsg_failure;
 
-	if (nla_put_u32(skb, GTPA_VERSION, pctx->gtp_version) ||
-	    nla_put_be32(skb, GTPA_PEER_ADDRESS, pctx->peer_addr_ip4.s_addr))
+	if (nla_put_u32(skb, GTPA_VERSION, pctx->gtp_version))
 		goto nla_put_failure;
 
+	if (nla_put_u32(skb, GTPA_LINK, pctx->dev->ifindex))
+		goto nla_put_failure;
+
+	switch (pctx->peer_af) {
+	case AF_INET:
+		if (nla_put_be32(skb, GTPA_PEER_ADDRESS,
+				 pctx->peer_addr_ip4.s_addr))
+			goto nla_put_failure;
+
+		break;
+#if GTP_IPV6
+	case AF_INET6:
+		if (nla_put_in6_addr(skb, GTPA_PEER6_ADDRESS,
+				     &pctx->peer_addr_ip6))
+			goto nla_put_failure;
+
+		break;
+#endif
+	default:
+		goto nla_put_failure;
+	}
+
 	switch (pctx->ms_af) {
 	case AF_INET:
 		if (nla_put_be32(skb, GTPA_MS_ADDRESS,
@@ -1448,6 +1590,8 @@ static struct nla_policy gtp_genl_policy[GTPA_MAX + 1] = {
 	[GTPA_PEER_ADDRESS]	= { .type = NLA_U32, },
 	[GTPA_MS_ADDRESS]	= { .type = NLA_U32, },
 #if GTP_IPV6
+	[GTPA_PEER6_ADDRESS]	= { .len = FIELD_SIZEOF(struct ipv6hdr,
+							daddr) },
 	[GTPA_MS6_ADDRESS]	= { .len = FIELD_SIZEOF(struct ipv6hdr,
 							daddr) },
 #endif
diff --git a/include/uapi/linux/gtp.h b/include/uapi/linux/gtp.h
index ae4e632c0360..8eec519fa754 100644
--- a/include/uapi/linux/gtp.h
+++ b/include/uapi/linux/gtp.h
@@ -29,6 +29,7 @@ enum gtp_attrs {
 	GTPA_PAD,
 	GTPA_PORT,
 	GTPA_MS6_ADDRESS,
+	GTPA_PEER6_ADDRESS,
 	__GTPA_MAX,
 };
 #define GTPA_MAX (__GTPA_MAX + 1)
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8d062c58d5cb..81c26864abeb 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -552,6 +552,9 @@ enum {
 	IFLA_GTP_FD1,
 	IFLA_GTP_PDP_HASHSIZE,
 	IFLA_GTP_ROLE,
+	IFLA_GTP_AF,
+	IFLA_GTP_PORT0,
+	IFLA_GTP_PORT1,
 	__IFLA_GTP_MAX,
 };
 #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1)
-- 
2.11.0

^ permalink raw reply related

* [PATCH v3 net-next 10/12] gtp: Experimental encapsulation of IPv6 packets
From: Tom Herbert @ 2017-09-25  3:29 UTC (permalink / raw)
  To: davem; +Cc: pablo, laforge, aschultz, netdev, rohit, Tom Herbert
In-Reply-To: <20170925032941.14586-1-tom@quantonium.net>

Allow IPv6 mobile subscriber packets. This entails adding an IPv6 mobile
subscriber address to pdp context and IPv6 specific variants to find pdp
contexts by address.

Note that this is experimental support of IPv6, more work is
necessary to make this compliant with 3GPP standard.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 drivers/net/Kconfig      |  12 +-
 drivers/net/gtp.c        | 324 +++++++++++++++++++++++++++++++++++++++--------
 include/uapi/linux/gtp.h |   1 +
 3 files changed, 280 insertions(+), 57 deletions(-)

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index aba0d652095b..8e55367ab6d4 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -225,7 +225,17 @@ config GTP
 	  3GPP TS 29.060 standards.
 
 	  To compile this drivers as a module, choose M here: the module
-	  wil be called gtp.
+	  will be called gtp.
+
+config GTP_IPV6_EXPERIMENTAL
+	bool "GTP IPv6 datapath (EXPERIMENTAL)"
+	default n
+	depends on GTP
+	---help---
+	  This is an experimental implementation that allows encapsulating
+	  IPv6 over GTP and using GTP over IPv6 for testing and development
+	  purpose. This is not a standards conformant implementation for
+	  IPv6 and GTP. More work is needed reach that level.
 
 config MACSEC
 	tristate "IEEE 802.1AE MAC-level encryption (MACsec)"
diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index 44844eba8df2..919ec6e14973 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -36,6 +36,8 @@
 #include <net/netns/generic.h>
 #include <net/gtp.h>
 
+#define GTP_IPV6 IS_ENABLED(CONFIG_GTP_IPV6_EXPERIMENTAL)
+
 /* An active session for the subscriber. */
 struct pdp_ctx {
 	struct hlist_node	hlist_tid;
@@ -55,9 +57,17 @@ struct pdp_ctx {
 	u8			gtp_version;
 	u8			hlen;
 	__be16			gtp_port;
-	u16			af;
 
-	struct in_addr		ms_addr_ip4;
+	u16			ms_af;
+#if GTP_IPV6
+	union {
+		struct in_addr	ms_addr_ip4;
+		struct in6_addr	ms_addr_ip6;
+	};
+#else
+	struct in_addr	ms_addr_ip4;
+#endif
+
 	struct in_addr		peer_addr_ip4;
 
 	struct sock		*sk;
@@ -81,7 +91,11 @@ struct gtp_dev {
 	unsigned int		role;
 	unsigned int		hash_size;
 	struct hlist_head	*tid_hash;
-	struct hlist_head	*addr_hash;
+
+	struct hlist_head	*addr4_hash;
+#if GTP_IPV6
+	struct hlist_head	*addr6_hash;
+#endif
 
 	struct gro_cells	gro_cells;
 };
@@ -99,6 +113,7 @@ static void pdp_context_delete(struct pdp_ctx *pctx);
 static inline u32 gtp0_hashfn(u64 tid)
 {
 	u32 *tid32 = (u32 *) &tid;
+
 	return jhash_2words(tid32[0], tid32[1], gtp_h_initval);
 }
 
@@ -107,11 +122,6 @@ static inline u32 gtp1u_hashfn(u32 tid)
 	return jhash_1word(tid, gtp_h_initval);
 }
 
-static inline u32 ipv4_hashfn(__be32 ip)
-{
-	return jhash_1word((__force u32)ip, gtp_h_initval);
-}
-
 /* Resolve a PDP context structure based on the 64bit TID. */
 static struct pdp_ctx *gtp0_pdp_find(struct gtp_dev *gtp, u64 tid)
 {
@@ -144,16 +154,21 @@ static struct pdp_ctx *gtp1_pdp_find(struct gtp_dev *gtp, u32 tid)
 	return NULL;
 }
 
+static inline u32 gtp_ipv4_hashfn(__be32 ip)
+{
+	return jhash_1word((__force u32)ip, gtp_h_initval);
+}
+
 /* Resolve a PDP context based on IPv4 address of MS. */
 static struct pdp_ctx *ipv4_pdp_find(struct gtp_dev *gtp, __be32 ms_addr)
 {
 	struct hlist_head *head;
 	struct pdp_ctx *pdp;
 
-	head = &gtp->addr_hash[ipv4_hashfn(ms_addr) % gtp->hash_size];
+	head = &gtp->addr4_hash[gtp_ipv4_hashfn(ms_addr) % gtp->hash_size];
 
 	hlist_for_each_entry_rcu(pdp, head, hlist_addr) {
-		if (pdp->af == AF_INET &&
+		if (pdp->ms_af == AF_INET &&
 		    pdp->ms_addr_ip4.s_addr == ms_addr)
 			return pdp;
 	}
@@ -177,33 +192,109 @@ static bool gtp_check_ms_ipv4(struct sk_buff *skb, struct pdp_ctx *pctx,
 		return iph->saddr == pctx->ms_addr_ip4.s_addr;
 }
 
+#if GTP_IPV6
+
+static inline u32 gtp_ipv6_hashfn(const struct in6_addr *a)
+{
+	return __ipv6_addr_jhash(a, gtp_h_initval);
+}
+
+/* Resolve a PDP context based on IPv6 address of MS. */
+static struct pdp_ctx *ipv6_pdp_find(struct gtp_dev *gtp,
+				     const struct in6_addr *ms_addr)
+{
+	struct hlist_head *head;
+	struct pdp_ctx *pdp;
+
+	head = &gtp->addr6_hash[gtp_ipv6_hashfn(ms_addr) % gtp->hash_size];
+
+	hlist_for_each_entry_rcu(pdp, head, hlist_addr) {
+		if (pdp->ms_af == AF_INET6 &&
+		    ipv6_addr_equal(&pdp->ms_addr_ip6, ms_addr))
+			return pdp;
+	}
+
+	return NULL;
+}
+
+static bool gtp_check_ms_ipv6(struct sk_buff *skb, struct pdp_ctx *pctx,
+			      unsigned int hdrlen, unsigned int role)
+{
+	struct ipv6hdr *ipv6h;
+
+	if (!pskb_may_pull(skb, hdrlen + sizeof(struct ipv6hdr)))
+		return false;
+
+	ipv6h = (struct ipv6hdr *)(skb->data + hdrlen);
+
+	if (role == GTP_ROLE_SGSN)
+		return ipv6_addr_equal(&ipv6h->daddr, &pctx->ms_addr_ip6);
+	else
+		return ipv6_addr_equal(&ipv6h->saddr, &pctx->ms_addr_ip6);
+}
+
+#endif
+
 /* Check if the inner IP address in this packet is assigned to any
  * existing mobile subscriber.
  */
 static bool gtp_check_ms(struct sk_buff *skb, struct pdp_ctx *pctx,
 			     unsigned int hdrlen, unsigned int role)
 {
-	switch (ntohs(skb->protocol)) {
-	case ETH_P_IP:
+	struct iphdr *iph;
+
+	/* Minimally there needs to be an IPv4 header */
+	if (!pskb_may_pull(skb, hdrlen + sizeof(struct iphdr)))
+		return false;
+
+	iph = (struct iphdr *)(skb->data + hdrlen);
+
+	switch (iph->version) {
+	case 4:
 		return gtp_check_ms_ipv4(skb, pctx, hdrlen, role);
+#if GTP_IPV6
+	case 6:
+		return gtp_check_ms_ipv6(skb, pctx, hdrlen, role);
+#endif
 	}
+
 	return false;
 }
 
+static u16 ipver_to_eth(struct iphdr *iph)
+{
+	switch (iph->version) {
+	case 4:
+		return htons(ETH_P_IP);
+#if GTP_IPV6
+	case 6:
+		return htons(ETH_P_IPV6);
+#endif
+	default:
+		return 0;
+	}
+}
+
 static int gtp_rx(struct pdp_ctx *pctx, struct sk_buff *skb,
-			unsigned int hdrlen, unsigned int role)
+		  unsigned int hdrlen, unsigned int role)
 {
 	struct gtp_dev *gtp = netdev_priv(pctx->dev);
 	struct pcpu_sw_netstats *stats;
+	u16 inner_protocol;
 
 	if (!gtp_check_ms(skb, pctx, hdrlen, role)) {
 		netdev_dbg(pctx->dev, "No PDP ctx for this MS\n");
 		return 1;
 	}
 
+	inner_protocol = ipver_to_eth((struct iphdr *)(skb->data + hdrlen));
+	if (!inner_protocol)
+		return -1;
+
 	/* Get rid of the GTP + UDP headers. */
-	if (iptunnel_pull_header(skb, hdrlen, skb->protocol,
-				 !net_eq(sock_net(pctx->sk), dev_net(pctx->dev))))
+	if (iptunnel_pull_header(skb, hdrlen, inner_protocol,
+				 !net_eq(sock_net(pctx->sk),
+					 dev_net(pctx->dev))))
 		return -1;
 
 	netdev_dbg(pctx->dev, "forwarding packet from GGSN to uplink\n");
@@ -241,7 +332,8 @@ static int gtp0_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	if (!gtp)
 		goto pass;
 
-	if (!pskb_may_pull(skb, hdrlen))
+	/* Pull through IP header since gtp_rx looks at IP version */
+	if (!pskb_may_pull(skb, hdrlen + sizeof(struct iphdr)))
 		goto drop;
 
 	gtp0 = (struct gtp0_header *)(skb->data + sizeof(struct udphdr));
@@ -287,7 +379,8 @@ static int gtp1u_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	if (!gtp)
 		goto pass;
 
-	if (!pskb_may_pull(skb, hdrlen))
+	/* Pull through IP header since gtp_rx looks at IP version */
+	if (!pskb_may_pull(skb, hdrlen + sizeof(struct iphdr)))
 		goto drop;
 
 	gtp1 = (struct gtp1_header *)(skb->data + sizeof(struct udphdr));
@@ -309,8 +402,10 @@ static int gtp1u_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	if (gtp1->flags & GTP1_F_MASK)
 		hdrlen += 4;
 
-	/* Make sure the header is larger enough, including extensions. */
-	if (!pskb_may_pull(skb, hdrlen))
+	/* Make sure the header is larger enough, including extensions and
+	 * also an IP header since gtp_rx looks at IP version
+	 */
+	if (!pskb_may_pull(skb, hdrlen + sizeof(struct iphdr)))
 		goto drop;
 
 	gtp1 = (struct gtp1_header *)(skb->data + sizeof(struct udphdr));
@@ -391,7 +486,8 @@ static inline void gtp0_push_header(struct sk_buff *skb, struct pdp_ctx *pctx)
 	gtp0->flags	= 0x1e; /* v0, GTP-non-prime. */
 	gtp0->type	= GTP_TPDU;
 	gtp0->length	= htons(payload_len);
-	gtp0->seq	= htons((atomic_inc_return(&pctx->tx_seq) - 1) % 0xffff);
+	gtp0->seq	= htons((atomic_inc_return(&pctx->tx_seq) - 1) %
+				0xffff);
 	gtp0->flow	= htons(pctx->u.v0.flow);
 	gtp0->number	= 0xff;
 	gtp0->spare[0]	= gtp0->spare[1] = gtp0->spare[2] = 0xff;
@@ -523,6 +619,25 @@ static netdev_tx_t gtp_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 
 		break;
 	}
+#if GTP_IPV6
+	case ETH_P_IPV6: {
+		struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+
+		if (gtp->role == GTP_ROLE_SGSN)
+			pctx = ipv6_pdp_find(gtp, &ipv6h->saddr);
+		else
+			pctx = ipv6_pdp_find(gtp, &ipv6h->daddr);
+
+		if (!pctx) {
+			netdev_dbg(dev, "no PDP ctx found for %pI6, skip\n",
+				   &ipv6h->daddr);
+			err = -ENOENT;
+			goto tx_err;
+		}
+
+		break;
+	}
+#endif
 	default:
 		err = -EOPNOTSUPP;
 		goto tx_err;
@@ -692,23 +807,38 @@ static int gtp_hashtable_new(struct gtp_dev *gtp, int hsize)
 {
 	int i;
 
-	gtp->addr_hash = kmalloc(sizeof(struct hlist_head) * hsize, GFP_KERNEL);
-	if (gtp->addr_hash == NULL)
-		return -ENOMEM;
+	gtp->addr4_hash = kmalloc_array(hsize, sizeof(*gtp->addr4_hash),
+					GFP_KERNEL);
+	if (!gtp->addr4_hash)
+		goto err;
+
+#if GTP_IPV6
+	gtp->addr6_hash = kmalloc_array(hsize, sizeof(*gtp->addr6_hash),
+					GFP_KERNEL);
+	if (!gtp->addr6_hash)
+		goto err;
+#endif
 
-	gtp->tid_hash = kmalloc(sizeof(struct hlist_head) * hsize, GFP_KERNEL);
-	if (gtp->tid_hash == NULL)
-		goto err1;
+	gtp->tid_hash = kmalloc_array(hsize, sizeof(struct hlist_head),
+				      GFP_KERNEL);
+	if (!gtp->tid_hash)
+		goto err;
 
 	gtp->hash_size = hsize;
 
 	for (i = 0; i < hsize; i++) {
-		INIT_HLIST_HEAD(&gtp->addr_hash[i]);
+		INIT_HLIST_HEAD(&gtp->addr4_hash[i]);
+#if GTP_IPV6
+		INIT_HLIST_HEAD(&gtp->addr6_hash[i]);
+#endif
 		INIT_HLIST_HEAD(&gtp->tid_hash[i]);
 	}
 	return 0;
-err1:
-	kfree(gtp->addr_hash);
+err:
+	kfree(gtp->addr4_hash);
+#if GTP_IPV6
+	kfree(gtp->addr6_hash);
+#endif
 	return -ENOMEM;
 }
 
@@ -722,7 +852,10 @@ static void gtp_hashtable_free(struct gtp_dev *gtp)
 			pdp_context_delete(pctx);
 
 	synchronize_rcu();
-	kfree(gtp->addr_hash);
+	kfree(gtp->addr4_hash);
+#if GTP_IPV6
+	kfree(gtp->addr6_hash);
+#endif
 	kfree(gtp->tid_hash);
 }
 
@@ -844,16 +977,13 @@ static struct gtp_dev *gtp_find_dev(struct net *src_net, struct nlattr *nla[])
 	return gtp;
 }
 
-static void ipv4_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info)
+static void pdp_fill(struct pdp_ctx *pctx, struct genl_info *info)
 {
 	__be16 default_port = 0;
 
 	pctx->gtp_version = nla_get_u32(info->attrs[GTPA_VERSION]);
-	pctx->af = AF_INET;
 	pctx->peer_addr_ip4.s_addr =
 		nla_get_be32(info->attrs[GTPA_PEER_ADDRESS]);
-	pctx->ms_addr_ip4.s_addr =
-		nla_get_be32(info->attrs[GTPA_MS_ADDRESS]);
 
 	switch (pctx->gtp_version) {
 	case GTP_V0:
@@ -882,33 +1012,59 @@ static void ipv4_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info)
 		pctx->gtp_port = default_port;
 }
 
-static int ipv4_pdp_add(struct gtp_dev *gtp, struct sock *sk,
-			struct genl_info *info)
+static int gtp_pdp_add(struct gtp_dev *gtp, struct sock *sk,
+		       struct genl_info *info)
 {
 	struct net_device *dev = gtp->dev;
+	struct hlist_head *addr_list;
+	struct pdp_ctx *pctx = NULL;
 	u32 hash_ms, hash_tid = 0;
-	struct pdp_ctx *pctx;
-	bool found = false;
-	__be32 ms_addr;
+#if GTP_IPV6
+	struct in6_addr ms6_addr;
+#endif
+	__be32 ms_addr = 0;
+	int ms_af;
 	int err;
 
-	ms_addr = nla_get_be32(info->attrs[GTPA_MS_ADDRESS]);
-	hash_ms = ipv4_hashfn(ms_addr) % gtp->hash_size;
+#if GTP_IPV6
+	/* Caller ensures we have either v4 or v6 mobile subscriber address */
+	if (info->attrs[GTPA_MS_ADDRESS]) {
+		/* IPv4 mobile subscriber */
 
-	hlist_for_each_entry_rcu(pctx, &gtp->addr_hash[hash_ms], hlist_addr) {
-		if (pctx->ms_addr_ip4.s_addr == ms_addr) {
-			found = true;
-			break;
-		}
+		ms_addr = nla_get_in_addr(info->attrs[GTPA_MS_ADDRESS]);
+		hash_ms = gtp_ipv4_hashfn(ms_addr) % gtp->hash_size;
+		addr_list = &gtp->addr4_hash[hash_ms];
+		ms_af = AF_INET;
+
+		pctx = ipv4_pdp_find(gtp, ms_addr);
+	} else {
+		/* IPv6 mobile subscriber */
+
+		ms6_addr = nla_get_in6_addr(info->attrs[GTPA_MS6_ADDRESS]);
+		hash_ms = gtp_ipv6_hashfn(&ms6_addr) % gtp->hash_size;
+		addr_list = &gtp->addr6_hash[hash_ms];
+		ms_af = AF_INET6;
+
+		pctx = ipv6_pdp_find(gtp, &ms6_addr);
 	}
+#else
+	/* IPv4 mobile subscriber */
 
-	if (found) {
+	ms_addr = nla_get_in_addr(info->attrs[GTPA_MS_ADDRESS]);
+	hash_ms = gtp_ipv4_hashfn(ms_addr) % gtp->hash_size;
+	addr_list = &gtp->addr4_hash[hash_ms];
+	ms_af = AF_INET;
+
+	pctx = ipv4_pdp_find(gtp, ms_addr);
+#endif
+
+	if (pctx) {
 		if (info->nlhdr->nlmsg_flags & NLM_F_EXCL)
 			return -EEXIST;
 		if (info->nlhdr->nlmsg_flags & NLM_F_REPLACE)
 			return -EOPNOTSUPP;
 
-		ipv4_pdp_fill(pctx, info);
+		pdp_fill(pctx, info);
 
 		if (pctx->gtp_version == GTP_V0)
 			netdev_dbg(dev, "GTPv0-U: update tunnel id = %llx (pdp %p)\n",
@@ -934,7 +1090,20 @@ static int ipv4_pdp_add(struct gtp_dev *gtp, struct sock *sk,
 	sock_hold(sk);
 	pctx->sk = sk;
 	pctx->dev = gtp->dev;
-	ipv4_pdp_fill(pctx, info);
+	pctx->ms_af = ms_af;
+
+	switch (ms_af) {
+	case AF_INET:
+		pctx->ms_addr_ip4.s_addr = ms_addr;
+		break;
+#if GTP_IPV6
+	case AF_INET6:
+		pctx->ms_addr_ip6 = ms6_addr;
+		break;
+#endif
+	}
+
+	pdp_fill(pctx, info);
 	atomic_set(&pctx->tx_seq, 0);
 
 	switch (pctx->gtp_version) {
@@ -951,7 +1120,7 @@ static int ipv4_pdp_add(struct gtp_dev *gtp, struct sock *sk,
 		break;
 	}
 
-	hlist_add_head_rcu(&pctx->hlist_addr, &gtp->addr_hash[hash_ms]);
+	hlist_add_head_rcu(&pctx->hlist_addr, addr_list);
 	hlist_add_head_rcu(&pctx->hlist_tid, &gtp->tid_hash[hash_tid]);
 
 	switch (pctx->gtp_version) {
@@ -993,11 +1162,25 @@ static int gtp_genl_new_pdp(struct sk_buff *skb, struct genl_info *info)
 	int err;
 
 	if (!info->attrs[GTPA_VERSION] ||
-	    !info->attrs[GTPA_LINK] ||
-	    !info->attrs[GTPA_PEER_ADDRESS] ||
-	    !info->attrs[GTPA_MS_ADDRESS])
+	   !info->attrs[GTPA_LINK] ||
+	   !info->attrs[GTPA_PEER_ADDRESS])
 		return -EINVAL;
 
+#if GTP_IPV6
+	if (!(!!info->attrs[GTPA_MS_ADDRESS] ^
+	      !!info->attrs[GTPA_MS6_ADDRESS])) {
+		/* Either v4 or v6 mobile subscriber address must be set */
+
+		return -EINVAL;
+	}
+#else
+	if (!info->attrs[GTPA_MS_ADDRESS]) {
+		/* v4 mobile subscriber address must be set */
+
+		return -EINVAL;
+	}
+#endif
+
 	version = nla_get_u32(info->attrs[GTPA_VERSION]);
 
 	switch (version) {
@@ -1036,7 +1219,7 @@ static int gtp_genl_new_pdp(struct sk_buff *skb, struct genl_info *info)
 		goto out_unlock;
 	}
 
-	err = ipv4_pdp_add(gtp, sk, info);
+	err = gtp_pdp_add(gtp, sk, info);
 
 out_unlock:
 	rcu_read_unlock();
@@ -1056,6 +1239,13 @@ static struct pdp_ctx *gtp_find_pdp_by_link(struct net *net,
 		__be32 ip = nla_get_be32(nla[GTPA_MS_ADDRESS]);
 
 		return ipv4_pdp_find(gtp, ip);
+#if GTP_IPV6
+	} else if (nla[GTPA_MS6_ADDRESS]) {
+		struct in6_addr ip6 =
+		    nla_get_in6_addr(nla[GTPA_MS6_ADDRESS]);
+
+		return ipv6_pdp_find(gtp, &ip6);
+#endif
 	} else if (nla[GTPA_VERSION]) {
 		u32 gtp_version = nla_get_u32(nla[GTPA_VERSION]);
 
@@ -1126,9 +1316,27 @@ static int gtp_genl_fill_info(struct sk_buff *skb, u32 snd_portid, u32 snd_seq,
 		goto nlmsg_failure;
 
 	if (nla_put_u32(skb, GTPA_VERSION, pctx->gtp_version) ||
-	    nla_put_be32(skb, GTPA_PEER_ADDRESS, pctx->peer_addr_ip4.s_addr) ||
-	    nla_put_be32(skb, GTPA_MS_ADDRESS, pctx->ms_addr_ip4.s_addr))
+	    nla_put_be32(skb, GTPA_PEER_ADDRESS, pctx->peer_addr_ip4.s_addr))
+		goto nla_put_failure;
+
+	switch (pctx->ms_af) {
+	case AF_INET:
+		if (nla_put_be32(skb, GTPA_MS_ADDRESS,
+				 pctx->ms_addr_ip4.s_addr))
+			goto nla_put_failure;
+
+		break;
+#if GTP_IPV6
+	case AF_INET6:
+		if (nla_put_in6_addr(skb, GTPA_MS6_ADDRESS,
+				     &pctx->ms_addr_ip6))
+			goto nla_put_failure;
+
+		break;
+#endif
+	default:
 		goto nla_put_failure;
+	}
 
 	switch (pctx->gtp_version) {
 	case GTP_V0:
@@ -1239,6 +1447,10 @@ static struct nla_policy gtp_genl_policy[GTPA_MAX + 1] = {
 	[GTPA_TID]		= { .type = NLA_U64, },
 	[GTPA_PEER_ADDRESS]	= { .type = NLA_U32, },
 	[GTPA_MS_ADDRESS]	= { .type = NLA_U32, },
+#if GTP_IPV6
+	[GTPA_MS6_ADDRESS]	= { .len = FIELD_SIZEOF(struct ipv6hdr,
+							daddr) },
+#endif
 	[GTPA_FLOW]		= { .type = NLA_U16, },
 	[GTPA_NET_NS_FD]	= { .type = NLA_U32, },
 	[GTPA_I_TEI]		= { .type = NLA_U32, },
diff --git a/include/uapi/linux/gtp.h b/include/uapi/linux/gtp.h
index b2283a5c6d7f..ae4e632c0360 100644
--- a/include/uapi/linux/gtp.h
+++ b/include/uapi/linux/gtp.h
@@ -28,6 +28,7 @@ enum gtp_attrs {
 	GTPA_O_TEI,	/* for GTPv1 only */
 	GTPA_PAD,
 	GTPA_PORT,
+	GTPA_MS6_ADDRESS,
 	__GTPA_MAX,
 };
 #define GTPA_MAX (__GTPA_MAX + 1)
-- 
2.11.0

^ permalink raw reply related

* [PATCH v3 net-next 09/12] gtp: Eliminate pktinfo and add port configuration
From: Tom Herbert @ 2017-09-25  3:29 UTC (permalink / raw)
  To: davem; +Cc: pablo, laforge, aschultz, netdev, rohit, Tom Herbert
In-Reply-To: <20170925032941.14586-1-tom@quantonium.net>

The gtp pktinfo structure is unnecessary and needs a lot of code to
manage it. Remove it. Also, add per pdp port configuration for transmit.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 drivers/net/gtp.c        | 177 +++++++++++++++++++++--------------------------
 include/uapi/linux/gtp.h |   1 +
 2 files changed, 80 insertions(+), 98 deletions(-)

diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index bbb08f8849d3..44844eba8df2 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -54,6 +54,7 @@ struct pdp_ctx {
 	} u;
 	u8			gtp_version;
 	u8			hlen;
+	__be16			gtp_port;
 	u16			af;
 
 	struct in_addr		ms_addr_ip4;
@@ -420,73 +421,36 @@ static inline void gtp1_push_header(struct sk_buff *skb, struct pdp_ctx *pctx)
 	 */
 }
 
-struct gtp_pktinfo {
-	struct sock		*sk;
-	struct iphdr		*iph;
-	struct flowi4		fl4;
-	struct rtable		*rt;
-	struct pdp_ctx		*pctx;
-	struct net_device	*dev;
-	__be16			gtph_port;
-};
-
-static void gtp_push_header(struct sk_buff *skb, struct gtp_pktinfo *pktinfo)
+static void gtp_push_header(struct sk_buff *skb, struct pdp_ctx *pctx)
 {
-	switch (pktinfo->pctx->gtp_version) {
+	switch (pctx->gtp_version) {
 	case GTP_V0:
-		pktinfo->gtph_port = htons(GTP0_PORT);
-		gtp0_push_header(skb, pktinfo->pctx);
+		gtp0_push_header(skb, pctx);
 		break;
 	case GTP_V1:
-		pktinfo->gtph_port = htons(GTP1U_PORT);
-		gtp1_push_header(skb, pktinfo->pctx);
+		gtp1_push_header(skb, pctx);
 		break;
 	}
 }
 
-static inline void gtp_set_pktinfo_ipv4(struct gtp_pktinfo *pktinfo,
-					struct sock *sk, struct iphdr *iph,
-					struct pdp_ctx *pctx, struct rtable *rt,
-					struct flowi4 *fl4,
-					struct net_device *dev)
-{
-	pktinfo->sk	= sk;
-	pktinfo->iph	= iph;
-	pktinfo->pctx	= pctx;
-	pktinfo->rt	= rt;
-	pktinfo->fl4	= *fl4;
-	pktinfo->dev	= dev;
-}
-
-static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev,
-			     struct gtp_pktinfo *pktinfo)
+static int gtp_xmit(struct sk_buff *skb, struct net_device *dev,
+		    struct pdp_ctx *pctx)
 {
-	struct gtp_dev *gtp = netdev_priv(dev);
-	struct pdp_ctx *pctx;
+	struct iphdr *inner_iph = NULL;
+	struct sock *sk = pctx->sk;
+	__be32 saddr = inet_sk(sk)->inet_saddr;
 	struct rtable *rt;
-	struct flowi4 fl4;
-	struct iphdr *iph;
-	struct sock *sk;
-	__be32 saddr;
+	int err = 0;
 
-	/* Read the IP destination address and resolve the PDP context.
-	 * Prepend PDP header with TEI/TID from PDP ctx.
-	 */
-	iph = ip_hdr(skb);
-	if (gtp->role == GTP_ROLE_SGSN)
-		pctx = ipv4_pdp_find(gtp, iph->saddr);
-	else
-		pctx = ipv4_pdp_find(gtp, iph->daddr);
+	if (skb->protocol == ETH_P_IP)
+		inner_iph = ip_hdr(skb);
 
-	if (!pctx) {
-		netdev_dbg(dev, "no PDP ctx found for %pI4, skip\n",
-			   &iph->daddr);
-		return -ENOENT;
-	}
-	netdev_dbg(dev, "found PDP context %p\n", pctx);
+	/* Ensure there is sufficient headroom. */
+	err = skb_cow_head(skb, dev->needed_headroom);
+	if (unlikely(err))
+		goto out_err;
 
-	sk = pctx->sk;
-	saddr = inet_sk(sk)->inet_saddr;
+	skb_reset_inner_headers(skb);
 
 	/* Source address returned by route lookup is ignored since
 	 * we get the address from a socket.
@@ -494,81 +458,89 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev,
 	rt = ip_tunnel_get_route(dev, skb, sk->sk_protocol,
 				 sk->sk_bound_dev_if, RT_CONN_FLAGS(sk),
 				 pctx->peer_addr_ip4.s_addr, &saddr,
-				 pktinfo->gtph_port, pktinfo->gtph_port,
+				 pctx->gtp_port, pctx->gtp_port,
 				 &pctx->dst_cache, NULL);
 
 	if (IS_ERR(rt)) {
-		if (rt == ERR_PTR(-ELOOP)) {
-			netdev_dbg(dev, "circular route to SSGN %pI4\n",
-				   &pctx->peer_addr_ip4.s_addr);
-			dev->stats.collisions++;
-			goto err_rt;
-		} else {
-			netdev_dbg(dev, "no route to SSGN %pI4\n",
-				   &pctx->peer_addr_ip4.s_addr);
-			dev->stats.tx_carrier_errors++;
-			goto err;
-		}
+		err = PTR_ERR(rt);
+		goto out_err;
 	}
 
 	skb_dst_drop(skb);
 
-	gtp_set_pktinfo_ipv4(pktinfo, sk, iph, pctx, rt, &fl4, dev);
-	gtp_push_header(skb, pktinfo);
+	gtp_push_header(skb, pctx);
+
+	if (inner_iph)
+		__iptunnel_update_pmtu(dev, skb, &rt->dst,
+				       !!inner_iph->frag_off,
+				       inner_iph, pctx->hlen,
+				       pctx->peer_addr_ip4.s_addr);
 
-	__iptunnel_update_pmtu(dev, skb, &rt->dst, !!iph->frag_off, iph,
-			       pctx->hlen, pctx->peer_addr_ip4.s_addr);
+	udp_tunnel_xmit_skb(rt, sk, skb, saddr,
+			    pctx->peer_addr_ip4.s_addr,
+			    0, ip4_dst_hoplimit(&rt->dst), 0,
+			    pctx->gtp_port, pctx->gtp_port,
+			    false, false);
+
+	netdev_dbg(dev, "gtp -> IP src: %pI4 dst: %pI4\n",
+		   &saddr, &pctx->peer_addr_ip4.s_addr);
 
 	return 0;
-err_rt:
-	ip_rt_put(rt);
-err:
-	return -EBADMSG;
+
+out_err:
+	if (err == -ELOOP)
+		dev->stats.collisions++;
+	else
+		dev->stats.tx_carrier_errors++;
+
+	return err;
 }
 
 static netdev_tx_t gtp_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	unsigned int proto = ntohs(skb->protocol);
-	struct gtp_pktinfo pktinfo;
+	struct gtp_dev *gtp = netdev_priv(dev);
+	struct pdp_ctx *pctx;
 	int err;
 
-	/* Ensure there is sufficient headroom. */
-	if (skb_cow_head(skb, dev->needed_headroom))
-		goto tx_err;
-
-	skb_reset_inner_headers(skb);
-
 	/* PDP context lookups in gtp_build_skb_*() need rcu read-side lock. */
 	rcu_read_lock();
 	switch (proto) {
-	case ETH_P_IP:
-		err = gtp_build_skb_ip4(skb, dev, &pktinfo);
+	case ETH_P_IP: {
+		struct iphdr *iph = ip_hdr(skb);
+
+		if (gtp->role == GTP_ROLE_SGSN)
+			pctx = ipv4_pdp_find(gtp, iph->saddr);
+		else
+			pctx = ipv4_pdp_find(gtp, iph->daddr);
+
+		if (!pctx) {
+			netdev_dbg(dev, "no PDP ctx found for %pI4, skip\n",
+				   &iph->daddr);
+			err = -ENOENT;
+			goto tx_err;
+		}
+
 		break;
+	}
 	default:
 		err = -EOPNOTSUPP;
-		break;
+		goto tx_err;
 	}
-	rcu_read_unlock();
+
+	netdev_dbg(dev, "found PDP context %p\n", pctx);
+
+	err = gtp_xmit(skb, dev, pctx);
 
 	if (err < 0)
 		goto tx_err;
 
-	switch (proto) {
-	case ETH_P_IP:
-		netdev_dbg(pktinfo.dev, "gtp -> IP src: %pI4 dst: %pI4\n",
-			   &pktinfo.iph->saddr, &pktinfo.iph->daddr);
-		udp_tunnel_xmit_skb(pktinfo.rt, pktinfo.sk, skb,
-				    pktinfo.fl4.saddr, pktinfo.fl4.daddr,
-				    pktinfo.iph->tos,
-				    ip4_dst_hoplimit(&pktinfo.rt->dst),
-				    0,
-				    pktinfo.gtph_port, pktinfo.gtph_port,
-				    true, false);
-		break;
-	}
+	rcu_read_unlock();
 
 	return NETDEV_TX_OK;
+
 tx_err:
+	rcu_read_unlock();
 	dev->stats.tx_errors++;
 	dev_kfree_skb(skb);
 	return NETDEV_TX_OK;
@@ -874,6 +846,8 @@ static struct gtp_dev *gtp_find_dev(struct net *src_net, struct nlattr *nla[])
 
 static void ipv4_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info)
 {
+	__be16 default_port = 0;
+
 	pctx->gtp_version = nla_get_u32(info->attrs[GTPA_VERSION]);
 	pctx->af = AF_INET;
 	pctx->peer_addr_ip4.s_addr =
@@ -890,15 +864,22 @@ static void ipv4_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info)
 		pctx->u.v0.tid = nla_get_u64(info->attrs[GTPA_TID]);
 		pctx->u.v0.flow = nla_get_u16(info->attrs[GTPA_FLOW]);
 		pctx->hlen = sizeof(struct udphdr) + sizeof(struct gtp0_header);
+		default_port = htons(GTP0_PORT);
 		break;
 	case GTP_V1:
 		pctx->u.v1.i_tei = nla_get_u32(info->attrs[GTPA_I_TEI]);
 		pctx->u.v1.o_tei = nla_get_u32(info->attrs[GTPA_O_TEI]);
 		pctx->hlen = sizeof(struct udphdr) + sizeof(struct gtp1_header);
+		default_port = htons(GTP1U_PORT);
 		break;
 	default:
 		break;
 	}
+
+	if (info->attrs[GTPA_PORT])
+		pctx->gtp_port = nla_get_u16(info->attrs[GTPA_PORT]);
+	else
+		pctx->gtp_port = default_port;
 }
 
 static int ipv4_pdp_add(struct gtp_dev *gtp, struct sock *sk,
diff --git a/include/uapi/linux/gtp.h b/include/uapi/linux/gtp.h
index 57d1edb8efd9..b2283a5c6d7f 100644
--- a/include/uapi/linux/gtp.h
+++ b/include/uapi/linux/gtp.h
@@ -27,6 +27,7 @@ enum gtp_attrs {
 	GTPA_I_TEI,	/* for GTPv1 only */
 	GTPA_O_TEI,	/* for GTPv1 only */
 	GTPA_PAD,
+	GTPA_PORT,
 	__GTPA_MAX,
 };
 #define GTPA_MAX (__GTPA_MAX + 1)
-- 
2.11.0

^ permalink raw reply related

* [PATCH v3 net-next 08/12] gtp: Call function to update path mtu
From: Tom Herbert @ 2017-09-25  3:29 UTC (permalink / raw)
  To: davem; +Cc: pablo, laforge, aschultz, netdev, rohit, Tom Herbert
In-Reply-To: <20170925032941.14586-1-tom@quantonium.net>

Replace mtu handling with call to __iptunnel_update_pmtu.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 drivers/net/gtp.c | 36 ++++++------------------------------
 1 file changed, 6 insertions(+), 30 deletions(-)

diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index a6e2e0a1f424..bbb08f8849d3 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -53,6 +53,7 @@ struct pdp_ctx {
 		} v1;
 	} u;
 	u8			gtp_version;
+	u8			hlen;
 	u16			af;
 
 	struct in_addr		ms_addr_ip4;
@@ -467,8 +468,6 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev,
 	struct iphdr *iph;
 	struct sock *sk;
 	__be32 saddr;
-	__be16 df;
-	int mtu;
 
 	/* Read the IP destination address and resolve the PDP context.
 	 * Prepend PDP header with TEI/TID from PDP ctx.
@@ -514,37 +513,12 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev,
 
 	skb_dst_drop(skb);
 
-	/* This is similar to tnl_update_pmtu(). */
-	df = iph->frag_off;
-	if (df) {
-		mtu = dst_mtu(&rt->dst) - dev->hard_header_len -
-			sizeof(struct iphdr) - sizeof(struct udphdr);
-		switch (pctx->gtp_version) {
-		case GTP_V0:
-			mtu -= sizeof(struct gtp0_header);
-			break;
-		case GTP_V1:
-			mtu -= sizeof(struct gtp1_header);
-			break;
-		}
-	} else {
-		mtu = dst_mtu(&rt->dst);
-	}
-
-	rt->dst.ops->update_pmtu(&rt->dst, NULL, skb, mtu);
-
-	if (!skb_is_gso(skb) && (iph->frag_off & htons(IP_DF)) &&
-	    mtu < ntohs(iph->tot_len)) {
-		netdev_dbg(dev, "packet too big, fragmentation needed\n");
-		memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
-		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
-			  htonl(mtu));
-		goto err_rt;
-	}
-
 	gtp_set_pktinfo_ipv4(pktinfo, sk, iph, pctx, rt, &fl4, dev);
 	gtp_push_header(skb, pktinfo);
 
+	__iptunnel_update_pmtu(dev, skb, &rt->dst, !!iph->frag_off, iph,
+			       pctx->hlen, pctx->peer_addr_ip4.s_addr);
+
 	return 0;
 err_rt:
 	ip_rt_put(rt);
@@ -915,10 +889,12 @@ static void ipv4_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info)
 		 */
 		pctx->u.v0.tid = nla_get_u64(info->attrs[GTPA_TID]);
 		pctx->u.v0.flow = nla_get_u16(info->attrs[GTPA_FLOW]);
+		pctx->hlen = sizeof(struct udphdr) + sizeof(struct gtp0_header);
 		break;
 	case GTP_V1:
 		pctx->u.v1.i_tei = nla_get_u32(info->attrs[GTPA_I_TEI]);
 		pctx->u.v1.o_tei = nla_get_u32(info->attrs[GTPA_O_TEI]);
+		pctx->hlen = sizeof(struct udphdr) + sizeof(struct gtp1_header);
 		break;
 	default:
 		break;
-- 
2.11.0

^ permalink raw reply related

* [PATCH v3 net-next 07/12] gtp: udp recv clean up
From: Tom Herbert @ 2017-09-25  3:29 UTC (permalink / raw)
  To: davem; +Cc: pablo, laforge, aschultz, netdev, rohit, Tom Herbert
In-Reply-To: <20170925032941.14586-1-tom@quantonium.net>

Create separate UDP receive functions for GTP version 0 and version 1.
Set encap_rcv appropriately when configuring a socket.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 drivers/net/gtp.c | 100 ++++++++++++++++++++++++++----------------------------
 1 file changed, 49 insertions(+), 51 deletions(-)

diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index 00e5ea5cb935..a6e2e0a1f424 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -225,14 +225,20 @@ static int gtp_rx(struct pdp_ctx *pctx, struct sk_buff *skb,
 	return 0;
 }
 
-/* 1 means pass up to the stack, -1 means drop and 0 means decapsulated. */
-static int gtp0_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
+/* UDP encapsulation receive handler for GTPv0-U . See net/ipv4/udp.c.
+ * Return codes: 0: success, <0: error, >0: pass up to userspace UDP socket.
+ */
+static int gtp0_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 {
+	struct gtp_dev *gtp = rcu_dereference_sk_user_data(sk);
 	unsigned int hdrlen = sizeof(struct udphdr) +
 			      sizeof(struct gtp0_header);
 	struct gtp0_header *gtp0;
 	struct pdp_ctx *pctx;
 
+	if (!gtp)
+		goto pass;
+
 	if (!pskb_may_pull(skb, hdrlen))
 		goto drop;
 
@@ -244,26 +250,41 @@ static int gtp0_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 	if (gtp0->type != GTP_TPDU)
 		goto pass;
 
+	netdev_dbg(gtp->dev, "received GTP0 packet\n");
+
 	pctx = gtp0_pdp_find(gtp, be64_to_cpu(gtp0->tid));
 	if (!pctx) {
 		netdev_dbg(gtp->dev, "No PDP ctx to decap skb=%p\n", skb);
 		goto pass;
 	}
 
-	return gtp_rx(pctx, skb, hdrlen, gtp->role);
+	if (!gtp_rx(pctx, skb, hdrlen, gtp->role)) {
+		/* Successfully received */
+		return 0;
+	}
+
 drop:
-	return -1;
+	kfree_skb(skb);
+	return 0;
+
 pass:
 	return 1;
 }
 
-static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
+/* UDP encapsulation receive handler for GTPv0-U . See net/ipv4/udp.c.
+ * Return codes: 0: success, <0: error, >0: pass up to userspace UDP socket.
+ */
+static int gtp1u_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 {
+	struct gtp_dev *gtp = rcu_dereference_sk_user_data(sk);
 	unsigned int hdrlen = sizeof(struct udphdr) +
 			      sizeof(struct gtp1_header);
 	struct gtp1_header *gtp1;
 	struct pdp_ctx *pctx;
 
+	if (!gtp)
+		goto pass;
+
 	if (!pskb_may_pull(skb, hdrlen))
 		goto drop;
 
@@ -275,6 +296,8 @@ static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 	if (gtp1->type != GTP_TPDU)
 		goto pass;
 
+	netdev_dbg(gtp->dev, "received GTP1 packet\n");
+
 	/* From 29.060: "This field shall be present if and only if any one or
 	 * more of the S, PN and E flags are set.".
 	 *
@@ -296,9 +319,15 @@ static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 		goto drop;
 	}
 
-	return gtp_rx(pctx, skb, hdrlen, gtp->role);
+	if (!gtp_rx(pctx, skb, hdrlen, gtp->role)) {
+		/* Successfully received */
+		return 0;
+	}
+
 drop:
-	return -1;
+	kfree_skb(skb);
+	return 0;
+
 pass:
 	return 1;
 }
@@ -329,49 +358,6 @@ static void gtp_encap_disable(struct gtp_dev *gtp)
 	gtp_encap_disable_sock(gtp->sk1u);
 }
 
-/* UDP encapsulation receive handler. See net/ipv4/udp.c.
- * Return codes: 0: success, <0: error, >0: pass up to userspace UDP socket.
- */
-static int gtp_encap_recv(struct sock *sk, struct sk_buff *skb)
-{
-	struct gtp_dev *gtp;
-	int ret = 0;
-
-	gtp = rcu_dereference_sk_user_data(sk);
-	if (!gtp)
-		return 1;
-
-	netdev_dbg(gtp->dev, "encap_recv sk=%p\n", sk);
-
-	switch (udp_sk(sk)->encap_type) {
-	case UDP_ENCAP_GTP0:
-		netdev_dbg(gtp->dev, "received GTP0 packet\n");
-		ret = gtp0_udp_encap_recv(gtp, skb);
-		break;
-	case UDP_ENCAP_GTP1U:
-		netdev_dbg(gtp->dev, "received GTP1U packet\n");
-		ret = gtp1u_udp_encap_recv(gtp, skb);
-		break;
-	default:
-		ret = -1; /* Shouldn't happen. */
-	}
-
-	switch (ret) {
-	case 1:
-		netdev_dbg(gtp->dev, "pass up to the process\n");
-		break;
-	case 0:
-		break;
-	case -1:
-		netdev_dbg(gtp->dev, "GTP packet has been dropped\n");
-		kfree_skb(skb);
-		ret = 0;
-		break;
-	}
-
-	return ret;
-}
-
 static int gtp_dev_init(struct net_device *dev)
 {
 	struct gtp_dev *gtp = netdev_priv(dev);
@@ -824,9 +810,21 @@ static struct sock *gtp_encap_enable_socket(int fd, int type,
 	sk = sock->sk;
 	sock_hold(sk);
 
+	switch (type) {
+	case UDP_ENCAP_GTP0:
+		tuncfg.encap_rcv = gtp0_udp_encap_recv;
+		break;
+	case UDP_ENCAP_GTP1U:
+		tuncfg.encap_rcv = gtp1u_udp_encap_recv;
+		break;
+	default:
+		pr_debug("Unknown encap type %u\n", type);
+		sk = ERR_PTR(-EINVAL);
+		goto out_sock;
+	}
+
 	tuncfg.sk_user_data = gtp;
 	tuncfg.encap_type = type;
-	tuncfg.encap_rcv = gtp_encap_recv;
 	tuncfg.encap_destroy = gtp_encap_destroy;
 
 	setup_udp_tunnel_sock(sock_net(sock->sk), sock, &tuncfg);
-- 
2.11.0

^ permalink raw reply related

* [PATCH v3 net-next 06/12] gtp: Use goto for exceptions in gtp_udp_encap_recv funcs
From: Tom Herbert @ 2017-09-25  3:29 UTC (permalink / raw)
  To: davem; +Cc: pablo, laforge, aschultz, netdev, rohit, Tom Herbert
In-Reply-To: <20170925032941.14586-1-tom@quantonium.net>

Consolidate return logic to make it easier to extend.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 drivers/net/gtp.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index f2aac5d01143..00e5ea5cb935 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -234,23 +234,27 @@ static int gtp0_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 	struct pdp_ctx *pctx;
 
 	if (!pskb_may_pull(skb, hdrlen))
-		return -1;
+		goto drop;
 
 	gtp0 = (struct gtp0_header *)(skb->data + sizeof(struct udphdr));
 
 	if ((gtp0->flags >> 5) != GTP_V0)
-		return 1;
+		goto pass;
 
 	if (gtp0->type != GTP_TPDU)
-		return 1;
+		goto pass;
 
 	pctx = gtp0_pdp_find(gtp, be64_to_cpu(gtp0->tid));
 	if (!pctx) {
 		netdev_dbg(gtp->dev, "No PDP ctx to decap skb=%p\n", skb);
-		return 1;
+		goto pass;
 	}
 
 	return gtp_rx(pctx, skb, hdrlen, gtp->role);
+drop:
+	return -1;
+pass:
+	return 1;
 }
 
 static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
@@ -261,15 +265,15 @@ static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 	struct pdp_ctx *pctx;
 
 	if (!pskb_may_pull(skb, hdrlen))
-		return -1;
+		goto drop;
 
 	gtp1 = (struct gtp1_header *)(skb->data + sizeof(struct udphdr));
 
 	if ((gtp1->flags >> 5) != GTP_V1)
-		return 1;
+		goto pass;
 
 	if (gtp1->type != GTP_TPDU)
-		return 1;
+		goto pass;
 
 	/* From 29.060: "This field shall be present if and only if any one or
 	 * more of the S, PN and E flags are set.".
@@ -282,17 +286,21 @@ static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 
 	/* Make sure the header is larger enough, including extensions. */
 	if (!pskb_may_pull(skb, hdrlen))
-		return -1;
+		goto drop;
 
 	gtp1 = (struct gtp1_header *)(skb->data + sizeof(struct udphdr));
 
 	pctx = gtp1_pdp_find(gtp, ntohl(gtp1->tid));
 	if (!pctx) {
 		netdev_dbg(gtp->dev, "No PDP ctx to decap skb=%p\n", skb);
-		return 1;
+		goto drop;
 	}
 
 	return gtp_rx(pctx, skb, hdrlen, gtp->role);
+drop:
+	return -1;
+pass:
+	return 1;
 }
 
 static void gtp_encap_destroy(struct sock *sk)
-- 
2.11.0

^ permalink raw reply related

* [PATCH v3 net-next 05/12] gtp: Change to use gro_cells
From: Tom Herbert @ 2017-09-25  3:29 UTC (permalink / raw)
  To: davem; +Cc: pablo, laforge, aschultz, netdev, rohit, Tom Herbert
In-Reply-To: <20170925032941.14586-1-tom@quantonium.net>

Call gro_cells_receive instead of netif_rx.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 drivers/net/gtp.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index 6dabd605607c..f2aac5d01143 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -80,6 +80,8 @@ struct gtp_dev {
 	unsigned int		hash_size;
 	struct hlist_head	*tid_hash;
 	struct hlist_head	*addr_hash;
+
+	struct gro_cells	gro_cells;
 };
 
 static unsigned int gtp_net_id __read_mostly;
@@ -189,6 +191,7 @@ static bool gtp_check_ms(struct sk_buff *skb, struct pdp_ctx *pctx,
 static int gtp_rx(struct pdp_ctx *pctx, struct sk_buff *skb,
 			unsigned int hdrlen, unsigned int role)
 {
+	struct gtp_dev *gtp = netdev_priv(pctx->dev);
 	struct pcpu_sw_netstats *stats;
 
 	if (!gtp_check_ms(skb, pctx, hdrlen, role)) {
@@ -217,7 +220,8 @@ static int gtp_rx(struct pdp_ctx *pctx, struct sk_buff *skb,
 	stats->rx_bytes += skb->len;
 	u64_stats_update_end(&stats->syncp);
 
-	netif_rx(skb);
+	gro_cells_receive(&gtp->gro_cells, skb);
+
 	return 0;
 }
 
@@ -611,6 +615,8 @@ static const struct net_device_ops gtp_netdev_ops = {
 
 static void gtp_link_setup(struct net_device *dev)
 {
+	struct gtp_dev *gtp = netdev_priv(dev);
+
 	dev->netdev_ops		= &gtp_netdev_ops;
 	dev->needs_free_netdev	= true;
 
@@ -630,6 +636,8 @@ static void gtp_link_setup(struct net_device *dev)
 				  sizeof(struct iphdr) +
 				  sizeof(struct udphdr) +
 				  sizeof(struct gtp0_header);
+
+	gro_cells_init(&gtp->gro_cells, dev);
 }
 
 static int gtp_hashtable_new(struct gtp_dev *gtp, int hsize);
@@ -686,6 +694,7 @@ static void gtp_dellink(struct net_device *dev, struct list_head *head)
 {
 	struct gtp_dev *gtp = netdev_priv(dev);
 
+	gro_cells_destroy(&gtp->gro_cells);
 	gtp_encap_disable(gtp);
 	gtp_hashtable_free(gtp);
 	list_del_rcu(&gtp->list);
-- 
2.11.0

^ permalink raw reply related

* [PATCH v3 net-next 04/12] iptunnel: Generalize tunnel update pmtu
From: Tom Herbert @ 2017-09-25  3:29 UTC (permalink / raw)
  To: davem; +Cc: pablo, laforge, aschultz, netdev, rohit, Tom Herbert
In-Reply-To: <20170925032941.14586-1-tom@quantonium.net>

Add __iptunnel_update_pmtu exported function which does not take
an iptunnel argument but instead includes the fields from the
iptunnel structure as arguments which are needed in the function.

iptunnel_update_pmtu was modified to call __iptunnel_update_pmtu.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 include/net/ip_tunnels.h |  4 ++++
 net/ipv4/ip_tunnel.c     | 30 ++++++++++++++++++++----------
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 9650efff33d7..880c9ea5b08c 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -318,6 +318,10 @@ static inline struct rtable *ip_tunnel_get_route(struct net_device *dev,
 				     dst_cache, info, use_cache);
 }
 
+int __iptunnel_update_pmtu(struct net_device *dev, struct sk_buff *skb,
+			   struct dst_entry *dst, __be16 df,
+			   const struct iphdr *inner_iph, int hlen, u32 daddr);
+
 struct ip_tunnel_encap_ops {
 	size_t (*encap_hlen)(struct ip_tunnel_encap *e);
 	int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e,
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index ea8f8bc0aaf9..31d6dc9f6859 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -506,17 +506,16 @@ int ip_tunnel_encap_setup(struct ip_tunnel *t,
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
 
-static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
-			    struct rtable *rt, __be16 df,
-			    const struct iphdr *inner_iph)
+int __iptunnel_update_pmtu(struct net_device *dev, struct sk_buff *skb,
+			   struct dst_entry *dst, __be16 df,
+			   const struct iphdr *inner_iph, int hlen, u32 daddr)
 {
-	struct ip_tunnel *tunnel = netdev_priv(dev);
-	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
+	int pkt_size = skb->len - hlen - dev->hard_header_len;
 	int mtu;
 
 	if (df)
-		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
-					- sizeof(struct iphdr) - tunnel->hlen;
+		mtu = dst_mtu(dst) - dev->hard_header_len
+				   - sizeof(struct iphdr) - hlen;
 	else
 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 
@@ -538,8 +537,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
 
 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
 			   mtu >= IPV6_MIN_MTU) {
-			if ((tunnel->parms.iph.daddr &&
-			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
+			if ((daddr && !ipv4_is_multicast(daddr)) ||
 			    rt6->rt6i_dst.plen == 128) {
 				rt6->rt6i_flags |= RTF_MODIFIED;
 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
@@ -555,6 +553,17 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
 #endif
 	return 0;
 }
+EXPORT_SYMBOL(__iptunnel_update_pmtu);
+
+static int iptunnel_update_pmtu(struct net_device *dev, struct sk_buff *skb,
+				struct rtable *rt, __be16 df,
+				const struct iphdr *inner_iph)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+
+	return __iptunnel_update_pmtu(dev, skb, &rt->dst, df, inner_iph,
+				      tunnel->hlen, tunnel->parms.iph.daddr);
+}
 
 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
 {
@@ -739,7 +748,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		goto tx_error;
 	}
 
-	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
+	if (iptunnel_update_pmtu(dev, skb, rt, tnl_params->frag_off,
+				 inner_iph)) {
 		ip_rt_put(rt);
 		goto tx_error;
 	}
-- 
2.11.0

^ permalink raw reply related

* [PATCH v3 net-next 03/12] gtp: Call common functions to get tunnel routes and add dst_cache
From: Tom Herbert @ 2017-09-25  3:29 UTC (permalink / raw)
  To: davem; +Cc: pablo, laforge, aschultz, netdev, rohit, Tom Herbert
In-Reply-To: <20170925032941.14586-1-tom@quantonium.net>

Call ip_tunnel_get_route and dst_cache to pdp context which should
improve performance by obviating the need to perform a route lookup
on every packet.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 drivers/net/gtp.c | 62 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 35 insertions(+), 27 deletions(-)

diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index f38e32a7ec9c..6dabd605607c 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -63,6 +63,8 @@ struct pdp_ctx {
 
 	atomic_t		tx_seq;
 	struct rcu_head		rcu_head;
+
+	struct dst_cache	dst_cache;
 };
 
 /* One instance of the GTP device. */
@@ -379,20 +381,6 @@ static void gtp_dev_uninit(struct net_device *dev)
 	free_percpu(dev->tstats);
 }
 
-static struct rtable *ip4_route_output_gtp(struct flowi4 *fl4,
-					   const struct sock *sk,
-					   __be32 daddr)
-{
-	memset(fl4, 0, sizeof(*fl4));
-	fl4->flowi4_oif		= sk->sk_bound_dev_if;
-	fl4->daddr		= daddr;
-	fl4->saddr		= inet_sk(sk)->inet_saddr;
-	fl4->flowi4_tos		= RT_CONN_FLAGS(sk);
-	fl4->flowi4_proto	= sk->sk_protocol;
-
-	return ip_route_output_key(sock_net(sk), fl4);
-}
-
 static inline void gtp0_push_header(struct sk_buff *skb, struct pdp_ctx *pctx)
 {
 	int payload_len = skb->len;
@@ -479,6 +467,8 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev,
 	struct rtable *rt;
 	struct flowi4 fl4;
 	struct iphdr *iph;
+	struct sock *sk;
+	__be32 saddr;
 	__be16 df;
 	int mtu;
 
@@ -498,19 +488,30 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev,
 	}
 	netdev_dbg(dev, "found PDP context %p\n", pctx);
 
-	rt = ip4_route_output_gtp(&fl4, pctx->sk, pctx->peer_addr_ip4.s_addr);
-	if (IS_ERR(rt)) {
-		netdev_dbg(dev, "no route to SSGN %pI4\n",
-			   &pctx->peer_addr_ip4.s_addr);
-		dev->stats.tx_carrier_errors++;
-		goto err;
-	}
+	sk = pctx->sk;
+	saddr = inet_sk(sk)->inet_saddr;
 
-	if (rt->dst.dev == dev) {
-		netdev_dbg(dev, "circular route to SSGN %pI4\n",
-			   &pctx->peer_addr_ip4.s_addr);
-		dev->stats.collisions++;
-		goto err_rt;
+	/* Source address returned by route lookup is ignored since
+	 * we get the address from a socket.
+	 */
+	rt = ip_tunnel_get_route(dev, skb, sk->sk_protocol,
+				 sk->sk_bound_dev_if, RT_CONN_FLAGS(sk),
+				 pctx->peer_addr_ip4.s_addr, &saddr,
+				 pktinfo->gtph_port, pktinfo->gtph_port,
+				 &pctx->dst_cache, NULL);
+
+	if (IS_ERR(rt)) {
+		if (rt == ERR_PTR(-ELOOP)) {
+			netdev_dbg(dev, "circular route to SSGN %pI4\n",
+				   &pctx->peer_addr_ip4.s_addr);
+			dev->stats.collisions++;
+			goto err_rt;
+		} else {
+			netdev_dbg(dev, "no route to SSGN %pI4\n",
+				   &pctx->peer_addr_ip4.s_addr);
+			dev->stats.tx_carrier_errors++;
+			goto err;
+		}
 	}
 
 	skb_dst_drop(skb);
@@ -543,7 +544,7 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev,
 		goto err_rt;
 	}
 
-	gtp_set_pktinfo_ipv4(pktinfo, pctx->sk, iph, pctx, rt, &fl4, dev);
+	gtp_set_pktinfo_ipv4(pktinfo, sk, iph, pctx, rt, &fl4, dev);
 	gtp_push_header(skb, pktinfo);
 
 	return 0;
@@ -917,6 +918,7 @@ static int ipv4_pdp_add(struct gtp_dev *gtp, struct sock *sk,
 	struct pdp_ctx *pctx;
 	bool found = false;
 	__be32 ms_addr;
+	int err;
 
 	ms_addr = nla_get_be32(info->attrs[GTPA_MS_ADDRESS]);
 	hash_ms = ipv4_hashfn(ms_addr) % gtp->hash_size;
@@ -951,6 +953,12 @@ static int ipv4_pdp_add(struct gtp_dev *gtp, struct sock *sk,
 	if (pctx == NULL)
 		return -ENOMEM;
 
+	err = dst_cache_init(&pctx->dst_cache, GFP_KERNEL);
+	if (err) {
+		kfree(pctx);
+		return err;
+	}
+
 	sock_hold(sk);
 	pctx->sk = sk;
 	pctx->dev = gtp->dev;
-- 
2.11.0

^ permalink raw reply related

* [PATCH v3 net-next 02/12] vxlan: Call common functions to get tunnel routes
From: Tom Herbert @ 2017-09-25  3:29 UTC (permalink / raw)
  To: davem; +Cc: pablo, laforge, aschultz, netdev, rohit, Tom Herbert
In-Reply-To: <20170925032941.14586-1-tom@quantonium.net>

Call ip_tunnel_get_route and ip6_tnl_get_route to handle getting a route
and dealing with the dst_cache.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 drivers/net/vxlan.c | 84 ++++-------------------------------------------------
 1 file changed, 5 insertions(+), 79 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index d7c49cf1d5e9..810caa9adf37 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1867,47 +1867,11 @@ static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, struct net_device
 				      struct dst_cache *dst_cache,
 				      const struct ip_tunnel_info *info)
 {
-	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
-	struct rtable *rt = NULL;
-	struct flowi4 fl4;
-
 	if (!sock4)
 		return ERR_PTR(-EIO);
 
-	if (tos && !info)
-		use_cache = false;
-	if (use_cache) {
-		rt = dst_cache_get_ip4(dst_cache, saddr);
-		if (rt)
-			return rt;
-	}
-
-	memset(&fl4, 0, sizeof(fl4));
-	fl4.flowi4_oif = oif;
-	fl4.flowi4_tos = RT_TOS(tos);
-	fl4.flowi4_mark = skb->mark;
-	fl4.flowi4_proto = IPPROTO_UDP;
-	fl4.daddr = daddr;
-	fl4.saddr = *saddr;
-	fl4.fl4_dport = dport;
-	fl4.fl4_sport = sport;
-
-	rt = ip_route_output_key(vxlan->net, &fl4);
-	if (likely(!IS_ERR(rt))) {
-		if (rt->dst.dev == dev) {
-			netdev_dbg(dev, "circular route to %pI4\n", &daddr);
-			ip_rt_put(rt);
-			return ERR_PTR(-ELOOP);
-		}
-
-		*saddr = fl4.saddr;
-		if (use_cache)
-			dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
-	} else {
-		netdev_dbg(dev, "no route to %pI4\n", &daddr);
-		return ERR_PTR(-ENETUNREACH);
-	}
-	return rt;
+	return ip_tunnel_get_route(dev, skb, IPPROTO_UDP, oif, tos, daddr,
+				   saddr, dport, sport, dst_cache, info);
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
@@ -1922,50 +1886,12 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
 					  struct dst_cache *dst_cache,
 					  const struct ip_tunnel_info *info)
 {
-	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
-	struct dst_entry *ndst;
-	struct flowi6 fl6;
-	int err;
-
 	if (!sock6)
 		return ERR_PTR(-EIO);
 
-	if (tos && !info)
-		use_cache = false;
-	if (use_cache) {
-		ndst = dst_cache_get_ip6(dst_cache, saddr);
-		if (ndst)
-			return ndst;
-	}
-
-	memset(&fl6, 0, sizeof(fl6));
-	fl6.flowi6_oif = oif;
-	fl6.daddr = *daddr;
-	fl6.saddr = *saddr;
-	fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tos), label);
-	fl6.flowi6_mark = skb->mark;
-	fl6.flowi6_proto = IPPROTO_UDP;
-	fl6.fl6_dport = dport;
-	fl6.fl6_sport = sport;
-
-	err = ipv6_stub->ipv6_dst_lookup(vxlan->net,
-					 sock6->sock->sk,
-					 &ndst, &fl6);
-	if (unlikely(err < 0)) {
-		netdev_dbg(dev, "no route to %pI6\n", daddr);
-		return ERR_PTR(-ENETUNREACH);
-	}
-
-	if (unlikely(ndst->dev == dev)) {
-		netdev_dbg(dev, "circular route to %pI6\n", daddr);
-		dst_release(ndst);
-		return ERR_PTR(-ELOOP);
-	}
-
-	*saddr = fl6.saddr;
-	if (use_cache)
-		dst_cache_set_ip6(dst_cache, ndst, saddr);
-	return ndst;
+	return ip6_tnl_get_route(dev, skb, sock6->sock->sk, IPPROTO_UDP, oif,
+				   tos, label, daddr, saddr, dport, sport,
+				   dst_cache, info);
 }
 #endif
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH v3 net-next 01/12] iptunnel: Add common functions to get a tunnel route
From: Tom Herbert @ 2017-09-25  3:29 UTC (permalink / raw)
  To: davem; +Cc: pablo, laforge, aschultz, netdev, rohit, Tom Herbert
In-Reply-To: <20170925032941.14586-1-tom@quantonium.net>

ip_tunnel_get_route and ip6_tnl_get_route are created to return
routes for a tunnel. These functions are derived from the VXLAN
functions.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 include/net/ip6_tunnel.h | 35 +++++++++++++++++++++++++++++++++++
 include/net/ip_tunnels.h | 33 +++++++++++++++++++++++++++++++++
 net/ipv4/ip_tunnel.c     | 41 +++++++++++++++++++++++++++++++++++++++++
 net/ipv6/ip6_tunnel.c    | 43 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 152 insertions(+)

diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 08fbc7f7d8d7..5a67301b0416 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -142,6 +142,41 @@ __u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr,
 struct net *ip6_tnl_get_link_net(const struct net_device *dev);
 int ip6_tnl_get_iflink(const struct net_device *dev);
 int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu);
+struct dst_entry *__ip6_tnl_get_route(struct net_device *dev,
+				      struct sk_buff *skb, struct sock *sk,
+				      u8 proto, int oif, u8 tos, __be32 label,
+				      const struct in6_addr *daddr,
+				      struct in6_addr *saddr,
+				      __be16 dport, __be16 sport,
+				      struct dst_cache *dst_cache,
+				      const struct ip_tunnel_info *info,
+				      bool use_cache);
+
+static inline struct dst_entry *ip6_tnl_get_route(struct net_device *dev,
+			struct sk_buff *skb, struct sock *sk, u8 proto,
+			int oif, u8 tos, __be32 label,
+			const struct in6_addr *daddr,
+			struct in6_addr *saddr,
+			__be16 dport, __be16 sport,
+			struct dst_cache *dst_cache,
+			const struct ip_tunnel_info *info)
+{
+	 bool use_cache = (ip_tunnel_dst_cache_usable(skb, info) &&
+		(!tos || info));
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (use_cache) {
+		struct dst_entry *ndst = dst_cache_get_ip6(dst_cache, saddr);
+
+		if (ndst)
+			return ndst;
+	}
+#endif
+
+	return __ip6_tnl_get_route(dev, skb, sk, proto, oif, tos, label,
+				   daddr, saddr, dport, sport, dst_cache,
+				   info, use_cache);
+}
 
 static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb,
 				  struct net_device *dev)
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index b41a1e057fce..9650efff33d7 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -285,6 +285,39 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
 		      struct ip_tunnel_parm *p, __u32 fwmark);
 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id);
 
+struct rtable *__ip_tunnel_get_route(struct net_device *dev,
+				     struct sk_buff *skb, u8 proto,
+				     int oif, u8 tos,
+				     __be32 daddr, __be32 *saddr,
+				     __be16 dport, __be16 sport,
+				     struct dst_cache *dst_cache,
+				     const struct ip_tunnel_info *info,
+				     bool use_cache);
+
+static inline struct rtable *ip_tunnel_get_route(struct net_device *dev,
+				     struct sk_buff *skb, u8 proto,
+				     int oif, u8 tos,
+				     __be32 daddr, __be32 *saddr,
+				     __be16 dport, __be16 sport,
+				     struct dst_cache *dst_cache,
+				     const struct ip_tunnel_info *info)
+{
+	bool use_cache = (ip_tunnel_dst_cache_usable(skb, info) &&
+		(!tos || info));
+
+	if (use_cache) {
+		struct rtable *rt;
+
+		rt = dst_cache_get_ip4(dst_cache, saddr);
+		if (rt)
+			return rt;
+	}
+
+	return __ip_tunnel_get_route(dev, skb, proto, oif, tos,
+				     daddr, saddr, dport, sport,
+				     dst_cache, info, use_cache);
+}
+
 struct ip_tunnel_encap_ops {
 	size_t (*encap_hlen)(struct ip_tunnel_encap *e);
 	int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e,
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index fe6fee728ce4..ea8f8bc0aaf9 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -935,6 +935,47 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
 
+struct rtable *__ip_tunnel_get_route(struct net_device *dev,
+				     struct sk_buff *skb, u8 proto,
+				    int oif, u8 tos,
+				     __be32 daddr, __be32 *saddr,
+				     __be16 dport, __be16 sport,
+				     struct dst_cache *dst_cache,
+				     const struct ip_tunnel_info *info,
+				     bool use_cache)
+{
+	struct rtable *rt = NULL;
+	struct flowi4 fl4;
+
+	memset(&fl4, 0, sizeof(fl4));
+	fl4.flowi4_oif = oif;
+	fl4.flowi4_tos = RT_TOS(tos);
+	fl4.flowi4_mark = skb->mark;
+	fl4.flowi4_proto = proto;
+	fl4.daddr = daddr;
+	fl4.saddr = *saddr;
+	fl4.fl4_dport = dport;
+	fl4.fl4_sport = sport;
+
+	rt = ip_route_output_key(dev_net(dev), &fl4);
+	if (likely(!IS_ERR(rt))) {
+		if (rt->dst.dev == dev) {
+			netdev_dbg(dev, "circular route to %pI4\n", &daddr);
+			ip_rt_put(rt);
+			return ERR_PTR(-ELOOP);
+		}
+
+		*saddr = fl4.saddr;
+		if (use_cache)
+			dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
+	} else {
+		netdev_dbg(dev, "no route to %pI4\n", &daddr);
+		return ERR_PTR(-ENETUNREACH);
+	}
+	return rt;
+}
+EXPORT_SYMBOL_GPL(__ip_tunnel_get_route);
+
 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 3d6df489b39f..a541daea1379 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1663,6 +1663,49 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	return err;
 }
 
+struct dst_entry *__ip6_tnl_get_route(struct net_device *dev,
+				      struct sk_buff *skb, struct sock *sk,
+				      u8 proto, int oif, u8 tos, __be32 label,
+				      const struct in6_addr *daddr,
+				      struct in6_addr *saddr,
+				      __be16 dport, __be16 sport,
+				      struct dst_cache *dst_cache,
+				      const struct ip_tunnel_info *info,
+				      bool use_cache)
+{
+	struct dst_entry *ndst;
+	struct flowi6 fl6;
+	int err;
+
+	memset(&fl6, 0, sizeof(fl6));
+	fl6.flowi6_oif = oif;
+	fl6.daddr = *daddr;
+	fl6.saddr = *saddr;
+	fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tos), label);
+	fl6.flowi6_mark = skb->mark;
+	fl6.flowi6_proto = proto;
+	fl6.fl6_dport = dport;
+	fl6.fl6_sport = sport;
+
+	err = ipv6_stub->ipv6_dst_lookup(dev_net(dev), sk, &ndst, &fl6);
+	if (unlikely(err < 0)) {
+		netdev_dbg(dev, "no route to %pI6\n", daddr);
+		return ERR_PTR(-ENETUNREACH);
+	}
+
+	if (unlikely(ndst->dev == dev)) {
+		netdev_dbg(dev, "circular route to %pI6\n", daddr);
+		dst_release(ndst);
+		return ERR_PTR(-ELOOP);
+	}
+
+	*saddr = fl6.saddr;
+	if (use_cache)
+		dst_cache_set_ip6(dst_cache, ndst, saddr);
+	return ndst;
+}
+EXPORT_SYMBOL_GPL(__ip6_tnl_get_route);
+
 /**
  * ip6_tnl_change_mtu - change mtu manually for tunnel device
  *   @dev: virtual device associated with tunnel
-- 
2.11.0

^ permalink raw reply related

* [PATCH v3 net-next 00/12] gtp: Additional feature support - Part I
From: Tom Herbert @ 2017-09-25  3:29 UTC (permalink / raw)
  To: davem; +Cc: pablo, laforge, aschultz, netdev, rohit, Tom Herbert

This patch set builds upon the initial GTP implementation to make
support closer to that enjoyed by other encapsulation protocols.

The major items are:

  - Experimental IPv6 support
  - Configurable networking interfaces so that GTP kernel can be
    used and tested without needing GSN network emulation (i.e. no user
    space daemon needed).
  - Addition of a dst_cache in the GTP structure and other cleanup

Additionally, this patch set also includes:

  - Common functions to get a route fo for an IP tunnel

For IPv6 support, the mobile subscriber needs to allow IPv6 addresses,
and the remote endpoint can be IPv6.

For configurable interfaces, configuration is added to allow an
alternate means to configure a GTP and device. This follows the
typical UDP encapsulation model of specifying a listener port for
receive, and a remote address and port for transmit. 

Configuration is performed by iproute2/ip. I will post that
in a subsequent patch set.

Tested:

Configured the matrix of IPv4/IPv6 mobile subscriber, IPv4/IPv6 remote
peer, and GTP version 0 and 1 (eight combinations). Observed
connectivity and functional netperf. Also, tested VXLAN for
regression.

Test using openggs with ggsn and kernel module on one side and
emulated sgsn on the other. Observed connectivity and
functional netperf.

v2:
  - Split the otiginal patch to post in parts in order to make
    review more manageable
  - Make IPv6 support experimental with a configuration option for it
  - Prepend hash functions with gtp
  - Generalize iptunnel update path MTU function and call it from gtp
    instead using custom code
  - Split original patch cleaning up udp_recv into several for easier
    review
v3: Properly include netdev on cc

Tom Herbert (12):
  iptunnel: Add common functions to get a tunnel route
  vxlan: Call common functions to get tunnel routes
  gtp: Call common functions to get tunnel routes and add dst_cache
  iptunnel: Generalize tunnel update pmtu
  gtp: Change to use gro_cells
  gtp: Use goto for exceptions in gtp_udp_encap_recv funcs
  gtp: udp recv clean up
  gtp: Call function to update path mtu
  gtp: Eliminate pktinfo and add port configuration
  gtp: Experimental encapsulation of IPv6 packets
  gtp: Experimental support encpasulating over IPv6
  gtp: Allow configuring GTP interface as standalone

 drivers/net/Kconfig          |   12 +-
 drivers/net/gtp.c            | 1043 ++++++++++++++++++++++++++++++------------
 drivers/net/vxlan.c          |   84 +---
 include/net/ip6_tunnel.h     |   35 ++
 include/net/ip_tunnels.h     |   37 ++
 include/uapi/linux/gtp.h     |    8 +
 include/uapi/linux/if_link.h |    3 +
 net/ipv4/ip_tunnel.c         |   71 ++-
 net/ipv6/ip6_tunnel.c        |   43 ++
 9 files changed, 949 insertions(+), 387 deletions(-)

-- 
2.11.0

^ permalink raw reply

* Re: [PATCH net-next] sch_netem: faster rb tree removal
From: David Ahern @ 2017-09-25  2:05 UTC (permalink / raw)
  To: Eric Dumazet, David Miller; +Cc: netdev, Stephen Hemminger
In-Reply-To: <b175f7ab-12b6-061a-4233-ba53641ef99b@gmail.com>

On 9/24/17 7:57 PM, David Ahern wrote:
> On 9/23/17 12:07 PM, Eric Dumazet wrote:
>> From: Eric Dumazet <edumazet@google.com>
>>
>> While running TCP tests involving netem storing millions of packets,
>> I had the idea to speed up tfifo_reset() and did experiments.
>>
>> I tried the rbtree_postorder_for_each_entry_safe() method that is
>> used in skb_rbtree_purge() but discovered it was slower than the
>> current tfifo_reset() method.
>>
>> I measured time taken to release skbs with three occupation levels :
>> 10^4, 10^5 and 10^6 skbs with three methods :
>>
>> 1) (current 'naive' method)
>>
>> 	while ((p = rb_first(&q->t_root))) {
>> 		struct sk_buff *skb = netem_rb_to_skb(p);
>>  
>> 		rb_erase(p, &q->t_root);
>> 		rtnl_kfree_skbs(skb, skb);
>> 	}
>>
>> 2) Use rb_next() instead of rb_first() in the loop :
>>
>> 	p = rb_first(&q->t_root);
>> 	while (p) {
>> 		struct sk_buff *skb = netem_rb_to_skb(p);
>>
>> 		p = rb_next(p);
>> 		rb_erase(&skb->rbnode, &q->t_root);
>> 		rtnl_kfree_skbs(skb, skb);
>> 	}
>>
>> 3) "optimized" method using rbtree_postorder_for_each_entry_safe()
>>
>> 	struct sk_buff *skb, *next;
>>
>> 	rbtree_postorder_for_each_entry_safe(skb, next,
>> 					     &q->t_root, rbnode) {
>>                rtnl_kfree_skbs(skb, skb);
>> 	}
>> 	q->t_root = RB_ROOT;
>>
>> Results :
>>
>> method_1:while (rb_first()) rb_erase() 10000 skbs in 690378 ns (69 ns per skb)
>> method_2:rb_first; while (p) { p = rb_next(p); ...}  10000 skbs in 541846 ns (54 ns per skb)
>> method_3:rbtree_postorder_for_each_entry_safe() 10000 skbs in 868307 ns (86 ns per skb)
>>
>> method_1:while (rb_first()) rb_erase() 99996 skbs in 7804021 ns (78 ns per skb)
>> method_2:rb_first; while (p) { p = rb_next(p); ...}  100000 skbs in 5942456 ns (59 ns per skb)
>> method_3:rbtree_postorder_for_each_entry_safe() 100000 skbs in 11584940 ns (115 ns per skb)
>>
>> method_1:while (rb_first()) rb_erase() 1000000 skbs in 108577838 ns (108 ns per skb)
>> method_2:rb_first; while (p) { p = rb_next(p); ...}  1000000 skbs in 82619635 ns (82 ns per skb)
>> method_3:rbtree_postorder_for_each_entry_safe() 1000000 skbs in 127328743 ns (127 ns per skb)
>>
>> Method 2) is simply faster, probably because it maintains a smaller
>> working size set.
>>
>> Note that this is the method we use in tcp_ofo_queue() already.
>>
>> I will also change skb_rbtree_purge() in a second patch.
>>
>> Signed-off-by: Eric Dumazet <edumazet@google.com>
>> ---
>>  net/sched/sch_netem.c |    7 ++++---
>>  1 file changed, 4 insertions(+), 3 deletions(-)
>>
>> diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
>> index 063a4bdb9ee6f26b01387959e8f6ccd15ec16191..5a4f1008029068372019a965186e7a3c0a18aac3 100644
>> --- a/net/sched/sch_netem.c
>> +++ b/net/sched/sch_netem.c
>> @@ -361,12 +361,13 @@ static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sche
>>  static void tfifo_reset(struct Qdisc *sch)
>>  {
>>  	struct netem_sched_data *q = qdisc_priv(sch);
>> -	struct rb_node *p;
>> +	struct rb_node *p = rb_first(&q->t_root);
>>  
>> -	while ((p = rb_first(&q->t_root))) {
>> +	while (p) {
>>  		struct sk_buff *skb = netem_rb_to_skb(p);
>>  
>> -		rb_erase(p, &q->t_root);
>> +		p = rb_next(p);
>> +		rb_erase(&skb->rbnode, &q->t_root);
>>  		rtnl_kfree_skbs(skb, skb);
>>  	}
>>  }
>>
>>
> 
> Hi Eric:
> 
> I'm guessing the cost is in the rb_first and rb_next computations. Did
> you consider something like this:
> 
>         struct rb_root *root
>         struct rb_node **p = &root->rb_node;
> 
>         while (*p != NULL) {
>                 struct foobar *fb;
> 
>                 fb = container_of(*p, struct foobar, rb_node);
>                 // fb processing
		  rb_erase(&nh->rb_node, root);

>                 p = &root->rb_node;
>         }
> 

Oops, dropped the rb_erase in my consolidating the code to this snippet.

^ permalink raw reply

* Re: [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index
From: Jason Wang @ 2017-09-25  2:04 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: mst, virtualization, netdev, linux-kernel, kvm
In-Reply-To: <20170922090257.GB9243@stefanha-x1.localdomain>



On 2017年09月22日 17:02, Stefan Hajnoczi wrote:
> On Fri, Sep 22, 2017 at 04:02:32PM +0800, Jason Wang wrote:
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index f87ec75..8424166d 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>>   }
>>   EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>>   
>> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
>> +				struct vring_used_elem *heads,
>> +				u16 num, bool used_update)
> Missing doc comment.

Will fix this.

>
>> +{
>> +	int ret, ret2;
>> +	u16 last_avail_idx, last_used_idx, total, copied;
>> +	__virtio16 avail_idx;
>> +	struct vring_used_elem __user *used;
>> +	int i;
> The following variable names are a little confusing:
>
> last_avail_idx vs vq->last_avail_idx.  last_avail_idx is a wrapped
> avail->ring[] index, vq->last_avail_idx is a free-running counter.  The
> same for last_used_idx vs vq->last_used_idx.
>
> num argument vs vq->num.  The argument could be called nheads instead to
> make it clear that this is heads[] and not the virtqueue size.
>
> Not a bug but it took me a while to figure out what was going on.

I admit the name is confusing. Let me try better ones in V2.

Thanks

^ permalink raw reply

* Re: [PATCH net-next RFC 1/5] vhost: split out ring head fetching logic
From: Jason Wang @ 2017-09-25  2:03 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: mst, virtualization, netdev, linux-kernel, kvm
In-Reply-To: <20170922083157.GA9243@stefanha-x1.localdomain>



On 2017年09月22日 16:31, Stefan Hajnoczi wrote:
> On Fri, Sep 22, 2017 at 04:02:31PM +0800, Jason Wang wrote:
>> +/* This looks in the virtqueue and for the first available buffer, and converts
>> + * it to an iovec for convenient access.  Since descriptors consist of some
>> + * number of output then some number of input descriptors, it's actually two
>> + * iovecs, but we pack them into one and note how many of each there were.
>> + *
>> + * This function returns the descriptor number found, or vq->num (which is
>> + * never a valid descriptor number) if none was found.  A negative code is
>> + * returned on error. */
>> +int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
>> +			struct iovec iov[], unsigned int iov_size,
>> +			unsigned int *out_num, unsigned int *in_num,
>> +			struct vhost_log *log, unsigned int *log_num,
>> +			__virtio16 head)
> [...]
>> +int vhost_get_vq_desc(struct vhost_virtqueue *vq,
>> +		      struct iovec iov[], unsigned int iov_size,
>> +		      unsigned int *out_num, unsigned int *in_num,
>> +		      struct vhost_log *log, unsigned int *log_num)
> Please document vhost_get_vq_desc().
>
> Please also explain the difference between __vhost_get_vq_desc() and
> vhost_get_vq_desc() in the documentation.

Right, will document this in next version.

Thanks

^ permalink raw reply

* Re: [PATCH net-next] sch_netem: faster rb tree removal
From: David Ahern @ 2017-09-25  1:57 UTC (permalink / raw)
  To: Eric Dumazet, David Miller; +Cc: netdev, Stephen Hemminger
In-Reply-To: <1506190048.29839.206.camel@edumazet-glaptop3.roam.corp.google.com>

On 9/23/17 12:07 PM, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
> 
> While running TCP tests involving netem storing millions of packets,
> I had the idea to speed up tfifo_reset() and did experiments.
> 
> I tried the rbtree_postorder_for_each_entry_safe() method that is
> used in skb_rbtree_purge() but discovered it was slower than the
> current tfifo_reset() method.
> 
> I measured time taken to release skbs with three occupation levels :
> 10^4, 10^5 and 10^6 skbs with three methods :
> 
> 1) (current 'naive' method)
> 
> 	while ((p = rb_first(&q->t_root))) {
> 		struct sk_buff *skb = netem_rb_to_skb(p);
>  
> 		rb_erase(p, &q->t_root);
> 		rtnl_kfree_skbs(skb, skb);
> 	}
> 
> 2) Use rb_next() instead of rb_first() in the loop :
> 
> 	p = rb_first(&q->t_root);
> 	while (p) {
> 		struct sk_buff *skb = netem_rb_to_skb(p);
> 
> 		p = rb_next(p);
> 		rb_erase(&skb->rbnode, &q->t_root);
> 		rtnl_kfree_skbs(skb, skb);
> 	}
> 
> 3) "optimized" method using rbtree_postorder_for_each_entry_safe()
> 
> 	struct sk_buff *skb, *next;
> 
> 	rbtree_postorder_for_each_entry_safe(skb, next,
> 					     &q->t_root, rbnode) {
>                rtnl_kfree_skbs(skb, skb);
> 	}
> 	q->t_root = RB_ROOT;
> 
> Results :
> 
> method_1:while (rb_first()) rb_erase() 10000 skbs in 690378 ns (69 ns per skb)
> method_2:rb_first; while (p) { p = rb_next(p); ...}  10000 skbs in 541846 ns (54 ns per skb)
> method_3:rbtree_postorder_for_each_entry_safe() 10000 skbs in 868307 ns (86 ns per skb)
> 
> method_1:while (rb_first()) rb_erase() 99996 skbs in 7804021 ns (78 ns per skb)
> method_2:rb_first; while (p) { p = rb_next(p); ...}  100000 skbs in 5942456 ns (59 ns per skb)
> method_3:rbtree_postorder_for_each_entry_safe() 100000 skbs in 11584940 ns (115 ns per skb)
> 
> method_1:while (rb_first()) rb_erase() 1000000 skbs in 108577838 ns (108 ns per skb)
> method_2:rb_first; while (p) { p = rb_next(p); ...}  1000000 skbs in 82619635 ns (82 ns per skb)
> method_3:rbtree_postorder_for_each_entry_safe() 1000000 skbs in 127328743 ns (127 ns per skb)
> 
> Method 2) is simply faster, probably because it maintains a smaller
> working size set.
> 
> Note that this is the method we use in tcp_ofo_queue() already.
> 
> I will also change skb_rbtree_purge() in a second patch.
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  net/sched/sch_netem.c |    7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
> index 063a4bdb9ee6f26b01387959e8f6ccd15ec16191..5a4f1008029068372019a965186e7a3c0a18aac3 100644
> --- a/net/sched/sch_netem.c
> +++ b/net/sched/sch_netem.c
> @@ -361,12 +361,13 @@ static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sche
>  static void tfifo_reset(struct Qdisc *sch)
>  {
>  	struct netem_sched_data *q = qdisc_priv(sch);
> -	struct rb_node *p;
> +	struct rb_node *p = rb_first(&q->t_root);
>  
> -	while ((p = rb_first(&q->t_root))) {
> +	while (p) {
>  		struct sk_buff *skb = netem_rb_to_skb(p);
>  
> -		rb_erase(p, &q->t_root);
> +		p = rb_next(p);
> +		rb_erase(&skb->rbnode, &q->t_root);
>  		rtnl_kfree_skbs(skb, skb);
>  	}
>  }
> 
> 

Hi Eric:

I'm guessing the cost is in the rb_first and rb_next computations. Did
you consider something like this:

        struct rb_root *root
        struct rb_node **p = &root->rb_node;

        while (*p != NULL) {
                struct foobar *fb;

                fb = container_of(*p, struct foobar, rb_node);
                // fb processing

                p = &root->rb_node;
        }

^ permalink raw reply

* Re: [patch net-next v2 07/12] mlxsw: spectrum: Add the multicast routing offloading logic
From: Yunsheng Lin @ 2017-09-25  1:48 UTC (permalink / raw)
  To: Jiri Pirko, netdev; +Cc: davem, yotamg, idosch, mlxsw, nikolay, andrew
In-Reply-To: <20170924172212.10096-8-jiri@resnulli.us>

Hi, Jiri

On 2017/9/25 1:22, Jiri Pirko wrote:
> From: Yotam Gigi <yotamg@mellanox.com>
> 
> Add the multicast router offloading logic, which is in charge of handling
> the VIF and MFC notifications and translating it to the hardware logic API.
> 
> The offloading logic has to overcome several obstacles in order to safely
> comply with the kernel multicast router user API:
>  - It must keep track of the mapping between VIFs to netdevices. The user
>    can add an MFC cache entry pointing to a VIF, delete the VIF and add
>    re-add it with a different netdevice. The offloading logic has to handle
>    this in order to be compatible with the kernel logic.
>  - It must keep track of the mapping between netdevices to spectrum RIFs,
>    as the current hardware implementation assume having a RIF for every
>    port in a multicast router.
>  - It must handle routes pointing to pimreg device to be trapped to the
>    kernel, as the packet should be delivered to userspace.
>  - It must handle routes pointing tunnel VIFs. The current implementation
>    does not support multicast forwarding to tunnels, thus routes that point
>    to a tunnel should be trapped to the kernel.
>  - It must be aware of proxy multicast routes, which include both (*,*)
>    routes and duplicate routes. Currently proxy routes are not offloaded
>    and trigger the abort mechanism: removal of all routes from hardware and
>    triggering the traffic to go through the kernel.
> 
> The multicast routing offloading logic also updates the counters of the
> offloaded MFC routes in a periodic work.
> 
> Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
> Reviewed-by: Ido Schimmel <idosch@mellanox.com>
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> ---
> v1->v2:
>  - Update the lastuse MFC entry field too, in addition to packets an bytes.
> ---
>  drivers/net/ethernet/mellanox/mlxsw/Makefile      |    3 +-
>  drivers/net/ethernet/mellanox/mlxsw/spectrum.h    |    1 +
>  drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c | 1014 +++++++++++++++++++++
>  drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h |  133 +++
>  4 files changed, 1150 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
>  create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h
> 
> diff --git a/drivers/net/ethernet/mellanox/mlxsw/Makefile b/drivers/net/ethernet/mellanox/mlxsw/Makefile
> index 4b88158..9b29764 100644
> --- a/drivers/net/ethernet/mellanox/mlxsw/Makefile
> +++ b/drivers/net/ethernet/mellanox/mlxsw/Makefile
> @@ -17,7 +17,8 @@ mlxsw_spectrum-objs		:= spectrum.o spectrum_buffers.o \
>  				   spectrum_kvdl.o spectrum_acl_tcam.o \
>  				   spectrum_acl.o spectrum_flower.o \
>  				   spectrum_cnt.o spectrum_fid.o \
> -				   spectrum_ipip.o spectrum_acl_flex_actions.o
> +				   spectrum_ipip.o spectrum_acl_flex_actions.o \
> +				   spectrum_mr.o
>  mlxsw_spectrum-$(CONFIG_MLXSW_SPECTRUM_DCB)	+= spectrum_dcb.o
>  mlxsw_spectrum-$(CONFIG_NET_DEVLINK) += spectrum_dpipe.o
>  obj-$(CONFIG_MLXSW_MINIMAL)	+= mlxsw_minimal.o
> diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
> index e907ec4..51d8b9f 100644
> --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
> +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
> @@ -153,6 +153,7 @@ struct mlxsw_sp {
>  	struct mlxsw_sp_sb *sb;
>  	struct mlxsw_sp_bridge *bridge;
>  	struct mlxsw_sp_router *router;
> +	struct mlxsw_sp_mr *mr;
>  	struct mlxsw_afa *afa;
>  	struct mlxsw_sp_acl *acl;
>  	struct mlxsw_sp_fid_core *fid_core;
> diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
> new file mode 100644
> index 0000000..89b2e60
> --- /dev/null
> +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
> @@ -0,0 +1,1014 @@
> +/*
> + * drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
> + * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
> + * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions are met:
> + *
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + * 3. Neither the names of the copyright holders nor the names of its
> + *    contributors may be used to endorse or promote products derived from
> + *    this software without specific prior written permission.
> + *
> + * Alternatively, this software may be distributed under the terms of the
> + * GNU General Public License ("GPL") version 2 as published by the Free
> + * Software Foundation.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
> + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
> + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
> + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
> + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
> + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
> + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
> + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
> + * POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include <linux/rhashtable.h>
> +
> +#include "spectrum_mr.h"
> +#include "spectrum_router.h"
> +
> +struct mlxsw_sp_mr {
> +	const struct mlxsw_sp_mr_ops *mr_ops;
> +	void *catchall_route_priv;
> +	struct delayed_work stats_update_dw;
> +	struct list_head table_list;
> +#define MLXSW_SP_MR_ROUTES_COUNTER_UPDATE_INTERVAL 5000 /* ms */
> +	unsigned long priv[0];
> +	/* priv has to be always the last item */
> +};
> +
> +struct mlxsw_sp_mr_vif {
> +	struct net_device *dev;
> +	const struct mlxsw_sp_rif *rif;
> +	unsigned long vif_flags;
> +
> +	/* A list of route_vif_entry structs that point to routes that the VIF
> +	 * instance is used as one of the egress VIFs
> +	 */
> +	struct list_head route_evif_list;
> +
> +	/* A list of route_vif_entry structs that point to routes that the VIF
> +	 * instance is used as an ingress VIF
> +	 */
> +	struct list_head route_ivif_list;
> +};
> +
> +struct mlxsw_sp_mr_route_vif_entry {
> +	struct list_head vif_node;
> +	struct list_head route_node;
> +	struct mlxsw_sp_mr_vif *mr_vif;
> +	struct mlxsw_sp_mr_route *mr_route;
> +};
> +
> +struct mlxsw_sp_mr_table {
> +	struct list_head node;
> +	enum mlxsw_sp_l3proto proto;
> +	struct mlxsw_sp *mlxsw_sp;
> +	u32 vr_id;
> +	struct mlxsw_sp_mr_vif vifs[MAXVIFS];
> +	struct list_head route_list;
> +	struct rhashtable route_ht;
> +	char catchall_route_priv[0];
> +	/* catchall_route_priv has to be always the last item */
> +};
> +
> +struct mlxsw_sp_mr_route {
> +	struct list_head node;
> +	struct rhash_head ht_node;
> +	struct mlxsw_sp_mr_route_key key;
> +	enum mlxsw_sp_mr_route_action route_action;
> +	u16 min_mtu;
> +	struct mfc_cache *mfc4;
> +	void *route_priv;
> +	const struct mlxsw_sp_mr_table *mr_table;
> +	/* A list of route_vif_entry structs that point to the egress VIFs */
> +	struct list_head evif_list;
> +	/* A route_vif_entry struct that point to the ingress VIF */
> +	struct mlxsw_sp_mr_route_vif_entry ivif;
> +};
> +
> +static const struct rhashtable_params mlxsw_sp_mr_route_ht_params = {
> +	.key_len = sizeof(struct mlxsw_sp_mr_route_key),
> +	.key_offset = offsetof(struct mlxsw_sp_mr_route, key),
> +	.head_offset = offsetof(struct mlxsw_sp_mr_route, ht_node),
> +	.automatic_shrinking = true,
> +};
> +
> +static bool mlxsw_sp_mr_vif_regular(const struct mlxsw_sp_mr_vif *vif)
> +{
> +	return !(vif->vif_flags & (VIFF_TUNNEL | VIFF_REGISTER));
> +}
> +
> +static bool mlxsw_sp_mr_vif_valid(const struct mlxsw_sp_mr_vif *vif)
> +{
> +	return mlxsw_sp_mr_vif_regular(vif) && vif->dev && vif->rif;
> +}
> +
> +static bool mlxsw_sp_mr_vif_rif_invalid(const struct mlxsw_sp_mr_vif *vif)
> +{
> +	return mlxsw_sp_mr_vif_regular(vif) && vif->dev && !vif->rif;
> +}
> +
> +static bool
> +mlxsw_sp_mr_route_ivif_in_evifs(const struct mlxsw_sp_mr_route *mr_route)
> +{
> +	vifi_t ivif;
> +
> +	switch (mr_route->mr_table->proto) {
> +	case MLXSW_SP_L3_PROTO_IPV4:
> +		ivif = mr_route->mfc4->mfc_parent;
> +		return mr_route->mfc4->mfc_un.res.ttls[ivif] != 255;
> +	case MLXSW_SP_L3_PROTO_IPV6:
> +		/* fall through */
> +	default:
> +		WARN_ON_ONCE(1);
> +	}
> +	return false;
> +}
> +
> +static int
> +mlxsw_sp_mr_route_valid_evifs_num(const struct mlxsw_sp_mr_route *mr_route)
> +{
> +	struct mlxsw_sp_mr_route_vif_entry *rve;
> +	int valid_evifs = 0;
> +
> +	valid_evifs = 0;

you are doing valid_evifs = 0 twice.

> +	list_for_each_entry(rve, &mr_route->evif_list, route_node)
> +		if (mlxsw_sp_mr_vif_valid(rve->mr_vif))
> +			valid_evifs++;
> +	return valid_evifs;
> +}
> +
> +static bool mlxsw_sp_mr_route_starg(const struct mlxsw_sp_mr_route *mr_route)
> +{
> +	switch (mr_route->mr_table->proto) {
> +	case MLXSW_SP_L3_PROTO_IPV4:
> +		return mr_route->key.source_mask.addr4 == INADDR_ANY;
> +	case MLXSW_SP_L3_PROTO_IPV6:
> +		/* fall through */
> +	default:
> +		WARN_ON_ONCE(1);
> +	}
> +	return false;
> +}
> +
> +static enum mlxsw_sp_mr_route_action
> +mlxsw_sp_mr_route_action(const struct mlxsw_sp_mr_route *mr_route)
> +{
> +	struct mlxsw_sp_mr_route_vif_entry *rve;
> +
> +	/* If the ingress port is not regular and resolved, trap the route */
> +	if (!mlxsw_sp_mr_vif_valid(mr_route->ivif.mr_vif))
> +		return MLXSW_SP_MR_ROUTE_ACTION_TRAP;
> +
> +	/* The kernel does not match a (*,G) route that the ingress interface is
> +	 * not one of the egress interfaces, so trap these kind of routes.
> +	 */
> +	if (mlxsw_sp_mr_route_starg(mr_route) &&
> +	    !mlxsw_sp_mr_route_ivif_in_evifs(mr_route))
> +		return MLXSW_SP_MR_ROUTE_ACTION_TRAP;
> +
> +	/* If the route has no valid eVIFs, trap it. */
> +	if (!mlxsw_sp_mr_route_valid_evifs_num(mr_route))
> +		return MLXSW_SP_MR_ROUTE_ACTION_TRAP;
> +
> +	/* If either one of the eVIFs is not regular (VIF of type pimreg or
> +	 * tunnel) or one of the VIFs has no matching RIF, trap the packet.
> +	 */
> +	list_for_each_entry(rve, &mr_route->evif_list, route_node) {
> +		if (!mlxsw_sp_mr_vif_regular(rve->mr_vif) ||
> +		    mlxsw_sp_mr_vif_rif_invalid(rve->mr_vif))
> +			return MLXSW_SP_MR_ROUTE_ACTION_TRAP;
> +	}
> +	return MLXSW_SP_MR_ROUTE_ACTION_FORWARD;
> +}
> +
> +static enum mlxsw_sp_mr_route_prio
> +mlxsw_sp_mr_route_prio(const struct mlxsw_sp_mr_route *mr_route)
> +{
> +	return mlxsw_sp_mr_route_starg(mr_route) ?
> +		MLXSW_SP_MR_ROUTE_PRIO_STARG : MLXSW_SP_MR_ROUTE_PRIO_SG;
> +}
> +
> +static void mlxsw_sp_mr_route4_key(struct mlxsw_sp_mr_table *mr_table,
> +				   struct mlxsw_sp_mr_route_key *key,
> +				   const struct mfc_cache *mfc)
> +{
> +	bool starg = (mfc->mfc_origin == INADDR_ANY);
> +
> +	memset(key, 0, sizeof(*key));
> +	key->vrid = mr_table->vr_id;
> +	key->proto = mr_table->proto;
> +	key->group.addr4 = mfc->mfc_mcastgrp;
> +	key->group_mask.addr4 = 0xffffffff;
> +	key->source.addr4 = mfc->mfc_origin;
> +	key->source_mask.addr4 = starg ? 0 : 0xffffffff;
> +}
> +
> +static int mlxsw_sp_mr_route_evif_link(struct mlxsw_sp_mr_route *mr_route,
> +				       struct mlxsw_sp_mr_vif *mr_vif)
> +{
> +	struct mlxsw_sp_mr_route_vif_entry *rve;
> +
> +	rve = kzalloc(sizeof(*rve), GFP_KERNEL);
> +	if (!rve)
> +		return -ENOMEM;
> +	rve->mr_route = mr_route;
> +	rve->mr_vif = mr_vif;
> +	list_add_tail(&rve->route_node, &mr_route->evif_list);
> +	list_add_tail(&rve->vif_node, &mr_vif->route_evif_list);
> +	return 0;
> +}
> +
> +static void
> +mlxsw_sp_mr_route_evif_unlink(struct mlxsw_sp_mr_route_vif_entry *rve)
> +{
> +	list_del(&rve->route_node);
> +	list_del(&rve->vif_node);
> +	kfree(rve);
> +}
> +
> +static void mlxsw_sp_mr_route_ivif_link(struct mlxsw_sp_mr_route *mr_route,
> +					struct mlxsw_sp_mr_vif *mr_vif)
> +{
> +	mr_route->ivif.mr_route = mr_route;
> +	mr_route->ivif.mr_vif = mr_vif;
> +	list_add_tail(&mr_route->ivif.vif_node, &mr_vif->route_ivif_list);
> +}
> +
> +static void mlxsw_sp_mr_route_ivif_unlink(struct mlxsw_sp_mr_route *mr_route)
> +{
> +	list_del(&mr_route->ivif.vif_node);
> +}
> +
> +static int
> +mlxsw_sp_mr_route_info_create(struct mlxsw_sp_mr_table *mr_table,
> +			      struct mlxsw_sp_mr_route *mr_route,
> +			      struct mlxsw_sp_mr_route_info *route_info)
> +{
> +	struct mlxsw_sp_mr_route_vif_entry *rve;
> +	u16 *erif_indices;
> +	u16 irif_index;
> +	u16 erif = 0;
> +
> +	erif_indices = kmalloc_array(MAXVIFS, sizeof(*erif_indices),
> +				     GFP_KERNEL);
> +	if (!erif_indices)
> +		return -ENOMEM;
> +
> +	list_for_each_entry(rve, &mr_route->evif_list, route_node) {
> +		if (mlxsw_sp_mr_vif_valid(rve->mr_vif)) {
> +			u16 rifi = mlxsw_sp_rif_index(rve->mr_vif->rif);
> +
> +			erif_indices[erif++] = rifi;
> +		}
> +	}
> +
> +	if (mlxsw_sp_mr_vif_valid(mr_route->ivif.mr_vif))
> +		irif_index = mlxsw_sp_rif_index(mr_route->ivif.mr_vif->rif);
> +	else
> +		irif_index = 0;
> +
> +	route_info->irif_index = irif_index;
> +	route_info->erif_indices = erif_indices;
> +	route_info->min_mtu = mr_route->min_mtu;
> +	route_info->route_action = mr_route->route_action;
> +	route_info->erif_num = erif;
> +	return 0;
> +}
> +
> +static void
> +mlxsw_sp_mr_route_info_destroy(struct mlxsw_sp_mr_route_info *route_info)
> +{
> +	kfree(route_info->erif_indices);
> +}
> +
> +static int mlxsw_sp_mr_route_write(struct mlxsw_sp_mr_table *mr_table,
> +				   struct mlxsw_sp_mr_route *mr_route,
> +				   bool replace)
> +{
> +	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
> +	struct mlxsw_sp_mr_route_info route_info;
> +	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
> +	int err;
> +
> +	err = mlxsw_sp_mr_route_info_create(mr_table, mr_route, &route_info);
> +	if (err)
> +		return err;
> +
> +	if (!replace) {
> +		struct mlxsw_sp_mr_route_params route_params;
> +
> +		mr_route->route_priv = kzalloc(mr->mr_ops->route_priv_size,
> +					       GFP_KERNEL);
> +		if (!mr_route->route_priv) {
> +			err = -ENOMEM;
> +			goto out;
> +		}
> +
> +		route_params.key = mr_route->key;
> +		route_params.value = route_info;
> +		route_params.prio = mlxsw_sp_mr_route_prio(mr_route);
> +		err = mr->mr_ops->route_create(mlxsw_sp, mr->priv,
> +					       mr_route->route_priv,
> +					       &route_params);
> +		if (err)
> +			kfree(mr_route->route_priv);
> +	} else {
> +		err = mr->mr_ops->route_update(mlxsw_sp, mr_route->route_priv,
> +					       &route_info);
> +	}
> +out:
> +	mlxsw_sp_mr_route_info_destroy(&route_info);
> +	return err;
> +}
> +
> +static void mlxsw_sp_mr_route_erase(struct mlxsw_sp_mr_table *mr_table,
> +				    struct mlxsw_sp_mr_route *mr_route)
> +{
> +	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
> +	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
> +
> +	mr->mr_ops->route_destroy(mlxsw_sp, mr->priv, mr_route->route_priv);
> +	kfree(mr_route->route_priv);
> +}
> +
> +static struct mlxsw_sp_mr_route *
> +mlxsw_sp_mr_route4_create(struct mlxsw_sp_mr_table *mr_table,
> +			  struct mfc_cache *mfc)
> +{
> +	struct mlxsw_sp_mr_route_vif_entry *rve, *tmp;
> +	struct mlxsw_sp_mr_route *mr_route;
> +	int err;
> +	int i;
> +
> +	/* Allocate and init a new route and fill it with parameters */
> +	mr_route = kzalloc(sizeof(*mr_table), GFP_KERNEL);
> +	if (!mr_route)
> +		return ERR_PTR(-ENOMEM);
> +	INIT_LIST_HEAD(&mr_route->evif_list);
> +	mlxsw_sp_mr_route4_key(mr_table, &mr_route->key, mfc);
> +
> +	/* Find min_mtu and link iVIF and eVIFs */
> +	mr_route->min_mtu = ETH_MAX_MTU;
> +	ipmr_cache_hold(mfc);
> +	mr_route->mfc4 = mfc;
> +	mr_route->mr_table = mr_table;
> +	for (i = 0; i < MAXVIFS; i++) {
> +		if (mfc->mfc_un.res.ttls[i] != 255) {
> +			err = mlxsw_sp_mr_route_evif_link(mr_route,
> +							  &mr_table->vifs[i]);
> +			if (err)
> +				goto err;
> +			if (mr_table->vifs[i].dev &&
> +			    mr_table->vifs[i].dev->mtu < mr_route->min_mtu)
> +				mr_route->min_mtu = mr_table->vifs[i].dev->mtu;
> +		}
> +	}
> +	mlxsw_sp_mr_route_ivif_link(mr_route, &mr_table->vifs[mfc->mfc_parent]);
> +	if (err)
> +		goto err;
> +
> +	mr_route->route_action = mlxsw_sp_mr_route_action(mr_route);
> +	return mr_route;
> +err:
> +	ipmr_cache_put(mfc);
> +	list_for_each_entry_safe(rve, tmp, &mr_route->evif_list, route_node)
> +		mlxsw_sp_mr_route_evif_unlink(rve);
> +	kfree(mr_route);
> +	return ERR_PTR(err);
> +}
> +
> +static void mlxsw_sp_mr_route4_destroy(struct mlxsw_sp_mr_table *mr_table,
> +				       struct mlxsw_sp_mr_route *mr_route)
> +{
> +	struct mlxsw_sp_mr_route_vif_entry *rve, *tmp;
> +
> +	mlxsw_sp_mr_route_ivif_unlink(mr_route);
> +	ipmr_cache_put(mr_route->mfc4);
> +	list_for_each_entry_safe(rve, tmp, &mr_route->evif_list, route_node)
> +		mlxsw_sp_mr_route_evif_unlink(rve);
> +	kfree(mr_route);
> +}
> +
> +static void mlxsw_sp_mr_route_destroy(struct mlxsw_sp_mr_table *mr_table,
> +				      struct mlxsw_sp_mr_route *mr_route)
> +{
> +	switch (mr_table->proto) {
> +	case MLXSW_SP_L3_PROTO_IPV4:
> +		mlxsw_sp_mr_route4_destroy(mr_table, mr_route);
> +		break;
> +	case MLXSW_SP_L3_PROTO_IPV6:
> +		/* fall through */
> +	default:
> +		WARN_ON_ONCE(1);
> +	}
> +}
> +
> +static void mlxsw_sp_mr_mfc_offload_set(struct mlxsw_sp_mr_route *mr_route,
> +					bool offload)
> +{
> +	switch (mr_route->mr_table->proto) {
> +	case MLXSW_SP_L3_PROTO_IPV4:
> +		if (offload)
> +			mr_route->mfc4->mfc_flags |= MFC_OFFLOAD;
> +		else
> +			mr_route->mfc4->mfc_flags &= ~MFC_OFFLOAD;
> +		break;
> +	case MLXSW_SP_L3_PROTO_IPV6:
> +		/* fall through */
> +	default:
> +		WARN_ON_ONCE(1);
> +	}
> +}
> +
> +static void mlxsw_sp_mr_mfc_offload_update(struct mlxsw_sp_mr_route *mr_route)
> +{
> +	bool offload;
> +
> +	offload = mr_route->route_action != MLXSW_SP_MR_ROUTE_ACTION_TRAP;
> +	mlxsw_sp_mr_mfc_offload_set(mr_route, offload);
> +}
> +
> +static void __mlxsw_sp_mr_route_del(struct mlxsw_sp_mr_table *mr_table,
> +				    struct mlxsw_sp_mr_route *mr_route)
> +{
> +	mlxsw_sp_mr_mfc_offload_set(mr_route, false);
> +	mlxsw_sp_mr_route_erase(mr_table, mr_route);
> +	rhashtable_remove_fast(&mr_table->route_ht, &mr_route->ht_node,
> +			       mlxsw_sp_mr_route_ht_params);
> +	list_del(&mr_route->node);
> +	mlxsw_sp_mr_route_destroy(mr_table, mr_route);
> +}
> +
> +int mlxsw_sp_mr_route4_add(struct mlxsw_sp_mr_table *mr_table,
> +			   struct mfc_cache *mfc, bool replace)
> +{
> +	struct mlxsw_sp_mr_route *mr_orig_route = NULL;
> +	struct mlxsw_sp_mr_route *mr_route;
> +	int err;
> +
> +	/* If the route is a (*,*) route, abort, as these kind of routes are
> +	 * used for proxy routes.
> +	 */
> +	if (mfc->mfc_origin == INADDR_ANY && mfc->mfc_mcastgrp == INADDR_ANY) {
> +		dev_warn(mr_table->mlxsw_sp->bus_info->dev,
> +			 "Offloading proxy routes is not supported.\n");

You are return err, why not use dev_err?

> +		return -EINVAL;
> +	}
> +
> +	/* Create a new route */
> +	mr_route = mlxsw_sp_mr_route4_create(mr_table, mfc);
> +	if (IS_ERR(mr_route))
> +		return PTR_ERR(mr_route);
> +
> +	/* Find any route with a matching key */
> +	mr_orig_route = rhashtable_lookup_fast(&mr_table->route_ht,
> +					       &mr_route->key,
> +					       mlxsw_sp_mr_route_ht_params);
> +	if (replace) {
> +		/* On replace case, make the route point to the new route_priv.
> +		 */
> +		if (WARN_ON(!mr_orig_route)) {
> +			err = -ENOENT;
> +			goto err_no_orig_route;
> +		}
> +		mr_route->route_priv = mr_orig_route->route_priv;
> +	} else if (mr_orig_route) {
> +		/* On non replace case, if another route with the same key was
> +		 * found, abort, as duplicate routes are used for proxy routes.
> +		 */
> +		dev_warn(mr_table->mlxsw_sp->bus_info->dev,
> +			 "Offloading proxy routes is not supported.\n");

Same as here.

> +		err = -EINVAL;
> +		goto err_duplicate_route;
> +	}
> +
> +	/* Put it in the table data-structures */
> +	list_add_tail(&mr_route->node, &mr_table->route_list);
> +	err = rhashtable_insert_fast(&mr_table->route_ht,
> +				     &mr_route->ht_node,
> +				     mlxsw_sp_mr_route_ht_params);
> +	if (err)
> +		goto err_rhashtable_insert;
> +
> +	/* Write the route to the hardware */
> +	err = mlxsw_sp_mr_route_write(mr_table, mr_route, replace);
> +	if (err)
> +		goto err_mr_route_write;
> +
> +	/* Destroy the original route */
> +	if (replace) {
> +		rhashtable_remove_fast(&mr_table->route_ht,
> +				       &mr_orig_route->ht_node,
> +				       mlxsw_sp_mr_route_ht_params);
> +		list_del(&mr_orig_route->node);
> +		mlxsw_sp_mr_route4_destroy(mr_table, mr_orig_route);
> +	}
> +
> +	mlxsw_sp_mr_mfc_offload_update(mr_route);
> +	return 0;
> +
> +err_mr_route_write:
> +	rhashtable_remove_fast(&mr_table->route_ht, &mr_route->ht_node,
> +			       mlxsw_sp_mr_route_ht_params);
> +err_rhashtable_insert:
> +	list_del(&mr_route->node);
> +err_no_orig_route:
> +err_duplicate_route:
> +	mlxsw_sp_mr_route4_destroy(mr_table, mr_route);
> +	return err;
> +}
> +
> +void mlxsw_sp_mr_route4_del(struct mlxsw_sp_mr_table *mr_table,
> +			    struct mfc_cache *mfc)
> +{
> +	struct mlxsw_sp_mr_route *mr_route;
> +	struct mlxsw_sp_mr_route_key key;
> +
> +	mlxsw_sp_mr_route4_key(mr_table, &key, mfc);
> +	mr_route = rhashtable_lookup_fast(&mr_table->route_ht, &key,
> +					  mlxsw_sp_mr_route_ht_params);
> +	if (mr_route)
> +		__mlxsw_sp_mr_route_del(mr_table, mr_route);
> +}
> +
> +/* Should be called after the VIF struct is updated */
> +static int
> +mlxsw_sp_mr_route_ivif_resolve(struct mlxsw_sp_mr_table *mr_table,
> +			       struct mlxsw_sp_mr_route_vif_entry *rve)
> +{
> +	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
> +	enum mlxsw_sp_mr_route_action route_action;
> +	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
> +	u16 irif_index;
> +	int err;
> +
> +	route_action = mlxsw_sp_mr_route_action(rve->mr_route);
> +	if (route_action == MLXSW_SP_MR_ROUTE_ACTION_TRAP)
> +		return 0;
> +
> +	/* rve->mr_vif->rif is guaranteed to be valid at this stage */
> +	irif_index = mlxsw_sp_rif_index(rve->mr_vif->rif);
> +	err = mr->mr_ops->route_irif_update(mlxsw_sp, rve->mr_route->route_priv,
> +					    irif_index);
> +	if (err)
> +		return err;
> +
> +	err = mr->mr_ops->route_action_update(mlxsw_sp,
> +					      rve->mr_route->route_priv,
> +					      route_action);
> +	if (err)
> +		/* No need to rollback here because the iRIF change only takes
> +		 * place after the action has been updated.
> +		 */
> +		return err;
> +
> +	rve->mr_route->route_action = route_action;
> +	mlxsw_sp_mr_mfc_offload_update(rve->mr_route);
> +	return 0;
> +}
> +
> +static void
> +mlxsw_sp_mr_route_ivif_unresolve(struct mlxsw_sp_mr_table *mr_table,
> +				 struct mlxsw_sp_mr_route_vif_entry *rve)
> +{
> +	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
> +	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
> +
> +	mr->mr_ops->route_action_update(mlxsw_sp, rve->mr_route->route_priv,
> +					MLXSW_SP_MR_ROUTE_ACTION_TRAP);
> +	rve->mr_route->route_action = MLXSW_SP_MR_ROUTE_ACTION_TRAP;
> +	mlxsw_sp_mr_mfc_offload_update(rve->mr_route);
> +}
> +
> +/* Should be called after the RIF struct is updated */
> +static int
> +mlxsw_sp_mr_route_evif_resolve(struct mlxsw_sp_mr_table *mr_table,
> +			       struct mlxsw_sp_mr_route_vif_entry *rve)
> +{
> +	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
> +	enum mlxsw_sp_mr_route_action route_action;
> +	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
> +	u16 erif_index = 0;
> +	int err;
> +
> +	/* Update the route action, as the new eVIF can be a tunnel or a pimreg
> +	 * device which will require updating the action.
> +	 */
> +	route_action = mlxsw_sp_mr_route_action(rve->mr_route);
> +	if (route_action != rve->mr_route->route_action) {
> +		err = mr->mr_ops->route_action_update(mlxsw_sp,
> +						      rve->mr_route->route_priv,
> +						      route_action);
> +		if (err)
> +			return err;
> +	}
> +
> +	/* Add the eRIF */
> +	if (mlxsw_sp_mr_vif_valid(rve->mr_vif)) {
> +		erif_index = mlxsw_sp_rif_index(rve->mr_vif->rif);
> +		err = mr->mr_ops->route_erif_add(mlxsw_sp,
> +						 rve->mr_route->route_priv,
> +						 erif_index);
> +		if (err)
> +			goto err_route_erif_add;
> +	}
> +
> +	/* Update the minimum MTU */
> +	if (rve->mr_vif->dev->mtu < rve->mr_route->min_mtu) {
> +		rve->mr_route->min_mtu = rve->mr_vif->dev->mtu;
> +		err = mr->mr_ops->route_min_mtu_update(mlxsw_sp,
> +						       rve->mr_route->route_priv,
> +						       rve->mr_route->min_mtu);
> +		if (err)
> +			goto err_route_min_mtu_update;
> +	}
> +
> +	rve->mr_route->route_action = route_action;
> +	mlxsw_sp_mr_mfc_offload_update(rve->mr_route);
> +	return 0;
> +
> +err_route_min_mtu_update:
> +	if (mlxsw_sp_mr_vif_valid(rve->mr_vif))
> +		mr->mr_ops->route_erif_del(mlxsw_sp, rve->mr_route->route_priv,
> +					   erif_index);
> +err_route_erif_add:
> +	if (route_action != rve->mr_route->route_action)
> +		mr->mr_ops->route_action_update(mlxsw_sp,
> +						rve->mr_route->route_priv,
> +						rve->mr_route->route_action);
> +	return err;
> +}
> +
> +/* Should be called before the RIF struct is updated */
> +static void
> +mlxsw_sp_mr_route_evif_unresolve(struct mlxsw_sp_mr_table *mr_table,
> +				 struct mlxsw_sp_mr_route_vif_entry *rve)
> +{
> +	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
> +	enum mlxsw_sp_mr_route_action route_action;
> +	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
> +	u16 rifi;
> +
> +	/* If the unresolved RIF was not valid, no need to delete it */
> +	if (!mlxsw_sp_mr_vif_valid(rve->mr_vif))
> +		return;
> +
> +	/* Update the route action: if there is only one valid eVIF in the
> +	 * route, set the action to trap as the VIF deletion will lead to zero
> +	 * valid eVIFs. On any other case, use the mlxsw_sp_mr_route_action to
> +	 * determine the route action.
> +	 */
> +	if (mlxsw_sp_mr_route_valid_evifs_num(rve->mr_route) == 1)
> +		route_action = MLXSW_SP_MR_ROUTE_ACTION_TRAP;
> +	else
> +		route_action = mlxsw_sp_mr_route_action(rve->mr_route);
> +	if (route_action != rve->mr_route->route_action)
> +		mr->mr_ops->route_action_update(mlxsw_sp,
> +						rve->mr_route->route_priv,
> +						route_action);
> +
> +	/* Delete the erif from the route */
> +	rifi = mlxsw_sp_rif_index(rve->mr_vif->rif);
> +	mr->mr_ops->route_erif_del(mlxsw_sp, rve->mr_route->route_priv, rifi);
> +	rve->mr_route->route_action = route_action;
> +	mlxsw_sp_mr_mfc_offload_update(rve->mr_route);
> +}
> +
> +static int mlxsw_sp_mr_vif_resolve(struct mlxsw_sp_mr_table *mr_table,
> +				   struct net_device *dev,
> +				   struct mlxsw_sp_mr_vif *mr_vif,
> +				   unsigned long vif_flags,
> +				   const struct mlxsw_sp_rif *rif)
> +{
> +	struct mlxsw_sp_mr_route_vif_entry *irve, *erve;
> +	int err;
> +
> +	/* Update the VIF */
> +	mr_vif->dev = dev;
> +	mr_vif->rif = rif;
> +	mr_vif->vif_flags = vif_flags;
> +
> +	/* Update all routes where this VIF is used as an unresolved iRIF */
> +	list_for_each_entry(irve, &mr_vif->route_ivif_list, vif_node) {
> +		err = mlxsw_sp_mr_route_ivif_resolve(mr_table, irve);
> +		if (err)
> +			goto err_irif_unresolve;
> +	}
> +
> +	/* Update all routes where this VIF is used as an unresolved eRIF */
> +	list_for_each_entry(erve, &mr_vif->route_evif_list, vif_node) {
> +		err = mlxsw_sp_mr_route_evif_resolve(mr_table, erve);
> +		if (err)
> +			goto err_erif_unresolve;
> +	}
> +	return 0;
> +
> +err_erif_unresolve:
> +	list_for_each_entry_from_reverse(erve, &mr_vif->route_evif_list,
> +					 vif_node)
> +		mlxsw_sp_mr_route_evif_unresolve(mr_table, erve);
> +err_irif_unresolve:
> +	list_for_each_entry_from_reverse(irve, &mr_vif->route_ivif_list,
> +					 vif_node)
> +		mlxsw_sp_mr_route_ivif_unresolve(mr_table, irve);
> +	mr_vif->rif = NULL;
> +	return err;
> +}
> +
> +static void mlxsw_sp_mr_vif_unresolve(struct mlxsw_sp_mr_table *mr_table,
> +				      struct net_device *dev,
> +				      struct mlxsw_sp_mr_vif *mr_vif)
> +{
> +	struct mlxsw_sp_mr_route_vif_entry *rve;
> +
> +	/* Update all routes where this VIF is used as an unresolved eRIF */
> +	list_for_each_entry(rve, &mr_vif->route_evif_list, vif_node)
> +		mlxsw_sp_mr_route_evif_unresolve(mr_table, rve);
> +
> +	/* Update all routes where this VIF is used as an unresolved iRIF */
> +	list_for_each_entry(rve, &mr_vif->route_ivif_list, vif_node)
> +		mlxsw_sp_mr_route_ivif_unresolve(mr_table, rve);
> +
> +	/* Update the VIF */
> +	mr_vif->dev = dev;
> +	mr_vif->rif = NULL;
> +}
> +
> +int mlxsw_sp_mr_vif_add(struct mlxsw_sp_mr_table *mr_table,
> +			struct net_device *dev, vifi_t vif_index,
> +			unsigned long vif_flags, const struct mlxsw_sp_rif *rif)
> +{
> +	struct mlxsw_sp_mr_vif *mr_vif = &mr_table->vifs[vif_index];
> +
> +	if (WARN_ON(vif_index >= MAXVIFS))
> +		return -EINVAL;
> +	if (mr_vif->dev)
> +		return -EEXIST;

-ENODEV?

> +	return mlxsw_sp_mr_vif_resolve(mr_table, dev, mr_vif, vif_flags, rif);
> +}
> +
> +void mlxsw_sp_mr_vif_del(struct mlxsw_sp_mr_table *mr_table, vifi_t vif_index)
> +{
> +	struct mlxsw_sp_mr_vif *mr_vif = &mr_table->vifs[vif_index];
> +
> +	if (WARN_ON(vif_index >= MAXVIFS))
> +		return;
> +	if (WARN_ON(!mr_vif->dev))
> +		return;
> +	mlxsw_sp_mr_vif_unresolve(mr_table, NULL, mr_vif);
> +}
> +
> +struct mlxsw_sp_mr_vif *
> +mlxsw_sp_mr_dev_vif_lookup(struct mlxsw_sp_mr_table *mr_table,
> +			   const struct net_device *dev)
> +{
> +	vifi_t vif_index;
> +
> +	for (vif_index = 0; vif_index < MAXVIFS; vif_index++)
> +		if (mr_table->vifs[vif_index].dev == dev)
> +			return &mr_table->vifs[vif_index];
> +	return NULL;
> +}
> +
> +int mlxsw_sp_mr_rif_add(struct mlxsw_sp_mr_table *mr_table,
> +			const struct mlxsw_sp_rif *rif)
> +{
> +	const struct net_device *rif_dev = mlxsw_sp_rif_dev(rif);
> +	struct mlxsw_sp_mr_vif *mr_vif;
> +
> +	if (!rif_dev)
> +		return 0;
> +
> +	mr_vif = mlxsw_sp_mr_dev_vif_lookup(mr_table, rif_dev);
> +	if (!mr_vif)
> +		return 0;
> +	return mlxsw_sp_mr_vif_resolve(mr_table, mr_vif->dev, mr_vif,
> +				       mr_vif->vif_flags, rif);
> +}
> +
> +void mlxsw_sp_mr_rif_del(struct mlxsw_sp_mr_table *mr_table,
> +			 const struct mlxsw_sp_rif *rif)
> +{
> +	const struct net_device *rif_dev = mlxsw_sp_rif_dev(rif);
> +	struct mlxsw_sp_mr_vif *mr_vif;
> +
> +	if (!rif_dev)
> +		return;
> +
> +	mr_vif = mlxsw_sp_mr_dev_vif_lookup(mr_table, rif_dev);
> +	if (!mr_vif)
> +		return;
> +	mlxsw_sp_mr_vif_unresolve(mr_table, mr_vif->dev, mr_vif);
> +}
> +
> +void mlxsw_sp_mr_rif_mtu_update(struct mlxsw_sp_mr_table *mr_table,
> +				const struct mlxsw_sp_rif *rif, int mtu)
> +{
> +	const struct net_device *rif_dev = mlxsw_sp_rif_dev(rif);
> +	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
> +	struct mlxsw_sp_mr_route_vif_entry *rve;
> +	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
> +	struct mlxsw_sp_mr_vif *mr_vif;
> +
> +	if (!rif_dev)
> +		return;
> +
> +	/* Search for a VIF that use that RIF */
> +	mr_vif = mlxsw_sp_mr_dev_vif_lookup(mr_table, rif_dev);
> +	if (!mr_vif)
> +		return;
> +
> +	/* Update all the routes that uses that VIF as eVIF */
> +	list_for_each_entry(rve, &mr_vif->route_evif_list, vif_node) {
> +		if (mtu < rve->mr_route->min_mtu) {
> +			rve->mr_route->min_mtu = mtu;
> +			mr->mr_ops->route_min_mtu_update(mlxsw_sp,
> +							 rve->mr_route->route_priv,
> +							 mtu);
> +		}
> +	}
> +}
> +
> +struct mlxsw_sp_mr_table *mlxsw_sp_mr_table_create(struct mlxsw_sp *mlxsw_sp,
> +						   u32 vr_id,
> +						   enum mlxsw_sp_l3proto proto)
> +{
> +	struct mlxsw_sp_mr_route_params catchall_route_params = {
> +		.prio = MLXSW_SP_MR_ROUTE_PRIO_CATCHALL,
> +		.key = {
> +			.vrid = vr_id,
> +		},
> +		.value = {
> +			.route_action = MLXSW_SP_MR_ROUTE_ACTION_TRAP,
> +		}
> +	};
> +	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
> +	struct mlxsw_sp_mr_table *mr_table;
> +	int err;
> +	int i;
> +
> +	mr_table = kzalloc(sizeof(*mr_table) + mr->mr_ops->route_priv_size,
> +			   GFP_KERNEL);
> +	if (!mr_table)
> +		return ERR_PTR(-ENOMEM);
> +
> +	mr_table->vr_id = vr_id;
> +	mr_table->mlxsw_sp = mlxsw_sp;
> +	mr_table->proto = proto;
> +	INIT_LIST_HEAD(&mr_table->route_list);
> +
> +	err = rhashtable_init(&mr_table->route_ht,
> +			      &mlxsw_sp_mr_route_ht_params);
> +	if (err)
> +		goto err_route_rhashtable_init;
> +
> +	for (i = 0; i < MAXVIFS; i++) {
> +		INIT_LIST_HEAD(&mr_table->vifs[i].route_evif_list);
> +		INIT_LIST_HEAD(&mr_table->vifs[i].route_ivif_list);
> +	}
> +
> +	err = mr->mr_ops->route_create(mlxsw_sp, mr->priv,
> +				       mr_table->catchall_route_priv,
> +				       &catchall_route_params);
> +	if (err)
> +		goto err_ops_route_create;
> +	list_add_tail(&mr_table->node, &mr->table_list);
> +	return mr_table;
> +
> +err_ops_route_create:
> +	rhashtable_destroy(&mr_table->route_ht);
> +err_route_rhashtable_init:
> +	kfree(mr_table);
> +	return ERR_PTR(err);
> +}
> +
> +void mlxsw_sp_mr_table_destroy(struct mlxsw_sp_mr_table *mr_table)
> +{
> +	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
> +	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
> +
> +	WARN_ON(!mlxsw_sp_mr_table_empty(mr_table));
> +	list_del(&mr_table->node);
> +	mr->mr_ops->route_destroy(mlxsw_sp, mr->priv,
> +				  &mr_table->catchall_route_priv);
> +	rhashtable_destroy(&mr_table->route_ht);
> +	kfree(mr_table);
> +}
> +
> +void mlxsw_sp_mr_table_flush(struct mlxsw_sp_mr_table *mr_table)
> +{
> +	struct mlxsw_sp_mr_route *mr_route, *tmp;
> +	int i;
> +
> +	list_for_each_entry_safe(mr_route, tmp, &mr_table->route_list, node)
> +		__mlxsw_sp_mr_route_del(mr_table, mr_route);
> +
> +	for (i = 0; i < MAXVIFS; i++) {
> +		mr_table->vifs[i].dev = NULL;
> +		mr_table->vifs[i].rif = NULL;
> +	}
> +}
> +
> +bool mlxsw_sp_mr_table_empty(const struct mlxsw_sp_mr_table *mr_table)
> +{
> +	int i;
> +
> +	for (i = 0; i < MAXVIFS; i++)
> +		if (mr_table->vifs[i].dev)
> +			return false;
> +	return list_empty(&mr_table->route_list);
> +}
> +
> +static void mlxsw_sp_mr_route_stats_update(struct mlxsw_sp *mlxsw_sp,
> +					   struct mlxsw_sp_mr_route *mr_route)
> +{
> +	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
> +	u64 packets, bytes;
> +
> +	if (mr_route->route_action == MLXSW_SP_MR_ROUTE_ACTION_TRAP)
> +		return;
> +
> +	mr->mr_ops->route_stats(mlxsw_sp, mr_route->route_priv, &packets,
> +				&bytes);
> +
> +	switch (mr_route->mr_table->proto) {
> +	case MLXSW_SP_L3_PROTO_IPV4:
> +		if (mr_route->mfc4->mfc_un.res.pkt != packets)
> +			mr_route->mfc4->mfc_un.res.lastuse = jiffies;
> +		mr_route->mfc4->mfc_un.res.pkt = packets;
> +		mr_route->mfc4->mfc_un.res.bytes = bytes;
> +		break;
> +	case MLXSW_SP_L3_PROTO_IPV6:
> +		/* fall through */
> +	default:
> +		WARN_ON_ONCE(1);
> +	}
> +}
> +
> +static void mlxsw_sp_mr_stats_update(struct work_struct *work)
> +{
> +	struct mlxsw_sp_mr *mr = container_of(work, struct mlxsw_sp_mr,
> +					      stats_update_dw.work);
> +	struct mlxsw_sp_mr_table *mr_table;
> +	struct mlxsw_sp_mr_route *mr_route;
> +	unsigned long interval;
> +
> +	rtnl_lock();
> +	list_for_each_entry(mr_table, &mr->table_list, node)
> +		list_for_each_entry(mr_route, &mr_table->route_list, node)
> +			mlxsw_sp_mr_route_stats_update(mr_table->mlxsw_sp,
> +						       mr_route);
> +	rtnl_unlock();
> +
> +	interval = msecs_to_jiffies(MLXSW_SP_MR_ROUTES_COUNTER_UPDATE_INTERVAL);
> +	mlxsw_core_schedule_dw(&mr->stats_update_dw, interval);
> +}
> +
> +int mlxsw_sp_mr_init(struct mlxsw_sp *mlxsw_sp,
> +		     const struct mlxsw_sp_mr_ops *mr_ops)
> +{
> +	struct mlxsw_sp_mr *mr;
> +	unsigned long interval;
> +	int err;
> +
> +	mr = kzalloc(sizeof(*mr) + mr_ops->priv_size, GFP_KERNEL);
> +	if (!mr)
> +		return -ENOMEM;
> +	mr->mr_ops = mr_ops;
> +	mlxsw_sp->mr = mr;
> +	INIT_LIST_HEAD(&mr->table_list);
> +
> +	err = mr_ops->init(mlxsw_sp, mr->priv);
> +	if (err)
> +		goto err;
> +
> +	/* Create the delayed work for counter updates */
> +	INIT_DELAYED_WORK(&mr->stats_update_dw, mlxsw_sp_mr_stats_update);
> +	interval = msecs_to_jiffies(MLXSW_SP_MR_ROUTES_COUNTER_UPDATE_INTERVAL);
> +	mlxsw_core_schedule_dw(&mr->stats_update_dw, interval);
> +	return 0;
> +err:
> +	kfree(mr);
> +	return err;
> +}
> +
> +void mlxsw_sp_mr_fini(struct mlxsw_sp *mlxsw_sp)
> +{
> +	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
> +
> +	cancel_delayed_work_sync(&mr->stats_update_dw);
> +	mr->mr_ops->fini(mr->priv);
> +	kfree(mr);
> +}
> diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h
> new file mode 100644
> index 0000000..c851b23
> --- /dev/null
> +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h
> @@ -0,0 +1,133 @@
> +/*
> + * drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h
> + * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
> + * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions are met:
> + *
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + * 3. Neither the names of the copyright holders nor the names of its
> + *    contributors may be used to endorse or promote products derived from
> + *    this software without specific prior written permission.
> + *
> + * Alternatively, this software may be distributed under the terms of the
> + * GNU General Public License ("GPL") version 2 as published by the Free
> + * Software Foundation.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
> + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
> + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
> + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
> + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
> + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
> + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
> + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
> + * POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _MLXSW_SPECTRUM_MCROUTER_H
> +#define _MLXSW_SPECTRUM_MCROUTER_H
> +
> +#include <linux/mroute.h>
> +#include "spectrum_router.h"
> +#include "spectrum.h"
> +
> +enum mlxsw_sp_mr_route_action {
> +	MLXSW_SP_MR_ROUTE_ACTION_FORWARD,
> +	MLXSW_SP_MR_ROUTE_ACTION_TRAP,
> +};
> +
> +enum mlxsw_sp_mr_route_prio {
> +	MLXSW_SP_MR_ROUTE_PRIO_SG,
> +	MLXSW_SP_MR_ROUTE_PRIO_STARG,
> +	MLXSW_SP_MR_ROUTE_PRIO_CATCHALL,
> +	__MLXSW_SP_MR_ROUTE_PRIO_MAX
> +};
> +
> +#define MLXSW_SP_MR_ROUTE_PRIO_MAX (__MLXSW_SP_MR_ROUTE_PRIO_MAX - 1)
> +
> +struct mlxsw_sp_mr_route_key {
> +	int vrid;
> +	enum mlxsw_sp_l3proto proto;
> +	union mlxsw_sp_l3addr group;
> +	union mlxsw_sp_l3addr group_mask;
> +	union mlxsw_sp_l3addr source;
> +	union mlxsw_sp_l3addr source_mask;
> +};
> +
> +struct mlxsw_sp_mr_route_info {
> +	enum mlxsw_sp_mr_route_action route_action;
> +	u16 irif_index;
> +	u16 *erif_indices;
> +	size_t erif_num;
> +	u16 min_mtu;
> +};
> +
> +struct mlxsw_sp_mr_route_params {
> +	struct mlxsw_sp_mr_route_key key;
> +	struct mlxsw_sp_mr_route_info value;
> +	enum mlxsw_sp_mr_route_prio prio;
> +};
> +
> +struct mlxsw_sp_mr_ops {
> +	int priv_size;
> +	int route_priv_size;
> +	int (*init)(struct mlxsw_sp *mlxsw_sp, void *priv);
> +	int (*route_create)(struct mlxsw_sp *mlxsw_sp, void *priv,
> +			    void *route_priv,
> +			    struct mlxsw_sp_mr_route_params *route_params);
> +	int (*route_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
> +			    struct mlxsw_sp_mr_route_info *route_info);
> +	int (*route_stats)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
> +			   u64 *packets, u64 *bytes);
> +	int (*route_action_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
> +				   enum mlxsw_sp_mr_route_action route_action);
> +	int (*route_min_mtu_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
> +				    u16 min_mtu);
> +	int (*route_irif_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
> +				 u16 irif_index);
> +	int (*route_erif_add)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
> +			      u16 erif_index);
> +	int (*route_erif_del)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
> +			      u16 erif_index);
> +	void (*route_destroy)(struct mlxsw_sp *mlxsw_sp, void *priv,
> +			      void *route_priv);
> +	void (*fini)(void *priv);
> +};
> +
> +struct mlxsw_sp_mr;
> +struct mlxsw_sp_mr_table;
> +
> +int mlxsw_sp_mr_init(struct mlxsw_sp *mlxsw_sp,
> +		     const struct mlxsw_sp_mr_ops *mr_ops);
> +void mlxsw_sp_mr_fini(struct mlxsw_sp *mlxsw_sp);
> +int mlxsw_sp_mr_route4_add(struct mlxsw_sp_mr_table *mr_table,
> +			   struct mfc_cache *mfc, bool replace);
> +void mlxsw_sp_mr_route4_del(struct mlxsw_sp_mr_table *mr_table,
> +			    struct mfc_cache *mfc);
> +int mlxsw_sp_mr_vif_add(struct mlxsw_sp_mr_table *mr_table,
> +			struct net_device *dev, vifi_t vif_index,
> +			unsigned long vif_flags,
> +			const struct mlxsw_sp_rif *rif);
> +void mlxsw_sp_mr_vif_del(struct mlxsw_sp_mr_table *mr_table, vifi_t vif_index);
> +int mlxsw_sp_mr_rif_add(struct mlxsw_sp_mr_table *mr_table,
> +			const struct mlxsw_sp_rif *rif);
> +void mlxsw_sp_mr_rif_del(struct mlxsw_sp_mr_table *mr_table,
> +			 const struct mlxsw_sp_rif *rif);
> +void mlxsw_sp_mr_rif_mtu_update(struct mlxsw_sp_mr_table *mr_table,
> +				const struct mlxsw_sp_rif *rif, int mtu);
> +struct mlxsw_sp_mr_table *mlxsw_sp_mr_table_create(struct mlxsw_sp *mlxsw_sp,
> +						   u32 tb_id,
> +						   enum mlxsw_sp_l3proto proto);
> +void mlxsw_sp_mr_table_destroy(struct mlxsw_sp_mr_table *mr_table);
> +void mlxsw_sp_mr_table_flush(struct mlxsw_sp_mr_table *mr_table);
> +bool mlxsw_sp_mr_table_empty(const struct mlxsw_sp_mr_table *mr_table);
> +
> +#endif
> 

^ permalink raw reply

* Re: [patch net-next v2 06/12] net: mroute: Check if rule is a default rule
From: Yunsheng Lin @ 2017-09-25  1:28 UTC (permalink / raw)
  To: Jiri Pirko, netdev; +Cc: davem, yotamg, idosch, mlxsw, nikolay, andrew
In-Reply-To: <20170924172212.10096-7-jiri@resnulli.us>

Hi, Jiri

On 2017/9/25 1:22, Jiri Pirko wrote:
> From: Yotam Gigi <yotamg@mellanox.com>
> 
> When the ipmr starts, it adds one default FIB rule that matches all packets
> and sends them to the DEFAULT (multicast) FIB table. A more complex rule
> can be added by user to specify that for a specific interface, a packet
> should be look up at either an arbitrary table or according to the l3mdev
> of the interface.
> 
> For drivers willing to offload the ipmr logic into a hardware but don't
> want to offload all the FIB rules functionality, provide a function that
> can indicate whether the FIB rule is the default multicast rule, thus only
> one routing table is needed.
> 
> This way, a driver can register to the FIB notification chain, get
> notifications about FIB rules added and trigger some kind of an internal
> abort mechanism when a non default rule is added by the user.
> 
> Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
> Reviewed-by: Ido Schimmel <idosch@mellanox.com>
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> ---
>  include/linux/mroute.h |  7 +++++++
>  net/ipv4/ipmr.c        | 10 ++++++++++
>  2 files changed, 17 insertions(+)
> 
> diff --git a/include/linux/mroute.h b/include/linux/mroute.h
> index 5566580..b072a84 100644
> --- a/include/linux/mroute.h
> +++ b/include/linux/mroute.h
> @@ -5,6 +5,7 @@
>  #include <linux/pim.h>
>  #include <linux/rhashtable.h>
>  #include <net/sock.h>
> +#include <net/fib_rules.h>
>  #include <net/fib_notifier.h>
>  #include <uapi/linux/mroute.h>
>  
> @@ -19,6 +20,7 @@ int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *);
>  int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg);
>  int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
>  int ip_mr_init(void);
> +bool ipmr_rule_default(const struct fib_rule *rule);
>  #else
>  static inline int ip_mroute_setsockopt(struct sock *sock, int optname,
>  				       char __user *optval, unsigned int optlen)
> @@ -46,6 +48,11 @@ static inline int ip_mroute_opt(int opt)
>  {
>  	return 0;
>  }
> +
> +static inline bool ipmr_rule_default(const struct fib_rule *rule)
> +{
> +	return true;
> +}
>  #endif
>  
>  struct vif_device {
> diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
> index 2a795d2..a714f55 100644
> --- a/net/ipv4/ipmr.c
> +++ b/net/ipv4/ipmr.c
> @@ -320,6 +320,16 @@ static unsigned int ipmr_rules_seq_read(struct net *net)
>  }
>  #endif
>  
> +bool ipmr_rule_default(const struct fib_rule *rule)
> +{
> +#if IS_ENABLED(CONFIG_FIB_RULES)
> +	return fib_rule_matchall(rule) && rule->table == RT_TABLE_DEFAULT;
> +#else
> +	return true;
> +#endif

In patch 02, You have the following, can you do the same for the above?
+#ifdef CONFIG_IP_MROUTE
+void ipmr_cache_free(struct mfc_cache *mfc_cache);
+#else
+static inline void ipmr_cache_free(struct mfc_cache *mfc_cache)
+{
+}
+#endif

> +}
> +EXPORT_SYMBOL(ipmr_rule_default);
> +
>  static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
>  				const void *ptr)
>  {
> 

^ permalink raw reply

* Re: [patch net-next v2 03/12] ipmr: Add FIB notification access functions
From: Yunsheng Lin @ 2017-09-25  1:19 UTC (permalink / raw)
  To: Jiri Pirko, netdev; +Cc: davem, yotamg, idosch, mlxsw, nikolay, andrew
In-Reply-To: <20170924172212.10096-4-jiri@resnulli.us>

Hi, Jiri

On 2017/9/25 1:22, Jiri Pirko wrote:
> From: Yotam Gigi <yotamg@mellanox.com>
> 
> Make the ipmr module register as a FIB notifier. To do that, implement both
> the ipmr_seq_read and ipmr_dump ops.
> 
> The ipmr_seq_read op returns a sequence counter that is incremented on
> every notification related operation done by the ipmr. To implement that,
> add a sequence counter in the netns_ipv4 struct and increment it whenever a
> new MFC route or VIF are added or deleted. The sequence operations are
> protected by the RTNL lock.
> 
> The ipmr_dump iterates the list of MFC routes and the list of VIF entries
> and sends notifications about them. The entries dump is done under RCU
> where the VIF dump uses the mrt_lock too, as the vif->dev field can change
> under RCU.
> 
> Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
> Reviewed-by: Ido Schimmel <idosch@mellanox.com>
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> ---
> v1->v2:
>  - Take the mrt_lock when dumping VIF entries.
> ---
>  include/linux/mroute.h   |  15 ++++++
>  include/net/netns/ipv4.h |   3 ++
>  net/ipv4/ipmr.c          | 137 ++++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 153 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/mroute.h b/include/linux/mroute.h
> index 10028f2..54c5cb8 100644
> --- a/include/linux/mroute.h
> +++ b/include/linux/mroute.h
> @@ -5,6 +5,7 @@
>  #include <linux/pim.h>
>  #include <linux/rhashtable.h>
>  #include <net/sock.h>
> +#include <net/fib_notifier.h>
>  #include <uapi/linux/mroute.h>
>  
>  #ifdef CONFIG_IP_MROUTE
> @@ -58,6 +59,14 @@ struct vif_device {
>  	int		link;			/* Physical interface index	*/
>  };
>  
> +struct vif_entry_notifier_info {
> +	struct fib_notifier_info info;
> +	struct net_device *dev;
> +	vifi_t vif_index;
> +	unsigned short vif_flags;
> +	u32 tb_id;
> +};
> +
>  #define VIFF_STATIC 0x8000
>  
>  #define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
> @@ -146,6 +155,12 @@ struct mfc_cache {
>  	struct rcu_head	rcu;
>  };
>  
> +struct mfc_entry_notifier_info {
> +	struct fib_notifier_info info;
> +	struct mfc_cache *mfc;
> +	u32 tb_id;
> +};
> +
>  struct rtmsg;
>  int ipmr_get_route(struct net *net, struct sk_buff *skb,
>  		   __be32 saddr, __be32 daddr,
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index 8387f09..abc84d9 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -163,6 +163,9 @@ struct netns_ipv4 {
>  	struct fib_notifier_ops	*notifier_ops;
>  	unsigned int	fib_seq;	/* protected by rtnl_mutex */
>  
> +	struct fib_notifier_ops	*ipmr_notifier_ops;

Can we add a const here?

> +	unsigned int	ipmr_seq;	/* protected by rtnl_mutex */
> +
>  	atomic_t	rt_genid;
>  };
>  #endif
> diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
> index 86dc5f9..49879c3 100644
> --- a/net/ipv4/ipmr.c
> +++ b/net/ipv4/ipmr.c
> @@ -264,6 +264,16 @@ static void __net_exit ipmr_rules_exit(struct net *net)
>  	fib_rules_unregister(net->ipv4.mr_rules_ops);
>  	rtnl_unlock();
>  }
> +
> +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
> +{
> +	return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR);
> +}
> +
> +static unsigned int ipmr_rules_seq_read(struct net *net)
> +{
> +	return fib_rules_seq_read(net, RTNL_FAMILY_IPMR);
> +}
>  #else
>  #define ipmr_for_each_table(mrt, net) \
>  	for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
> @@ -298,6 +308,16 @@ static void __net_exit ipmr_rules_exit(struct net *net)
>  	net->ipv4.mrt = NULL;
>  	rtnl_unlock();
>  }
> +
> +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
> +{
> +	return 0;
> +}
> +
> +static unsigned int ipmr_rules_seq_read(struct net *net)
> +{
> +	return 0;
> +}
>  #endif
>  
>  static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
> @@ -587,6 +607,43 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
>  }
>  #endif
>  
> +static int call_ipmr_vif_entry_notifier(struct notifier_block *nb,
> +					struct net *net,
> +					enum fib_event_type event_type,
> +					struct vif_device *vif,
> +					vifi_t vif_index, u32 tb_id)
> +{
> +	struct vif_entry_notifier_info info = {
> +		.info = {
> +			.family = RTNL_FAMILY_IPMR,
> +			.net = net,
> +		},
> +		.dev = vif->dev,
> +		.vif_index = vif_index,
> +		.vif_flags = vif->flags,
> +		.tb_id = tb_id,
> +	};

We only use info.info which is fib_notifier_info, the
vif_entry_notifier_info seems to be not needed, why not just
use fib_notifier_info?

> +
> +	return call_fib_notifier(nb, net, event_type, &info.info);
> +}
> +
> +static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb,
> +					struct net *net,
> +					enum fib_event_type event_type,
> +					struct mfc_cache *mfc, u32 tb_id)
> +{
> +	struct mfc_entry_notifier_info info = {
> +		.info = {
> +			.family = RTNL_FAMILY_IPMR,
> +			.net = net,
> +		},
> +		.mfc = mfc,
> +		.tb_id = tb_id
> +	};
> +

As above.

> +	return call_fib_notifier(nb, net, event_type, &info.info);
> +}
> +
>  /**
>   *	vif_delete - Delete a VIF entry
>   *	@notify: Set to 1, if the caller is a notifier_call
> @@ -3050,14 +3107,87 @@ static const struct net_protocol pim_protocol = {
>  };
>  #endif
>  
> +static unsigned int ipmr_seq_read(struct net *net)
> +{
> +	ASSERT_RTNL();
> +
> +	return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net);
> +}
> +
> +static int ipmr_dump(struct net *net, struct notifier_block *nb)
> +{
> +	struct mr_table *mrt;
> +	int err;
> +
> +	err = ipmr_rules_dump(net, nb);
> +	if (err)
> +		return err;
> +
> +	ipmr_for_each_table(mrt, net) {
> +		struct vif_device *v = &mrt->vif_table[0];
> +		struct mfc_cache *mfc;
> +		int vifi;
> +
> +		/* Notifiy on table VIF entries */
> +		read_lock(&mrt_lock);
> +		for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) {
> +			if (!v->dev)
> +				continue;
> +
> +			call_ipmr_vif_entry_notifier(nb, net, FIB_EVENT_VIF_ADD,
> +						     v, vifi, mrt->id);
> +		}
> +		read_unlock(&mrt_lock);
> +
> +		/* Notify on table MFC entries */
> +		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
> +			call_ipmr_mfc_entry_notifier(nb, net,
> +						     FIB_EVENT_ENTRY_ADD, mfc,
> +						     mrt->id);
> +	}
> +
> +	return 0;
> +}
> +
> +static const struct fib_notifier_ops ipmr_notifier_ops_template = {
> +	.family		= RTNL_FAMILY_IPMR,
> +	.fib_seq_read	= ipmr_seq_read,
> +	.fib_dump	= ipmr_dump,
> +	.owner		= THIS_MODULE,
> +};
> +
> +int __net_init ipmr_notifier_init(struct net *net)
> +{
> +	struct fib_notifier_ops *ops;
> +
> +	net->ipv4.ipmr_seq = 0;
> +
> +	ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net);
> +	if (IS_ERR(ops))
> +		return PTR_ERR(ops);
> +	net->ipv4.ipmr_notifier_ops = ops;
> +
> +	return 0;
> +}
> +
> +static void __net_exit ipmr_notifier_exit(struct net *net)
> +{
> +	fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops);
> +	net->ipv4.ipmr_notifier_ops = NULL;
> +}
> +
>  /* Setup for IP multicast routing */
>  static int __net_init ipmr_net_init(struct net *net)
>  {
>  	int err;
>  
> +	err = ipmr_notifier_init(net);
> +	if (err)
> +		goto ipmr_notifier_fail;
> +
>  	err = ipmr_rules_init(net);
>  	if (err < 0)
> -		goto fail;
> +		goto ipmr_rules_fail;
>  
>  #ifdef CONFIG_PROC_FS
>  	err = -ENOMEM;
> @@ -3074,7 +3204,9 @@ static int __net_init ipmr_net_init(struct net *net)
>  proc_vif_fail:
>  	ipmr_rules_exit(net);
>  #endif
> -fail:
> +ipmr_rules_fail:
> +	ipmr_notifier_exit(net);
> +ipmr_notifier_fail:
>  	return err;
>  }
>  
> @@ -3084,6 +3216,7 @@ static void __net_exit ipmr_net_exit(struct net *net)
>  	remove_proc_entry("ip_mr_cache", net->proc_net);
>  	remove_proc_entry("ip_mr_vif", net->proc_net);
>  #endif
> +	ipmr_notifier_exit(net);
>  	ipmr_rules_exit(net);
>  }
>  
> 

^ permalink raw reply

* Re: [PATCH net-next 10/10] net: hns3: Add mqprio support when interacting with network stack
From: Yunsheng Lin @ 2017-09-25  0:45 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: davem@davemloft.net, huangdaode, xuwei (O), Liguozhu (Kenneth),
	Zhuangyuzeng (Yisen), Gabriele Paoloni, John Garry, Linuxarm,
	Salil Mehta, lipeng (Y), netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <20170924113724.GA2029@nanopsycho>

Hi, Jiri

On 2017/9/24 19:37, Jiri Pirko wrote:
> Sat, Sep 23, 2017 at 02:47:20AM CEST, linyunsheng@huawei.com wrote:
>> Hi, Jiri
>>
>> On 2017/9/23 0:03, Jiri Pirko wrote:
>>> Fri, Sep 22, 2017 at 04:11:51PM CEST, linyunsheng@huawei.com wrote:
>>>> Hi, Jiri
>>>>
>>>>>> - if (!tc) {
>>>>>> + if (if_running) {
>>>>>> + (void)hns3_nic_net_stop(netdev);
>>>>>> + msleep(100);
>>>>>> + }
>>>>>> +
>>>>>> + ret = (kinfo->dcb_ops && kinfo->dcb_ops->>setup_tc) ?
>>>>>> + kinfo->dcb_ops->setup_tc(h, tc, prio_tc) : ->EOPNOTSUPP;
>>>>
>>>>> This is most odd. Why do you call dcb_ops from >ndo_setup_tc callback?
>>>>> Why are you mixing this together? prio->tc mapping >can be done
>>>>> directly in dcbnl
>>>>
>>>> Here is what we do in dcb_ops->setup_tc:
>>>> Firstly, if current tc num is different from the tc num
>>>> that user provide, then we setup the queues for each
>>>> tc.
>>>>
>>>> Secondly, we tell hardware the pri to tc mapping that
>>>> the stack is using. In rx direction, our hardware need
>>>> that mapping to put different packet into different tc'
>>>> queues according to the priority of the packet, then
>>>> rss decides which specific queue in the tc should the
>>>> packet goto.
>>>>
>>>> By mixing, I suppose you meant why we need the
>>>> pri to tc infomation?
>>>
>>> by mixing, I mean what I wrote. You are calling dcb_ops callback from
>>> ndo_setup_tc callback. So you are mixing DCBNL subsystem and TC
>>> subsystem. Why? Why do you need sch_mqprio? Why DCBNL is not enough for
>>> all?
>>
>> When using lldptool, dcbnl is involved.
>>
>> But when using tc qdisc, dcbbl is not involved, below is the a few key
>> call graph in the kernel when tc qdisc cmd is executed.
>>
>> cmd:
>> tc qdisc add dev eth0 root handle 1:0 mqprio num_tc 4 map 1 2 3 3 1 3 1 1 hw 1
>>
>> call graph:
>> rtnetlink_rcv_msg -> tc_modify_qdisc -> qdisc_create -> mqprio_init ->
>> hns3_nic_setup_tc
>>
>> When hns3_nic_setup_tc is called, we need to know how many tc num and
>> prio_tc mapping from the tc_mqprio_qopt which is provided in the paramter
>> in the ndo_setup_tc function, and dcb_ops is the our hardware specific
>> method to setup the tc related parameter to the hardware, so this is why
>> we call dcb_ops callback in ndo_setup_tc callback.
>>
>> I hope this will answer your question, thanks for your time.
> 
> Okay. I understand that you have a usecase for mqprio mapping offload
> without lldptool being involved. Ok. I believe it is wrong to call dcb_ops
> from tc callback. You should have a generic layer inside the driver and
> call it from both dcb_ops and tc callbacks.

Actually, dcb_ops is our generic layer inside the driver.
Below is high level architecture:

       [ tc qdisc ]	       [ lldpad ]
             |                     |
             |                     |
             |                     |
       [ hns3_enet ]        [ hns3_dcbnl ]
             \                    /
                \              /
                   \        /
                 [ hclge_dcb ]
                   /      \
                /            \
             /                  \
     [ hclgc_main ]        [ hclge_tm ]

hns3_enet.c implements the ndo_setup_tc callback.
hns3_dcbnl.c implements the dcbnl_rtnl_ops for stack's DCBNL system.
hclge_dcb implements the dcb_ops.
So we already have a generic layer that tc and dcbnl all call from.

> 
> Also, what happens If I run lldptool concurrently with mqprio? Who wins
> and is going to configure the mapping?

Both lldptool and tc qdisc cmd use rtnl interface provided by stack, so
they are both protected by rtnl_lock, so we do not have to do the locking
in the driver.

The locking is in rtnetlink_rcv_msg:

	rtnl_lock();
	handlers = rtnl_dereference(rtnl_msg_handlers[family]);
	if (handlers) {
		doit = READ_ONCE(handlers[type].doit);
		if (doit)
			err = doit(skb, nlh, extack);
	}
	rtnl_unlock();

Thanks.

> 
> 
>>
>>>
>>>
>>>
>>>> I hope I did not misunderstand your question, thanks
>>>> for your time reviewing.
>>>
>>> .
>>>
>>
> 
> .
> 

^ permalink raw reply

* [PATCH net-next 6/6] bpf, ixgbe: add meta data support
From: Daniel Borkmann @ 2017-09-25  0:25 UTC (permalink / raw)
  To: davem
  Cc: alexei.starovoitov, john.fastabend, peter.waskiewicz.jr,
	jakub.kicinski, netdev, Daniel Borkmann
In-Reply-To: <cover.1506297988.git.daniel@iogearbox.net>

Implement support for transferring XDP meta data into skb for
ixgbe driver; before calling into the program, xdp.data_meta points
to xdp.data, where on program return with pass verdict, we call
into skb_metadata_set().

We implement this for the default ixgbe_build_skb() variant. For the
ixgbe_construct_skb() that is used when legacy-rx buffer mananagement
mode is turned on via ethtool, I found that XDP gets 0 headroom, so
neither xdp_adjust_head() nor xdp_adjust_meta() can be used with this.
Just add a comment with explanation for this operating mode.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 30 +++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 04bb03b..3942c62 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2133,6 +2133,21 @@ static struct sk_buff *ixgbe_construct_skb(struct ixgbe_ring *rx_ring,
 #if L1_CACHE_BYTES < 128
 	prefetch(xdp->data + L1_CACHE_BYTES);
 #endif
+	/* Note, we get here by enabling legacy-rx via:
+	 *
+	 *    ethtool --set-priv-flags <dev> legacy-rx on
+	 *
+	 * In this mode, we currently get 0 extra XDP headroom as
+	 * opposed to having legacy-rx off, where we process XDP
+	 * packets going to stack via ixgbe_build_skb(). The latter
+	 * provides us currently with 192 bytes of headroom.
+	 *
+	 * For ixgbe_construct_skb() mode it means that the
+	 * xdp->data_meta will always point to xdp->data, since
+	 * the helper cannot expand the head. Should this ever
+	 * change in future for legacy-rx mode on, then lets also
+	 * add xdp->data_meta handling here.
+	 */
 
 	/* allocate a skb to store the frags */
 	skb = napi_alloc_skb(&rx_ring->q_vector->napi, IXGBE_RX_HDR_SIZE);
@@ -2165,6 +2180,7 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
 				       struct xdp_buff *xdp,
 				       union ixgbe_adv_rx_desc *rx_desc)
 {
+	unsigned int metasize = xdp->data - xdp->data_meta;
 #if (PAGE_SIZE < 8192)
 	unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2;
 #else
@@ -2174,10 +2190,14 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
 #endif
 	struct sk_buff *skb;
 
-	/* prefetch first cache line of first page */
-	prefetch(xdp->data);
+	/* Prefetch first cache line of first page. If xdp->data_meta
+	 * is unused, this points extactly as xdp->data, otherwise we
+	 * likely have a consumer accessing first few bytes of meta
+	 * data, and then actual data.
+	 */
+	prefetch(xdp->data_meta);
 #if L1_CACHE_BYTES < 128
-	prefetch(xdp->data + L1_CACHE_BYTES);
+	prefetch(xdp->data_meta + L1_CACHE_BYTES);
 #endif
 
 	/* build an skb to around the page buffer */
@@ -2188,6 +2208,8 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
 	/* update pointers within the skb to store the data */
 	skb_reserve(skb, xdp->data - xdp->data_hard_start);
 	__skb_put(skb, xdp->data_end - xdp->data);
+	if (metasize)
+		skb_metadata_set(skb, metasize);
 
 	/* record DMA address if this is the start of a chain of buffers */
 	if (!ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP))
@@ -2326,7 +2348,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		if (!skb) {
 			xdp.data = page_address(rx_buffer->page) +
 				   rx_buffer->page_offset;
-			xdp_set_data_meta_invalid(&xdp);
+			xdp.data_meta = xdp.data;
 			xdp.data_hard_start = xdp.data -
 					      ixgbe_rx_offset(rx_ring);
 			xdp.data_end = xdp.data + size;
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next 4/6] bpf: improve selftests and add tests for meta pointer
From: Daniel Borkmann @ 2017-09-25  0:25 UTC (permalink / raw)
  To: davem
  Cc: alexei.starovoitov, john.fastabend, peter.waskiewicz.jr,
	jakub.kicinski, netdev, Daniel Borkmann
In-Reply-To: <cover.1506297988.git.daniel@iogearbox.net>

Add various test_verifier selftests, and a simple xdp/tc functional
test that is being attached to veths. Also let new versions of clang
use the recently added -mcpu=probe support [1] for the BPF target,
so that it can probe the underlying kernel for BPF insn set extensions.
We could also just set this options always, where older versions just
ignore it and give a note to the user that the -mcpu value is not
supported, but given emitting the note cannot be turned off from clang
side lets not confuse users running selftests with it, thus fallback
to the default generic one when we see that clang doesn't support it.
Also allow CPU option to be overridden in the Makefile from command
line.

  [1] https://github.com/llvm-mirror/llvm/commit/d7276a40d87b89aed89978dec6457a5b8b3a0db5

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
---
 tools/testing/selftests/bpf/Makefile         |  21 ++-
 tools/testing/selftests/bpf/bpf_helpers.h    |   2 +
 tools/testing/selftests/bpf/test_verifier.c  | 247 +++++++++++++++++++++++++++
 tools/testing/selftests/bpf/test_xdp_meta.c  |  53 ++++++
 tools/testing/selftests/bpf/test_xdp_meta.sh |  51 ++++++
 5 files changed, 370 insertions(+), 4 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_xdp_meta.c
 create mode 100755 tools/testing/selftests/bpf/test_xdp_meta.sh

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index f4b23d6..924af8d7 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -15,9 +15,10 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
 	test_align
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
-	test_pkt_md_access.o test_xdp_redirect.o sockmap_parse_prog.o sockmap_verdict_prog.o
+	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
+	sockmap_verdict_prog.o
 
-TEST_PROGS := test_kmod.sh test_xdp_redirect.sh
+TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh
 
 include ../lib.mk
 
@@ -34,8 +35,20 @@ $(BPFOBJ): force
 	$(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/
 
 CLANG ?= clang
+LLC   ?= llc
+
+PROBE := $(shell llc -march=bpf -mcpu=probe -filetype=null /dev/null 2>&1)
+
+# Let newer LLVM versions transparently probe the kernel for availability
+# of full BPF instruction set.
+ifeq ($(PROBE),)
+  CPU ?= probe
+else
+  CPU ?= generic
+endif
 
 %.o: %.c
 	$(CLANG) -I. -I./include/uapi -I../../../include/uapi \
-		-Wno-compare-distinct-pointer-types \
-		-O2 -target bpf -c $< -o $@
+		 -Wno-compare-distinct-pointer-types          \
+		 -O2 -target bpf -emit-llvm -c $< -o - |      \
+	$(LLC) -march=bpf -mcpu=$(CPU) -filetype=obj -o $@
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 4875395..a56053d 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -62,6 +62,8 @@ static unsigned long long (*bpf_get_prandom_u32)(void) =
 	(void *) BPF_FUNC_get_prandom_u32;
 static int (*bpf_xdp_adjust_head)(void *ctx, int offset) =
 	(void *) BPF_FUNC_xdp_adjust_head;
+static int (*bpf_xdp_adjust_meta)(void *ctx, int offset) =
+	(void *) BPF_FUNC_xdp_adjust_meta;
 static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval,
 			     int optlen) =
 	(void *) BPF_FUNC_setsockopt;
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 26f3250..a042614 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -6645,6 +6645,253 @@ struct test_val {
 		.errstr = "BPF_END uses reserved fields",
 		.result = REJECT,
 	},
+	{
+		"meta access, test1",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
+	{
+		"meta access, test2",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 8),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid access to packet, off=-8",
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
+	{
+		"meta access, test3",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data_end)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid access to packet",
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
+	{
+		"meta access, test4",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data_end)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+				    offsetof(struct xdp_md, data)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_4),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid access to packet",
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
+	{
+		"meta access, test5",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+				    offsetof(struct xdp_md, data)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_3),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_4, 3),
+			BPF_MOV64_IMM(BPF_REG_2, -8),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_xdp_adjust_meta),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_3, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "R3 !read_ok",
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
+	{
+		"meta access, test6",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_3),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_0, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid access to packet",
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
+	{
+		"meta access, test7",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_3),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
+	{
+		"meta access, test8",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data)),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 0xFFFF),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
+	{
+		"meta access, test9",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data)),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 0xFFFF),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 1),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid access to packet",
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
+	{
+		"meta access, test10",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+				    offsetof(struct xdp_md, data_end)),
+			BPF_MOV64_IMM(BPF_REG_5, 42),
+			BPF_MOV64_IMM(BPF_REG_6, 24),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_5, -8),
+			BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_6, -8),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -8),
+			BPF_JMP_IMM(BPF_JGT, BPF_REG_5, 100, 6),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_5),
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_3),
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_5, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_2, BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid access to packet",
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
+	{
+		"meta access, test11",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data)),
+			BPF_MOV64_IMM(BPF_REG_5, 42),
+			BPF_MOV64_IMM(BPF_REG_6, 24),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_5, -8),
+			BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_6, -8),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -8),
+			BPF_JMP_IMM(BPF_JGT, BPF_REG_5, 100, 6),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_5),
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_3, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_5, BPF_REG_5, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
+	{
+		"meta access, test12",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+				    offsetof(struct xdp_md, data_end)),
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_3),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 16),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_5, BPF_REG_4, 5),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_3, 0),
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 16),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_5, BPF_REG_3, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
diff --git a/tools/testing/selftests/bpf/test_xdp_meta.c b/tools/testing/selftests/bpf/test_xdp_meta.c
new file mode 100644
index 0000000..8d01826
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_xdp_meta.c
@@ -0,0 +1,53 @@
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/pkt_cls.h>
+
+#include "bpf_helpers.h"
+
+#define __round_mask(x, y) ((__typeof__(x))((y) - 1))
+#define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1)
+#define ctx_ptr(ctx, mem) (void *)(unsigned long)ctx->mem
+
+SEC("t")
+int ing_cls(struct __sk_buff *ctx)
+{
+	__u8 *data, *data_meta, *data_end;
+	__u32 diff = 0;
+
+	data_meta = ctx_ptr(ctx, data_meta);
+	data_end  = ctx_ptr(ctx, data_end);
+	data      = ctx_ptr(ctx, data);
+
+	if (data + ETH_ALEN > data_end ||
+	    data_meta + round_up(ETH_ALEN, 4) > data)
+		return TC_ACT_SHOT;
+
+	diff |= ((__u32 *)data_meta)[0] ^ ((__u32 *)data)[0];
+	diff |= ((__u16 *)data_meta)[2] ^ ((__u16 *)data)[2];
+
+	return diff ? TC_ACT_SHOT : TC_ACT_OK;
+}
+
+SEC("x")
+int ing_xdp(struct xdp_md *ctx)
+{
+	__u8 *data, *data_meta, *data_end;
+	int ret;
+
+	ret = bpf_xdp_adjust_meta(ctx, -round_up(ETH_ALEN, 4));
+	if (ret < 0)
+		return XDP_DROP;
+
+	data_meta = ctx_ptr(ctx, data_meta);
+	data_end  = ctx_ptr(ctx, data_end);
+	data      = ctx_ptr(ctx, data);
+
+	if (data + ETH_ALEN > data_end ||
+	    data_meta + round_up(ETH_ALEN, 4) > data)
+		return XDP_DROP;
+
+	__builtin_memcpy(data_meta, data, ETH_ALEN);
+	return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_xdp_meta.sh b/tools/testing/selftests/bpf/test_xdp_meta.sh
new file mode 100755
index 0000000..307aa85
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_xdp_meta.sh
@@ -0,0 +1,51 @@
+#!/bin/sh
+
+cleanup()
+{
+	if [ "$?" = "0" ]; then
+		echo "selftests: test_xdp_meta [PASS]";
+	else
+		echo "selftests: test_xdp_meta [FAILED]";
+	fi
+
+	set +e
+	ip netns del ns1 2> /dev/null
+	ip netns del ns2 2> /dev/null
+}
+
+ip link set dev lo xdp off 2>/dev/null > /dev/null
+if [ $? -ne 0 ];then
+	echo "selftests: [SKIP] Could not run test without the ip xdp support"
+	exit 0
+fi
+set -e
+
+ip netns add ns1
+ip netns add ns2
+
+trap cleanup 0 2 3 6 9
+
+ip link add veth1 type veth peer name veth2
+
+ip link set veth1 netns ns1
+ip link set veth2 netns ns2
+
+ip netns exec ns1 ip addr add 10.1.1.11/24 dev veth1
+ip netns exec ns2 ip addr add 10.1.1.22/24 dev veth2
+
+ip netns exec ns1 tc qdisc add dev veth1 clsact
+ip netns exec ns2 tc qdisc add dev veth2 clsact
+
+ip netns exec ns1 tc filter add dev veth1 ingress bpf da obj test_xdp_meta.o sec t
+ip netns exec ns2 tc filter add dev veth2 ingress bpf da obj test_xdp_meta.o sec t
+
+ip netns exec ns1 ip link set dev veth1 xdp obj test_xdp_meta.o sec x
+ip netns exec ns2 ip link set dev veth2 xdp obj test_xdp_meta.o sec x
+
+ip netns exec ns1 ip link set dev veth1 up
+ip netns exec ns2 ip link set dev veth2 up
+
+ip netns exec ns1 ping -c 1 10.1.1.22
+ip netns exec ns2 ping -c 1 10.1.1.11
+
+exit 0
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next 1/6] bpf: rename bpf_compute_data_end into bpf_compute_data_pointers
From: Daniel Borkmann @ 2017-09-25  0:25 UTC (permalink / raw)
  To: davem
  Cc: alexei.starovoitov, john.fastabend, peter.waskiewicz.jr,
	jakub.kicinski, netdev, Daniel Borkmann
In-Reply-To: <cover.1506297988.git.daniel@iogearbox.net>

Just do the rename into bpf_compute_data_pointers() as we'll add
one more pointer here to recompute.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
---
 include/linux/filter.h |  9 ++++++---
 kernel/bpf/sockmap.c   |  4 ++--
 net/bpf/test_run.c     |  2 +-
 net/core/filter.c      | 14 +++++++-------
 net/core/lwt_bpf.c     |  2 +-
 net/sched/act_bpf.c    |  4 ++--
 net/sched/cls_bpf.c    |  4 ++--
 7 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index d29e58f..052bab3 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -496,10 +496,13 @@ struct xdp_buff {
 	void *data_hard_start;
 };
 
-/* compute the linear packet data range [data, data_end) which
- * will be accessed by cls_bpf, act_bpf and lwt programs
+/* Compute the linear packet data range [data, data_end) which
+ * will be accessed by various program types (cls_bpf, act_bpf,
+ * lwt, ...). Subsystems allowing direct data access must (!)
+ * ensure that cb[] area can be written to when BPF program is
+ * invoked (otherwise cb[] save/restore is necessary).
  */
-static inline void bpf_compute_data_end(struct sk_buff *skb)
+static inline void bpf_compute_data_pointers(struct sk_buff *skb)
 {
 	struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
 
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 6424ce0..a298d66 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -102,7 +102,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 
 	skb_orphan(skb);
 	skb->sk = psock->sock;
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	rc = (*prog->bpf_func)(skb, prog->insnsi);
 	skb->sk = NULL;
 
@@ -369,7 +369,7 @@ static int smap_parse_func_strparser(struct strparser *strp,
 	 * any socket yet.
 	 */
 	skb->sk = psock->sock;
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	rc = (*prog->bpf_func)(skb, prog->insnsi);
 	skb->sk = NULL;
 	rcu_read_unlock();
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 6be41a4..df67251 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -133,7 +133,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 	if (is_l2)
 		__skb_push(skb, ETH_HLEN);
 	if (is_direct_pkt_access)
-		bpf_compute_data_end(skb);
+		bpf_compute_data_pointers(skb);
 	retval = bpf_test_run(prog, skb, repeat, &duration);
 	if (!is_l2)
 		__skb_push(skb, ETH_HLEN);
diff --git a/net/core/filter.c b/net/core/filter.c
index 82edad5..c468e7c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1402,7 +1402,7 @@ static inline int bpf_try_make_writable(struct sk_buff *skb,
 {
 	int err = __bpf_try_make_writable(skb, write_len);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return err;
 }
 
@@ -1962,7 +1962,7 @@ struct sock *do_sk_redirect_map(void)
 	ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
 	bpf_pull_mac_rcsum(skb);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -1984,7 +1984,7 @@ struct sock *do_sk_redirect_map(void)
 	ret = skb_vlan_pop(skb);
 	bpf_pull_mac_rcsum(skb);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2178,7 +2178,7 @@ static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
 	 * need to be verified first.
 	 */
 	ret = bpf_skb_proto_xlat(skb, proto);
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2303,7 +2303,7 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff)
 	ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) :
 		       bpf_skb_net_grow(skb, len_diff_abs);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2394,7 +2394,7 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
 			skb_gso_reset(skb);
 	}
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2434,7 +2434,7 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
 		skb_reset_mac_header(skb);
 	}
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return 0;
 }
 
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 1307731..e7e626f 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -51,7 +51,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
 	 */
 	preempt_disable();
 	rcu_read_lock();
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	ret = bpf_prog_run_save_cb(lwt->prog, skb);
 	rcu_read_unlock();
 
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index c0c707e..5ef8ce8 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -49,11 +49,11 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
 	filter = rcu_dereference(prog->filter);
 	if (at_ingress) {
 		__skb_push(skb, skb->mac_len);
-		bpf_compute_data_end(skb);
+		bpf_compute_data_pointers(skb);
 		filter_res = BPF_PROG_RUN(filter, skb);
 		__skb_pull(skb, skb->mac_len);
 	} else {
-		bpf_compute_data_end(skb);
+		bpf_compute_data_pointers(skb);
 		filter_res = BPF_PROG_RUN(filter, skb);
 	}
 	rcu_read_unlock();
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 520c502..36671b0 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -99,11 +99,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		} else if (at_ingress) {
 			/* It is safe to push/pull even if skb_shared() */
 			__skb_push(skb, skb->mac_len);
-			bpf_compute_data_end(skb);
+			bpf_compute_data_pointers(skb);
 			filter_res = BPF_PROG_RUN(prog->filter, skb);
 			__skb_pull(skb, skb->mac_len);
 		} else {
-			bpf_compute_data_end(skb);
+			bpf_compute_data_pointers(skb);
 			filter_res = BPF_PROG_RUN(prog->filter, skb);
 		}
 
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next 5/6] bpf, nfp: add meta data support
From: Daniel Borkmann @ 2017-09-25  0:25 UTC (permalink / raw)
  To: davem
  Cc: alexei.starovoitov, john.fastabend, peter.waskiewicz.jr,
	jakub.kicinski, netdev, Daniel Borkmann
In-Reply-To: <cover.1506297988.git.daniel@iogearbox.net>

Implement support for transferring XDP meta data into skb for
nfp driver; before calling into the program, xdp.data_meta points
to xdp.data, where on program return with pass verdict, we call
into skb_metadata_set().

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
---
 .../net/ethernet/netronome/nfp/nfp_net_common.c    | 40 ++++++++--------------
 1 file changed, 15 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index e3a38be..d2f73fe 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1574,27 +1574,6 @@ static void nfp_net_rx_csum(struct nfp_net_dp *dp,
 	return true;
 }
 
-static int nfp_net_run_xdp(struct bpf_prog *prog, void *data, void *hard_start,
-			   unsigned int *off, unsigned int *len)
-{
-	struct xdp_buff xdp;
-	void *orig_data;
-	int ret;
-
-	xdp.data_hard_start = hard_start;
-	xdp.data = data + *off;
-	xdp_set_data_meta_invalid(&xdp);
-	xdp.data_end = data + *off + *len;
-
-	orig_data = xdp.data;
-	ret = bpf_prog_run_xdp(prog, &xdp);
-
-	*len -= xdp.data - orig_data;
-	*off += xdp.data - orig_data;
-
-	return ret;
-}
-
 /**
  * nfp_net_rx() - receive up to @budget packets on @rx_ring
  * @rx_ring:   RX ring to receive from
@@ -1630,6 +1609,7 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget)
 		struct nfp_meta_parsed meta;
 		struct net_device *netdev;
 		dma_addr_t new_dma_addr;
+		u32 meta_len_xdp = 0;
 		void *new_frag;
 
 		idx = D_IDX(rx_ring, rx_ring->rd_p);
@@ -1708,16 +1688,24 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget)
 
 		if (xdp_prog && !(rxd->rxd.flags & PCIE_DESC_RX_BPF &&
 				  dp->bpf_offload_xdp) && !meta.portid) {
+			void *orig_data = rxbuf->frag + pkt_off;
 			unsigned int dma_off;
-			void *hard_start;
+			struct xdp_buff xdp;
 			int act;
 
-			hard_start = rxbuf->frag + NFP_NET_RX_BUF_HEADROOM;
+			xdp.data_hard_start = rxbuf->frag + NFP_NET_RX_BUF_HEADROOM;
+			xdp.data = orig_data;
+			xdp.data_meta = orig_data;
+			xdp.data_end = orig_data + pkt_len;
+
+			act = bpf_prog_run_xdp(xdp_prog, &xdp);
+
+			pkt_len -= xdp.data - orig_data;
+			pkt_off += xdp.data - orig_data;
 
-			act = nfp_net_run_xdp(xdp_prog, rxbuf->frag, hard_start,
-					      &pkt_off, &pkt_len);
 			switch (act) {
 			case XDP_PASS:
+				meta_len_xdp = xdp.data - xdp.data_meta;
 				break;
 			case XDP_TX:
 				dma_off = pkt_off - NFP_NET_RX_BUF_HEADROOM;
@@ -1785,6 +1773,8 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget)
 		if (rxd->rxd.flags & PCIE_DESC_RX_VLAN)
 			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
 					       le16_to_cpu(rxd->rxd.vlan));
+		if (meta_len_xdp)
+			skb_metadata_set(skb, meta_len_xdp);
 
 		napi_gro_receive(&rx_ring->r_vec->napi, skb);
 	}
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next 2/6] bpf: add meta pointer for direct access
From: Daniel Borkmann @ 2017-09-25  0:25 UTC (permalink / raw)
  To: davem
  Cc: alexei.starovoitov, john.fastabend, peter.waskiewicz.jr,
	jakub.kicinski, netdev, Daniel Borkmann
In-Reply-To: <cover.1506297988.git.daniel@iogearbox.net>

This work enables generic transfer of metadata from XDP into skb. The
basic idea is that we can make use of the fact that the resulting skb
must be linear and already comes with a larger headroom for supporting
bpf_xdp_adjust_head(), which mangles xdp->data. Here, we base our work
on a similar principle and introduce a small helper bpf_xdp_adjust_meta()
for adjusting a new pointer called xdp->data_meta. Thus, the packet has
a flexible and programmable room for meta data, followed by the actual
packet data. struct xdp_buff is therefore laid out that we first point
to data_hard_start, then data_meta directly prepended to data followed
by data_end marking the end of packet. bpf_xdp_adjust_head() takes into
account whether we have meta data already prepended and if so, memmove()s
this along with the given offset provided there's enough room.

xdp->data_meta is optional and programs are not required to use it. The
rationale is that when we process the packet in XDP (e.g. as DoS filter),
we can push further meta data along with it for the XDP_PASS case, and
give the guarantee that a clsact ingress BPF program on the same device
can pick this up for further post-processing. Since we work with skb
there, we can also set skb->mark, skb->priority or other skb meta data
out of BPF, thus having this scratch space generic and programmable
allows for more flexibility than defining a direct 1:1 transfer of
potentially new XDP members into skb (it's also more efficient as we
don't need to initialize/handle each of such new members). The facility
also works together with GRO aggregation. The scratch space at the head
of the packet can be multiple of 4 byte up to 32 byte large. Drivers not
yet supporting xdp->data_meta can simply be set up with xdp->data_meta
as xdp->data + 1 as bpf_xdp_adjust_meta() will detect this and bail out,
such that the subsequent match against xdp->data for later access is
guaranteed to fail.

The verifier treats xdp->data_meta/xdp->data the same way as we treat
xdp->data/xdp->data_end pointer comparisons. The requirement for doing
the compare against xdp->data is that it hasn't been modified from it's
original address we got from ctx access. It may have a range marking
already from prior successful xdp->data/xdp->data_end pointer comparisons
though.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c      |   1 +
 drivers/net/ethernet/cavium/thunder/nicvf_main.c   |   1 +
 drivers/net/ethernet/intel/i40e/i40e_txrx.c        |   1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c      |   1 +
 drivers/net/ethernet/mellanox/mlx4/en_rx.c         |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    |   1 +
 .../net/ethernet/netronome/nfp/nfp_net_common.c    |   1 +
 drivers/net/ethernet/qlogic/qede/qede_fp.c         |   1 +
 drivers/net/tun.c                                  |   1 +
 drivers/net/virtio_net.c                           |   2 +
 include/linux/bpf.h                                |   1 +
 include/linux/filter.h                             |  21 +++-
 include/linux/skbuff.h                             |  68 +++++++++++-
 include/uapi/linux/bpf.h                           |  13 ++-
 kernel/bpf/verifier.c                              | 114 ++++++++++++++++-----
 net/bpf/test_run.c                                 |   1 +
 net/core/dev.c                                     |  31 +++++-
 net/core/filter.c                                  |  77 +++++++++++++-
 net/core/skbuff.c                                  |   2 +
 19 files changed, 297 insertions(+), 42 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
index d8f0c83..06ce63c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
@@ -94,6 +94,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons,
 
 	xdp.data_hard_start = *data_ptr - offset;
 	xdp.data = *data_ptr;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = *data_ptr + *len;
 	orig_data = xdp.data;
 	mapping = rx_buf->mapping - bp->rx_dma_offset;
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 49b80da..d68478a 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -523,6 +523,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
 
 	xdp.data_hard_start = page_address(page);
 	xdp.data = (void *)cpu_addr;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + len;
 	orig_data = xdp.data;
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 1519dfb..f426762 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2107,6 +2107,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
 		if (!skb) {
 			xdp.data = page_address(rx_buffer->page) +
 				   rx_buffer->page_offset;
+			xdp_set_data_meta_invalid(&xdp);
 			xdp.data_hard_start = xdp.data -
 					      i40e_rx_offset(rx_ring);
 			xdp.data_end = xdp.data + size;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index d962368..04bb03b 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2326,6 +2326,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		if (!skb) {
 			xdp.data = page_address(rx_buffer->page) +
 				   rx_buffer->page_offset;
+			xdp_set_data_meta_invalid(&xdp);
 			xdp.data_hard_start = xdp.data -
 					      ixgbe_rx_offset(rx_ring);
 			xdp.data_end = xdp.data + size;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index b97a55c8..8f9cb8a 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -762,6 +762,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 
 			xdp.data_hard_start = va - frags[0].page_offset;
 			xdp.data = va;
+			xdp_set_data_meta_invalid(&xdp);
 			xdp.data_end = xdp.data + length;
 			orig_data = xdp.data;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index f1dd638..30b3f3f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -794,6 +794,7 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
 		return false;
 
 	xdp.data = va + *rx_headroom;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + *len;
 	xdp.data_hard_start = va;
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 1c0187f..e3a38be 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1583,6 +1583,7 @@ static int nfp_net_run_xdp(struct bpf_prog *prog, void *data, void *hard_start,
 
 	xdp.data_hard_start = hard_start;
 	xdp.data = data + *off;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = data + *off + *len;
 
 	orig_data = xdp.data;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c
index 6fc854b..48ec4c5 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_fp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c
@@ -1004,6 +1004,7 @@ static bool qede_rx_xdp(struct qede_dev *edev,
 
 	xdp.data_hard_start = page_address(bd->data);
 	xdp.data = xdp.data_hard_start + *data_offset;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + *len;
 
 	/* Queues always have a full reset currently, so for the time
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 3c9985f..1757fd7 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1314,6 +1314,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 
 		xdp.data_hard_start = buf;
 		xdp.data = buf + pad;
+		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + len;
 		orig_data = xdp.data;
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index dd14a45..fc059f1 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -554,6 +554,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 
 		xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
 		xdp.data = xdp.data_hard_start + xdp_headroom;
+		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + len;
 		orig_data = xdp.data;
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
@@ -686,6 +687,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		data = page_address(xdp_page) + offset;
 		xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
 		xdp.data = data + vi->hdr_len;
+		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + (len - vi->hdr_len);
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8390859..2b672c5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -137,6 +137,7 @@ enum bpf_reg_type {
 	PTR_TO_MAP_VALUE,	 /* reg points to map element value */
 	PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
 	PTR_TO_STACK,		 /* reg == frame_pointer + offset */
+	PTR_TO_PACKET_META,	 /* skb->data - meta_len */
 	PTR_TO_PACKET,		 /* reg points to skb->data */
 	PTR_TO_PACKET_END,	 /* skb->data + headlen */
 };
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 052bab3..911d454 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -487,12 +487,14 @@ struct sk_filter {
 
 struct bpf_skb_data_end {
 	struct qdisc_skb_cb qdisc_cb;
+	void *data_meta;
 	void *data_end;
 };
 
 struct xdp_buff {
 	void *data;
 	void *data_end;
+	void *data_meta;
 	void *data_hard_start;
 };
 
@@ -507,7 +509,8 @@ static inline void bpf_compute_data_pointers(struct sk_buff *skb)
 	struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
 
 	BUILD_BUG_ON(sizeof(*cb) > FIELD_SIZEOF(struct sk_buff, cb));
-	cb->data_end = skb->data + skb_headlen(skb);
+	cb->data_meta = skb->data - skb_metadata_len(skb);
+	cb->data_end  = skb->data + skb_headlen(skb);
 }
 
 static inline u8 *bpf_skb_cb(struct sk_buff *skb)
@@ -728,8 +731,22 @@ int xdp_do_redirect(struct net_device *dev,
 		    struct bpf_prog *prog);
 void xdp_do_flush_map(void);
 
+/* Drivers not supporting XDP metadata can use this helper, which
+ * rejects any room expansion for metadata as a result.
+ */
+static __always_inline void
+xdp_set_data_meta_invalid(struct xdp_buff *xdp)
+{
+	xdp->data_meta = xdp->data + 1;
+}
+
+static __always_inline bool
+xdp_data_meta_unsupported(const struct xdp_buff *xdp)
+{
+	return unlikely(xdp->data_meta > xdp->data);
+}
+
 void bpf_warn_invalid_xdp_action(u32 act);
-void bpf_warn_invalid_xdp_redirect(u32 ifindex);
 
 struct sock *do_sk_redirect_map(void);
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f9db553..19e64bf 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -489,8 +489,9 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
  * the end of the header data, ie. at skb->end.
  */
 struct skb_shared_info {
-	unsigned short	_unused;
-	unsigned char	nr_frags;
+	__u8		__unused;
+	__u8		meta_len;
+	__u8		nr_frags;
 	__u8		tx_flags;
 	unsigned short	gso_size;
 	/* Warning: this field is not always filled in (UFO)! */
@@ -3400,6 +3401,69 @@ static inline ktime_t net_invalid_timestamp(void)
 	return 0;
 }
 
+static inline u8 skb_metadata_len(const struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->meta_len;
+}
+
+static inline void *skb_metadata_end(const struct sk_buff *skb)
+{
+	return skb_mac_header(skb);
+}
+
+static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
+					  const struct sk_buff *skb_b,
+					  u8 meta_len)
+{
+	const void *a = skb_metadata_end(skb_a);
+	const void *b = skb_metadata_end(skb_b);
+	/* Using more efficient varaiant than plain call to memcmp(). */
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
+	u64 diffs = 0;
+
+	switch (meta_len) {
+#define __it(x, op) (x -= sizeof(u##op))
+#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op))
+	case 32: diffs |= __it_diff(a, b, 64);
+	case 24: diffs |= __it_diff(a, b, 64);
+	case 16: diffs |= __it_diff(a, b, 64);
+	case  8: diffs |= __it_diff(a, b, 64);
+		break;
+	case 28: diffs |= __it_diff(a, b, 64);
+	case 20: diffs |= __it_diff(a, b, 64);
+	case 12: diffs |= __it_diff(a, b, 64);
+	case  4: diffs |= __it_diff(a, b, 32);
+		break;
+	}
+	return diffs;
+#else
+	return memcmp(a - meta_len, b - meta_len, meta_len);
+#endif
+}
+
+static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
+					const struct sk_buff *skb_b)
+{
+	u8 len_a = skb_metadata_len(skb_a);
+	u8 len_b = skb_metadata_len(skb_b);
+
+	if (!(len_a | len_b))
+		return false;
+
+	return len_a != len_b ?
+	       true : __skb_metadata_differs(skb_a, skb_b, len_a);
+}
+
+static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
+{
+	skb_shinfo(skb)->meta_len = meta_len;
+}
+
+static inline void skb_metadata_clear(struct sk_buff *skb)
+{
+	skb_metadata_set(skb, 0);
+}
+
 struct sk_buff *skb_clone_sk(struct sk_buff *skb);
 
 #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 43ab5c4..e43491a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -582,6 +582,12 @@ enum bpf_attach_type {
  *	@map: pointer to sockmap to update
  *	@key: key to insert/update sock in map
  *	@flags: same flags as map update elem
+ *
+ * int bpf_xdp_adjust_meta(xdp_md, delta)
+ *     Adjust the xdp_md.data_meta by delta
+ *     @xdp_md: pointer to xdp_md
+ *     @delta: An positive/negative integer to be added to xdp_md.data_meta
+ *     Return: 0 on success or negative on error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -638,6 +644,7 @@ enum bpf_attach_type {
 	FN(redirect_map),		\
 	FN(sk_redirect_map),		\
 	FN(sock_map_update),		\
+	FN(xdp_adjust_meta),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -715,7 +722,7 @@ struct __sk_buff {
 	__u32 data_end;
 	__u32 napi_id;
 
-	/* accessed by BPF_PROG_TYPE_sk_skb types */
+	/* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */
 	__u32 family;
 	__u32 remote_ip4;	/* Stored in network byte order */
 	__u32 local_ip4;	/* Stored in network byte order */
@@ -723,6 +730,9 @@ struct __sk_buff {
 	__u32 local_ip6[4];	/* Stored in network byte order */
 	__u32 remote_port;	/* Stored in network byte order */
 	__u32 local_port;	/* stored in host byte order */
+	/* ... here. */
+
+	__u32 data_meta;
 };
 
 struct bpf_tunnel_key {
@@ -783,6 +793,7 @@ enum xdp_action {
 struct xdp_md {
 	__u32 data;
 	__u32 data_end;
+	__u32 data_meta;
 };
 
 enum sk_action {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b914fbe..f849eca 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -177,6 +177,12 @@ static __printf(1, 2) void verbose(const char *fmt, ...)
 	va_end(args);
 }
 
+static bool type_is_pkt_pointer(enum bpf_reg_type type)
+{
+	return type == PTR_TO_PACKET ||
+	       type == PTR_TO_PACKET_META;
+}
+
 /* string representation of 'enum bpf_reg_type' */
 static const char * const reg_type_str[] = {
 	[NOT_INIT]		= "?",
@@ -187,6 +193,7 @@ static __printf(1, 2) void verbose(const char *fmt, ...)
 	[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
 	[PTR_TO_STACK]		= "fp",
 	[PTR_TO_PACKET]		= "pkt",
+	[PTR_TO_PACKET_META]	= "pkt_meta",
 	[PTR_TO_PACKET_END]	= "pkt_end",
 };
 
@@ -226,7 +233,7 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 			verbose("(id=%d", reg->id);
 			if (t != SCALAR_VALUE)
 				verbose(",off=%d", reg->off);
-			if (t == PTR_TO_PACKET)
+			if (type_is_pkt_pointer(t))
 				verbose(",r=%d", reg->range);
 			else if (t == CONST_PTR_TO_MAP ||
 				 t == PTR_TO_MAP_VALUE ||
@@ -519,6 +526,31 @@ static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
 	__mark_reg_known_zero(regs + regno);
 }
 
+static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
+{
+	return type_is_pkt_pointer(reg->type);
+}
+
+static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
+{
+	return reg_is_pkt_pointer(reg) ||
+	       reg->type == PTR_TO_PACKET_END;
+}
+
+/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
+static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
+				    enum bpf_reg_type which)
+{
+	/* The register can already have a range from prior markings.
+	 * This is fine as long as it hasn't been advanced from its
+	 * origin.
+	 */
+	return reg->type == which &&
+	       reg->id == 0 &&
+	       reg->off == 0 &&
+	       tnum_equals_const(reg->var_off, 0);
+}
+
 /* Attempts to improve min/max values based on var_off information */
 static void __update_reg_bounds(struct bpf_reg_state *reg)
 {
@@ -702,6 +734,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_STACK:
 	case PTR_TO_CTX:
 	case PTR_TO_PACKET:
+	case PTR_TO_PACKET_META:
 	case PTR_TO_PACKET_END:
 	case CONST_PTR_TO_MAP:
 		return true;
@@ -1047,7 +1080,10 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 
 	switch (reg->type) {
 	case PTR_TO_PACKET:
-		/* special case, because of NET_IP_ALIGN */
+	case PTR_TO_PACKET_META:
+		/* Special case, because of NET_IP_ALIGN. Given metadata sits
+		 * right in front, treat it the very same way.
+		 */
 		return check_pkt_ptr_alignment(reg, off, size, strict);
 	case PTR_TO_MAP_VALUE:
 		pointer_desc = "value ";
@@ -1124,8 +1160,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
 		if (!err && t == BPF_READ && value_regno >= 0) {
 			/* ctx access returns either a scalar, or a
-			 * PTR_TO_PACKET[_END].  In the latter case, we know
-			 * the offset is zero.
+			 * PTR_TO_PACKET[_META,_END]. In the latter
+			 * case, we know the offset is zero.
 			 */
 			if (reg_type == SCALAR_VALUE)
 				mark_reg_unknown(state->regs, value_regno);
@@ -1170,7 +1206,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		} else {
 			err = check_stack_read(state, off, size, value_regno);
 		}
-	} else if (reg->type == PTR_TO_PACKET) {
+	} else if (reg_is_pkt_pointer(reg)) {
 		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
 			verbose("cannot write into packet\n");
 			return -EACCES;
@@ -1310,6 +1346,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 
 	switch (reg->type) {
 	case PTR_TO_PACKET:
+	case PTR_TO_PACKET_META:
 		return check_packet_access(env, regno, reg->off, access_size);
 	case PTR_TO_MAP_VALUE:
 		return check_map_access(env, regno, reg->off, access_size);
@@ -1342,7 +1379,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		return 0;
 	}
 
-	if (type == PTR_TO_PACKET &&
+	if (type_is_pkt_pointer(type) &&
 	    !may_access_direct_pkt_data(env, meta, BPF_READ)) {
 		verbose("helper access to the packet is not allowed\n");
 		return -EACCES;
@@ -1351,7 +1388,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 	if (arg_type == ARG_PTR_TO_MAP_KEY ||
 	    arg_type == ARG_PTR_TO_MAP_VALUE) {
 		expected_type = PTR_TO_STACK;
-		if (type != PTR_TO_PACKET && type != expected_type)
+		if (!type_is_pkt_pointer(type) &&
+		    type != expected_type)
 			goto err_type;
 	} else if (arg_type == ARG_CONST_SIZE ||
 		   arg_type == ARG_CONST_SIZE_OR_ZERO) {
@@ -1375,7 +1413,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		 */
 		if (register_is_null(*reg))
 			/* final test in check_stack_boundary() */;
-		else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
+		else if (!type_is_pkt_pointer(type) &&
+			 type != PTR_TO_MAP_VALUE &&
 			 type != expected_type)
 			goto err_type;
 		meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
@@ -1401,7 +1440,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose("invalid map_ptr to access map->key\n");
 			return -EACCES;
 		}
-		if (type == PTR_TO_PACKET)
+		if (type_is_pkt_pointer(type))
 			err = check_packet_access(env, regno, reg->off,
 						  meta->map_ptr->key_size);
 		else
@@ -1417,7 +1456,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose("invalid map_ptr to access map->value\n");
 			return -EACCES;
 		}
-		if (type == PTR_TO_PACKET)
+		if (type_is_pkt_pointer(type))
 			err = check_packet_access(env, regno, reg->off,
 						  meta->map_ptr->value_size);
 		else
@@ -1590,8 +1629,8 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
 	return count > 1 ? -EINVAL : 0;
 }
 
-/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid,
- * so turn them into unknown SCALAR_VALUE.
+/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
+ * are now invalid, so turn them into unknown SCALAR_VALUE.
  */
 static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 {
@@ -1600,18 +1639,15 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++)
-		if (regs[i].type == PTR_TO_PACKET ||
-		    regs[i].type == PTR_TO_PACKET_END)
+		if (reg_is_pkt_pointer_any(&regs[i]))
 			mark_reg_unknown(regs, i);
 
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
 		if (state->stack_slot_type[i] != STACK_SPILL)
 			continue;
 		reg = &state->spilled_regs[i / BPF_REG_SIZE];
-		if (reg->type != PTR_TO_PACKET &&
-		    reg->type != PTR_TO_PACKET_END)
-			continue;
-		__mark_reg_unknown(reg);
+		if (reg_is_pkt_pointer_any(reg))
+			__mark_reg_unknown(reg);
 	}
 }
 
@@ -1871,7 +1907,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		}
 		dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
 		dst_reg->off = ptr_reg->off;
-		if (ptr_reg->type == PTR_TO_PACKET) {
+		if (reg_is_pkt_pointer(ptr_reg)) {
 			dst_reg->id = ++env->id_gen;
 			/* something was added to pkt_ptr, set range to zero */
 			dst_reg->range = 0;
@@ -1931,7 +1967,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		}
 		dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
 		dst_reg->off = ptr_reg->off;
-		if (ptr_reg->type == PTR_TO_PACKET) {
+		if (reg_is_pkt_pointer(ptr_reg)) {
 			dst_reg->id = ++env->id_gen;
 			/* something was added to pkt_ptr, set range to zero */
 			if (smin_val < 0)
@@ -2421,7 +2457,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 }
 
 static void find_good_pkt_pointers(struct bpf_verifier_state *state,
-				   struct bpf_reg_state *dst_reg)
+				   struct bpf_reg_state *dst_reg,
+				   enum bpf_reg_type type)
 {
 	struct bpf_reg_state *regs = state->regs, *reg;
 	int i;
@@ -2483,7 +2520,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 	 * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
 	 */
 	for (i = 0; i < MAX_BPF_REG; i++)
-		if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
+		if (regs[i].type == type && regs[i].id == dst_reg->id)
 			/* keep the maximum range already checked */
 			regs[i].range = max_t(u16, regs[i].range, dst_reg->off);
 
@@ -2491,7 +2528,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 		if (state->stack_slot_type[i] != STACK_SPILL)
 			continue;
 		reg = &state->spilled_regs[i / BPF_REG_SIZE];
-		if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
+		if (reg->type == type && reg->id == dst_reg->id)
 			reg->range = max_t(u16, reg->range, dst_reg->off);
 	}
 }
@@ -2856,19 +2893,39 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
 		   dst_reg->type == PTR_TO_PACKET &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		find_good_pkt_pointers(this_branch, dst_reg);
+		find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
 		   dst_reg->type == PTR_TO_PACKET &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		find_good_pkt_pointers(other_branch, dst_reg);
+		find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
 		   dst_reg->type == PTR_TO_PACKET_END &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		find_good_pkt_pointers(other_branch, &regs[insn->src_reg]);
+		find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
 		   dst_reg->type == PTR_TO_PACKET_END &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		find_good_pkt_pointers(this_branch, &regs[insn->src_reg]);
+		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
+		   dst_reg->type == PTR_TO_PACKET_META &&
+		   reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
+		find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET_META);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
+		   dst_reg->type == PTR_TO_PACKET_META &&
+		   reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
+		find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET_META);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
+		   reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+		   regs[insn->src_reg].type == PTR_TO_PACKET_META) {
+		find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET_META);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
+		   reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+		   regs[insn->src_reg].type == PTR_TO_PACKET_META) {
+		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET_META);
 	} else if (is_pointer_value(env, insn->dst_reg)) {
 		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
 		return -EACCES;
@@ -3298,8 +3355,9 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 			return false;
 		/* Check our ids match any regs they're supposed to */
 		return check_ids(rold->id, rcur->id, idmap);
+	case PTR_TO_PACKET_META:
 	case PTR_TO_PACKET:
-		if (rcur->type != PTR_TO_PACKET)
+		if (rcur->type != rold->type)
 			return false;
 		/* We must have at least as much range as the old ptr
 		 * did, so that any accesses which were safe before are
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index df67251..a86e668 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -162,6 +162,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 
 	xdp.data_hard_start = data;
 	xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN;
+	xdp.data_meta = xdp.data;
 	xdp.data_end = xdp.data + size;
 
 	retval = bpf_test_run(prog, &xdp, repeat, &duration);
diff --git a/net/core/dev.c b/net/core/dev.c
index 97abddd..e350c76 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3864,8 +3864,8 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 				     struct bpf_prog *xdp_prog)
 {
+	u32 metalen, act = XDP_DROP;
 	struct xdp_buff xdp;
-	u32 act = XDP_DROP;
 	void *orig_data;
 	int hlen, off;
 	u32 mac_len;
@@ -3876,8 +3876,25 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	if (skb_cloned(skb))
 		return XDP_PASS;
 
-	if (skb_linearize(skb))
-		goto do_drop;
+	/* XDP packets must be linear and must have sufficient headroom
+	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
+	 * native XDP provides, thus we need to do it here as well.
+	 */
+	if (skb_is_nonlinear(skb) ||
+	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
+		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
+		int troom = skb->tail + skb->data_len - skb->end;
+
+		/* In case we have to go down the path and also linearize,
+		 * then lets do the pskb_expand_head() work just once here.
+		 */
+		if (pskb_expand_head(skb,
+				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
+				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
+			goto do_drop;
+		if (troom > 0 && __skb_linearize(skb))
+			goto do_drop;
+	}
 
 	/* The XDP program wants to see the packet starting at the MAC
 	 * header.
@@ -3885,6 +3902,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	mac_len = skb->data - skb_mac_header(skb);
 	hlen = skb_headlen(skb) + mac_len;
 	xdp.data = skb->data - mac_len;
+	xdp.data_meta = xdp.data;
 	xdp.data_end = xdp.data + hlen;
 	xdp.data_hard_start = skb->data - skb_headroom(skb);
 	orig_data = xdp.data;
@@ -3902,10 +3920,12 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	case XDP_REDIRECT:
 	case XDP_TX:
 		__skb_push(skb, mac_len);
-		/* fall through */
+		break;
 	case XDP_PASS:
+		metalen = xdp.data - xdp.data_meta;
+		if (metalen)
+			skb_metadata_set(skb, metalen);
 		break;
-
 	default:
 		bpf_warn_invalid_xdp_action(act);
 		/* fall through */
@@ -4695,6 +4715,7 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
 		diffs |= p->vlan_tci ^ skb->vlan_tci;
 		diffs |= skb_metadata_dst_cmp(p, skb);
+		diffs |= skb_metadata_differs(p, skb);
 		if (maclen == ETH_HLEN)
 			diffs |= compare_ether_header(skb_mac_header(p),
 						      skb_mac_header(skb));
diff --git a/net/core/filter.c b/net/core/filter.c
index c468e7c..9b6e7e8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2447,14 +2447,26 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
 	.arg3_type	= ARG_ANYTHING,
 };
 
+static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
+{
+	return xdp_data_meta_unsupported(xdp) ? 0 :
+	       xdp->data - xdp->data_meta;
+}
+
 BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
 {
+	unsigned long metalen = xdp_get_metalen(xdp);
+	void *data_start = xdp->data_hard_start + metalen;
 	void *data = xdp->data + offset;
 
-	if (unlikely(data < xdp->data_hard_start ||
+	if (unlikely(data < data_start ||
 		     data > xdp->data_end - ETH_HLEN))
 		return -EINVAL;
 
+	if (metalen)
+		memmove(xdp->data_meta + offset,
+			xdp->data_meta, metalen);
+	xdp->data_meta += offset;
 	xdp->data = data;
 
 	return 0;
@@ -2468,6 +2480,33 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
 	.arg2_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
+{
+	void *meta = xdp->data_meta + offset;
+	unsigned long metalen = xdp->data - meta;
+
+	if (xdp_data_meta_unsupported(xdp))
+		return -ENOTSUPP;
+	if (unlikely(meta < xdp->data_hard_start ||
+		     meta > xdp->data))
+		return -EINVAL;
+	if (unlikely((metalen & (sizeof(__u32) - 1)) ||
+		     (metalen > 32)))
+		return -EACCES;
+
+	xdp->data_meta = meta;
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
+	.func		= bpf_xdp_adjust_meta,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static int __bpf_tx_xdp(struct net_device *dev,
 			struct bpf_map *map,
 			struct xdp_buff *xdp,
@@ -2692,7 +2731,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_clone_redirect ||
 	    func == bpf_l3_csum_replace ||
 	    func == bpf_l4_csum_replace ||
-	    func == bpf_xdp_adjust_head)
+	    func == bpf_xdp_adjust_head ||
+	    func == bpf_xdp_adjust_meta)
 		return true;
 
 	return false;
@@ -3288,6 +3328,8 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_xdp_adjust_head:
 		return &bpf_xdp_adjust_head_proto;
+	case BPF_FUNC_xdp_adjust_meta:
+		return &bpf_xdp_adjust_meta_proto;
 	case BPF_FUNC_redirect:
 		return &bpf_xdp_redirect_proto;
 	case BPF_FUNC_redirect_map:
@@ -3418,6 +3460,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
 	case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
 	case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
 	case bpf_ctx_range(struct __sk_buff, data):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range(struct __sk_buff, data_end):
 		if (size != size_default)
 			return false;
@@ -3444,6 +3487,7 @@ static bool sk_filter_is_valid_access(int off, int size,
 	switch (off) {
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range(struct __sk_buff, data):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range(struct __sk_buff, data_end):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 		return false;
@@ -3468,6 +3512,7 @@ static bool lwt_is_valid_access(int off, int size,
 	switch (off) {
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
 		return false;
 	}
 
@@ -3586,6 +3631,9 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, data):
 		info->reg_type = PTR_TO_PACKET;
 		break;
+	case bpf_ctx_range(struct __sk_buff, data_meta):
+		info->reg_type = PTR_TO_PACKET_META;
+		break;
 	case bpf_ctx_range(struct __sk_buff, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
 		break;
@@ -3619,6 +3667,9 @@ static bool xdp_is_valid_access(int off, int size,
 	case offsetof(struct xdp_md, data):
 		info->reg_type = PTR_TO_PACKET;
 		break;
+	case offsetof(struct xdp_md, data_meta):
+		info->reg_type = PTR_TO_PACKET_META;
+		break;
 	case offsetof(struct xdp_md, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
 		break;
@@ -3677,6 +3728,12 @@ static bool sk_skb_is_valid_access(int off, int size,
 				   enum bpf_access_type type,
 				   struct bpf_insn_access_aux *info)
 {
+	switch (off) {
+	case bpf_ctx_range(struct __sk_buff, tc_classid):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
+		return false;
+	}
+
 	if (type == BPF_WRITE) {
 		switch (off) {
 		case bpf_ctx_range(struct __sk_buff, mark):
@@ -3689,8 +3746,6 @@ static bool sk_skb_is_valid_access(int off, int size,
 	}
 
 	switch (off) {
-	case bpf_ctx_range(struct __sk_buff, tc_classid):
-		return false;
 	case bpf_ctx_range(struct __sk_buff, data):
 		info->reg_type = PTR_TO_PACKET;
 		break;
@@ -3847,6 +3902,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 				      offsetof(struct sk_buff, data));
 		break;
 
+	case offsetof(struct __sk_buff, data_meta):
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, data_meta);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct bpf_skb_data_end, data_meta);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
+				      si->src_reg, off);
+		break;
+
 	case offsetof(struct __sk_buff, data_end):
 		off  = si->off;
 		off -= offsetof(struct __sk_buff, data_end);
@@ -4095,6 +4159,11 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 				      si->dst_reg, si->src_reg,
 				      offsetof(struct xdp_buff, data));
 		break;
+	case offsetof(struct xdp_md, data_meta):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct xdp_buff, data_meta));
+		break;
 	case offsetof(struct xdp_md, data_end):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
 				      si->dst_reg, si->src_reg,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 16982de..681177b 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1509,6 +1509,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	skb->nohdr    = 0;
 	atomic_set(&skb_shinfo(skb)->dataref, 1);
 
+	skb_metadata_clear(skb);
+
 	/* It is not generally safe to change skb->truesize.
 	 * For the moment, we really care of rx path, or
 	 * when skb is orphaned (not attached to a socket).
-- 
1.9.3

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox