Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next v2 3/4] geneve: Remove redundant socket checks.
From: Pravin B Shelar @ 2016-11-19  2:10 UTC (permalink / raw)
  To: netdev; +Cc: Pravin B Shelar
In-Reply-To: <1479521411-53012-1-git-send-email-pshelar@ovn.org>

Geneve already has check for device socket in route
lookup function. So no need to check it in xmit
function.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
---
 drivers/net/geneve.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index d1759aa..f2912ca 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -785,14 +785,11 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	struct geneve_sock *gs4 = rcu_dereference(geneve->sock4);
 	const struct ip_tunnel_key *key = &info->key;
 	struct rtable *rt;
-	int err = -EINVAL;
 	struct flowi4 fl4;
 	__u8 tos, ttl;
 	__be16 sport;
 	__be16 df;
-
-	if (!gs4)
-		return err;
+	int err;
 
 	rt = geneve_get_v4_rt(skb, dev, &fl4, info);
 	if (IS_ERR(rt))
@@ -828,13 +825,10 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	struct geneve_sock *gs6 = rcu_dereference(geneve->sock6);
 	const struct ip_tunnel_key *key = &info->key;
 	struct dst_entry *dst = NULL;
-	int err = -EINVAL;
 	struct flowi6 fl6;
 	__u8 prio, ttl;
 	__be16 sport;
-
-	if (!gs6)
-		return err;
+	int err;
 
 	dst = geneve_get_v6_dst(skb, dev, &fl6, info);
 	if (IS_ERR(dst))
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next v2 4/4] geneve: Optimize geneve device lookup.
From: Pravin B Shelar @ 2016-11-19  2:10 UTC (permalink / raw)
  To: netdev; +Cc: Pravin B Shelar
In-Reply-To: <1479521411-53012-1-git-send-email-pshelar@ovn.org>

Rather than comparing 64-bit tunnel-id, compare tunnel vni
which is 24-bit id. This also save conversion from vni
to tunnel id on each tunnel packet receive.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
---
 drivers/net/geneve.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index f2912ca..930b1b0 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -103,6 +103,17 @@ static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni)
 #endif
 }
 
+static bool eq_tun_id_and_vni(u8 *tun_id, u8 *vni)
+{
+#ifdef __BIG_ENDIAN
+	return (vni[0] == tun_id[2]) &&
+	       (vni[1] == tun_id[1]) &&
+	       (vni[2] == tun_id[0]);
+#else
+	return !memcmp(vni, &tun_id[5], 3);
+#endif
+}
+
 static sa_family_t geneve_get_sk_family(struct geneve_sock *gs)
 {
 	return gs->sock->sk->sk_family;
@@ -111,7 +122,6 @@ static sa_family_t geneve_get_sk_family(struct geneve_sock *gs)
 static struct geneve_dev *geneve_lookup(struct geneve_sock *gs,
 					__be32 addr, u8 vni[])
 {
-	__be64 id = vni_to_tunnel_id(vni);
 	struct hlist_head *vni_list_head;
 	struct geneve_dev *geneve;
 	__u32 hash;
@@ -120,7 +130,7 @@ static struct geneve_dev *geneve_lookup(struct geneve_sock *gs,
 	hash = geneve_net_vni_hash(vni);
 	vni_list_head = &gs->vni_list[hash];
 	hlist_for_each_entry_rcu(geneve, vni_list_head, hlist) {
-		if (!memcmp(&id, &geneve->info.key.tun_id, sizeof(id)) &&
+		if (eq_tun_id_and_vni((u8 *)&geneve->info.key.tun_id, vni) &&
 		    addr == geneve->info.key.u.ipv4.dst)
 			return geneve;
 	}
@@ -131,7 +141,6 @@ static struct geneve_dev *geneve_lookup(struct geneve_sock *gs,
 static struct geneve_dev *geneve6_lookup(struct geneve_sock *gs,
 					 struct in6_addr addr6, u8 vni[])
 {
-	__be64 id = vni_to_tunnel_id(vni);
 	struct hlist_head *vni_list_head;
 	struct geneve_dev *geneve;
 	__u32 hash;
@@ -140,7 +149,7 @@ static struct geneve_dev *geneve6_lookup(struct geneve_sock *gs,
 	hash = geneve_net_vni_hash(vni);
 	vni_list_head = &gs->vni_list[hash];
 	hlist_for_each_entry_rcu(geneve, vni_list_head, hlist) {
-		if (!memcmp(&id, &geneve->info.key.tun_id, sizeof(id)) &&
+		if (eq_tun_id_and_vni((u8 *)&geneve->info.key.tun_id, vni) &&
 		    ipv6_addr_equal(&addr6, &geneve->info.key.u.ipv6.dst))
 			return geneve;
 	}
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next v2 1/4] geneve: Unify LWT and netdev handling.
From: Pravin B Shelar @ 2016-11-19  2:10 UTC (permalink / raw)
  To: netdev; +Cc: Pravin B Shelar
In-Reply-To: <1479521411-53012-1-git-send-email-pshelar@ovn.org>

Current geneve implementation has two separate cases to handle.
1. netdev xmit
2. LWT xmit.

In case of netdev, geneve configuration is stored in various
struct geneve_dev members. For example geneve_addr, ttl, tos,
label, flags, dst_cache, etc. For LWT ip_tunnel_info is passed
to the device in ip_tunnel_info.

Following patch uses ip_tunnel_info struct to store almost all
of configuration of a geneve netdevice. This allows us to unify
most of geneve driver code around ip_tunnel_info struct.
This dramatically simplify geneve code, since it does not
need to handle two different configuration cases. Removes
duplicate code, single code path can handle either type
of geneve devices.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
---
 drivers/net/geneve.c | 612 ++++++++++++++++++++++-----------------------------
 1 file changed, 263 insertions(+), 349 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 85a423a..b5e65cd 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -45,41 +45,22 @@ struct geneve_net {
 
 static int geneve_net_id;
 
-union geneve_addr {
-	struct sockaddr_in sin;
-	struct sockaddr_in6 sin6;
-	struct sockaddr sa;
-};
-
-static union geneve_addr geneve_remote_unspec = { .sa.sa_family = AF_UNSPEC, };
-
 /* Pseudo network device */
 struct geneve_dev {
 	struct hlist_node  hlist;	/* vni hash table */
 	struct net	   *net;	/* netns for packet i/o */
 	struct net_device  *dev;	/* netdev for geneve tunnel */
+	struct ip_tunnel_info info;
 	struct geneve_sock __rcu *sock4;	/* IPv4 socket used for geneve tunnel */
 #if IS_ENABLED(CONFIG_IPV6)
 	struct geneve_sock __rcu *sock6;	/* IPv6 socket used for geneve tunnel */
 #endif
-	u8                 vni[3];	/* virtual network ID for tunnel */
-	u8                 ttl;		/* TTL override */
-	u8                 tos;		/* TOS override */
-	union geneve_addr  remote;	/* IP address for link partner */
 	struct list_head   next;	/* geneve's per namespace list */
-	__be32		   label;	/* IPv6 flowlabel override */
-	__be16		   dst_port;
-	bool		   collect_md;
 	struct gro_cells   gro_cells;
-	u32		   flags;
-	struct dst_cache   dst_cache;
+	bool		   collect_md;
+	bool		   use_udp6_rx_checksums;
 };
 
-/* Geneve device flags */
-#define GENEVE_F_UDP_ZERO_CSUM_TX	BIT(0)
-#define GENEVE_F_UDP_ZERO_CSUM6_TX	BIT(1)
-#define GENEVE_F_UDP_ZERO_CSUM6_RX	BIT(2)
-
 struct geneve_sock {
 	bool			collect_md;
 	struct list_head	list;
@@ -87,7 +68,6 @@ struct geneve_sock {
 	struct rcu_head		rcu;
 	int			refcnt;
 	struct hlist_head	vni_list[VNI_HASH_SIZE];
-	u32			flags;
 };
 
 static inline __u32 geneve_net_vni_hash(u8 vni[3])
@@ -109,6 +89,20 @@ static __be64 vni_to_tunnel_id(const __u8 *vni)
 #endif
 }
 
+/* Convert 64 bit tunnel ID to 24 bit VNI. */
+static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni)
+{
+#ifdef __BIG_ENDIAN
+	vni[0] = (__force __u8)(tun_id >> 16);
+	vni[1] = (__force __u8)(tun_id >> 8);
+	vni[2] = (__force __u8)tun_id;
+#else
+	vni[0] = (__force __u8)((__force u64)tun_id >> 40);
+	vni[1] = (__force __u8)((__force u64)tun_id >> 48);
+	vni[2] = (__force __u8)((__force u64)tun_id >> 56);
+#endif
+}
+
 static sa_family_t geneve_get_sk_family(struct geneve_sock *gs)
 {
 	return gs->sock->sk->sk_family;
@@ -117,6 +111,7 @@ static sa_family_t geneve_get_sk_family(struct geneve_sock *gs)
 static struct geneve_dev *geneve_lookup(struct geneve_sock *gs,
 					__be32 addr, u8 vni[])
 {
+	__be64 id = vni_to_tunnel_id(vni);
 	struct hlist_head *vni_list_head;
 	struct geneve_dev *geneve;
 	__u32 hash;
@@ -125,8 +120,8 @@ static struct geneve_dev *geneve_lookup(struct geneve_sock *gs,
 	hash = geneve_net_vni_hash(vni);
 	vni_list_head = &gs->vni_list[hash];
 	hlist_for_each_entry_rcu(geneve, vni_list_head, hlist) {
-		if (!memcmp(vni, geneve->vni, sizeof(geneve->vni)) &&
-		    addr == geneve->remote.sin.sin_addr.s_addr)
+		if (!memcmp(&id, &geneve->info.key.tun_id, sizeof(id)) &&
+		    addr == geneve->info.key.u.ipv4.dst)
 			return geneve;
 	}
 	return NULL;
@@ -136,6 +131,7 @@ static struct geneve_dev *geneve_lookup(struct geneve_sock *gs,
 static struct geneve_dev *geneve6_lookup(struct geneve_sock *gs,
 					 struct in6_addr addr6, u8 vni[])
 {
+	__be64 id = vni_to_tunnel_id(vni);
 	struct hlist_head *vni_list_head;
 	struct geneve_dev *geneve;
 	__u32 hash;
@@ -144,8 +140,8 @@ static struct geneve_dev *geneve6_lookup(struct geneve_sock *gs,
 	hash = geneve_net_vni_hash(vni);
 	vni_list_head = &gs->vni_list[hash];
 	hlist_for_each_entry_rcu(geneve, vni_list_head, hlist) {
-		if (!memcmp(vni, geneve->vni, sizeof(geneve->vni)) &&
-		    ipv6_addr_equal(&addr6, &geneve->remote.sin6.sin6_addr))
+		if (!memcmp(&id, &geneve->info.key.tun_id, sizeof(id)) &&
+		    ipv6_addr_equal(&addr6, &geneve->info.key.u.ipv6.dst))
 			return geneve;
 	}
 	return NULL;
@@ -160,15 +156,12 @@ static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
 static struct geneve_dev *geneve_lookup_skb(struct geneve_sock *gs,
 					    struct sk_buff *skb)
 {
-	u8 *vni;
-	__be32 addr;
 	static u8 zero_vni[3];
-#if IS_ENABLED(CONFIG_IPV6)
-	static struct in6_addr zero_addr6;
-#endif
+	u8 *vni;
 
 	if (geneve_get_sk_family(gs) == AF_INET) {
 		struct iphdr *iph;
+		__be32 addr;
 
 		iph = ip_hdr(skb); /* outer IP header... */
 
@@ -183,6 +176,7 @@ static struct geneve_dev *geneve_lookup_skb(struct geneve_sock *gs,
 		return geneve_lookup(gs, addr, vni);
 #if IS_ENABLED(CONFIG_IPV6)
 	} else if (geneve_get_sk_family(gs) == AF_INET6) {
+		static struct in6_addr zero_addr6;
 		struct ipv6hdr *ip6h;
 		struct in6_addr addr6;
 
@@ -305,13 +299,12 @@ static int geneve_init(struct net_device *dev)
 		return err;
 	}
 
-	err = dst_cache_init(&geneve->dst_cache, GFP_KERNEL);
+	err = dst_cache_init(&geneve->info.dst_cache, GFP_KERNEL);
 	if (err) {
 		free_percpu(dev->tstats);
 		gro_cells_destroy(&geneve->gro_cells);
 		return err;
 	}
-
 	return 0;
 }
 
@@ -319,7 +312,7 @@ static void geneve_uninit(struct net_device *dev)
 {
 	struct geneve_dev *geneve = netdev_priv(dev);
 
-	dst_cache_destroy(&geneve->dst_cache);
+	dst_cache_destroy(&geneve->info.dst_cache);
 	gro_cells_destroy(&geneve->gro_cells);
 	free_percpu(dev->tstats);
 }
@@ -368,7 +361,7 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 }
 
 static struct socket *geneve_create_sock(struct net *net, bool ipv6,
-					 __be16 port, u32 flags)
+					 __be16 port, bool ipv6_rx_csum)
 {
 	struct socket *sock;
 	struct udp_port_cfg udp_conf;
@@ -379,8 +372,7 @@ static struct socket *geneve_create_sock(struct net *net, bool ipv6,
 	if (ipv6) {
 		udp_conf.family = AF_INET6;
 		udp_conf.ipv6_v6only = 1;
-		udp_conf.use_udp6_rx_checksums =
-		    !(flags & GENEVE_F_UDP_ZERO_CSUM6_RX);
+		udp_conf.use_udp6_rx_checksums = ipv6_rx_csum;
 	} else {
 		udp_conf.family = AF_INET;
 		udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
@@ -491,7 +483,7 @@ static int geneve_gro_complete(struct sock *sk, struct sk_buff *skb,
 
 /* Create new listen socket if needed */
 static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
-						bool ipv6, u32 flags)
+						bool ipv6, bool ipv6_rx_csum)
 {
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
 	struct geneve_sock *gs;
@@ -503,7 +495,7 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
 	if (!gs)
 		return ERR_PTR(-ENOMEM);
 
-	sock = geneve_create_sock(net, ipv6, port, flags);
+	sock = geneve_create_sock(net, ipv6, port, ipv6_rx_csum);
 	if (IS_ERR(sock)) {
 		kfree(gs);
 		return ERR_CAST(sock);
@@ -579,21 +571,22 @@ static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6)
 	struct net *net = geneve->net;
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
 	struct geneve_sock *gs;
+	__u8 vni[3];
 	__u32 hash;
 
-	gs = geneve_find_sock(gn, ipv6 ? AF_INET6 : AF_INET, geneve->dst_port);
+	gs = geneve_find_sock(gn, ipv6 ? AF_INET6 : AF_INET, geneve->info.key.tp_dst);
 	if (gs) {
 		gs->refcnt++;
 		goto out;
 	}
 
-	gs = geneve_socket_create(net, geneve->dst_port, ipv6, geneve->flags);
+	gs = geneve_socket_create(net, geneve->info.key.tp_dst, ipv6,
+				  geneve->use_udp6_rx_checksums);
 	if (IS_ERR(gs))
 		return PTR_ERR(gs);
 
 out:
 	gs->collect_md = geneve->collect_md;
-	gs->flags = geneve->flags;
 #if IS_ENABLED(CONFIG_IPV6)
 	if (ipv6)
 		rcu_assign_pointer(geneve->sock6, gs);
@@ -601,7 +594,8 @@ static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6)
 #endif
 		rcu_assign_pointer(geneve->sock4, gs);
 
-	hash = geneve_net_vni_hash(geneve->vni);
+	tunnel_id_to_vni(geneve->info.key.tun_id, vni);
+	hash = geneve_net_vni_hash(vni);
 	hlist_add_head_rcu(&geneve->hlist, &gs->vni_list[hash]);
 	return 0;
 }
@@ -609,7 +603,7 @@ static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6)
 static int geneve_open(struct net_device *dev)
 {
 	struct geneve_dev *geneve = netdev_priv(dev);
-	bool ipv6 = geneve->remote.sa.sa_family == AF_INET6;
+	bool ipv6 = !!(geneve->info.mode & IP_TUNNEL_INFO_IPV6);
 	bool metadata = geneve->collect_md;
 	int ret = 0;
 
@@ -653,12 +647,12 @@ static void geneve_build_header(struct genevehdr *geneveh,
 
 static int geneve_build_skb(struct rtable *rt, struct sk_buff *skb,
 			    __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
-			    u32 flags, bool xnet)
+			    bool xnet)
 {
+	bool udp_sum = !!(tun_flags & TUNNEL_CSUM);
 	struct genevehdr *gnvh;
 	int min_headroom;
 	int err;
-	bool udp_sum = !(flags & GENEVE_F_UDP_ZERO_CSUM_TX);
 
 	skb_scrub_packet(skb, xnet);
 
@@ -686,12 +680,12 @@ static int geneve_build_skb(struct rtable *rt, struct sk_buff *skb,
 #if IS_ENABLED(CONFIG_IPV6)
 static int geneve6_build_skb(struct dst_entry *dst, struct sk_buff *skb,
 			     __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
-			     u32 flags, bool xnet)
+			     bool xnet)
 {
+	bool udp_sum = !!(tun_flags & TUNNEL_CSUM);
 	struct genevehdr *gnvh;
 	int min_headroom;
 	int err;
-	bool udp_sum = !(flags & GENEVE_F_UDP_ZERO_CSUM6_TX);
 
 	skb_scrub_packet(skb, xnet);
 
@@ -734,32 +728,22 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb,
 	memset(fl4, 0, sizeof(*fl4));
 	fl4->flowi4_mark = skb->mark;
 	fl4->flowi4_proto = IPPROTO_UDP;
+	fl4->daddr = info->key.u.ipv4.dst;
+	fl4->saddr = info->key.u.ipv4.src;
 
-	if (info) {
-		fl4->daddr = info->key.u.ipv4.dst;
-		fl4->saddr = info->key.u.ipv4.src;
-		fl4->flowi4_tos = RT_TOS(info->key.tos);
-		dst_cache = &info->dst_cache;
-	} else {
-		tos = geneve->tos;
-		if (tos == 1) {
-			const struct iphdr *iip = ip_hdr(skb);
-
-			tos = ip_tunnel_get_dsfield(iip, skb);
-			use_cache = false;
-		}
-
-		fl4->flowi4_tos = RT_TOS(tos);
-		fl4->daddr = geneve->remote.sin.sin_addr.s_addr;
-		dst_cache = &geneve->dst_cache;
+	tos = info->key.tos;
+	if (!geneve->collect_md && (tos == 1)) {
+		tos = ip_tunnel_get_dsfield(ip_hdr(skb), skb);
+		use_cache = false;
 	}
+	fl4->flowi4_tos = RT_TOS(tos);
 
+	dst_cache = &info->dst_cache;
 	if (use_cache) {
 		rt = dst_cache_get_ip4(dst_cache, &fl4->saddr);
 		if (rt)
 			return rt;
 	}
-
 	rt = ip_route_output_key(geneve->net, fl4);
 	if (IS_ERR(rt)) {
 		netdev_dbg(dev, "no route to %pI4\n", &fl4->daddr);
@@ -795,34 +779,22 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
 	memset(fl6, 0, sizeof(*fl6));
 	fl6->flowi6_mark = skb->mark;
 	fl6->flowi6_proto = IPPROTO_UDP;
-
-	if (info) {
-		fl6->daddr = info->key.u.ipv6.dst;
-		fl6->saddr = info->key.u.ipv6.src;
-		fl6->flowlabel = ip6_make_flowinfo(RT_TOS(info->key.tos),
-						   info->key.label);
-		dst_cache = &info->dst_cache;
-	} else {
-		prio = geneve->tos;
-		if (prio == 1) {
-			const struct iphdr *iip = ip_hdr(skb);
-
-			prio = ip_tunnel_get_dsfield(iip, skb);
-			use_cache = false;
-		}
-
-		fl6->flowlabel = ip6_make_flowinfo(RT_TOS(prio),
-						   geneve->label);
-		fl6->daddr = geneve->remote.sin6.sin6_addr;
-		dst_cache = &geneve->dst_cache;
+	fl6->daddr = info->key.u.ipv6.dst;
+	fl6->saddr = info->key.u.ipv6.src;
+	prio = info->key.tos;
+	if (!geneve->collect_md && (prio == 1)) {
+		prio = ip_tunnel_get_dsfield(ip_hdr(skb), skb);
+		use_cache = false;
 	}
 
+	fl6->flowlabel = ip6_make_flowinfo(RT_TOS(prio),
+					   info->key.label);
+	dst_cache = &info->dst_cache;
 	if (use_cache) {
 		dst = dst_cache_get_ip6(dst_cache, &fl6->saddr);
 		if (dst)
 			return dst;
 	}
-
 	if (ipv6_stub->ipv6_dst_lookup(geneve->net, gs6->sock->sk, &dst, fl6)) {
 		netdev_dbg(dev, "no route to %pI6\n", &fl6->daddr);
 		return ERR_PTR(-ENETUNREACH);
@@ -839,195 +811,130 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
 }
 #endif
 
-/* Convert 64 bit tunnel ID to 24 bit VNI. */
-static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni)
-{
-#ifdef __BIG_ENDIAN
-	vni[0] = (__force __u8)(tun_id >> 16);
-	vni[1] = (__force __u8)(tun_id >> 8);
-	vni[2] = (__force __u8)tun_id;
-#else
-	vni[0] = (__force __u8)((__force u64)tun_id >> 40);
-	vni[1] = (__force __u8)((__force u64)tun_id >> 48);
-	vni[2] = (__force __u8)((__force u64)tun_id >> 56);
-#endif
-}
-
-static netdev_tx_t geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
-				   struct ip_tunnel_info *info)
+static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
+			   struct geneve_dev *geneve, struct ip_tunnel_info *info)
 {
-	struct geneve_dev *geneve = netdev_priv(dev);
-	struct geneve_sock *gs4;
-	struct rtable *rt = NULL;
-	const struct iphdr *iip; /* interior IP header */
+	bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
+	struct geneve_sock *gs4 = rcu_dereference(geneve->sock4);
+	const struct ip_tunnel_key *key = &info->key;
+	struct rtable *rt;
 	int err = -EINVAL;
 	struct flowi4 fl4;
+	u8 *opts = NULL;
 	__u8 tos, ttl;
 	__be16 sport;
 	__be16 df;
-	bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
-	u32 flags = geneve->flags;
+	u8 vni[3];
 
-	gs4 = rcu_dereference(geneve->sock4);
 	if (!gs4)
-		goto tx_error;
-
-	if (geneve->collect_md) {
-		if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) {
-			netdev_dbg(dev, "no tunnel metadata\n");
-			goto tx_error;
-		}
-		if (info && ip_tunnel_info_af(info) != AF_INET)
-			goto tx_error;
-	}
+		return err;
 
 	rt = geneve_get_v4_rt(skb, dev, &fl4, info);
-	if (IS_ERR(rt)) {
-		err = PTR_ERR(rt);
-		goto tx_error;
-	}
+	if (IS_ERR(rt))
+		return PTR_ERR(rt);
 
 	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
-	skb_reset_mac_header(skb);
-
-	iip = ip_hdr(skb);
-
-	if (info) {
-		const struct ip_tunnel_key *key = &info->key;
-		u8 *opts = NULL;
-		u8 vni[3];
-
-		tunnel_id_to_vni(key->tun_id, vni);
-		if (info->options_len)
-			opts = ip_tunnel_info_opts(info);
-
-		if (key->tun_flags & TUNNEL_CSUM)
-			flags &= ~GENEVE_F_UDP_ZERO_CSUM_TX;
-		else
-			flags |= GENEVE_F_UDP_ZERO_CSUM_TX;
-
-		err = geneve_build_skb(rt, skb, key->tun_flags, vni,
-				       info->options_len, opts, flags, xnet);
-		if (unlikely(err))
-			goto tx_error;
-
-		tos = ip_tunnel_ecn_encap(key->tos, iip, skb);
+	if (geneve->collect_md) {
+		tos = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb);
 		ttl = key->ttl;
-		df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
 	} else {
-		err = geneve_build_skb(rt, skb, 0, geneve->vni,
-				       0, NULL, flags, xnet);
-		if (unlikely(err))
-			goto tx_error;
-
-		tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, iip, skb);
-		ttl = geneve->ttl;
-		if (!ttl && IN_MULTICAST(ntohl(fl4.daddr)))
-			ttl = 1;
-		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
-		df = 0;
+		tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, ip_hdr(skb), skb);
+		ttl = key->ttl ? : ip4_dst_hoplimit(&rt->dst);
 	}
-	udp_tunnel_xmit_skb(rt, gs4->sock->sk, skb, fl4.saddr, fl4.daddr,
-			    tos, ttl, df, sport, geneve->dst_port,
-			    !net_eq(geneve->net, dev_net(geneve->dev)),
-			    !!(flags & GENEVE_F_UDP_ZERO_CSUM_TX));
+	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
 
-	return NETDEV_TX_OK;
-
-tx_error:
-	dev_kfree_skb(skb);
+	tunnel_id_to_vni(key->tun_id, vni);
+	if (info->options_len)
+		opts = ip_tunnel_info_opts(info);
 
-	if (err == -ELOOP)
-		dev->stats.collisions++;
-	else if (err == -ENETUNREACH)
-		dev->stats.tx_carrier_errors++;
+	skb_reset_mac_header(skb);
+	err = geneve_build_skb(rt, skb, key->tun_flags, vni,
+			       info->options_len, opts, xnet);
+	if (unlikely(err))
+		return err;
 
-	dev->stats.tx_errors++;
-	return NETDEV_TX_OK;
+	udp_tunnel_xmit_skb(rt, gs4->sock->sk, skb, fl4.saddr, fl4.daddr,
+			    tos, ttl, df, sport, geneve->info.key.tp_dst,
+			    !net_eq(geneve->net, dev_net(geneve->dev)),
+			    !(info->key.tun_flags & TUNNEL_CSUM));
+	return 0;
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
-				    struct ip_tunnel_info *info)
+static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
+			    struct geneve_dev *geneve, struct ip_tunnel_info *info)
 {
-	struct geneve_dev *geneve = netdev_priv(dev);
+	bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
+	struct geneve_sock *gs6 = rcu_dereference(geneve->sock6);
+	const struct ip_tunnel_key *key = &info->key;
 	struct dst_entry *dst = NULL;
-	const struct iphdr *iip; /* interior IP header */
-	struct geneve_sock *gs6;
 	int err = -EINVAL;
 	struct flowi6 fl6;
+	u8 *opts = NULL;
 	__u8 prio, ttl;
 	__be16 sport;
-	__be32 label;
-	bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
-	u32 flags = geneve->flags;
+	u8 vni[3];
 
-	gs6 = rcu_dereference(geneve->sock6);
 	if (!gs6)
-		goto tx_error;
-
-	if (geneve->collect_md) {
-		if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) {
-			netdev_dbg(dev, "no tunnel metadata\n");
-			goto tx_error;
-		}
-	}
+		return err;
 
 	dst = geneve_get_v6_dst(skb, dev, &fl6, info);
-	if (IS_ERR(dst)) {
-		err = PTR_ERR(dst);
-		goto tx_error;
-	}
+	if (IS_ERR(dst))
+		return PTR_ERR(dst);
 
 	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
-	skb_reset_mac_header(skb);
-
-	iip = ip_hdr(skb);
+	if (geneve->collect_md) {
+		prio = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb);
+		ttl = key->ttl;
+	} else {
+		prio = ip_tunnel_ecn_encap(ip6_tclass(fl6.flowlabel),
+					   ip_hdr(skb), skb);
+		ttl = key->ttl ? : ip6_dst_hoplimit(dst);
+	}
+	tunnel_id_to_vni(key->tun_id, vni);
+	if (info->options_len)
+		opts = ip_tunnel_info_opts(info);
 
-	if (info) {
-		const struct ip_tunnel_key *key = &info->key;
-		u8 *opts = NULL;
-		u8 vni[3];
+	skb_reset_mac_header(skb);
+	err = geneve6_build_skb(dst, skb, key->tun_flags, vni,
+				info->options_len, opts, xnet);
+	if (unlikely(err))
+		return err;
 
-		tunnel_id_to_vni(key->tun_id, vni);
-		if (info->options_len)
-			opts = ip_tunnel_info_opts(info);
+	udp_tunnel6_xmit_skb(dst, gs6->sock->sk, skb, dev,
+			     &fl6.saddr, &fl6.daddr, prio, ttl,
+			     info->key.label, sport, geneve->info.key.tp_dst,
+			     !(info->key.tun_flags & TUNNEL_CSUM));
+	return 0;
+}
+#endif
 
-		if (key->tun_flags & TUNNEL_CSUM)
-			flags &= ~GENEVE_F_UDP_ZERO_CSUM6_TX;
-		else
-			flags |= GENEVE_F_UDP_ZERO_CSUM6_TX;
+static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct geneve_dev *geneve = netdev_priv(dev);
+	struct ip_tunnel_info *info = NULL;
+	int err;
 
-		err = geneve6_build_skb(dst, skb, key->tun_flags, vni,
-					info->options_len, opts,
-					flags, xnet);
-		if (unlikely(err))
+	if (geneve->collect_md) {
+		info = skb_tunnel_info(skb);
+		if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) {
+			err = -EINVAL;
+			netdev_dbg(dev, "no tunnel metadata\n");
 			goto tx_error;
-
-		prio = ip_tunnel_ecn_encap(key->tos, iip, skb);
-		ttl = key->ttl;
-		label = info->key.label;
+		}
 	} else {
-		err = geneve6_build_skb(dst, skb, 0, geneve->vni,
-					0, NULL, flags, xnet);
-		if (unlikely(err))
-			goto tx_error;
-
-		prio = ip_tunnel_ecn_encap(ip6_tclass(fl6.flowlabel),
-					   iip, skb);
-		ttl = geneve->ttl;
-		if (!ttl && ipv6_addr_is_multicast(&fl6.daddr))
-			ttl = 1;
-		ttl = ttl ? : ip6_dst_hoplimit(dst);
-		label = geneve->label;
+		info = &geneve->info;
 	}
 
-	udp_tunnel6_xmit_skb(dst, gs6->sock->sk, skb, dev,
-			     &fl6.saddr, &fl6.daddr, prio, ttl, label,
-			     sport, geneve->dst_port,
-			     !!(flags & GENEVE_F_UDP_ZERO_CSUM6_TX));
-	return NETDEV_TX_OK;
+#if IS_ENABLED(CONFIG_IPV6)
+	if (info->mode & IP_TUNNEL_INFO_IPV6)
+		err = geneve6_xmit_skb(skb, dev, geneve, info);
+	else
+#endif
+		err = geneve_xmit_skb(skb, dev, geneve, info);
 
+	if (likely(!err))
+		return NETDEV_TX_OK;
 tx_error:
 	dev_kfree_skb(skb);
 
@@ -1039,23 +946,6 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	dev->stats.tx_errors++;
 	return NETDEV_TX_OK;
 }
-#endif
-
-static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
-{
-	struct geneve_dev *geneve = netdev_priv(dev);
-	struct ip_tunnel_info *info = NULL;
-
-	if (geneve->collect_md)
-		info = skb_tunnel_info(skb);
-
-#if IS_ENABLED(CONFIG_IPV6)
-	if ((info && ip_tunnel_info_af(info) == AF_INET6) ||
-	    (!info && geneve->remote.sa.sa_family == AF_INET6))
-		return geneve6_xmit_skb(skb, dev, info);
-#endif
-	return geneve_xmit_skb(skb, dev, info);
-}
 
 static int geneve_change_mtu(struct net_device *dev, int new_mtu)
 {
@@ -1073,14 +963,11 @@ static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 {
 	struct ip_tunnel_info *info = skb_tunnel_info(skb);
 	struct geneve_dev *geneve = netdev_priv(dev);
-	struct rtable *rt;
-	struct flowi4 fl4;
-#if IS_ENABLED(CONFIG_IPV6)
-	struct dst_entry *dst;
-	struct flowi6 fl6;
-#endif
 
 	if (ip_tunnel_info_af(info) == AF_INET) {
+		struct rtable *rt;
+		struct flowi4 fl4;
+
 		rt = geneve_get_v4_rt(skb, dev, &fl4, info);
 		if (IS_ERR(rt))
 			return PTR_ERR(rt);
@@ -1089,6 +976,9 @@ static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 		info->key.u.ipv4.src = fl4.saddr;
 #if IS_ENABLED(CONFIG_IPV6)
 	} else if (ip_tunnel_info_af(info) == AF_INET6) {
+		struct dst_entry *dst;
+		struct flowi6 fl6;
+
 		dst = geneve_get_v6_dst(skb, dev, &fl6, info);
 		if (IS_ERR(dst))
 			return PTR_ERR(dst);
@@ -1102,7 +992,7 @@ static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 
 	info->key.tp_src = udp_flow_src_port(geneve->net, skb,
 					     1, USHRT_MAX, true);
-	info->key.tp_dst = geneve->dst_port;
+	info->key.tp_dst = geneve->info.key.tp_dst;
 	return 0;
 }
 
@@ -1224,78 +1114,69 @@ static int geneve_validate(struct nlattr *tb[], struct nlattr *data[])
 }
 
 static struct geneve_dev *geneve_find_dev(struct geneve_net *gn,
-					  __be16 dst_port,
-					  union geneve_addr *remote,
-					  u8 vni[],
+					  const struct ip_tunnel_info *info,
 					  bool *tun_on_same_port,
 					  bool *tun_collect_md)
 {
-	struct geneve_dev *geneve, *t;
+	struct geneve_dev *geneve, *t = NULL;
 
 	*tun_on_same_port = false;
 	*tun_collect_md = false;
-	t = NULL;
 	list_for_each_entry(geneve, &gn->geneve_list, next) {
-		if (geneve->dst_port == dst_port) {
+		if (info->key.tp_dst == geneve->info.key.tp_dst) {
 			*tun_collect_md = geneve->collect_md;
 			*tun_on_same_port = true;
 		}
-		if (!memcmp(vni, geneve->vni, sizeof(geneve->vni)) &&
-		    !memcmp(remote, &geneve->remote, sizeof(geneve->remote)) &&
-		    dst_port == geneve->dst_port)
+		if (info->key.tun_id == geneve->info.key.tun_id &&
+		    info->key.tp_dst == geneve->info.key.tp_dst &&
+		    !memcmp(&info->key.u, &geneve->info.key.u, sizeof(info->key.u)))
 			t = geneve;
 	}
 	return t;
 }
 
+static bool is_all_zero(const u8 *fp, size_t size)
+{
+	int i;
+
+	for (i = 0; i < size; i++)
+		if (fp[i])
+			return false;
+	return true;
+}
+
+static bool is_tnl_info_zero(const struct ip_tunnel_info *info)
+{
+	if (info->key.tun_id || info->key.tun_flags || info->key.tos ||
+	    info->key.ttl || info->key.label || info->key.tp_src ||
+	    !is_all_zero((const u8 *)&info->key.u, sizeof(info->key.u)))
+		return false;
+	else
+		return true;
+}
+
 static int geneve_configure(struct net *net, struct net_device *dev,
-			    union geneve_addr *remote,
-			    __u32 vni, __u8 ttl, __u8 tos, __be32 label,
-			    __be16 dst_port, bool metadata, u32 flags)
+			    const struct ip_tunnel_info *info,
+			    bool metadata, bool ipv6_rx_csum)
 {
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
 	struct geneve_dev *t, *geneve = netdev_priv(dev);
 	bool tun_collect_md, tun_on_same_port;
 	int err, encap_len;
 
-	if (!remote)
-		return -EINVAL;
-	if (metadata &&
-	    (remote->sa.sa_family != AF_UNSPEC || vni || tos || ttl || label))
+	if (metadata && !is_tnl_info_zero(info))
 		return -EINVAL;
 
 	geneve->net = net;
 	geneve->dev = dev;
 
-	geneve->vni[0] = (vni & 0x00ff0000) >> 16;
-	geneve->vni[1] = (vni & 0x0000ff00) >> 8;
-	geneve->vni[2] =  vni & 0x000000ff;
-
-	if ((remote->sa.sa_family == AF_INET &&
-	     IN_MULTICAST(ntohl(remote->sin.sin_addr.s_addr))) ||
-	    (remote->sa.sa_family == AF_INET6 &&
-	     ipv6_addr_is_multicast(&remote->sin6.sin6_addr)))
-		return -EINVAL;
-	if (label && remote->sa.sa_family != AF_INET6)
-		return -EINVAL;
-
-	geneve->remote = *remote;
-
-	geneve->ttl = ttl;
-	geneve->tos = tos;
-	geneve->label = label;
-	geneve->dst_port = dst_port;
-	geneve->collect_md = metadata;
-	geneve->flags = flags;
-
-	t = geneve_find_dev(gn, dst_port, remote, geneve->vni,
-			    &tun_on_same_port, &tun_collect_md);
+	t = geneve_find_dev(gn, info, &tun_on_same_port, &tun_collect_md);
 	if (t)
 		return -EBUSY;
 
 	/* make enough headroom for basic scenario */
 	encap_len = GENEVE_BASE_HLEN + ETH_HLEN;
-	if (remote->sa.sa_family == AF_INET) {
+	if (ip_tunnel_info_af(info) == AF_INET) {
 		encap_len += sizeof(struct iphdr);
 		dev->max_mtu -= sizeof(struct iphdr);
 	} else {
@@ -1312,7 +1193,10 @@ static int geneve_configure(struct net *net, struct net_device *dev,
 			return -EPERM;
 	}
 
-	dst_cache_reset(&geneve->dst_cache);
+	dst_cache_reset(&geneve->info.dst_cache);
+	geneve->info = *info;
+	geneve->collect_md = metadata;
+	geneve->use_udp6_rx_checksums = ipv6_rx_csum;
 
 	err = register_netdevice(dev);
 	if (err)
@@ -1322,74 +1206,99 @@ static int geneve_configure(struct net *net, struct net_device *dev,
 	return 0;
 }
 
+static void init_tnl_info(struct ip_tunnel_info *info, __u16 dst_port)
+{
+	memset(info, 0, sizeof(*info));
+	info->key.tp_dst = htons(dst_port);
+}
+
 static int geneve_newlink(struct net *net, struct net_device *dev,
 			  struct nlattr *tb[], struct nlattr *data[])
 {
-	__be16 dst_port = htons(GENEVE_UDP_PORT);
-	__u8 ttl = 0, tos = 0;
+	bool use_udp6_rx_checksums = false;
+	struct ip_tunnel_info info;
 	bool metadata = false;
-	union geneve_addr remote = geneve_remote_unspec;
-	__be32 label = 0;
-	__u32 vni = 0;
-	u32 flags = 0;
+
+	init_tnl_info(&info, GENEVE_UDP_PORT);
 
 	if (data[IFLA_GENEVE_REMOTE] && data[IFLA_GENEVE_REMOTE6])
 		return -EINVAL;
 
 	if (data[IFLA_GENEVE_REMOTE]) {
-		remote.sa.sa_family = AF_INET;
-		remote.sin.sin_addr.s_addr =
+		info.key.u.ipv4.dst =
 			nla_get_in_addr(data[IFLA_GENEVE_REMOTE]);
+
+		if (IN_MULTICAST(ntohl(info.key.u.ipv4.dst))) {
+			netdev_dbg(dev, "multicast remote is unsupported\n");
+			return -EINVAL;
+		}
 	}
 
 	if (data[IFLA_GENEVE_REMOTE6]) {
-		if (!IS_ENABLED(CONFIG_IPV6))
-			return -EPFNOSUPPORT;
-
-		remote.sa.sa_family = AF_INET6;
-		remote.sin6.sin6_addr =
+ #if IS_ENABLED(CONFIG_IPV6)
+		info.mode = IP_TUNNEL_INFO_IPV6;
+		info.key.u.ipv6.dst =
 			nla_get_in6_addr(data[IFLA_GENEVE_REMOTE6]);
 
-		if (ipv6_addr_type(&remote.sin6.sin6_addr) &
+		if (ipv6_addr_type(&info.key.u.ipv6.dst) &
 		    IPV6_ADDR_LINKLOCAL) {
 			netdev_dbg(dev, "link-local remote is unsupported\n");
 			return -EINVAL;
 		}
+		if (ipv6_addr_is_multicast(&info.key.u.ipv6.dst)) {
+			netdev_dbg(dev, "multicast remote is unsupported\n");
+			return -EINVAL;
+		}
+		info.key.tun_flags |= TUNNEL_CSUM;
+		use_udp6_rx_checksums = true;
+#else
+		return -EPFNOSUPPORT;
+#endif
 	}
 
-	if (data[IFLA_GENEVE_ID])
+	if (data[IFLA_GENEVE_ID]) {
+		__u32 vni;
+		__u8 tvni[3];
+
 		vni = nla_get_u32(data[IFLA_GENEVE_ID]);
+		tvni[0] = (vni & 0x00ff0000) >> 16;
+		tvni[1] = (vni & 0x0000ff00) >> 8;
+		tvni[2] =  vni & 0x000000ff;
 
+		info.key.tun_id = vni_to_tunnel_id(tvni);
+	}
 	if (data[IFLA_GENEVE_TTL])
-		ttl = nla_get_u8(data[IFLA_GENEVE_TTL]);
+		info.key.ttl = nla_get_u8(data[IFLA_GENEVE_TTL]);
 
 	if (data[IFLA_GENEVE_TOS])
-		tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
+		info.key.tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
 
-	if (data[IFLA_GENEVE_LABEL])
-		label = nla_get_be32(data[IFLA_GENEVE_LABEL]) &
-			IPV6_FLOWLABEL_MASK;
+	if (data[IFLA_GENEVE_LABEL]) {
+		info.key.label = nla_get_be32(data[IFLA_GENEVE_LABEL]) &
+				  IPV6_FLOWLABEL_MASK;
+		if (info.key.label && (!(info.mode & IP_TUNNEL_INFO_IPV6)))
+			return -EINVAL;
+	}
 
 	if (data[IFLA_GENEVE_PORT])
-		dst_port = nla_get_be16(data[IFLA_GENEVE_PORT]);
+		info.key.tp_dst = nla_get_be16(data[IFLA_GENEVE_PORT]);
 
 	if (data[IFLA_GENEVE_COLLECT_METADATA])
 		metadata = true;
 
 	if (data[IFLA_GENEVE_UDP_CSUM] &&
 	    !nla_get_u8(data[IFLA_GENEVE_UDP_CSUM]))
-		flags |= GENEVE_F_UDP_ZERO_CSUM_TX;
+		info.key.tun_flags |= TUNNEL_CSUM;
 
 	if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX] &&
 	    nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX]))
-		flags |= GENEVE_F_UDP_ZERO_CSUM6_TX;
+		info.key.tun_flags &= ~TUNNEL_CSUM;
 
 	if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX] &&
 	    nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX]))
-		flags |= GENEVE_F_UDP_ZERO_CSUM6_RX;
+		use_udp6_rx_checksums = false;
 
-	return geneve_configure(net, dev, &remote, vni, ttl, tos, label,
-				dst_port, metadata, flags);
+	return geneve_configure(net, dev, &info, metadata, use_udp6_rx_checksums);
 }
 
 static void geneve_dellink(struct net_device *dev, struct list_head *head)
@@ -1418,45 +1327,52 @@ static size_t geneve_get_size(const struct net_device *dev)
 static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
 {
 	struct geneve_dev *geneve = netdev_priv(dev);
+	struct ip_tunnel_info *info = &geneve->info;
+	__u8 tmp_vni[3];
 	__u32 vni;
 
-	vni = (geneve->vni[0] << 16) | (geneve->vni[1] << 8) | geneve->vni[2];
+	tunnel_id_to_vni(info->key.tun_id, tmp_vni);
+	vni = (tmp_vni[0] << 16) | (tmp_vni[1] << 8) | tmp_vni[2];
 	if (nla_put_u32(skb, IFLA_GENEVE_ID, vni))
 		goto nla_put_failure;
 
-	if (geneve->remote.sa.sa_family == AF_INET) {
+	if (ip_tunnel_info_af(info) == AF_INET) {
 		if (nla_put_in_addr(skb, IFLA_GENEVE_REMOTE,
-				    geneve->remote.sin.sin_addr.s_addr))
+				    info->key.u.ipv4.dst))
+			goto nla_put_failure;
+
+		if (nla_put_u8(skb, IFLA_GENEVE_UDP_CSUM,
+			       !!(info->key.tun_flags & TUNNEL_CSUM)))
 			goto nla_put_failure;
+
 #if IS_ENABLED(CONFIG_IPV6)
 	} else {
 		if (nla_put_in6_addr(skb, IFLA_GENEVE_REMOTE6,
-				     &geneve->remote.sin6.sin6_addr))
+				     &info->key.u.ipv6.dst))
+			goto nla_put_failure;
+
+		if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_TX,
+			       !(info->key.tun_flags & TUNNEL_CSUM)))
+			goto nla_put_failure;
+
+		if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
+			       !geneve->use_udp6_rx_checksums))
 			goto nla_put_failure;
 #endif
 	}
 
-	if (nla_put_u8(skb, IFLA_GENEVE_TTL, geneve->ttl) ||
-	    nla_put_u8(skb, IFLA_GENEVE_TOS, geneve->tos) ||
-	    nla_put_be32(skb, IFLA_GENEVE_LABEL, geneve->label))
+	if (nla_put_u8(skb, IFLA_GENEVE_TTL, info->key.ttl) ||
+	    nla_put_u8(skb, IFLA_GENEVE_TOS, info->key.tos) ||
+	    nla_put_be32(skb, IFLA_GENEVE_LABEL, info->key.label))
 		goto nla_put_failure;
 
-	if (nla_put_be16(skb, IFLA_GENEVE_PORT, geneve->dst_port))
+	if (nla_put_be16(skb, IFLA_GENEVE_PORT, info->key.tp_dst))
 		goto nla_put_failure;
 
 	if (geneve->collect_md) {
 		if (nla_put_flag(skb, IFLA_GENEVE_COLLECT_METADATA))
 			goto nla_put_failure;
 	}
-
-	if (nla_put_u8(skb, IFLA_GENEVE_UDP_CSUM,
-		       !(geneve->flags & GENEVE_F_UDP_ZERO_CSUM_TX)) ||
-	    nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_TX,
-		       !!(geneve->flags & GENEVE_F_UDP_ZERO_CSUM6_TX)) ||
-	    nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
-		       !!(geneve->flags & GENEVE_F_UDP_ZERO_CSUM6_RX)))
-		goto nla_put_failure;
-
 	return 0;
 
 nla_put_failure:
@@ -1480,6 +1396,7 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
 					u8 name_assign_type, u16 dst_port)
 {
 	struct nlattr *tb[IFLA_MAX + 1];
+	struct ip_tunnel_info info;
 	struct net_device *dev;
 	LIST_HEAD(list_kill);
 	int err;
@@ -1490,9 +1407,8 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
 	if (IS_ERR(dev))
 		return dev;
 
-	err = geneve_configure(net, dev, &geneve_remote_unspec,
-			       0, 0, 0, 0, htons(dst_port), true,
-			       GENEVE_F_UDP_ZERO_CSUM6_RX);
+	init_tnl_info(&info, dst_port);
+	err = geneve_configure(net, dev, &info, true, true);
 	if (err) {
 		free_netdev(dev);
 		return ERR_PTR(err);
@@ -1510,8 +1426,7 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
 		goto err;
 
 	return dev;
-
- err:
+err:
 	geneve_dellink(dev, &list_kill);
 	unregister_netdevice_many(&list_kill);
 	return ERR_PTR(err);
@@ -1594,7 +1509,6 @@ static int __init geneve_init_module(void)
 		goto out3;
 
 	return 0;
-
 out3:
 	unregister_netdevice_notifier(&geneve_notifier_block);
 out2:
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next v2 2/4] geneve: Merge ipv4 and ipv6 geneve_build_skb()
From: Pravin B Shelar @ 2016-11-19  2:10 UTC (permalink / raw)
  To: netdev; +Cc: Pravin B Shelar
In-Reply-To: <1479521411-53012-1-git-send-email-pshelar@ovn.org>

There are minimal difference in building Geneve header
between ipv4 and ipv6 geneve tunnels. Following patch
refactors code to unify it.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
---
 drivers/net/geneve.c | 100 ++++++++++++++-------------------------------------
 1 file changed, 26 insertions(+), 74 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index b5e65cd..d1759aa 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -630,67 +630,34 @@ static int geneve_stop(struct net_device *dev)
 }
 
 static void geneve_build_header(struct genevehdr *geneveh,
-				__be16 tun_flags, u8 vni[3],
-				u8 options_len, u8 *options)
+				const struct ip_tunnel_info *info)
 {
 	geneveh->ver = GENEVE_VER;
-	geneveh->opt_len = options_len / 4;
-	geneveh->oam = !!(tun_flags & TUNNEL_OAM);
-	geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT);
+	geneveh->opt_len = info->options_len / 4;
+	geneveh->oam = !!(info->key.tun_flags & TUNNEL_OAM);
+	geneveh->critical = !!(info->key.tun_flags & TUNNEL_CRIT_OPT);
 	geneveh->rsvd1 = 0;
-	memcpy(geneveh->vni, vni, 3);
+	tunnel_id_to_vni(info->key.tun_id, geneveh->vni);
 	geneveh->proto_type = htons(ETH_P_TEB);
 	geneveh->rsvd2 = 0;
 
-	memcpy(geneveh->options, options, options_len);
+	ip_tunnel_info_opts_get(geneveh->options, info);
 }
 
-static int geneve_build_skb(struct rtable *rt, struct sk_buff *skb,
-			    __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
-			    bool xnet)
-{
-	bool udp_sum = !!(tun_flags & TUNNEL_CSUM);
-	struct genevehdr *gnvh;
-	int min_headroom;
-	int err;
-
-	skb_scrub_packet(skb, xnet);
-
-	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
-			+ GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr);
-	err = skb_cow_head(skb, min_headroom);
-	if (unlikely(err))
-		goto free_rt;
-
-	err = udp_tunnel_handle_offloads(skb, udp_sum);
-	if (err)
-		goto free_rt;
-
-	gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
-	geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);
-
-	skb_set_inner_protocol(skb, htons(ETH_P_TEB));
-	return 0;
-
-free_rt:
-	ip_rt_put(rt);
-	return err;
-}
-
-#if IS_ENABLED(CONFIG_IPV6)
-static int geneve6_build_skb(struct dst_entry *dst, struct sk_buff *skb,
-			     __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
-			     bool xnet)
+static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb,
+			    const struct ip_tunnel_info *info,
+			    bool xnet, int ip_hdr_len)
 {
-	bool udp_sum = !!(tun_flags & TUNNEL_CSUM);
+	bool udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
 	struct genevehdr *gnvh;
 	int min_headroom;
 	int err;
 
+	skb_reset_mac_header(skb);
 	skb_scrub_packet(skb, xnet);
 
-	min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
-			+ GENEVE_BASE_HLEN + opt_len + sizeof(struct ipv6hdr);
+	min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len +
+		       GENEVE_BASE_HLEN + info->options_len + ip_hdr_len;
 	err = skb_cow_head(skb, min_headroom);
 	if (unlikely(err))
 		goto free_dst;
@@ -699,9 +666,9 @@ static int geneve6_build_skb(struct dst_entry *dst, struct sk_buff *skb,
 	if (err)
 		goto free_dst;
 
-	gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
-	geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);
-
+	gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) +
+						   info->options_len);
+	geneve_build_header(gnvh, info);
 	skb_set_inner_protocol(skb, htons(ETH_P_TEB));
 	return 0;
 
@@ -709,12 +676,11 @@ static int geneve6_build_skb(struct dst_entry *dst, struct sk_buff *skb,
 	dst_release(dst);
 	return err;
 }
-#endif
 
 static struct rtable *geneve_get_v4_rt(struct sk_buff *skb,
 				       struct net_device *dev,
 				       struct flowi4 *fl4,
-				       struct ip_tunnel_info *info)
+				       const struct ip_tunnel_info *info)
 {
 	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
 	struct geneve_dev *geneve = netdev_priv(dev);
@@ -738,7 +704,7 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb,
 	}
 	fl4->flowi4_tos = RT_TOS(tos);
 
-	dst_cache = &info->dst_cache;
+	dst_cache = (struct dst_cache *)&info->dst_cache;
 	if (use_cache) {
 		rt = dst_cache_get_ip4(dst_cache, &fl4->saddr);
 		if (rt)
@@ -763,7 +729,7 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb,
 static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
 					   struct net_device *dev,
 					   struct flowi6 *fl6,
-					   struct ip_tunnel_info *info)
+					   const struct ip_tunnel_info *info)
 {
 	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
 	struct geneve_dev *geneve = netdev_priv(dev);
@@ -789,7 +755,7 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
 
 	fl6->flowlabel = ip6_make_flowinfo(RT_TOS(prio),
 					   info->key.label);
-	dst_cache = &info->dst_cache;
+	dst_cache = (struct dst_cache *)&info->dst_cache;
 	if (use_cache) {
 		dst = dst_cache_get_ip6(dst_cache, &fl6->saddr);
 		if (dst)
@@ -812,7 +778,8 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
 #endif
 
 static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
-			   struct geneve_dev *geneve, struct ip_tunnel_info *info)
+			   struct geneve_dev *geneve,
+			   const struct ip_tunnel_info *info)
 {
 	bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
 	struct geneve_sock *gs4 = rcu_dereference(geneve->sock4);
@@ -820,11 +787,9 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	struct rtable *rt;
 	int err = -EINVAL;
 	struct flowi4 fl4;
-	u8 *opts = NULL;
 	__u8 tos, ttl;
 	__be16 sport;
 	__be16 df;
-	u8 vni[3];
 
 	if (!gs4)
 		return err;
@@ -843,13 +808,7 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	}
 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
 
-	tunnel_id_to_vni(key->tun_id, vni);
-	if (info->options_len)
-		opts = ip_tunnel_info_opts(info);
-
-	skb_reset_mac_header(skb);
-	err = geneve_build_skb(rt, skb, key->tun_flags, vni,
-			       info->options_len, opts, xnet);
+	err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr));
 	if (unlikely(err))
 		return err;
 
@@ -862,7 +821,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 
 #if IS_ENABLED(CONFIG_IPV6)
 static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
-			    struct geneve_dev *geneve, struct ip_tunnel_info *info)
+			    struct geneve_dev *geneve,
+			    const struct ip_tunnel_info *info)
 {
 	bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
 	struct geneve_sock *gs6 = rcu_dereference(geneve->sock6);
@@ -870,10 +830,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	struct dst_entry *dst = NULL;
 	int err = -EINVAL;
 	struct flowi6 fl6;
-	u8 *opts = NULL;
 	__u8 prio, ttl;
 	__be16 sport;
-	u8 vni[3];
 
 	if (!gs6)
 		return err;
@@ -891,13 +849,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 					   ip_hdr(skb), skb);
 		ttl = key->ttl ? : ip6_dst_hoplimit(dst);
 	}
-	tunnel_id_to_vni(key->tun_id, vni);
-	if (info->options_len)
-		opts = ip_tunnel_info_opts(info);
-
-	skb_reset_mac_header(skb);
-	err = geneve6_build_skb(dst, skb, key->tun_flags, vni,
-				info->options_len, opts, xnet);
+	err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct iphdr));
 	if (unlikely(err))
 		return err;
 
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next v2 0/4] geneve: Use LWT more effectively.
From: Pravin B Shelar @ 2016-11-19  2:10 UTC (permalink / raw)
  To: netdev; +Cc: Pravin B Shelar

Following patch series make use of geneve LWT code path for
geneve netdev type of device.
This allows us to simplify geneve module.

v1-v2:
Fix warning reported by kbuild test robot.

Pravin B Shelar (4):
  geneve: Unify LWT and netdev handling.
  geneve: Merge ipv4 and ipv6 geneve_build_skb()
  geneve: Remove redundant socket checks.
  geneve: Optimize geneve device lookup.

 drivers/net/geneve.c | 679 +++++++++++++++++++++------------------------------
 1 file changed, 274 insertions(+), 405 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* [PATCH net-next] udp: avoid one cache line miss in recvmsg()
From: Eric Dumazet @ 2016-11-19  1:18 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

From: Eric Dumazet <edumazet@google.com>

UDP_SKB_CB(skb)->partial_cov is located at offset 66 in skb,
requesting a cold cache line being read in cpu cache.

We can avoid this cache line miss for UDP sockets,
as partial_cov has a meaning only for UDPLite.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/udp.c |    3 ++-
 net/ipv6/udp.c |    3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e1fc0116e8d59d8185670c6e55d1219bde55610d..b949770fdc08398a10f3974505a50b2b4f4b2cf3 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1389,7 +1389,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 	 * coverage checksum (UDP-Lite), do it before the copy.
 	 */
 
-	if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) {
+	if (copied < ulen || peeking ||
+	    (is_udplite && UDP_SKB_CB(skb)->partial_cov)) {
 		checksum_valid = !udp_lib_checksum_complete(skb);
 		if (!checksum_valid)
 			goto csum_copy_err;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 4f99417d9b401f2a65c7828e7d6b86d1d6161794..8fd4d89380b86c8630f7fd27ce4e9958497a2b89 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -363,7 +363,8 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	 * coverage checksum (UDP-Lite), do it before the copy.
 	 */
 
-	if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) {
+	if (copied < ulen || peeking ||
+	    (is_udplite && UDP_SKB_CB(skb)->partial_cov)) {
 		checksum_valid = !udp_lib_checksum_complete(skb);
 		if (!checksum_valid)
 			goto csum_copy_err;

^ permalink raw reply related

* Re: [PATCH 3/5] virtio_net: Add XDP support
From: John Fastabend @ 2016-11-19  1:02 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: tgraf, shm, alexei.starovoitov, daniel, davem, john.r.fastabend,
	netdev, bblanco, brouer
In-Reply-To: <1479511422.8455.305.camel@edumazet-glaptop3.roam.corp.google.com>

On 16-11-18 03:23 PM, Eric Dumazet wrote:
> On Fri, 2016-11-18 at 11:00 -0800, John Fastabend wrote:
>> From: Shrijeet Mukherjee <shrijeet@gmail.com>
> 
> 
>>  #include <linux/slab.h>
>> @@ -81,6 +82,8 @@ struct receive_queue {
>>  
>>  	struct napi_struct napi;
>>  
>> +	struct bpf_prog *xdp_prog;
> 
> Please add proper sparse annotation, as in 
> 
> 	struct bpf_prog __rcu *xdp_prog;
> 
> And run sparse ;)
> 
> CONFIG_SPARSE_RCU_POINTER=y
> 
> make C=2 drivers/net/virtio_net.o
> 
> 
> 
> 

Yep will do thanks! And I will fix the other comment as well.

^ permalink raw reply

* [PATCH net-next v3 4/4] bpf: add __must_check attributes to refcount manipulating helpers
From: Daniel Borkmann @ 2016-11-19  0:45 UTC (permalink / raw)
  To: davem
  Cc: alexei.starovoitov, bblanco, zhiyisun, ranas, saeedm, netdev,
	Daniel Borkmann
In-Reply-To: <cover.1479514784.git.daniel@iogearbox.net>

Helpers like bpf_prog_add(), bpf_prog_inc(), bpf_map_inc() can fail
with an error, so make sure the caller properly checks their return
value and not just ignores it, which could worst-case lead to use
after free.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 01c1487..69d0a7f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -233,14 +233,14 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 
 struct bpf_prog *bpf_prog_get(u32 ufd);
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
-struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i);
+struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
 void bpf_prog_sub(struct bpf_prog *prog, int i);
-struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog);
+struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog);
 void bpf_prog_put(struct bpf_prog *prog);
 
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
 struct bpf_map *__bpf_map_get(struct fd f);
-struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref);
+struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
 int bpf_map_precharge_memlock(u32 pages);
@@ -299,7 +299,8 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
-static inline struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
+static inline struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog,
+							  int i)
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
@@ -311,7 +312,8 @@ static inline void bpf_prog_sub(struct bpf_prog *prog, int i)
 static inline void bpf_prog_put(struct bpf_prog *prog)
 {
 }
-static inline struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+
+static inline struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog)
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next v3 2/4] bpf, mlx5: fix various refcount issues in mlx5e_xdp_set
From: Daniel Borkmann @ 2016-11-19  0:45 UTC (permalink / raw)
  To: davem
  Cc: alexei.starovoitov, bblanco, zhiyisun, ranas, saeedm, netdev,
	Daniel Borkmann
In-Reply-To: <cover.1479514784.git.daniel@iogearbox.net>

There are multiple issues in mlx5e_xdp_set():

1) The batched bpf_prog_add() is currently not checked for errors. When
   doing so, it should be done at an earlier point in time to makes sure
   that we cannot fail anymore at the time we want to set the program for
   each channel. The batched refs short-cut can only be performed when we
   don't need to perform a reset for changing the rq type and the device
   was in opened state. In case the device was not in opened state, then
   the next mlx5e_open_locked() will aquire the refs from the control prog
   via mlx5e_create_rq(), same when we need to perform a reset.

2) When swapping the priv->xdp_prog, then no extra reference count must be
   taken since we got that from call path via dev_change_xdp_fd() already.
   Otherwise, we'd never be able to release the program. Also, bpf_prog_add()
   without checking the return code could fail.

Fixes: 86994156c736 ("net/mlx5e: XDP fast RX drop bpf programs support")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 54bae79..491cff9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3144,11 +3144,21 @@ static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog)

 	if (was_opened && reset)
 		mlx5e_close_locked(netdev);
+	if (was_opened && !reset) {
+		/* num_channels is invariant here, so we can take the
+		 * batched reference right upfront.
+		 */
+		prog = bpf_prog_add(prog, priv->params.num_channels);
+		if (IS_ERR(prog)) {
+			err = PTR_ERR(prog);
+			goto unlock;
+		}
+	}

-	/* exchange programs */
+	/* exchange programs, extra prog reference we got from caller
+	 * as long as we don't fail from this point onwards.
+	 */
 	old_prog = xchg(&priv->xdp_prog, prog);
-	if (prog)
-		bpf_prog_add(prog, 1);
 	if (old_prog)
 		bpf_prog_put(old_prog);

@@ -3164,7 +3174,6 @@ static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
 	/* exchanging programs w/o reset, we update ref counts on behalf
 	 * of the channels RQs here.
 	 */
-	bpf_prog_add(prog, priv->params.num_channels);
 	for (i = 0; i < priv->params.num_channels; i++) {
 		struct mlx5e_channel *c = priv->channel[i];

-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next v3 0/4] Couple of BPF refcount fixes for mlx5
From: Daniel Borkmann @ 2016-11-19  0:44 UTC (permalink / raw)
  To: davem
  Cc: alexei.starovoitov, bblanco, zhiyisun, ranas, saeedm, netdev,
	Daniel Borkmann

Various mlx5 bugs on eBPF refcount handling found during review.
Last patch in series adds a __must_check to BPF helpers to make
sure we won't run into it again w/o compiler complaining first.

v2 -> v3:

 - Just reworked patch 2/4 so we don't need bpf_prog_sub().
 - Rebased, rest as is.

v1 -> v2:

 - After discussion with Alexei, we agreed upon rebasing the
   patches against net-next.
 - Since net-next, I've also added the __must_check to enforce
   future users to check for errors.
 - Fixed up commit message #2.
 - Simplify assignment from patch #1 based on Saeed's feedback
   on previous set.

Thanks a lot!

Daniel Borkmann (4):
  bpf, mlx5: fix mlx5e_create_rq taking reference on prog
  bpf, mlx5: fix various refcount issues in mlx5e_xdp_set
  bpf, mlx5: drop priv->xdp_prog reference on netdev cleanup
  bpf: add __must_check attributes to refcount manipulating helpers

 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 33 +++++++++++++++++------
 include/linux/bpf.h                               | 12 +++++----
 kernel/bpf/syscall.c                              |  1 +
 3 files changed, 33 insertions(+), 13 deletions(-)

-- 
1.9.3

^ permalink raw reply

* [PATCH net-next v3 3/4] bpf, mlx5: drop priv->xdp_prog reference on netdev cleanup
From: Daniel Borkmann @ 2016-11-19  0:45 UTC (permalink / raw)
  To: davem
  Cc: alexei.starovoitov, bblanco, zhiyisun, ranas, saeedm, netdev,
	Daniel Borkmann
In-Reply-To: <cover.1479514784.git.daniel@iogearbox.net>

mlx5e_xdp_set() is currently the only place where we drop reference on the
prog sitting in priv->xdp_prog when it's exchanged by a new one. We also
need to make sure that we eventually release that reference, for example,
in case the netdev is dismantled, otherwise we leak the program.

Fixes: 86994156c736 ("net/mlx5e: XDP fast RX drop bpf programs support")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 491cff9..6957608 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3705,6 +3705,9 @@ static void mlx5e_nic_cleanup(struct mlx5e_priv *priv)
 
 	if (MLX5_CAP_GEN(mdev, vport_group_manager))
 		mlx5_eswitch_unregister_vport_rep(esw, 0);
+
+	if (priv->xdp_prog)
+		bpf_prog_put(priv->xdp_prog);
 }
 
 static int mlx5e_init_nic_rx(struct mlx5e_priv *priv)
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next v3 1/4] bpf, mlx5: fix mlx5e_create_rq taking reference on prog
From: Daniel Borkmann @ 2016-11-19  0:45 UTC (permalink / raw)
  To: davem
  Cc: alexei.starovoitov, bblanco, zhiyisun, ranas, saeedm, netdev,
	Daniel Borkmann
In-Reply-To: <cover.1479514784.git.daniel@iogearbox.net>

In mlx5e_create_rq(), when creating a new queue, we call bpf_prog_add() but
without checking the return value. bpf_prog_add() can fail since 92117d8443bc
("bpf: fix refcnt overflow"), so we really must check it. Take the reference
right when we assign it to the rq from priv->xdp_prog, and just drop the
reference on error path. Destruction in mlx5e_destroy_rq() looks good, though.

Fixes: 86994156c736 ("net/mlx5e: XDP fast RX drop bpf programs support")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 13 +++++++++----
 kernel/bpf/syscall.c                              |  1 +
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index bd0732d..54bae79 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -513,7 +513,13 @@ static int mlx5e_create_rq(struct mlx5e_channel *c,
 	rq->channel = c;
 	rq->ix      = c->ix;
 	rq->priv    = c->priv;
-	rq->xdp_prog = priv->xdp_prog;
+
+	rq->xdp_prog = priv->xdp_prog ? bpf_prog_inc(priv->xdp_prog) : NULL;
+	if (IS_ERR(rq->xdp_prog)) {
+		err = PTR_ERR(rq->xdp_prog);
+		rq->xdp_prog = NULL;
+		goto err_rq_wq_destroy;
+	}
 
 	rq->buff.map_dir = DMA_FROM_DEVICE;
 	if (rq->xdp_prog)
@@ -590,12 +596,11 @@ static int mlx5e_create_rq(struct mlx5e_channel *c,
 	rq->page_cache.head = 0;
 	rq->page_cache.tail = 0;
 
-	if (rq->xdp_prog)
-		bpf_prog_add(rq->xdp_prog, 1);
-
 	return 0;
 
 err_rq_wq_destroy:
+	if (rq->xdp_prog)
+		bpf_prog_put(rq->xdp_prog);
 	mlx5_wq_destroy(&rq->wq_ctrl);
 
 	return err;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ce1b7de..eb15498 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -696,6 +696,7 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
 {
 	return bpf_prog_add(prog, 1);
 }
+EXPORT_SYMBOL_GPL(bpf_prog_inc);
 
 static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
 {
-- 
1.9.3

^ permalink raw reply related

* Re: Long delays creating a netns after deleting one (possibly RCU related)
From: Eric Dumazet @ 2016-11-19  0:41 UTC (permalink / raw)
  To: Jarno Rajahalme
  Cc: Eric W. Biederman, Paul E. McKenney, Cong Wang, Rolf Neugebauer,
	LKML, Linux Kernel Network Developers, Justin Cormack,
	Ian Campbell, Eric Dumazet
In-Reply-To: <884D43D4-024E-4485-94E6-1E8DFF972483@gmail.com>

On Fri, 2016-11-18 at 16:38 -0800, Jarno Rajahalme wrote:

> This fixes the problem for me, so for whatever it’s worth:
> 
> Tested-by: Jarno Rajahalme <jarno@ovn.org>
> 

Thanks for testing !

https://git.kernel.org/cgit/linux/kernel/git/davem/net.git/commit/?id=e88a2766143a27bfe6704b4493b214de4094cf29

^ permalink raw reply

* Re: Long delays creating a netns after deleting one (possibly RCU related)
From: Jarno Rajahalme @ 2016-11-19  0:38 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Eric W. Biederman, Paul E. McKenney, Cong Wang, Rolf Neugebauer,
	LKML, Linux Kernel Network Developers, Justin Cormack,
	Ian Campbell, Eric Dumazet
In-Reply-To: <1479164967.8455.87.camel@edumazet-glaptop3.roam.corp.google.com>


> On Nov 14, 2016, at 3:09 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
> On Mon, 2016-11-14 at 14:46 -0800, Eric Dumazet wrote:
>> On Mon, 2016-11-14 at 16:12 -0600, Eric W. Biederman wrote:
>> 
>>> synchronize_rcu_expidited is not enough if you have multiple network
>>> devices in play.
>>> 
>>> Looking at the code it comes down to this commit, and it appears there
>>> is a promise add rcu grace period combining by Eric Dumazet.
>>> 
>>> Eric since people are hitting noticable stalls because of the rcu grace
>>> period taking a long time do you think you could look at this code path
>>> a bit more?
>>> 
>>> commit 93d05d4a320cb16712bb3d57a9658f395d8cecb9
>>> Author: Eric Dumazet <edumazet@google.com>
>>> Date:   Wed Nov 18 06:31:03 2015 -0800
>> 
>> Absolutely, I will take a loop asap.
> 
> The worst offender should be fixed by the following patch.
> 
> busy poll needs to poll the physical device, not a virtual one...
> 
> diff --git a/include/net/gro_cells.h b/include/net/gro_cells.h
> index d15214d673b2e8e08fd6437b572278fb1359f10d..2a1abbf8da74368cd01adc40cef6c0644e059ef2 100644
> --- a/include/net/gro_cells.h
> +++ b/include/net/gro_cells.h
> @@ -68,6 +68,9 @@ static inline int gro_cells_init(struct gro_cells *gcells, struct net_device *de
> 		struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
> 
> 		__skb_queue_head_init(&cell->napi_skbs);
> +
> +		set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state);
> +
> 		netif_napi_add(dev, &cell->napi, gro_cell_poll, 64);
> 		napi_enable(&cell->napi);
> 	}
> 
> 
> 
> 
> 

This fixes the problem for me, so for whatever it’s worth:

Tested-by: Jarno Rajahalme <jarno@ovn.org>

^ permalink raw reply

* [PATCH net-next v2 5/5] af_packet: Use virtio_net_hdr_from_skb() directly.
From: Jarno Rajahalme @ 2016-11-18 23:40 UTC (permalink / raw)
  To: netdev; +Cc: jarno
In-Reply-To: <1479512442-61601-1-git-send-email-jarno@ovn.org>

Remove static function __packet_rcv_vnet(), which only called
virtio_net_hdr_from_skb() and BUG()ged out if an error code was
returned.  Instead, call virtio_net_hdr_from_skb() from the former
call sites of __packet_rcv_vnet() and actually use the error handling
code that is already there.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
---
 net/packet/af_packet.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 1816b77..fab9bbf 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1967,15 +1967,6 @@ static unsigned int run_filter(struct sk_buff *skb,
 	return res;
 }
 
-static int __packet_rcv_vnet(const struct sk_buff *skb,
-			     struct virtio_net_hdr *vnet_hdr)
-{
-	if (virtio_net_hdr_from_skb(skb, vnet_hdr, vio_le()))
-		BUG();
-
-	return 0;
-}
-
 static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
 			   size_t *len)
 {
@@ -1985,7 +1976,7 @@ static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
 		return -EINVAL;
 	*len -= sizeof(vnet_hdr);
 
-	if (__packet_rcv_vnet(skb, &vnet_hdr))
+	if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le()))
 		return -EINVAL;
 
 	return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
@@ -2244,8 +2235,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	spin_unlock(&sk->sk_receive_queue.lock);
 
 	if (po->has_vnet_hdr) {
-		if (__packet_rcv_vnet(skb, h.raw + macoff -
-					   sizeof(struct virtio_net_hdr))) {
+		if (virtio_net_hdr_from_skb(skb, h.raw + macoff -
+					    sizeof(struct virtio_net_hdr),
+					    vio_le())) {
 			spin_lock(&sk->sk_receive_queue.lock);
 			goto drop_n_account;
 		}
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v2 4/5] af_packet: Use virtio_net_hdr_to_skb().
From: Jarno Rajahalme @ 2016-11-18 23:40 UTC (permalink / raw)
  To: netdev; +Cc: jarno
In-Reply-To: <1479512442-61601-1-git-send-email-jarno@ovn.org>

Use the common virtio_net_hdr_to_skb() instead of open coding it.
Other call sites were changed by commit fd2a0437dc, but this one was
missed, maybe because it is split in two parts of the source code.

Interim comparisons of 'vnet_hdr->gso_type' still work as both the
vnet_hdr and skb notion of gso_type is zero when there is no gso.

Fixes: fd2a0437dc ("virtio_net: introduce virtio_net_hdr_{from,to}_skb")
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
---
 net/packet/af_packet.c | 51 +++-----------------------------------------------
 1 file changed, 3 insertions(+), 48 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index abe6c0b..1816b77 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2388,8 +2388,6 @@ static void tpacket_set_protocol(const struct net_device *dev,
 
 static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
 {
-	unsigned short gso_type = 0;
-
 	if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
 	    (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
 	     __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
@@ -2401,29 +2399,6 @@ static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
 	if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
 		return -EINVAL;
 
-	if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
-		switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
-		case VIRTIO_NET_HDR_GSO_TCPV4:
-			gso_type = SKB_GSO_TCPV4;
-			break;
-		case VIRTIO_NET_HDR_GSO_TCPV6:
-			gso_type = SKB_GSO_TCPV6;
-			break;
-		case VIRTIO_NET_HDR_GSO_UDP:
-			gso_type = SKB_GSO_UDP;
-			break;
-		default:
-			return -EINVAL;
-		}
-
-		if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN)
-			gso_type |= SKB_GSO_TCP_ECN;
-
-		if (vnet_hdr->gso_size == 0)
-			return -EINVAL;
-	}
-
-	vnet_hdr->gso_type = gso_type;	/* changes type, temporary storage */
 	return 0;
 }
 
@@ -2443,27 +2418,6 @@ static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
 	return __packet_snd_vnet_parse(vnet_hdr, *len);
 }
 
-static int packet_snd_vnet_gso(struct sk_buff *skb,
-			       struct virtio_net_hdr *vnet_hdr)
-{
-	if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
-		u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start);
-		u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset);
-
-		if (!skb_partial_csum_set(skb, s, o))
-			return -EINVAL;
-	}
-
-	skb_shinfo(skb)->gso_size =
-		__virtio16_to_cpu(vio_le(), vnet_hdr->gso_size);
-	skb_shinfo(skb)->gso_type = vnet_hdr->gso_type;
-
-	/* Header must be checked, and gso_segs computed. */
-	skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
-	skb_shinfo(skb)->gso_segs = 0;
-	return 0;
-}
-
 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 		void *frame, struct net_device *dev, void *data, int tp_len,
 		__be16 proto, unsigned char *addr, int hlen, int copylen,
@@ -2723,7 +2677,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 			}
 		}
 
-		if (po->has_vnet_hdr && packet_snd_vnet_gso(skb, vnet_hdr)) {
+		if (po->has_vnet_hdr && virtio_net_hdr_to_skb(skb, vnet_hdr,
+							      vio_le())) {
 			tp_len = -EINVAL;
 			goto tpacket_error;
 		}
@@ -2914,7 +2869,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	packet_pick_tx_queue(dev, skb);
 
 	if (po->has_vnet_hdr) {
-		err = packet_snd_vnet_gso(skb, &vnet_hdr);
+		err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
 		if (err)
 			goto out_free;
 		len += sizeof(vnet_hdr);
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v2 3/5] virtio_net: Do not clear memory for struct virtio_net_hdr twice.
From: Jarno Rajahalme @ 2016-11-18 23:40 UTC (permalink / raw)
  To: netdev; +Cc: jarno
In-Reply-To: <1479512442-61601-1-git-send-email-jarno@ovn.org>

virtio_net_hdr_from_skb() clears the memory for the header, so there
is no point for the callers to do the same.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
---
 drivers/net/tun.c          | 3 +--
 include/linux/virtio_net.h | 2 +-
 net/packet/af_packet.c     | 2 --
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 3b8d8cc..64e694c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1360,8 +1360,7 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 	}
 
 	if (vnet_hdr_sz) {
-		struct virtio_net_hdr gso = { 0 }; /* no info leak */
-		int ret;
+		struct virtio_net_hdr gso;
 
 		if (iov_iter_count(iter) < vnet_hdr_sz)
 			return -EINVAL;
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 74f1e33..6620400 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -58,7 +58,7 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
 					  struct virtio_net_hdr *hdr,
 					  bool little_endian)
 {
-	memset(hdr, 0, sizeof(*hdr));
+	memset(hdr, 0, sizeof(*hdr));   /* no info leak */
 
 	if (skb_is_gso(skb)) {
 		struct skb_shared_info *sinfo = skb_shinfo(skb);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index d2238b2..abe6c0b 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1970,8 +1970,6 @@ static unsigned int run_filter(struct sk_buff *skb,
 static int __packet_rcv_vnet(const struct sk_buff *skb,
 			     struct virtio_net_hdr *vnet_hdr)
 {
-	*vnet_hdr = (const struct virtio_net_hdr) { 0 };
-
 	if (virtio_net_hdr_from_skb(skb, vnet_hdr, vio_le()))
 		BUG();
 
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v2 2/5] virtio_net.h: Fix comment.
From: Jarno Rajahalme @ 2016-11-18 23:40 UTC (permalink / raw)
  To: netdev; +Cc: jarno
In-Reply-To: <1479512442-61601-1-git-send-email-jarno@ovn.org>

Fix incorrent comment after the final #endif.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
---
 include/linux/virtio_net.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 1c912f8..74f1e33 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -98,4 +98,4 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
 	return 0;
 }
 
-#endif /* _LINUX_VIRTIO_BYTEORDER */
+#endif /* _LINUX_VIRTIO_NET_H */
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v2 1/5] virtio_net: Simplify call sites for virtio_net_hdr_{from,to}_skb().
From: Jarno Rajahalme @ 2016-11-18 23:40 UTC (permalink / raw)
  To: netdev; +Cc: jarno

No point storing the return value of virtio_net_hdr_to_skb() or
virtio_net_hdr_from_skb() to a variable when the value is used only
once as a boolean in an immediately following if statement.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
---
 drivers/net/macvtap.c | 5 ++---
 drivers/net/tun.c     | 8 +++-----
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 070e329..5da9861 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -821,9 +821,8 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
 		if (iov_iter_count(iter) < vnet_hdr_len)
 			return -EINVAL;
 
-		ret = virtio_net_hdr_from_skb(skb, &vnet_hdr,
-					      macvtap_is_little_endian(q));
-		if (ret)
+		if (virtio_net_hdr_from_skb(skb, &vnet_hdr,
+					    macvtap_is_little_endian(q)))
 			BUG();
 
 		if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) !=
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 1588469..3b8d8cc 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1252,8 +1252,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		return -EFAULT;
 	}
 
-	err = virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun));
-	if (err) {
+	if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
 		this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
 		kfree_skb(skb);
 		return -EINVAL;
@@ -1367,9 +1366,8 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 		if (iov_iter_count(iter) < vnet_hdr_sz)
 			return -EINVAL;
 
-		ret = virtio_net_hdr_from_skb(skb, &gso,
-					      tun_is_little_endian(tun));
-		if (ret) {
+		if (virtio_net_hdr_from_skb(skb, &gso,
+					    tun_is_little_endian(tun))) {
 			struct skb_shared_info *sinfo = skb_shinfo(skb);
 			pr_err("unexpected GSO type: "
 			       "0x%x, gso_size %d, hdr_len %d\n",
-- 
2.1.4

^ permalink raw reply related

* Re: [mm PATCH v3 21/23] mm: Add support for releasing multiple instances of a page
From: Andrew Morton @ 2016-11-18 23:27 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: linux-mm, netdev, linux-kernel
In-Reply-To: <20161110113606.76501.70752.stgit@ahduyck-blue-test.jf.intel.com>

On Thu, 10 Nov 2016 06:36:06 -0500 Alexander Duyck <alexander.h.duyck@intel.com> wrote:

> This patch adds a function that allows us to batch free a page that has
> multiple references outstanding.  Specifically this function can be used to
> drop a page being used in the page frag alloc cache.  With this drivers can
> make use of functionality similar to the page frag alloc cache without
> having to do any workarounds for the fact that there is no function that
> frees multiple references.
> 
> ...
>
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -506,6 +506,8 @@ extern void free_hot_cold_page(struct page *page, bool cold);
>  extern void free_hot_cold_page_list(struct list_head *list, bool cold);
>  
>  struct page_frag_cache;
> +extern void __page_frag_drain(struct page *page, unsigned int order,
> +			      unsigned int count);
>  extern void *__alloc_page_frag(struct page_frag_cache *nc,
>  			       unsigned int fragsz, gfp_t gfp_mask);
>  extern void __free_page_frag(void *addr);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 0fbfead..54fea40 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3912,6 +3912,20 @@ static struct page *__page_frag_refill(struct page_frag_cache *nc,
>  	return page;
>  }
>  
> +void __page_frag_drain(struct page *page, unsigned int order,
> +		       unsigned int count)
> +{
> +	VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
> +
> +	if (page_ref_sub_and_test(page, count)) {
> +		if (order == 0)
> +			free_hot_cold_page(page, false);
> +		else
> +			__free_pages_ok(page, order);
> +	}
> +}
> +EXPORT_SYMBOL(__page_frag_drain);

It's an exported-to-modules library function.  It should be documented,
please?  The page-frag API is only partially documented, but that's no
excuse.

And perhaps documentation will help explain the naming choice.  Why
"drain"?  I'd have expected "put"?

And why the leading underscores.  The page-frag API is pretty weird :(

And inconsistent.  __alloc_page_frag -> page_frag_alloc,
__free_page_frag -> page_frag_free(), etc.  I must have been asleep
when I let that lot through.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH 3/5] virtio_net: Add XDP support
From: Eric Dumazet @ 2016-11-18 23:23 UTC (permalink / raw)
  To: John Fastabend
  Cc: tgraf, shm, alexei.starovoitov, daniel, davem, john.r.fastabend,
	netdev, bblanco, brouer
In-Reply-To: <20161118190017.16137.13910.stgit@john-Precision-Tower-5810>

On Fri, 2016-11-18 at 11:00 -0800, John Fastabend wrote:
> From: Shrijeet Mukherjee <shrijeet@gmail.com>


>  #include <linux/slab.h>
> @@ -81,6 +82,8 @@ struct receive_queue {
>  
>  	struct napi_struct napi;
>  
> +	struct bpf_prog *xdp_prog;

Please add proper sparse annotation, as in 

	struct bpf_prog __rcu *xdp_prog;

And run sparse ;)

CONFIG_SPARSE_RCU_POINTER=y

make C=2 drivers/net/virtio_net.o

^ permalink raw reply

* Re: [PATCH 3/5] virtio_net: Add XDP support
From: Eric Dumazet @ 2016-11-18 23:21 UTC (permalink / raw)
  To: John Fastabend
  Cc: tgraf, shm, alexei.starovoitov, daniel, davem, john.r.fastabend,
	netdev, bblanco, brouer
In-Reply-To: <20161118190017.16137.13910.stgit@john-Precision-Tower-5810>

On Fri, 2016-11-18 at 11:00 -0800, John Fastabend wrote:


>  static void free_receive_bufs(struct virtnet_info *vi)
>  {
> +	struct bpf_prog *old_prog;
>  	int i;
>  
>  	for (i = 0; i < vi->max_queue_pairs; i++) {
>  		while (vi->rq[i].pages)
>  			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
> +
> +		old_prog = rcu_dereference(vi->rq[i].xdp_prog);

Seems wrong to me.

Are you sure lockdep (with CONFIG_PROVE_RCU=y) was happy with this ?

> +		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
> +		if (old_prog)
> +			bpf_prog_put(old_prog);
>  	}
>  }
>  
> 

^ permalink raw reply

* [PATCH] net: fix bogus cast in skb_pagelen() and use unsigned variables
From: Alexey Dobriyan @ 2016-11-19  1:08 UTC (permalink / raw)
  To: davem; +Cc: netdev

1) cast to "int" is unnecessary:
   u8 will be promoted to int before decrementing,
   small positive numbers fit into "int", so their values won't be changed
   during promotion.

   Once everything is int including loop counters, signedness doesn't
   matter: 32-bit operations will stay 32-bit operations.

   But! Someone tried to make this loop smart by making everything of
   the same type apparently in an attempt to optimise it.
   Do the optimization, just differently.
   Do the cast where it matters. :^)

2) frag size is unsigned entity and sum of fragments sizes is also
   unsigned.

Make everything unsigned, leave no MOVSX instruction behind.


	add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-4 (-4)
	function                                     old     new   delta
	skb_cow_data                                 835     834      -1
	ip_do_fragment                              2549    2548      -1
	ip6_fragment                                3130    3128      -2
	Total: Before=154865032, After=154865028, chg -0.00%


Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---

 include/linux/skbuff.h |    6 +++---
 net/ipv4/ip_output.c   |    2 +-
 net/ipv6/ip6_output.c  |    2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1799,11 +1799,11 @@ static inline unsigned int skb_headlen(const struct sk_buff *skb)
 	return skb->len - skb->data_len;
 }
 
-static inline int skb_pagelen(const struct sk_buff *skb)
+static inline unsigned int skb_pagelen(const struct sk_buff *skb)
 {
-	int i, len = 0;
+	unsigned int i, len = 0;
 
-	for (i = (int)skb_shinfo(skb)->nr_frags - 1; i >= 0; i--)
+	for (i = skb_shinfo(skb)->nr_frags - 1; (int)i >= 0; i--)
 		len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
 	return len + skb_headlen(skb);
 }
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -581,7 +581,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 	 */
 	if (skb_has_frag_list(skb)) {
 		struct sk_buff *frag, *frag2;
-		int first_len = skb_pagelen(skb);
+		unsigned int first_len = skb_pagelen(skb);
 
 		if (first_len - hlen > mtu ||
 		    ((first_len - hlen) & 7) ||
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -625,7 +625,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 
 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
 	if (skb_has_frag_list(skb)) {
-		int first_len = skb_pagelen(skb);
+		unsigned int first_len = skb_pagelen(skb);
 		struct sk_buff *frag2;
 
 		if (first_len - hlen > mtu ||

^ permalink raw reply

* [PATCH] netlink: smaller nla_attr_minlen table
From: Alexey Dobriyan @ 2016-11-19  0:59 UTC (permalink / raw)
  To: davem; +Cc: netdev

Length of a netlink attribute may be u16 but lengths of basic attributes
are much smaller, so small we can save 16 bytes of .rodata and pocket
change inside .text.

16-bit is worse on x86-64 than 8-bit because of operand size override prefix.

	add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-19 (-19)
	function                                     old     new   delta
	validate_nla                                 418     417      -1
	nla_policy_len                                66      64      -2
	nla_attr_minlen                               32      16     -16
	Total: Before=154865051, After=154865032, chg -0.00%


Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---

 lib/nlattr.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -14,7 +14,7 @@
 #include <linux/types.h>
 #include <net/netlink.h>
 
-static const u16 nla_attr_minlen[NLA_TYPE_MAX+1] = {
+static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = {
 	[NLA_U8]	= sizeof(u8),
 	[NLA_U16]	= sizeof(u16),
 	[NLA_U32]	= sizeof(u32),

^ permalink raw reply

* Re: [net-next] af_packet: Use virtio_net_hdr_to_skb().
From: Jarno Rajahalme @ 2016-11-18 22:58 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20161118.113527.2164504813195182869.davem@davemloft.net>

Sorry for my transgressions and wasting your time. I’ll send a v2 in a moment.

  Jarno
 
> On Nov 18, 2016, at 8:35 AM, David Miller <davem@davemloft.net> wrote:
> 
> From: Jarno Rajahalme <jarno@ovn.org>
> Date: Wed, 16 Nov 2016 18:06:42 -0800
> 
>> Use the common virtio_net_hdr_to_skb() instead of open coding it.
>> Other call sites were changed by commit fd2a0437dc, but this one was
>> missed, maybe because it is split in two parts of the source code.
>> 
>> Also fix other call sites to be more uniform.
>> 
>> Fixes: fd2a0437dc ("virtio_net: introduce virtio_net_hdr_{from,to}_skb")
>> Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
> 
> This patch is doing many more things that just this.
> 
> Do not mix unrelated changes together:
> 
>> @@ -821,9 +821,8 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
>> 		if (iov_iter_count(iter) < vnet_hdr_len)
>> 			return -EINVAL;
>> 
>> -		ret = virtio_net_hdr_from_skb(skb, &vnet_hdr,
>> -					      macvtap_is_little_endian(q));
>> -		if (ret)
>> +		if (virtio_net_hdr_from_skb(skb, &vnet_hdr,
>> +					    macvtap_is_little_endian(q)))
>> 			BUG();
>> 
>> 		if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) !=
> 
> This has nothing to do with modifying code to use vrtio_net_hdr_to_skb(), it
> doesn't belong in this patch.
> 
>> @@ -1361,15 +1360,12 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>> 	}
>> 
>> 	if (vnet_hdr_sz) {
>> -		struct virtio_net_hdr gso = { 0 }; /* no info leak */
>> -		int ret;
>> -
>> +		struct virtio_net_hdr gso;
> 
> This is _extremely_ opaque.  The initializer is trying to prevent kernel memory
> info leaks onto the network or into user space.
> 
> Maybe this transformation is valid but:
> 
> 1) YOU DON'T EVEN MENTION IT IN YOUR COMMIT MESSAGE.
> 
> 2) It's unrelated to this specific change, therefore it belongs in
>   a separate change.
> 
> 3) You don't explain that it is a valid transformation, not why.
> 
> It is extremely disappointing to catch unrelated, potentially far
> reaching things embedded in a patch when I review it.
> 
> Please do not ever do this.
> 
>> @@ -98,4 +98,4 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
>> 	return 0;
>> }
>> 
>> -#endif /* _LINUX_VIRTIO_BYTEORDER */
>> +#endif /* _LINUX_VIRTIO_NET_H */
> 
> Another unrelated change.
> 
>> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
>> index 11db0d6..09abb88 100644
>> --- a/net/packet/af_packet.c
>> +++ b/net/packet/af_packet.c
>> @@ -1971,8 +1971,6 @@ static unsigned int run_filter(struct sk_buff *skb,
>> static int __packet_rcv_vnet(const struct sk_buff *skb,
>> 			     struct virtio_net_hdr *vnet_hdr)
>> {
>> -	*vnet_hdr = (const struct virtio_net_hdr) { 0 };
>> -
> 
> There is no way this belongs in this patch, and again you do not explain
> why removing this initializer is valid.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox