[PATCH net] openvswitch: Fix egress tunnel info.

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH net] openvswitch: Fix egress tunnel info.
@ 2015-10-05 17:58 Pravin B Shelar
  2015-10-05 18:40 ` Jiri Benc
  2015-10-06 15:42 ` Jiri Benc
  0 siblings, 2 replies; 19+ messages in thread
From: Pravin B Shelar @ 2015-10-05 17:58 UTC (permalink / raw)
  To: netdev; +Cc: Pravin B Shelar

While transitioning to netdev based vport we broke OVS
feature which allows user to retrieve tunnel packet egress
information for lwtunnel devices.  Following patch fixes it
by introducing ndo operation to get the tunnel egress info.
Same ndo operation can be used for lwtunnel devices and compat
ovs-tnl-vport devices. After adding the device operation
There is no need for similar ovs-vport operation. So the vport
API is removed.

Fixes: 614732eaa12d ("openvswitch: Use regular VXLAN net_device device").

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 drivers/net/geneve.c           |   27 ++++++++++++++++++
 drivers/net/vxlan.c            |   46 +++++++++++++++++++++++++++++++
 include/linux/netdevice.h      |   13 +++++++++
 include/net/ip_tunnels.h       |   11 +++++++
 net/core/dev.c                 |   30 ++++++++++++++++++++
 net/ipv4/ip_gre.c              |   51 +++++++++++++++++++++++++++++------
 net/ipv4/ip_tunnel_core.c      |   37 +++++++++++++++++++++++++
 net/openvswitch/actions.c      |    6 +++-
 net/openvswitch/vport-geneve.c |   13 ---------
 net/openvswitch/vport-gre.c    |    8 -----
 net/openvswitch/vport-vxlan.c  |   19 -------------
 net/openvswitch/vport.c        |   58 ----------------------------------------
 net/openvswitch/vport.h        |   35 ------------------------
 13 files changed, 210 insertions(+), 144 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 8f5c02e..97d5b33 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -703,6 +703,32 @@ err:
 	return NETDEV_TX_OK;
 }
 
+static int geneve_egress_tun_info(struct net_device *dev, struct sk_buff *skb,
+				  struct ip_tunnel_info *egress_tun_info,
+				  const void **egress_tun_opts)
+{
+	struct geneve_dev *geneve = netdev_priv(dev);
+	struct ip_tunnel_info *info;
+	struct rtable *rt;
+	struct flowi4 fl4;
+	__be16 sport;
+
+	info = skb_tunnel_info(skb);
+	if (ip_tunnel_info_af(info) != AF_INET)
+		return -EINVAL;
+
+	rt = geneve_get_rt(skb, dev, &fl4, info);
+	if (IS_ERR(rt))
+		return PTR_ERR(rt);
+
+	ip_rt_put(rt);
+
+	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
+	ipv4_egress_info_init(egress_tun_info, egress_tun_opts, info,
+			      fl4.saddr, sport, geneve->dst_port);
+	return 0;
+}
+
 static const struct net_device_ops geneve_netdev_ops = {
 	.ndo_init		= geneve_init,
 	.ndo_uninit		= geneve_uninit,
@@ -713,6 +739,7 @@ static const struct net_device_ops geneve_netdev_ops = {
 	.ndo_change_mtu		= eth_change_mtu,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= eth_mac_addr,
+	.ndo_get_egress_info	= geneve_egress_tun_info,
 };
 
 static void geneve_get_drvinfo(struct net_device *dev,
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index bbac1d3..17a2da0 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2337,6 +2337,51 @@ static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
 	return 0;
 }
 
+static int egress_ipv4_tun_info(struct net_device *dev, struct sk_buff *skb,
+				struct ip_tunnel_info *info,
+				__be16 sport, __be16 dport,
+				struct ip_tunnel_info *egress_tun_info,
+				const void **egress_tun_opts)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct rtable *rt;
+	struct flowi4 fl4;
+
+	memset(&fl4, 0, sizeof(fl4));
+	fl4.flowi4_tos = RT_TOS(info->key.tos);
+	fl4.flowi4_mark = skb->mark;
+	fl4.flowi4_proto = IPPROTO_UDP;
+	fl4.daddr = info->key.u.ipv4.dst;
+
+	rt = ip_route_output_key(vxlan->net, &fl4);
+	if (IS_ERR(rt))
+		return PTR_ERR(rt);
+	ip_rt_put(rt);
+
+	ipv4_egress_info_init(egress_tun_info, egress_tun_opts, info,
+			      fl4.saddr, sport, dport);
+	return 0;
+}
+
+static int vxlan_egress_tun_info(struct net_device *dev, struct sk_buff *skb,
+				 struct ip_tunnel_info *egress_tun_info,
+				 const void **egress_tun_opts)
+{
+	struct ip_tunnel_info *info = skb_tunnel_info(skb);
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	__be16 sport, dport;
+
+	sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
+				  vxlan->cfg.port_max, true);
+	dport = info->key.tp_dst ? : vxlan->cfg.dst_port;
+
+	if (ip_tunnel_info_af(info) == AF_INET)
+		return egress_ipv4_tun_info(dev, skb, info, sport, dport,
+					    egress_tun_info,
+					    egress_tun_opts);
+	return -EINVAL;
+}
+
 static const struct net_device_ops vxlan_netdev_ops = {
 	.ndo_init		= vxlan_init,
 	.ndo_uninit		= vxlan_uninit,
@@ -2351,6 +2396,7 @@ static const struct net_device_ops vxlan_netdev_ops = {
 	.ndo_fdb_add		= vxlan_fdb_add,
 	.ndo_fdb_del		= vxlan_fdb_delete,
 	.ndo_fdb_dump		= vxlan_fdb_dump,
+	.ndo_get_egress_info	= vxlan_egress_tun_info,
 };
 
 /* Info for udev, that this is a virtual tunnel endpoint */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2d15e38..e44c076 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -60,6 +60,7 @@ struct wireless_dev;
 /* 802.15.4 specific */
 struct wpan_dev;
 struct mpls_dev;
+struct ip_tunnel_info;
 
 void netdev_set_default_ethtool_ops(struct net_device *dev,
 				    const struct ethtool_ops *ops);
@@ -1054,6 +1055,11 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
  *	This function is used to pass protocol port error state information
  *	to the switch driver. The switch driver can react to the proto_down
  *      by doing a phys down on the associated switch port.
+ * int (*ndo_get_egress_info)(struct net_device *dev, struct sk_buff *skb,
+ *			      __be32 *saddr, __be16 *sport, __be16 *dport);
+ *	This function is used to get egress tunnel information for given skb.
+ *	This is useful for retrieving outer tunnel header parameters while
+ *	sampling packet.
  *
  */
 struct net_device_ops {
@@ -1227,6 +1233,10 @@ struct net_device_ops {
 	int			(*ndo_get_iflink)(const struct net_device *dev);
 	int			(*ndo_change_proto_down)(struct net_device *dev,
 							 bool proto_down);
+	int			(*ndo_get_egress_info)(struct net_device *dev,
+						       struct sk_buff *skb,
+						       struct ip_tunnel_info *egress_tun_info,
+						       const void **egress_tun_opts);
 };
 
 /**
@@ -2203,6 +2213,9 @@ void dev_add_offload(struct packet_offload *po);
 void dev_remove_offload(struct packet_offload *po);
 
 int dev_get_iflink(const struct net_device *dev);
+int dev_get_egress_info(struct net_device *dev, struct sk_buff *skb,
+			struct ip_tunnel_info *egress_info,
+			const void **egress_opts);
 struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags,
 				      unsigned short mask);
 struct net_device *dev_get_by_name(struct net *net, const char *name);
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index f6dafec..8e3f336 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -337,6 +337,11 @@ void __init ip_tunnel_core_init(void);
 void ip_tunnel_need_metadata(void);
 void ip_tunnel_unneed_metadata(void);
 
+void ipv4_egress_info_init(struct ip_tunnel_info *egress_tun_info,
+			   const void **egress_tun_opts,
+			   struct ip_tunnel_info *info, __be32 saddr,
+			   __be16 sport, __be16 dport);
+
 #else /* CONFIG_INET */
 
 static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate)
@@ -351,6 +356,12 @@ static inline void ip_tunnel_need_metadata(void)
 static inline void ip_tunnel_unneed_metadata(void)
 {
 }
+static inline void ipv4_egress_info_init(struct ip_tunnel_info *egress_tun_info,
+					 const void **egress_tun_opts,
+					 struct ip_tunnel_info *info, __be32 saddr,
+					 __be16 sport, __be16 dport)
+{
+}
 
 #endif /* CONFIG_INET */
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 6bb6470..b764ffc 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -99,6 +99,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/stat.h>
 #include <net/dst.h>
+#include <net/dst_metadata.h>
 #include <net/pkt_sched.h>
 #include <net/checksum.h>
 #include <net/xfrm.h>
@@ -682,6 +683,35 @@ int dev_get_iflink(const struct net_device *dev)
 EXPORT_SYMBOL(dev_get_iflink);
 
 /**
+ *	dev_egress_tun_info - Retrieve tunnel egress information.
+ *	@dev: targeted interface
+ *	@skb: The packet.
+ *	@egress_info: Storage for egress info.
+ *	@egress_opts: Storage for egress options.
+ *
+ *	For better visibility of tunnel traffic OVS needs to retrieve
+ *	egress tunnel information for a packet. Following API allows
+ *	user to get this info.
+ */
+int dev_get_egress_info(struct net_device *dev, struct sk_buff *skb,
+			struct ip_tunnel_info *egress_info,
+			const void **egress_opts)
+{
+	struct ip_tunnel_info *info;
+
+	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_get_egress_info)
+		return -EINVAL;
+
+	info = skb_tunnel_info(skb);
+	if (unlikely(info && !(info->mode & IP_TUNNEL_INFO_TX)))
+		return -EINVAL;
+
+	return dev->netdev_ops->ndo_get_egress_info(dev, skb,
+						    egress_info, egress_opts);
+}
+EXPORT_SYMBOL_GPL(dev_get_egress_info);
+
+/**
  *	__dev_get_by_name	- find a device by its name
  *	@net: the applicable net namespace
  *	@name: name to find
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index bd0679d..f08370e 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -498,10 +498,26 @@ static struct sk_buff *gre_handle_offloads(struct sk_buff *skb,
 					csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
 }
 
+static struct rtable *gre_get_rt(struct sk_buff *skb,
+				 struct net_device *dev,
+				 struct flowi4 *fl,
+				 const struct ip_tunnel_key *key)
+{
+	struct net *net = dev_net(dev);
+
+	memset(fl, 0, sizeof(*fl));
+	fl->daddr = key->u.ipv4.dst;
+	fl->saddr = key->u.ipv4.src;
+	fl->flowi4_tos = RT_TOS(key->tos);
+	fl->flowi4_mark = skb->mark;
+	fl->flowi4_proto = IPPROTO_GRE;
+
+	return ip_route_output_key(net, fl);
+}
+
 static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ip_tunnel_info *tun_info;
-	struct net *net = dev_net(dev);
 	const struct ip_tunnel_key *key;
 	struct flowi4 fl;
 	struct rtable *rt;
@@ -516,14 +532,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 		goto err_free_skb;
 
 	key = &tun_info->key;
-	memset(&fl, 0, sizeof(fl));
-	fl.daddr = key->u.ipv4.dst;
-	fl.saddr = key->u.ipv4.src;
-	fl.flowi4_tos = RT_TOS(key->tos);
-	fl.flowi4_mark = skb->mark;
-	fl.flowi4_proto = IPPROTO_GRE;
-
-	rt = ip_route_output_key(net, &fl);
+	rt = gre_get_rt(skb, dev, &fl, key);
 	if (IS_ERR(rt))
 		goto err_free_skb;
 
@@ -566,6 +575,29 @@ err_free_skb:
 	dev->stats.tx_dropped++;
 }
 
+static int gre_egress_tun_info(struct net_device *dev, struct sk_buff *skb,
+			       struct ip_tunnel_info *egress_tun_info,
+			       const void **egress_tun_opts)
+{
+	struct ip_tunnel_info *info;
+	struct rtable *rt;
+	struct flowi4 fl4;
+
+	info = skb_tunnel_info(skb);
+	if (ip_tunnel_info_af(info) != AF_INET)
+		return -EINVAL;
+
+	rt = gre_get_rt(skb, dev, &fl4, &info->key);
+	if (IS_ERR(rt))
+		return PTR_ERR(rt);
+
+	ip_rt_put(rt);
+
+	ipv4_egress_info_init(egress_tun_info, egress_tun_opts, info,
+			      fl4.saddr, 0, 0);
+	return 0;
+}
+
 static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
 			      struct net_device *dev)
 {
@@ -1023,6 +1055,7 @@ static const struct net_device_ops gre_tap_netdev_ops = {
 	.ndo_change_mtu		= ip_tunnel_change_mtu,
 	.ndo_get_stats64	= ip_tunnel_get_stats64,
 	.ndo_get_iflink		= ip_tunnel_get_iflink,
+	.ndo_get_egress_info	= gre_egress_tun_info,
 };
 
 static void ipgre_tap_setup(struct net_device *dev)
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 84dce6a..ac742de 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -424,3 +424,40 @@ void ip_tunnel_unneed_metadata(void)
 	static_key_slow_dec(&ip_tunnel_metadata_cnt);
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);
+
+static void tnl_egress_opts_init(struct ip_tunnel_info *egress_tun_info,
+				 const void **egress_tun_opts,
+				 struct ip_tunnel_info *info)
+{
+	egress_tun_info->options_len = info->options_len;
+	egress_tun_info->mode = info->mode;
+
+	/* Tunnel options. */
+	if (info->options_len)
+		*egress_tun_opts = ip_tunnel_info_opts(info);
+	else
+		*egress_tun_opts = NULL;
+}
+
+void ipv4_egress_info_init(struct ip_tunnel_info *egress_tun_info,
+			   const void **egress_tun_opts,
+			   struct ip_tunnel_info *info, __be32 saddr,
+			   __be16 sport, __be16 dport)
+{
+	const struct ip_tunnel_key *tun_key;
+
+	/* Generate egress_tun_info based on tun_info,
+	 * saddr, tp_src and tp_dst
+	 */
+	tun_key = &egress_tun_info->key;
+	ip_tunnel_key_init(&egress_tun_info->key,
+			   saddr, tun_key->u.ipv4.dst,
+			   tun_key->tos,
+			   tun_key->ttl,
+			   sport, dport,
+			   tun_key->tun_id,
+			   tun_key->tun_flags);
+
+	tnl_egress_opts_init(egress_tun_info, egress_tun_opts, info);
+}
+EXPORT_SYMBOL_GPL(ipv4_egress_info_init);
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index e23a61c..17762ba 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -794,8 +794,10 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
 				int err;
 
 				upcall.egress_tun_info = &info;
-				err = ovs_vport_get_egress_tun_info(vport, skb,
-								    &upcall);
+				err = dev_get_egress_info(vport->dev, skb,
+							  upcall.egress_tun_info,
+							  &upcall.egress_tun_opts);
+
 				if (err)
 					upcall.egress_tun_info = NULL;
 			}
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index 2735e9c..5f8aaaa 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -52,18 +52,6 @@ static int geneve_get_options(const struct vport *vport,
 	return 0;
 }
 
-static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				      struct dp_upcall_info *upcall)
-{
-	struct geneve_port *geneve_port = geneve_vport(vport);
-	struct net *net = ovs_dp_get_net(vport->dp);
-	__be16 dport = htons(geneve_port->port_no);
-	__be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
-
-	return ovs_tunnel_get_egress_info(upcall, ovs_dp_get_net(vport->dp),
-					  skb, IPPROTO_UDP, sport, dport);
-}
-
 static struct vport *geneve_tnl_create(const struct vport_parms *parms)
 {
 	struct net *net = ovs_dp_get_net(parms->dp);
@@ -130,7 +118,6 @@ static struct vport_ops ovs_geneve_vport_ops = {
 	.get_options	= geneve_get_options,
 	.send		= ovs_netdev_send,
 	.owner          = THIS_MODULE,
-	.get_egress_tun_info	= geneve_get_egress_tun_info,
 };
 
 static int __init ovs_geneve_tnl_init(void)
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index 4d24481..64225bf 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -84,18 +84,10 @@ static struct vport *gre_create(const struct vport_parms *parms)
 	return ovs_netdev_link(vport, parms->name);
 }
 
-static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				   struct dp_upcall_info *upcall)
-{
-	return ovs_tunnel_get_egress_info(upcall, ovs_dp_get_net(vport->dp),
-					  skb, IPPROTO_GRE, 0, 0);
-}
-
 static struct vport_ops ovs_gre_vport_ops = {
 	.type		= OVS_VPORT_TYPE_GRE,
 	.create		= gre_create,
 	.send		= ovs_netdev_send,
-	.get_egress_tun_info	= gre_get_egress_tun_info,
 	.destroy	= ovs_netdev_tunnel_destroy,
 	.owner		= THIS_MODULE,
 };
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index c11413d..e1c9c08 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -146,31 +146,12 @@ static struct vport *vxlan_create(const struct vport_parms *parms)
 	return ovs_netdev_link(vport, parms->name);
 }
 
-static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				     struct dp_upcall_info *upcall)
-{
-	struct vxlan_dev *vxlan = netdev_priv(vport->dev);
-	struct net *net = ovs_dp_get_net(vport->dp);
-	__be16 dst_port = vxlan_dev_dst_port(vxlan);
-	__be16 src_port;
-	int port_min;
-	int port_max;
-
-	inet_get_local_port_range(net, &port_min, &port_max);
-	src_port = udp_flow_src_port(net, skb, 0, 0, true);
-
-	return ovs_tunnel_get_egress_info(upcall, net,
-					  skb, IPPROTO_UDP,
-					  src_port, dst_port);
-}
-
 static struct vport_ops ovs_vxlan_netdev_vport_ops = {
 	.type			= OVS_VPORT_TYPE_VXLAN,
 	.create			= vxlan_create,
 	.destroy		= ovs_netdev_tunnel_destroy,
 	.get_options		= vxlan_get_options,
 	.send			= ovs_netdev_send,
-	.get_egress_tun_info	= vxlan_get_egress_tun_info,
 };
 
 static int __init ovs_vxlan_tnl_init(void)
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index fc5c0b9..0ddbc2e 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -470,61 +470,3 @@ void ovs_vport_deferred_free(struct vport *vport)
 	call_rcu(&vport->rcu, free_vport_rcu);
 }
 EXPORT_SYMBOL_GPL(ovs_vport_deferred_free);
-
-int ovs_tunnel_get_egress_info(struct dp_upcall_info *upcall,
-			       struct net *net,
-			       struct sk_buff *skb,
-			       u8 ipproto,
-			       __be16 tp_src,
-			       __be16 tp_dst)
-{
-	struct ip_tunnel_info *egress_tun_info = upcall->egress_tun_info;
-	const struct ip_tunnel_info *tun_info = skb_tunnel_info(skb);
-	const struct ip_tunnel_key *tun_key;
-	u32 skb_mark = skb->mark;
-	struct rtable *rt;
-	struct flowi4 fl;
-
-	if (unlikely(!tun_info))
-		return -EINVAL;
-	if (ip_tunnel_info_af(tun_info) != AF_INET)
-		return -EINVAL;
-
-	tun_key = &tun_info->key;
-
-	/* Route lookup to get srouce IP address.
-	 * The process may need to be changed if the corresponding process
-	 * in vports ops changed.
-	 */
-	rt = ovs_tunnel_route_lookup(net, tun_key, skb_mark, &fl, ipproto);
-	if (IS_ERR(rt))
-		return PTR_ERR(rt);
-
-	ip_rt_put(rt);
-
-	/* Generate egress_tun_info based on tun_info,
-	 * saddr, tp_src and tp_dst
-	 */
-	ip_tunnel_key_init(&egress_tun_info->key,
-			   fl.saddr, tun_key->u.ipv4.dst,
-			   tun_key->tos,
-			   tun_key->ttl,
-			   tp_src, tp_dst,
-			   tun_key->tun_id,
-			   tun_key->tun_flags);
-	egress_tun_info->options_len = tun_info->options_len;
-	egress_tun_info->mode = tun_info->mode;
-	upcall->egress_tun_opts = ip_tunnel_info_opts(egress_tun_info);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ovs_tunnel_get_egress_info);
-
-int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				  struct dp_upcall_info *upcall)
-{
-	/* get_egress_tun_info() is only implemented on tunnel ports. */
-	if (unlikely(!vport->ops->get_egress_tun_info))
-		return -EINVAL;
-
-	return vport->ops->get_egress_tun_info(vport, skb, upcall);
-}
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index a413f3a..d341ad6 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -27,7 +27,6 @@
 #include <linux/skbuff.h>
 #include <linux/spinlock.h>
 #include <linux/u64_stats_sync.h>
-#include <net/route.h>
 
 #include "datapath.h"
 
@@ -53,16 +52,6 @@ int ovs_vport_set_upcall_portids(struct vport *, const struct nlattr *pids);
 int ovs_vport_get_upcall_portids(const struct vport *, struct sk_buff *);
 u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *);
 
-int ovs_tunnel_get_egress_info(struct dp_upcall_info *upcall,
-			       struct net *net,
-			       struct sk_buff *,
-			       u8 ipproto,
-			       __be16 tp_src,
-			       __be16 tp_dst);
-
-int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				  struct dp_upcall_info *upcall);
-
 /**
  * struct vport_portids - array of netlink portids of a vport.
  *                        must be protected by rcu.
@@ -140,8 +129,6 @@ struct vport_parms {
  * have any configuration.
  * @send: Send a packet on the device.
  * zero for dropped packets or negative for error.
- * @get_egress_tun_info: Get the egress tunnel 5-tuple and other info for
- * a packet.
  */
 struct vport_ops {
 	enum ovs_vport_type type;
@@ -154,9 +141,6 @@ struct vport_ops {
 	int (*get_options)(const struct vport *, struct sk_buff *);
 
 	void (*send)(struct vport *, struct sk_buff *);
-	int (*get_egress_tun_info)(struct vport *, struct sk_buff *,
-				   struct dp_upcall_info *upcall);
-
 	struct module *owner;
 	struct list_head list;
 };
@@ -215,25 +199,6 @@ static inline const char *ovs_vport_name(struct vport *vport)
 int ovs_vport_ops_register(struct vport_ops *ops);
 void ovs_vport_ops_unregister(struct vport_ops *ops);
 
-static inline struct rtable *ovs_tunnel_route_lookup(struct net *net,
-						     const struct ip_tunnel_key *key,
-						     u32 mark,
-						     struct flowi4 *fl,
-						     u8 protocol)
-{
-	struct rtable *rt;
-
-	memset(fl, 0, sizeof(*fl));
-	fl->daddr = key->u.ipv4.dst;
-	fl->saddr = key->u.ipv4.src;
-	fl->flowi4_tos = RT_TOS(key->tos);
-	fl->flowi4_mark = mark;
-	fl->flowi4_proto = protocol;
-
-	rt = ip_route_output_key(net, fl);
-	return rt;
-}
-
 static inline void ovs_vport_send(struct vport *vport, struct sk_buff *skb)
 {
 	vport->ops->send(vport, skb);
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH net] openvswitch: Fix egress tunnel info.
  2015-10-05 17:58 [PATCH net] openvswitch: Fix egress tunnel info Pravin B Shelar
@ 2015-10-05 18:40 ` Jiri Benc
  2015-10-05 19:37   ` Pravin Shelar
  2015-10-06 15:42 ` Jiri Benc
  1 sibling, 1 reply; 19+ messages in thread
From: Jiri Benc @ 2015-10-05 18:40 UTC (permalink / raw)
  To: Pravin B Shelar; +Cc: netdev

On Mon,  5 Oct 2015 10:58:17 -0700, Pravin B Shelar wrote:
> --- a/drivers/net/vxlan.c
> +++ b/drivers/net/vxlan.c
> @@ -2337,6 +2337,51 @@ static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
>  	return 0;
>  }
>  
> +static int egress_ipv4_tun_info(struct net_device *dev, struct sk_buff *skb,
> +				struct ip_tunnel_info *info,
> +				__be16 sport, __be16 dport,
> +				struct ip_tunnel_info *egress_tun_info,
> +				const void **egress_tun_opts)
> +{
> +	struct vxlan_dev *vxlan = netdev_priv(dev);
> +	struct rtable *rt;
> +	struct flowi4 fl4;
> +
> +	memset(&fl4, 0, sizeof(fl4));
> +	fl4.flowi4_tos = RT_TOS(info->key.tos);
> +	fl4.flowi4_mark = skb->mark;
> +	fl4.flowi4_proto = IPPROTO_UDP;
> +	fl4.daddr = info->key.u.ipv4.dst;
> +
> +	rt = ip_route_output_key(vxlan->net, &fl4);
> +	if (IS_ERR(rt))
> +		return PTR_ERR(rt);
> +	ip_rt_put(rt);
> +
> +	ipv4_egress_info_init(egress_tun_info, egress_tun_opts, info,
> +			      fl4.saddr, sport, dport);
> +	return 0;
> +}
> +
> +static int vxlan_egress_tun_info(struct net_device *dev, struct sk_buff *skb,
> +				 struct ip_tunnel_info *egress_tun_info,
> +				 const void **egress_tun_opts)
> +{
> +	struct ip_tunnel_info *info = skb_tunnel_info(skb);
> +	struct vxlan_dev *vxlan = netdev_priv(dev);
> +	__be16 sport, dport;
> +
> +	sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
> +				  vxlan->cfg.port_max, true);
> +	dport = info->key.tp_dst ? : vxlan->cfg.dst_port;
> +
> +	if (ip_tunnel_info_af(info) == AF_INET)
> +		return egress_ipv4_tun_info(dev, skb, info, sport, dport,
> +					    egress_tun_info,
> +					    egress_tun_opts);
> +	return -EINVAL;
> +}

This duplicates a lot of code from vxlan_xmit_one. I think we want a
common function (or functions) used by both this and xmit.

 Jiri

-- 
Jiri Benc

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net] openvswitch: Fix egress tunnel info.
  2015-10-05 18:40 ` Jiri Benc
@ 2015-10-05 19:37   ` Pravin Shelar
  2015-10-06 15:56     ` Jiri Benc
  0 siblings, 1 reply; 19+ messages in thread
From: Pravin Shelar @ 2015-10-05 19:37 UTC (permalink / raw)
  To: Jiri Benc; +Cc: netdev

On Mon, Oct 5, 2015 at 11:40 AM, Jiri Benc <jbenc@redhat.com> wrote:
> On Mon,  5 Oct 2015 10:58:17 -0700, Pravin B Shelar wrote:
>> --- a/drivers/net/vxlan.c
>> +++ b/drivers/net/vxlan.c
>> @@ -2337,6 +2337,51 @@ static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
>>       return 0;
>>  }
>>
>> +static int egress_ipv4_tun_info(struct net_device *dev, struct sk_buff *skb,
>> +                             struct ip_tunnel_info *info,
>> +                             __be16 sport, __be16 dport,
>> +                             struct ip_tunnel_info *egress_tun_info,
>> +                             const void **egress_tun_opts)
>> +{
>> +     struct vxlan_dev *vxlan = netdev_priv(dev);
>> +     struct rtable *rt;
>> +     struct flowi4 fl4;
>> +
>> +     memset(&fl4, 0, sizeof(fl4));
>> +     fl4.flowi4_tos = RT_TOS(info->key.tos);
>> +     fl4.flowi4_mark = skb->mark;
>> +     fl4.flowi4_proto = IPPROTO_UDP;
>> +     fl4.daddr = info->key.u.ipv4.dst;
>> +
>> +     rt = ip_route_output_key(vxlan->net, &fl4);
>> +     if (IS_ERR(rt))
>> +             return PTR_ERR(rt);
>> +     ip_rt_put(rt);
>> +
>> +     ipv4_egress_info_init(egress_tun_info, egress_tun_opts, info,
>> +                           fl4.saddr, sport, dport);
>> +     return 0;
>> +}
>> +
>> +static int vxlan_egress_tun_info(struct net_device *dev, struct sk_buff *skb,
>> +                              struct ip_tunnel_info *egress_tun_info,
>> +                              const void **egress_tun_opts)
>> +{
>> +     struct ip_tunnel_info *info = skb_tunnel_info(skb);
>> +     struct vxlan_dev *vxlan = netdev_priv(dev);
>> +     __be16 sport, dport;
>> +
>> +     sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
>> +                               vxlan->cfg.port_max, true);
>> +     dport = info->key.tp_dst ? : vxlan->cfg.dst_port;
>> +
>> +     if (ip_tunnel_info_af(info) == AF_INET)
>> +             return egress_ipv4_tun_info(dev, skb, info, sport, dport,
>> +                                         egress_tun_info,
>> +                                         egress_tun_opts);
>> +     return -EINVAL;
>> +}
>
> This duplicates a lot of code from vxlan_xmit_one. I think we want a
> common function (or functions) used by both this and xmit.
>

That would involve lot of refactoring vxlan xmit code. I do not want
to make such change in net tree at such late stage.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net] openvswitch: Fix egress tunnel info.
  2015-10-05 19:37   ` Pravin Shelar
@ 2015-10-06 15:56     ` Jiri Benc
  2015-10-06 18:28       ` Pravin Shelar
  0 siblings, 1 reply; 19+ messages in thread
From: Jiri Benc @ 2015-10-06 15:56 UTC (permalink / raw)
  To: Pravin Shelar; +Cc: netdev

On Mon, 5 Oct 2015 12:37:50 -0700, Pravin Shelar wrote:
> That would involve lot of refactoring vxlan xmit code. I do not want
> to make such change in net tree at such late stage.

Fair enough. But then, the minimal fix for net.git would be just
modifying the current get_egress_tun_info handlers in vport-*.c,
wouldn't it? I'd say let's go this way and fix this properly for
net-next. See my other mail for my comments on the patch.

 Jiri

-- 
Jiri Benc

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net] openvswitch: Fix egress tunnel info.
  2015-10-06 15:56     ` Jiri Benc
@ 2015-10-06 18:28       ` Pravin Shelar
  2015-10-06 18:45         ` Jiri Benc
  0 siblings, 1 reply; 19+ messages in thread
From: Pravin Shelar @ 2015-10-06 18:28 UTC (permalink / raw)
  To: Jiri Benc; +Cc: netdev

On Tue, Oct 6, 2015 at 8:56 AM, Jiri Benc <jbenc@redhat.com> wrote:
> On Mon, 5 Oct 2015 12:37:50 -0700, Pravin Shelar wrote:
>> That would involve lot of refactoring vxlan xmit code. I do not want
>> to make such change in net tree at such late stage.
>
> Fair enough. But then, the minimal fix for net.git would be just
> modifying the current get_egress_tun_info handlers in vport-*.c,
> wouldn't it? I'd say let's go this way and fix this properly for
> net-next. See my other mail for my comments on the patch.
>
What do you have in mind? I do not see way to fix this issue in vport-*.c.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net] openvswitch: Fix egress tunnel info.
  2015-10-06 18:28       ` Pravin Shelar
@ 2015-10-06 18:45         ` Jiri Benc
  2015-10-06 18:55           ` Pravin Shelar
  0 siblings, 1 reply; 19+ messages in thread
From: Jiri Benc @ 2015-10-06 18:45 UTC (permalink / raw)
  To: Pravin Shelar; +Cc: netdev

On Tue, 6 Oct 2015 11:28:08 -0700, Pravin Shelar wrote:
> What do you have in mind? I do not see way to fix this issue in vport-*.c.

It seems to me that e.g. the code you have in vxlan_egress_tun_info in
drivers/net/vxlan.c can be put into vxlan_get_egress_tun_info in
net/openvswitch/vport-vxlan.c. vport->dev is guaranteed to be vxlan,
and the current code accesses netdev_priv(dev) as struct vxlan_dev
anyway.

This would of course not work if we created lwtunnel interface from the
ovs user space. But that's not going to happen with kernel 4.3, we'll
need a way to query datapath for features it supports for this to work
- there's currently no useful way to determine whether the kernel
supports metadata based vxlan or not. I'm working on a patch to query
the datapath for the supported features but that's for net-next. Thus
I think we're safe.

 Jiri

-- 
Jiri Benc

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net] openvswitch: Fix egress tunnel info.
  2015-10-06 18:45         ` Jiri Benc
@ 2015-10-06 18:55           ` Pravin Shelar
  2015-10-06 19:03             ` Jiri Benc
  0 siblings, 1 reply; 19+ messages in thread
From: Pravin Shelar @ 2015-10-06 18:55 UTC (permalink / raw)
  To: Jiri Benc; +Cc: netdev

On Tue, Oct 6, 2015 at 11:45 AM, Jiri Benc <jbenc@redhat.com> wrote:
> On Tue, 6 Oct 2015 11:28:08 -0700, Pravin Shelar wrote:
>> What do you have in mind? I do not see way to fix this issue in vport-*.c.
>
> It seems to me that e.g. the code you have in vxlan_egress_tun_info in
> drivers/net/vxlan.c can be put into vxlan_get_egress_tun_info in
> net/openvswitch/vport-vxlan.c. vport->dev is guaranteed to be vxlan,
> and the current code accesses netdev_priv(dev) as struct vxlan_dev
> anyway.
>
> This would of course not work if we created lwtunnel interface from the
> ovs user space. But that's not going to happen with kernel 4.3, we'll
> need a way to query datapath for features it supports for this to work

This issue exist for lwtunnel based devices, for vport based tunnels
there is no bug.

> - there's currently no useful way to determine whether the kernel
> supports metadata based vxlan or not. I'm working on a patch to query
> the datapath for the supported features but that's for net-next. Thus
> I think we're safe.
>
We should be able to use lwtunnel devices on 4.3. How about using
tunnel device parameters to detect lwtunnel support. For example in
case of vxlan tunnel IFLA_VXLAN_COLLECT_METADATA flags can confirm
lwtunnel support.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net] openvswitch: Fix egress tunnel info.
  2015-10-06 18:55           ` Pravin Shelar
@ 2015-10-06 19:03             ` Jiri Benc
  2015-10-06 19:21               ` Pravin Shelar
  0 siblings, 1 reply; 19+ messages in thread
From: Jiri Benc @ 2015-10-06 19:03 UTC (permalink / raw)
  To: Pravin Shelar; +Cc: netdev

On Tue, 6 Oct 2015 11:55:35 -0700, Pravin Shelar wrote:
> We should be able to use lwtunnel devices on 4.3. How about using
> tunnel device parameters to detect lwtunnel support. For example in
> case of vxlan tunnel IFLA_VXLAN_COLLECT_METADATA flags can confirm
> lwtunnel support.

You would have to create the interface first using that flag and then
check whether the created interface has the flag set or not. If not,
delete the interface.

Unfortunately, old kernels will just ignore the flag when creating the
interface. That's why I wrote there's no useful way to check it.

 Jiri

-- 
Jiri Benc

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net] openvswitch: Fix egress tunnel info.
  2015-10-06 19:03             ` Jiri Benc
@ 2015-10-06 19:21               ` Pravin Shelar
  2015-10-06 19:32                 ` Jiri Benc
  0 siblings, 1 reply; 19+ messages in thread
From: Pravin Shelar @ 2015-10-06 19:21 UTC (permalink / raw)
  To: Jiri Benc; +Cc: netdev

On Tue, Oct 6, 2015 at 12:03 PM, Jiri Benc <jbenc@redhat.com> wrote:
> On Tue, 6 Oct 2015 11:55:35 -0700, Pravin Shelar wrote:
>> We should be able to use lwtunnel devices on 4.3. How about using
>> tunnel device parameters to detect lwtunnel support. For example in
>> case of vxlan tunnel IFLA_VXLAN_COLLECT_METADATA flags can confirm
>> lwtunnel support.
>
> You would have to create the interface first using that flag and then
> check whether the created interface has the flag set or not. If not,
> delete the interface.
>
> Unfortunately, old kernels will just ignore the flag when creating the
> interface. That's why I wrote there's no useful way to check it.
>

It should report the flag when device parameters are requested by
userspace if there is support for lwtunnel. So check for the flag
should be good enough test for lwtunnel support. Why is that not
reliable check?

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net] openvswitch: Fix egress tunnel info.
  2015-10-06 19:21               ` Pravin Shelar
@ 2015-10-06 19:32                 ` Jiri Benc
  2015-10-06 21:16                   ` Pravin Shelar
  0 siblings, 1 reply; 19+ messages in thread
From: Jiri Benc @ 2015-10-06 19:32 UTC (permalink / raw)
  To: Pravin Shelar; +Cc: netdev

On Tue, 6 Oct 2015 12:21:24 -0700, Pravin Shelar wrote:
> It should report the flag when device parameters are requested by
> userspace if there is support for lwtunnel. So check for the flag
> should be good enough test for lwtunnel support. Why is that not
> reliable check?

Which device? You'll need to create one before querying it. Which won't
work so nicely for older kernels. Basically, what the user space would
have to do is trying to create a vxlan interface and then deleting it
and going the tunnel vport way. It can be quite confusing, I'm afraid.

 Jiri

-- 
Jiri Benc

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net] openvswitch: Fix egress tunnel info.
  2015-10-06 19:32                 ` Jiri Benc
@ 2015-10-06 21:16                   ` Pravin Shelar
  2015-10-07  8:09                     ` Jiri Benc
  0 siblings, 1 reply; 19+ messages in thread
From: Pravin Shelar @ 2015-10-06 21:16 UTC (permalink / raw)
  To: Jiri Benc; +Cc: netdev

On Tue, Oct 6, 2015 at 12:32 PM, Jiri Benc <jbenc@redhat.com> wrote:
> On Tue, 6 Oct 2015 12:21:24 -0700, Pravin Shelar wrote:
>> It should report the flag when device parameters are requested by
>> userspace if there is support for lwtunnel. So check for the flag
>> should be good enough test for lwtunnel support. Why is that not
>> reliable check?
>
> Which device? You'll need to create one before querying it. Which won't
> work so nicely for older kernels. Basically, what the user space would
> have to do is trying to create a vxlan interface and then deleting it
> and going the tunnel vport way. It can be quite confusing, I'm afraid.
>

When OVS needs to add a vxlan tunnel device, OVS can create the vxlan
device and check the parameters for the flag. If the flag does not
exist then fallback to vport-create. This only needs to be done for
very first tunnel device. Thats how most of features are supported in
OVS.
Sorry, but I do not see where is the confusion.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net] openvswitch: Fix egress tunnel info.
  2015-10-06 21:16                   ` Pravin Shelar
@ 2015-10-07  8:09                     ` Jiri Benc
  2015-10-07  9:34                       ` Thomas Graf
  2015-10-07 17:19                       ` Pravin Shelar
  0 siblings, 2 replies; 19+ messages in thread
From: Jiri Benc @ 2015-10-07  8:09 UTC (permalink / raw)
  To: Pravin Shelar; +Cc: netdev

On Tue, 6 Oct 2015 14:16:02 -0700, Pravin Shelar wrote:
> When OVS needs to add a vxlan tunnel device, OVS can create the vxlan
> device and check the parameters for the flag. If the flag does not
> exist then fallback to vport-create.

You omitted the very important step, delete the device before falling
back.

This means that with older kernels, there will be an interface created
and destroyed shortly after. With all netlink notifications sent, all
tools listening for netlink events seeing the interface appear and go,
with this being logged in various places, etc. Sounds very hacky and
confusing to anyone watching their logs or output of such tools. That
is the confusion I was talking about.

> This only needs to be done for
> very first tunnel device. Thats how most of features are supported in
> OVS.

The big difference to the other features is this cannot be detected
until half way through the setup.

What I'm proposing instead is to introduce a way to clearly and
unambiguously detect whether lwtunnels are supported or not. We'll need
this anyway: kernel 4.3 won't really support IPv6 tunneling with ovs,
yet there's currently no way to determine whether it's supported or not
(and, unlike with lwtunnel detection, there's not even a hacky way).
Querying the datapath for the supported features is needed
nevertheless; it's only logical to use it for the lwtunnel vs. old
vport decision, too.

I don't understand why you're opposed to this: it's much cleaner and
there's no problem with lwtunnels not being used with the 4.3 kernel,
everything should work just fine.

 Jiri

-- 
Jiri Benc

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net] openvswitch: Fix egress tunnel info.
  2015-10-07  8:09                     ` Jiri Benc
@ 2015-10-07  9:34                       ` Thomas Graf
  2015-10-07  9:53                         ` Jiri Benc
  2015-10-07 17:19                       ` Pravin Shelar
  1 sibling, 1 reply; 19+ messages in thread
From: Thomas Graf @ 2015-10-07  9:34 UTC (permalink / raw)
  To: Jiri Benc; +Cc: Pravin Shelar, netdev

On 10/07/15 at 10:09am, Jiri Benc wrote:
> The big difference to the other features is this cannot be detected
> until half way through the setup.
> 
> What I'm proposing instead is to introduce a way to clearly and
> unambiguously detect whether lwtunnels are supported or not. We'll need
> this anyway: kernel 4.3 won't really support IPv6 tunneling with ovs,
> yet there's currently no way to determine whether it's supported or not
> (and, unlike with lwtunnel detection, there's not even a hacky way).
> Querying the datapath for the supported features is needed
> nevertheless; it's only logical to use it for the lwtunnel vs. old
> vport decision, too.
> 
> I don't understand why you're opposed to this: it's much cleaner and
> there's no problem with lwtunnels not being used with the 4.3 kernel,
> everything should work just fine.

Extending ovs_dp_cmd_fill_info() to dump a new dp->kernel_features
via a new Netlink attribtue which signals these capabilities looks
like a straight forward way to solve this.

OVS just needs to set NLM_F_ECHO when creating the initial datapath.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net] openvswitch: Fix egress tunnel info.
  2015-10-07  9:34                       ` Thomas Graf
@ 2015-10-07  9:53                         ` Jiri Benc
  0 siblings, 0 replies; 19+ messages in thread
From: Jiri Benc @ 2015-10-07  9:53 UTC (permalink / raw)
  To: Thomas Graf; +Cc: Pravin Shelar, netdev

On Wed, 7 Oct 2015 11:34:01 +0200, Thomas Graf wrote:
> Extending ovs_dp_cmd_fill_info() to dump a new dp->kernel_features
> via a new Netlink attribtue which signals these capabilities looks
> like a straight forward way to solve this.
> 
> OVS just needs to set NLM_F_ECHO when creating the initial datapath.

Agreed. That's exactly what I'm trying to implement just now.

 Jiri

-- 
Jiri Benc

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net] openvswitch: Fix egress tunnel info.
  2015-10-07  8:09                     ` Jiri Benc
  2015-10-07  9:34                       ` Thomas Graf
@ 2015-10-07 17:19                       ` Pravin Shelar
  1 sibling, 0 replies; 19+ messages in thread
From: Pravin Shelar @ 2015-10-07 17:19 UTC (permalink / raw)
  To: Jiri Benc; +Cc: netdev

On Wed, Oct 7, 2015 at 1:09 AM, Jiri Benc <jbenc@redhat.com> wrote:
> On Tue, 6 Oct 2015 14:16:02 -0700, Pravin Shelar wrote:
>> When OVS needs to add a vxlan tunnel device, OVS can create the vxlan
>> device and check the parameters for the flag. If the flag does not
>> exist then fallback to vport-create.
>
> You omitted the very important step, delete the device before falling
> back.
>
> This means that with older kernels, there will be an interface created
> and destroyed shortly after. With all netlink notifications sent, all
> tools listening for netlink events seeing the interface appear and go,
> with this being logged in various places, etc. Sounds very hacky and
> confusing to anyone watching their logs or output of such tools. That
> is the confusion I was talking about.
>
>> This only needs to be done for
>> very first tunnel device. Thats how most of features are supported in
>> OVS.
>
> The big difference to the other features is this cannot be detected
> until half way through the setup.
>
> What I'm proposing instead is to introduce a way to clearly and
> unambiguously detect whether lwtunnels are supported or not. We'll need
> this anyway: kernel 4.3 won't really support IPv6 tunneling with ovs,
> yet there's currently no way to determine whether it's supported or not
> (and, unlike with lwtunnel detection, there's not even a hacky way).
> Querying the datapath for the supported features is needed
> nevertheless; it's only logical to use it for the lwtunnel vs. old
> vport decision, too.
>
> I don't understand why you're opposed to this: it's much cleaner and
> there's no problem with lwtunnels not being used with the 4.3 kernel,
> everything should work just fine.
>

I don't want to add more flags to OVS for features which can be
derived from other APIs. But if creating and destroying single virtual
device is such big deal then I am fine with the alternative approach.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net] openvswitch: Fix egress tunnel info.
  2015-10-05 17:58 [PATCH net] openvswitch: Fix egress tunnel info Pravin B Shelar
  2015-10-05 18:40 ` Jiri Benc
@ 2015-10-06 15:42 ` Jiri Benc
  2015-10-06 18:26   ` Pravin Shelar
  1 sibling, 1 reply; 19+ messages in thread
From: Jiri Benc @ 2015-10-06 15:42 UTC (permalink / raw)
  To: Pravin B Shelar; +Cc: netdev

Some more feedback after doing a deeper review.

On Mon,  5 Oct 2015 10:58:17 -0700, Pravin B Shelar wrote:
> --- a/drivers/net/geneve.c
> +++ b/drivers/net/geneve.c
> @@ -703,6 +703,32 @@ err:
>  	return NETDEV_TX_OK;
>  }
>  
> +static int geneve_egress_tun_info(struct net_device *dev, struct sk_buff *skb,
> +				  struct ip_tunnel_info *egress_tun_info,
> +				  const void **egress_tun_opts)
> +{
> +	struct geneve_dev *geneve = netdev_priv(dev);
> +	struct ip_tunnel_info *info;
> +	struct rtable *rt;
> +	struct flowi4 fl4;
> +	__be16 sport;
> +
> +	info = skb_tunnel_info(skb);
> +	if (ip_tunnel_info_af(info) != AF_INET)
> +		return -EINVAL;
> +
> +	rt = geneve_get_rt(skb, dev, &fl4, info);

This will increase dev tx error stats in case the lookup fails which is
probably something we don't want.

[...]
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -60,6 +60,7 @@ struct wireless_dev;
>  /* 802.15.4 specific */
>  struct wpan_dev;
>  struct mpls_dev;
> +struct ip_tunnel_info;
>  
>  void netdev_set_default_ethtool_ops(struct net_device *dev,
>  				    const struct ethtool_ops *ops);
> @@ -1054,6 +1055,11 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
>   *	This function is used to pass protocol port error state information
>   *	to the switch driver. The switch driver can react to the proto_down
>   *      by doing a phys down on the associated switch port.
> + * int (*ndo_get_egress_info)(struct net_device *dev, struct sk_buff *skb,
> + *			      __be32 *saddr, __be16 *sport, __be16 *dport);
> + *	This function is used to get egress tunnel information for given skb.
> + *	This is useful for retrieving outer tunnel header parameters while
> + *	sampling packet.
>   *
>   */
>  struct net_device_ops {
> @@ -1227,6 +1233,10 @@ struct net_device_ops {
>  	int			(*ndo_get_iflink)(const struct net_device *dev);
>  	int			(*ndo_change_proto_down)(struct net_device *dev,
>  							 bool proto_down);
> +	int			(*ndo_get_egress_info)(struct net_device *dev,
> +						       struct sk_buff *skb,
> +						       struct ip_tunnel_info *egress_tun_info,
> +						       const void **egress_tun_opts);

This should have at least a better name to reflect it is about IP
tunnels.

But I don't like having an IP tunnel specific ndo, that doesn't sound
right. The real thing that is wanted here is to complete the dst
metadata. What about:

int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb);

The function will use skb_tunnel_info to get the template info, then
skb_dst_drop and allocate and attach a fully populated metadata_dst.
The egress_tun_info in struct dp_upcall_info then can be completely
dropped, as all the necessary tunnel information will be available
through skb_tunnel_info(skb). Also, when implemented correctly, such
skb will be just sent out without route lookups etc. if afterwards
handed to ndo_start_xmit.

[...]
> --- a/include/net/ip_tunnels.h
> +++ b/include/net/ip_tunnels.h
> @@ -337,6 +337,11 @@ void __init ip_tunnel_core_init(void);
>  void ip_tunnel_need_metadata(void);
>  void ip_tunnel_unneed_metadata(void);
>  
> +void ipv4_egress_info_init(struct ip_tunnel_info *egress_tun_info,
> +			   const void **egress_tun_opts,
> +			   struct ip_tunnel_info *info, __be32 saddr,
> +			   __be16 sport, __be16 dport);

Please use the ip_tunnel prefix as the rest of the functions, this is
not ipv4 egress info but ip *tunnel* egress info.

Also, it's not clear what the difference between "egress_tun_info" and
"info" is. I'd suggest to use "dst_info" and "src_info" or something
similar.

[...]
> --- a/net/ipv4/ip_tunnel_core.c
> +++ b/net/ipv4/ip_tunnel_core.c
> @@ -424,3 +424,40 @@ void ip_tunnel_unneed_metadata(void)
>  	static_key_slow_dec(&ip_tunnel_metadata_cnt);
>  }
>  EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);
> +
> +static void tnl_egress_opts_init(struct ip_tunnel_info *egress_tun_info,
> +				 const void **egress_tun_opts,
> +				 struct ip_tunnel_info *info)
> +{
> +	egress_tun_info->options_len = info->options_len;
> +	egress_tun_info->mode = info->mode;
> +
> +	/* Tunnel options. */
> +	if (info->options_len)
> +		*egress_tun_opts = ip_tunnel_info_opts(info);
> +	else
> +		*egress_tun_opts = NULL;
> +}
> +
> +void ipv4_egress_info_init(struct ip_tunnel_info *egress_tun_info,
> +			   const void **egress_tun_opts,
> +			   struct ip_tunnel_info *info, __be32 saddr,
> +			   __be16 sport, __be16 dport)
> +{
> +	const struct ip_tunnel_key *tun_key;
> +
> +	/* Generate egress_tun_info based on tun_info,
> +	 * saddr, tp_src and tp_dst
> +	 */
> +	tun_key = &egress_tun_info->key;
> +	ip_tunnel_key_init(&egress_tun_info->key,

Just pass tun_key as the first parameter, it will be clearer what's
going on. (I believe the assignments that have no effect will be
optimized out by the compiler, since ip_tunnel_key_init is an inline
function.)

> +			   saddr, tun_key->u.ipv4.dst,
> +			   tun_key->tos,
> +			   tun_key->ttl,
> +			   sport, dport,
> +			   tun_key->tun_id,
> +			   tun_key->tun_flags);

-- 
Jiri Benc

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net] openvswitch: Fix egress tunnel info.
  2015-10-06 15:42 ` Jiri Benc
@ 2015-10-06 18:26   ` Pravin Shelar
  2015-10-06 18:55     ` Jiri Benc
  0 siblings, 1 reply; 19+ messages in thread
From: Pravin Shelar @ 2015-10-06 18:26 UTC (permalink / raw)
  To: Jiri Benc; +Cc: netdev

On Tue, Oct 6, 2015 at 8:42 AM, Jiri Benc <jbenc@redhat.com> wrote:
> Some more feedback after doing a deeper review.
>
Thanks for the review.

> On Mon,  5 Oct 2015 10:58:17 -0700, Pravin B Shelar wrote:
>> --- a/drivers/net/geneve.c
>> +++ b/drivers/net/geneve.c
>> @@ -703,6 +703,32 @@ err:
>>       return NETDEV_TX_OK;
>>  }
>>
>> +static int geneve_egress_tun_info(struct net_device *dev, struct sk_buff *skb,
>> +                               struct ip_tunnel_info *egress_tun_info,
>> +                               const void **egress_tun_opts)
>> +{
>> +     struct geneve_dev *geneve = netdev_priv(dev);
>> +     struct ip_tunnel_info *info;
>> +     struct rtable *rt;
>> +     struct flowi4 fl4;
>> +     __be16 sport;
>> +
>> +     info = skb_tunnel_info(skb);
>> +     if (ip_tunnel_info_af(info) != AF_INET)
>> +             return -EINVAL;
>> +
>> +     rt = geneve_get_rt(skb, dev, &fl4, info);
>
> This will increase dev tx error stats in case the lookup fails which is
> probably something we don't want.
>
right, I will fix it.
> [...]
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -60,6 +60,7 @@ struct wireless_dev;
>>  /* 802.15.4 specific */
>>  struct wpan_dev;
>>  struct mpls_dev;
>> +struct ip_tunnel_info;
>>
>>  void netdev_set_default_ethtool_ops(struct net_device *dev,
>>                                   const struct ethtool_ops *ops);
>> @@ -1054,6 +1055,11 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
>>   *   This function is used to pass protocol port error state information
>>   *   to the switch driver. The switch driver can react to the proto_down
>>   *      by doing a phys down on the associated switch port.
>> + * int (*ndo_get_egress_info)(struct net_device *dev, struct sk_buff *skb,
>> + *                         __be32 *saddr, __be16 *sport, __be16 *dport);
>> + *   This function is used to get egress tunnel information for given skb.
>> + *   This is useful for retrieving outer tunnel header parameters while
>> + *   sampling packet.
>>   *
>>   */
>>  struct net_device_ops {
>> @@ -1227,6 +1233,10 @@ struct net_device_ops {
>>       int                     (*ndo_get_iflink)(const struct net_device *dev);
>>       int                     (*ndo_change_proto_down)(struct net_device *dev,
>>                                                        bool proto_down);
>> +     int                     (*ndo_get_egress_info)(struct net_device *dev,
>> +                                                    struct sk_buff *skb,
>> +                                                    struct ip_tunnel_info *egress_tun_info,
>> +                                                    const void **egress_tun_opts);
>
> This should have at least a better name to reflect it is about IP
> tunnels.
>
> But I don't like having an IP tunnel specific ndo, that doesn't sound
> right. The real thing that is wanted here is to complete the dst
> metadata. What about:
>
> int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb);
>
This is nicer, I will change it.

> The function will use skb_tunnel_info to get the template info, then
> skb_dst_drop and allocate and attach a fully populated metadata_dst.
> The egress_tun_info in struct dp_upcall_info then can be completely
> dropped, as all the necessary tunnel information will be available
> through skb_tunnel_info(skb). Also, when implemented correctly, such
> skb will be just sent out without route lookups etc. if afterwards
> handed to ndo_start_xmit.
>
I do not see need to drop and reallocate dst in this operation. I just
need to set source IP address and source and dst port in the metadata
dst already set in skb.
This fill_metadata function is not called for every packet so
ndo_start_xmit() still needs to do route lookup.
egress_tun_info in dp_upcall_info is required to check for failure. It
would be only set on successful fill_metadata operation.

> [...]
>> --- a/include/net/ip_tunnels.h
>> +++ b/include/net/ip_tunnels.h
>> @@ -337,6 +337,11 @@ void __init ip_tunnel_core_init(void);
>>  void ip_tunnel_need_metadata(void);
>>  void ip_tunnel_unneed_metadata(void);
>>
>> +void ipv4_egress_info_init(struct ip_tunnel_info *egress_tun_info,
>> +                        const void **egress_tun_opts,
>> +                        struct ip_tunnel_info *info, __be32 saddr,
>> +                        __be16 sport, __be16 dport);
>
> Please use the ip_tunnel prefix as the rest of the functions, this is
> not ipv4 egress info but ip *tunnel* egress info.
>
> Also, it's not clear what the difference between "egress_tun_info" and
> "info" is. I'd suggest to use "dst_info" and "src_info" or something
> similar.
>
src and dst prefix is confusing here, since it is nothing to do tunnel
endpoints. Anyways its moot point after changes above there is no need
to have these functions.

> [...]
>> --- a/net/ipv4/ip_tunnel_core.c
>> +++ b/net/ipv4/ip_tunnel_core.c
>> @@ -424,3 +424,40 @@ void ip_tunnel_unneed_metadata(void)
>>       static_key_slow_dec(&ip_tunnel_metadata_cnt);
>>  }
>>  EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);
>> +
>> +static void tnl_egress_opts_init(struct ip_tunnel_info *egress_tun_info,
>> +                              const void **egress_tun_opts,
>> +                              struct ip_tunnel_info *info)
>> +{
>> +     egress_tun_info->options_len = info->options_len;
>> +     egress_tun_info->mode = info->mode;
>> +
>> +     /* Tunnel options. */
>> +     if (info->options_len)
>> +             *egress_tun_opts = ip_tunnel_info_opts(info);
>> +     else
>> +             *egress_tun_opts = NULL;
>> +}
>> +
>> +void ipv4_egress_info_init(struct ip_tunnel_info *egress_tun_info,
>> +                        const void **egress_tun_opts,
>> +                        struct ip_tunnel_info *info, __be32 saddr,
>> +                        __be16 sport, __be16 dport)
>> +{
>> +     const struct ip_tunnel_key *tun_key;
>> +
>> +     /* Generate egress_tun_info based on tun_info,
>> +      * saddr, tp_src and tp_dst
>> +      */
>> +     tun_key = &egress_tun_info->key;
>> +     ip_tunnel_key_init(&egress_tun_info->key,
>
> Just pass tun_key as the first parameter, it will be clearer what's
> going on. (I believe the assignments that have no effect will be
> optimized out by the compiler, since ip_tunnel_key_init is an inline
> function.)
>
Same as above.
>> +                        saddr, tun_key->u.ipv4.dst,
>> +                        tun_key->tos,
>> +                        tun_key->ttl,
>> +                        sport, dport,
>> +                        tun_key->tun_id,
>> +                        tun_key->tun_flags);
>
> --
> Jiri Benc

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net] openvswitch: Fix egress tunnel info.
  2015-10-06 18:26   ` Pravin Shelar
@ 2015-10-06 18:55     ` Jiri Benc
  2015-10-06 19:11       ` Pravin Shelar
  0 siblings, 1 reply; 19+ messages in thread
From: Jiri Benc @ 2015-10-06 18:55 UTC (permalink / raw)
  To: Pravin Shelar; +Cc: netdev

On Tue, 6 Oct 2015 11:26:31 -0700, Pravin Shelar wrote:
> I do not see need to drop and reallocate dst in this operation. I just
> need to set source IP address and source and dst port in the metadata
> dst already set in skb.

If I'm looking at the code correctly, metadata_dst is stored in the
action and each skb gets only a reference to it. Modifying it would
modify the shared metadata_dst (see execute_set_action).

> This fill_metadata function is not called for every packet so
> ndo_start_xmit() still needs to do route lookup.

Yes. I meant that in those cases where the fill_metadata function was
called, we may skip the lookup. Just an optimization and not an
important one. I'm not even sure it can currently happen as the skb is
cloned for each action.

> egress_tun_info in dp_upcall_info is required to check for failure. It
> would be only set on successful fill_metadata operation.

Or you can just set dst to NULL on failure.

 Jiri

-- 
Jiri Benc

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net] openvswitch: Fix egress tunnel info.
  2015-10-06 18:55     ` Jiri Benc
@ 2015-10-06 19:11       ` Pravin Shelar
  0 siblings, 0 replies; 19+ messages in thread
From: Pravin Shelar @ 2015-10-06 19:11 UTC (permalink / raw)
  To: Jiri Benc; +Cc: netdev

On Tue, Oct 6, 2015 at 11:55 AM, Jiri Benc <jbenc@redhat.com> wrote:
> On Tue, 6 Oct 2015 11:26:31 -0700, Pravin Shelar wrote:
>> I do not see need to drop and reallocate dst in this operation. I just
>> need to set source IP address and source and dst port in the metadata
>> dst already set in skb.
>
> If I'm looking at the code correctly, metadata_dst is stored in the
> action and each skb gets only a reference to it. Modifying it would
> modify the shared metadata_dst (see execute_set_action).
>
right, I was thinking about updating just the source IPv4 address
which should be same for given tunnel metadata. But that can get
complicated for updates to ipv6 address.

>> This fill_metadata function is not called for every packet so
>> ndo_start_xmit() still needs to do route lookup.
>
> Yes. I meant that in those cases where the fill_metadata function was
> called, we may skip the lookup. Just an optimization and not an
> important one. I'm not even sure it can currently happen as the skb is
> cloned for each action.
>
The function is called for sampling which is rare operation. So I do
not want to complicate code to optimize such case.

>> egress_tun_info in dp_upcall_info is required to check for failure. It
>> would be only set on successful fill_metadata operation.
>
> Or you can just set dst to NULL on failure.
>

In that case we would need to extra logic to update error stats
correctly device xmit operation. For example geneve_xmit() updates
different error stats for route lookup.

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2015-10-07 17:19 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-10-05 17:58 [PATCH net] openvswitch: Fix egress tunnel info Pravin B Shelar
2015-10-05 18:40 ` Jiri Benc
2015-10-05 19:37   ` Pravin Shelar
2015-10-06 15:56     ` Jiri Benc
2015-10-06 18:28       ` Pravin Shelar
2015-10-06 18:45         ` Jiri Benc
2015-10-06 18:55           ` Pravin Shelar
2015-10-06 19:03             ` Jiri Benc
2015-10-06 19:21               ` Pravin Shelar
2015-10-06 19:32                 ` Jiri Benc
2015-10-06 21:16                   ` Pravin Shelar
2015-10-07  8:09                     ` Jiri Benc
2015-10-07  9:34                       ` Thomas Graf
2015-10-07  9:53                         ` Jiri Benc
2015-10-07 17:19                       ` Pravin Shelar
2015-10-06 15:42 ` Jiri Benc
2015-10-06 18:26   ` Pravin Shelar
2015-10-06 18:55     ` Jiri Benc
2015-10-06 19:11       ` Pravin Shelar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).