netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next 1/2] net: Refactor path selection in __ip_route_output_key
@ 2015-09-09 21:57 David Ahern
  2015-09-09 21:57 ` [PATCH 2/2 v2] net: Remove VRF change to udp_sendmsg David Ahern
  2015-09-09 22:56 ` [PATCH net-next 1/2] net: Refactor path selection in __ip_route_output_key David Ahern
  0 siblings, 2 replies; 10+ messages in thread
From: David Ahern @ 2015-09-09 21:57 UTC (permalink / raw)
  To: netdev; +Cc: tom, David Ahern

VRF device needs same path selection following lookup to set source
address. Rather than duplicating code, move existing code into a
function that is exported to modules.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 include/net/ip_fib.h     |  2 ++
 net/ipv4/fib_semantics.c | 18 ++++++++++++++++++
 net/ipv4/route.c         | 13 +------------
 3 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index a37d0432bebd..9d8fe37dacbf 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -313,6 +313,8 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event);
 int fib_sync_down_addr(struct net *net, __be32 local);
 int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
 void fib_select_multipath(struct fib_result *res);
+void fib_select_path(struct net *net, struct fib_result *res,
+		     struct flowi4 *fl4);
 
 /* Exported by fib_trie.c */
 void fib_trie_init(void);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 064bd3caaa4f..4ca172223ecb 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1547,3 +1547,21 @@ void fib_select_multipath(struct fib_result *res)
 	spin_unlock_bh(&fib_multipath_lock);
 }
 #endif
+
+void fib_select_path(struct net *net, struct fib_result *res,
+		     struct flowi4 *fl4)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	if (res->fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
+		fib_select_multipath(res);
+	else
+#endif
+	if (!res->prefixlen &&
+	    res->table->tb_num_default > 1 &&
+	    res->type == RTN_UNICAST && !fl4->flowi4_oif)
+		fib_select_default(fl4, res);
+
+	if (!fl4->saddr)
+		fl4->saddr = FIB_RES_PREFSRC(net, *res);
+}
+EXPORT_SYMBOL(fib_select_path);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 5f4a5565ad8b..16d62635b484 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2198,18 +2198,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 		goto make_route;
 	}
 
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
-		fib_select_multipath(&res);
-	else
-#endif
-	if (!res.prefixlen &&
-	    res.table->tb_num_default > 1 &&
-	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
-		fib_select_default(fl4, &res);
-
-	if (!fl4->saddr)
-		fl4->saddr = FIB_RES_PREFSRC(net, res);
+	fib_select_path(net, &res, fl4);
 
 	dev_out = FIB_RES_DEV(res);
 	fl4->flowi4_oif = dev_out->ifindex;
-- 
2.3.2 (Apple Git-55)

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/2 v2] net: Remove VRF change to udp_sendmsg
  2015-09-09 21:57 [PATCH net-next 1/2] net: Refactor path selection in __ip_route_output_key David Ahern
@ 2015-09-09 21:57 ` David Ahern
  2015-09-10  0:04   ` Tom Herbert
  2015-09-09 22:56 ` [PATCH net-next 1/2] net: Refactor path selection in __ip_route_output_key David Ahern
  1 sibling, 1 reply; 10+ messages in thread
From: David Ahern @ 2015-09-09 21:57 UTC (permalink / raw)
  To: netdev; +Cc: tom, David Ahern

Remove the VRF change in udp_sendmsg to set the source address. The VRF
driver already has access to the packet on the TX path via the dst. It
can be used to update the source address in the header. Since the VRF
device is directly associated with a table use fib_table_lookup rather
than the ip_route_output lookup functions.

Function to update source address based on similar code in OVS.

Cc: Tom Herbert <tom@herbertland.com>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
v2
- use fib_table_lookup over __ip_route_output_key since VRF device
  is associated with a table

Dave: not sure if you wanted this for net or wait until net-next.

 drivers/net/vrf.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++----
 net/ipv4/udp.c    | 18 --------------
 2 files changed, 66 insertions(+), 23 deletions(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index e7094fbd7568..4ae0295d4c63 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -160,6 +160,65 @@ static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
 	return NET_XMIT_DROP;
 }
 
+static void update_ipv4_saddr(struct sk_buff *skb, struct iphdr *iph,
+			      __be32 new_addr)
+{
+	int tlen = skb->len - skb_transport_offset(skb);
+
+	if (iph->protocol == IPPROTO_TCP) {
+		if (likely(tlen >= sizeof(struct tcphdr))) {
+			inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb,
+						 iph->saddr, new_addr, 1);
+		}
+	} else if (iph->protocol == IPPROTO_UDP) {
+		if (likely(tlen >= sizeof(struct udphdr))) {
+			struct udphdr *uh = udp_hdr(skb);
+
+			if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+				inet_proto_csum_replace4(&uh->check, skb,
+							 iph->saddr, new_addr,
+							 1);
+			if (!uh->check)
+				uh->check = CSUM_MANGLED_0;
+			}
+		}
+	}
+
+	csum_replace4(&iph->check, iph->saddr, new_addr);
+	skb_clear_hash(skb);
+	iph->saddr = new_addr;
+}
+
+static int vrf_set_ip_saddr(struct sk_buff *skb, struct net_device *dev)
+{
+	struct iphdr *ip4h = ip_hdr(skb);
+	struct flowi4 fl4 = {
+		.flowi4_oif = dev->ifindex,
+		.flowi4_iif = LOOPBACK_IFINDEX,
+		.flowi4_tos = RT_TOS(ip4h->tos),
+		.flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_VRFSRC,
+		.daddr = ip4h->daddr,
+	};
+	struct net_vrf *vrf = netdev_priv(dev);
+	struct net *net = dev_net(dev);
+	struct fib_result res;
+	struct fib_table *tb;
+
+	res.tclassid = 0;
+
+	rcu_read_lock();
+
+	tb = fib_get_table(net, vrf->tb_id);
+	if (tb && !fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF)) {
+		fib_select_path(net, &res, &fl4);
+		update_ipv4_saddr(skb, ip4h, fl4.saddr);
+	}
+
+	rcu_read_unlock();
+
+	return 0;
+}
+
 static int vrf_send_v4_prep(struct sk_buff *skb, struct flowi4 *fl4,
 			    struct net_device *vrf_dev)
 {
@@ -195,16 +254,12 @@ static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb,
 		.flowi4_tos = RT_TOS(ip4h->tos),
 		.flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_VRFSRC,
 		.daddr = ip4h->daddr,
+		.saddr = ip4h->saddr,
 	};
 
 	if (vrf_send_v4_prep(skb, &fl4, vrf_dev))
 		goto err;
 
-	if (!ip4h->saddr) {
-		ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0,
-					       RT_SCOPE_LINK);
-	}
-
 	ret = ip_local_out(skb);
 	if (unlikely(net_xmit_eval(ret)))
 		vrf_dev->stats.tx_errors++;
@@ -298,12 +353,18 @@ static int vrf_finish_output(struct sock *sk, struct sk_buff *skb)
 static int vrf_output(struct sock *sk, struct sk_buff *skb)
 {
 	struct net_device *dev = skb_dst(skb)->dev;
+	struct iphdr *iph = ip_hdr(skb);
 
 	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 
 	skb->dev = dev;
 	skb->protocol = htons(ETH_P_IP);
 
+	if (!iph->saddr && vrf_set_ip_saddr(skb, dev)) {
+		vrf_tx_error(dev, skb);
+		return -EINVAL;
+	}
+
 	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb,
 			    NULL, dev,
 			    vrf_finish_output,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c0a15e7f359f..ee3ba30f1ca5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1017,24 +1017,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 		fl4 = &fl4_stack;
 
-		/* unconnected socket. If output device is enslaved to a VRF
-		 * device lookup source address from VRF table. This mimics
-		 * behavior of ip_route_connect{_init}.
-		 */
-		if (netif_index_is_vrf(net, ipc.oif)) {
-			flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
-					   RT_SCOPE_UNIVERSE, sk->sk_protocol,
-					   (flow_flags | FLOWI_FLAG_VRFSRC),
-					   faddr, saddr, dport,
-					   inet->inet_sport);
-
-			rt = ip_route_output_flow(net, fl4, sk);
-			if (!IS_ERR(rt)) {
-				saddr = fl4->saddr;
-				ip_rt_put(rt);
-			}
-		}
-
 		flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
 				   RT_SCOPE_UNIVERSE, sk->sk_protocol,
 				   flow_flags,
-- 
2.3.2 (Apple Git-55)

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next 1/2] net: Refactor path selection in __ip_route_output_key
  2015-09-09 21:57 [PATCH net-next 1/2] net: Refactor path selection in __ip_route_output_key David Ahern
  2015-09-09 21:57 ` [PATCH 2/2 v2] net: Remove VRF change to udp_sendmsg David Ahern
@ 2015-09-09 22:56 ` David Ahern
  2015-09-10  0:00   ` David Miller
  1 sibling, 1 reply; 10+ messages in thread
From: David Ahern @ 2015-09-09 22:56 UTC (permalink / raw)
  To: netdev, David Miller; +Cc: tom

arg... I goofed the subject line (default is net-next via git config), 
but this one is required for 2/2 so if you want 2/2 for net this one 
should be net as well.


On 9/9/15 3:57 PM, David Ahern wrote:
> VRF device needs same path selection following lookup to set source
> address. Rather than duplicating code, move existing code into a
> function that is exported to modules.
>
> Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
> ---
>   include/net/ip_fib.h     |  2 ++
>   net/ipv4/fib_semantics.c | 18 ++++++++++++++++++
>   net/ipv4/route.c         | 13 +------------
>   3 files changed, 21 insertions(+), 12 deletions(-)
>
> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> index a37d0432bebd..9d8fe37dacbf 100644
> --- a/include/net/ip_fib.h
> +++ b/include/net/ip_fib.h
> @@ -313,6 +313,8 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event);
>   int fib_sync_down_addr(struct net *net, __be32 local);
>   int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
>   void fib_select_multipath(struct fib_result *res);
> +void fib_select_path(struct net *net, struct fib_result *res,
> +		     struct flowi4 *fl4);
>
>   /* Exported by fib_trie.c */
>   void fib_trie_init(void);
> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
> index 064bd3caaa4f..4ca172223ecb 100644
> --- a/net/ipv4/fib_semantics.c
> +++ b/net/ipv4/fib_semantics.c
> @@ -1547,3 +1547,21 @@ void fib_select_multipath(struct fib_result *res)
>   	spin_unlock_bh(&fib_multipath_lock);
>   }
>   #endif
> +
> +void fib_select_path(struct net *net, struct fib_result *res,
> +		     struct flowi4 *fl4)
> +{
> +#ifdef CONFIG_IP_ROUTE_MULTIPATH
> +	if (res->fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
> +		fib_select_multipath(res);
> +	else
> +#endif
> +	if (!res->prefixlen &&
> +	    res->table->tb_num_default > 1 &&
> +	    res->type == RTN_UNICAST && !fl4->flowi4_oif)
> +		fib_select_default(fl4, res);
> +
> +	if (!fl4->saddr)
> +		fl4->saddr = FIB_RES_PREFSRC(net, *res);
> +}
> +EXPORT_SYMBOL(fib_select_path);
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 5f4a5565ad8b..16d62635b484 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -2198,18 +2198,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
>   		goto make_route;
>   	}
>
> -#ifdef CONFIG_IP_ROUTE_MULTIPATH
> -	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
> -		fib_select_multipath(&res);
> -	else
> -#endif
> -	if (!res.prefixlen &&
> -	    res.table->tb_num_default > 1 &&
> -	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
> -		fib_select_default(fl4, &res);
> -
> -	if (!fl4->saddr)
> -		fl4->saddr = FIB_RES_PREFSRC(net, res);
> +	fib_select_path(net, &res, fl4);
>
>   	dev_out = FIB_RES_DEV(res);
>   	fl4->flowi4_oif = dev_out->ifindex;
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next 1/2] net: Refactor path selection in __ip_route_output_key
  2015-09-09 22:56 ` [PATCH net-next 1/2] net: Refactor path selection in __ip_route_output_key David Ahern
@ 2015-09-10  0:00   ` David Miller
  0 siblings, 0 replies; 10+ messages in thread
From: David Miller @ 2015-09-10  0:00 UTC (permalink / raw)
  To: dsa; +Cc: netdev, tom

From: David Ahern <dsa@cumulusnetworks.com>
Date: Wed, 9 Sep 2015 16:56:16 -0600

> arg... I goofed the subject line (default is net-next via git config),
> but this one is required for 2/2 so if you want 2/2 for net this one
> should be net as well.

Understood.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2 v2] net: Remove VRF change to udp_sendmsg
  2015-09-09 21:57 ` [PATCH 2/2 v2] net: Remove VRF change to udp_sendmsg David Ahern
@ 2015-09-10  0:04   ` Tom Herbert
  2015-09-10  0:23     ` David Ahern
  0 siblings, 1 reply; 10+ messages in thread
From: Tom Herbert @ 2015-09-10  0:04 UTC (permalink / raw)
  To: David Ahern; +Cc: Linux Kernel Network Developers

On Wed, Sep 9, 2015 at 2:57 PM, David Ahern <dsa@cumulusnetworks.com> wrote:
> Remove the VRF change in udp_sendmsg to set the source address. The VRF
> driver already has access to the packet on the TX path via the dst. It
> can be used to update the source address in the header. Since the VRF
> device is directly associated with a table use fib_table_lookup rather
> than the ip_route_output lookup functions.
>
> Function to update source address based on similar code in OVS.
>
I have the same comment as in v1 of this patch. Implementing address
selection by doing SNAT is not the right approach.

Tom

> Cc: Tom Herbert <tom@herbertland.com>
> Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
> ---
> v2
> - use fib_table_lookup over __ip_route_output_key since VRF device
>   is associated with a table
>
> Dave: not sure if you wanted this for net or wait until net-next.
>
>  drivers/net/vrf.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++----
>  net/ipv4/udp.c    | 18 --------------
>  2 files changed, 66 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
> index e7094fbd7568..4ae0295d4c63 100644
> --- a/drivers/net/vrf.c
> +++ b/drivers/net/vrf.c
> @@ -160,6 +160,65 @@ static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
>         return NET_XMIT_DROP;
>  }
>
> +static void update_ipv4_saddr(struct sk_buff *skb, struct iphdr *iph,
> +                             __be32 new_addr)
> +{
> +       int tlen = skb->len - skb_transport_offset(skb);
> +
> +       if (iph->protocol == IPPROTO_TCP) {
> +               if (likely(tlen >= sizeof(struct tcphdr))) {
> +                       inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb,
> +                                                iph->saddr, new_addr, 1);
> +               }
> +       } else if (iph->protocol == IPPROTO_UDP) {
> +               if (likely(tlen >= sizeof(struct udphdr))) {
> +                       struct udphdr *uh = udp_hdr(skb);
> +
> +                       if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
> +                               inet_proto_csum_replace4(&uh->check, skb,
> +                                                        iph->saddr, new_addr,
> +                                                        1);
> +                       if (!uh->check)
> +                               uh->check = CSUM_MANGLED_0;
> +                       }
> +               }
> +       }
> +
> +       csum_replace4(&iph->check, iph->saddr, new_addr);
> +       skb_clear_hash(skb);
> +       iph->saddr = new_addr;
> +}
> +
> +static int vrf_set_ip_saddr(struct sk_buff *skb, struct net_device *dev)
> +{
> +       struct iphdr *ip4h = ip_hdr(skb);
> +       struct flowi4 fl4 = {
> +               .flowi4_oif = dev->ifindex,
> +               .flowi4_iif = LOOPBACK_IFINDEX,
> +               .flowi4_tos = RT_TOS(ip4h->tos),
> +               .flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_VRFSRC,
> +               .daddr = ip4h->daddr,
> +       };
> +       struct net_vrf *vrf = netdev_priv(dev);
> +       struct net *net = dev_net(dev);
> +       struct fib_result res;
> +       struct fib_table *tb;
> +
> +       res.tclassid = 0;
> +
> +       rcu_read_lock();
> +
> +       tb = fib_get_table(net, vrf->tb_id);
> +       if (tb && !fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF)) {
> +               fib_select_path(net, &res, &fl4);
> +               update_ipv4_saddr(skb, ip4h, fl4.saddr);
> +       }
> +
> +       rcu_read_unlock();
> +
> +       return 0;
> +}
> +
>  static int vrf_send_v4_prep(struct sk_buff *skb, struct flowi4 *fl4,
>                             struct net_device *vrf_dev)
>  {
> @@ -195,16 +254,12 @@ static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb,
>                 .flowi4_tos = RT_TOS(ip4h->tos),
>                 .flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_VRFSRC,
>                 .daddr = ip4h->daddr,
> +               .saddr = ip4h->saddr,
>         };
>
>         if (vrf_send_v4_prep(skb, &fl4, vrf_dev))
>                 goto err;
>
> -       if (!ip4h->saddr) {
> -               ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0,
> -                                              RT_SCOPE_LINK);
> -       }
> -
>         ret = ip_local_out(skb);
>         if (unlikely(net_xmit_eval(ret)))
>                 vrf_dev->stats.tx_errors++;
> @@ -298,12 +353,18 @@ static int vrf_finish_output(struct sock *sk, struct sk_buff *skb)
>  static int vrf_output(struct sock *sk, struct sk_buff *skb)
>  {
>         struct net_device *dev = skb_dst(skb)->dev;
> +       struct iphdr *iph = ip_hdr(skb);
>
>         IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
>
>         skb->dev = dev;
>         skb->protocol = htons(ETH_P_IP);
>
> +       if (!iph->saddr && vrf_set_ip_saddr(skb, dev)) {
> +               vrf_tx_error(dev, skb);
> +               return -EINVAL;
> +       }
> +
>         return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb,
>                             NULL, dev,
>                             vrf_finish_output,
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index c0a15e7f359f..ee3ba30f1ca5 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -1017,24 +1017,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>
>                 fl4 = &fl4_stack;
>
> -               /* unconnected socket. If output device is enslaved to a VRF
> -                * device lookup source address from VRF table. This mimics
> -                * behavior of ip_route_connect{_init}.
> -                */
> -               if (netif_index_is_vrf(net, ipc.oif)) {
> -                       flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
> -                                          RT_SCOPE_UNIVERSE, sk->sk_protocol,
> -                                          (flow_flags | FLOWI_FLAG_VRFSRC),
> -                                          faddr, saddr, dport,
> -                                          inet->inet_sport);
> -
> -                       rt = ip_route_output_flow(net, fl4, sk);
> -                       if (!IS_ERR(rt)) {
> -                               saddr = fl4->saddr;
> -                               ip_rt_put(rt);
> -                       }
> -               }
> -
>                 flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
>                                    RT_SCOPE_UNIVERSE, sk->sk_protocol,
>                                    flow_flags,
> --
> 2.3.2 (Apple Git-55)
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2 v2] net: Remove VRF change to udp_sendmsg
  2015-09-10  0:04   ` Tom Herbert
@ 2015-09-10  0:23     ` David Ahern
  2015-09-10  0:51       ` Tom Herbert
  0 siblings, 1 reply; 10+ messages in thread
From: David Ahern @ 2015-09-10  0:23 UTC (permalink / raw)
  To: Tom Herbert; +Cc: Linux Kernel Network Developers

On 9/9/15 6:04 PM, Tom Herbert wrote:
> On Wed, Sep 9, 2015 at 2:57 PM, David Ahern <dsa@cumulusnetworks.com> wrote:
>> Remove the VRF change in udp_sendmsg to set the source address. The VRF
>> driver already has access to the packet on the TX path via the dst. It
>> can be used to update the source address in the header. Since the VRF
>> device is directly associated with a table use fib_table_lookup rather
>> than the ip_route_output lookup functions.
>>
>> Function to update source address based on similar code in OVS.
>>
> I have the same comment as in v1 of this patch. Implementing address
> selection by doing SNAT is not the right approach.

Hi Tom:

As I mentioned before this is not SNAT. The source address is being done 
at L3 just as it is in the non-VRF case, and it is only set if the prior 
layers have not.

vrf_set_ip_saddr is called by vrf_output. Setting a probe on a test case 
shows:

root@vm-wheezy:~# perf probe vrf_output
Added new event:
   probe:vrf_output     (on vrf_output)

You can now use it in all perf tools, such as:

	perf record -e probe:vrf_output -aR sleep 1

root@vm-wheezy:~# perf record -e probe:vrf_output -a -g -- vrf-test -t 
dgram -I vrf10 -r 10.2.1.254
09/09/2015 11:19:40 Sent message:
09/09/2015 11:19:40     Hello world!
09/09/2015 11:19:40 Message from: 10.2.1.254:12345
09/09/2015 11:19:40     Hello world!
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.050 MB perf.data (1 samples) ]

root@vm-wheezy:~# perf script --kallsyms /tmp/kallsyms
vrf-test  2773 [002]   207.598817: probe:vrf_output: (ffffffff813a5959)
         ffffffff813a595a vrf_output ([kernel.kallsyms])
         ffffffff81451dd7 ip_local_out_sk ([kernel.kallsyms])
         ffffffff81452cd7 ip_send_skb ([kernel.kallsyms])
         ffffffff8147571e udp_send_skb ([kernel.kallsyms])
         ffffffff81475f6f udp_sendmsg ([kernel.kallsyms])
         ffffffff8147feec inet_sendmsg ([kernel.kallsyms])
         ffffffff813ffc18 sock_sendmsg_nosec ([kernel.kallsyms])
         ffffffff81401414 SYSC_sendto ([kernel.kallsyms])
         ffffffff814015dd sys_sendto ([kernel.kallsyms])
         ffffffff81526572 entry_SYSCALL_64_fastpath ([kernel.kallsyms])
                    dc9d3 sendto (/lib/x86_64-linux-gnu/libc-2.13.so)
                     3217 main (/root/bin/vrf-test)
                    1eead __libc_start_main 
(/lib/x86_64-linux-gnu/libc-2.13.so)

Packets are diverted to the VRF device via a static/custom dst which has 
the output operation set to vrf_output.

David

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2 v2] net: Remove VRF change to udp_sendmsg
  2015-09-10  0:23     ` David Ahern
@ 2015-09-10  0:51       ` Tom Herbert
  2015-09-10  1:10         ` David Ahern
  2015-09-10  3:20         ` David Miller
  0 siblings, 2 replies; 10+ messages in thread
From: Tom Herbert @ 2015-09-10  0:51 UTC (permalink / raw)
  To: David Ahern; +Cc: Linux Kernel Network Developers

On Wed, Sep 9, 2015 at 5:23 PM, David Ahern <dsa@cumulusnetworks.com> wrote:
> On 9/9/15 6:04 PM, Tom Herbert wrote:
>>
>> On Wed, Sep 9, 2015 at 2:57 PM, David Ahern <dsa@cumulusnetworks.com>
>> wrote:
>>>
>>> Remove the VRF change in udp_sendmsg to set the source address. The VRF
>>> driver already has access to the packet on the TX path via the dst. It
>>> can be used to update the source address in the header. Since the VRF
>>> device is directly associated with a table use fib_table_lookup rather
>>> than the ip_route_output lookup functions.
>>>
>>> Function to update source address based on similar code in OVS.
>>>
>> I have the same comment as in v1 of this patch. Implementing address
>> selection by doing SNAT is not the right approach.
>
>
> Hi Tom:
>
> As I mentioned before this is not SNAT. The source address is being done at
> L3 just as it is in the non-VRF case, and it is only set if the prior layers
> have not.
>
It is NAT since you are changing the source address and modifying the
transport protocol checksum below IP and transport layer. There are a
bunch of side effects that you would need to consider. This is
creating custom APIs changing the semantics of address selection, and
also creates inconsistency between how addresses may be selected
between a connected and unconnected sockets. Consider that
ip_local_out_sk calls netfilter NF_INET_LOCAL_OUT hook before
dst->output, so then netfilter would start seeing packets with zero
source address???

A lot of design in the stack is predicated on inet_select_addr
returning the source address to use for sending a packet. This should
always return a reasonable address as an invariant, if someone wishes
to rewrite addresses at a lower layer that's fine, but that should be
defined as a NAT operation. If a device wants to weigh in on address
selection then we can define an ndo function for that as I mentioned
before.

Tom

> vrf_set_ip_saddr is called by vrf_output. Setting a probe on a test case
> shows:
>
> root@vm-wheezy:~# perf probe vrf_output
> Added new event:
>   probe:vrf_output     (on vrf_output)
>
> You can now use it in all perf tools, such as:
>
>         perf record -e probe:vrf_output -aR sleep 1
>
> root@vm-wheezy:~# perf record -e probe:vrf_output -a -g -- vrf-test -t dgram
> -I vrf10 -r 10.2.1.254
> 09/09/2015 11:19:40 Sent message:
> 09/09/2015 11:19:40     Hello world!
> 09/09/2015 11:19:40 Message from: 10.2.1.254:12345
> 09/09/2015 11:19:40     Hello world!
> [ perf record: Woken up 1 times to write data ]
> [ perf record: Captured and wrote 0.050 MB perf.data (1 samples) ]
>
> root@vm-wheezy:~# perf script --kallsyms /tmp/kallsyms
> vrf-test  2773 [002]   207.598817: probe:vrf_output: (ffffffff813a5959)
>         ffffffff813a595a vrf_output ([kernel.kallsyms])
>         ffffffff81451dd7 ip_local_out_sk ([kernel.kallsyms])
>         ffffffff81452cd7 ip_send_skb ([kernel.kallsyms])
>         ffffffff8147571e udp_send_skb ([kernel.kallsyms])
>         ffffffff81475f6f udp_sendmsg ([kernel.kallsyms])
>         ffffffff8147feec inet_sendmsg ([kernel.kallsyms])
>         ffffffff813ffc18 sock_sendmsg_nosec ([kernel.kallsyms])
>         ffffffff81401414 SYSC_sendto ([kernel.kallsyms])
>         ffffffff814015dd sys_sendto ([kernel.kallsyms])
>         ffffffff81526572 entry_SYSCALL_64_fastpath ([kernel.kallsyms])
>                    dc9d3 sendto (/lib/x86_64-linux-gnu/libc-2.13.so)
>                     3217 main (/root/bin/vrf-test)
>                    1eead __libc_start_main
> (/lib/x86_64-linux-gnu/libc-2.13.so)
>
> Packets are diverted to the VRF device via a static/custom dst which has the
> output operation set to vrf_output.
>
> David
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2 v2] net: Remove VRF change to udp_sendmsg
  2015-09-10  0:51       ` Tom Herbert
@ 2015-09-10  1:10         ` David Ahern
  2015-09-10  3:20         ` David Miller
  1 sibling, 0 replies; 10+ messages in thread
From: David Ahern @ 2015-09-10  1:10 UTC (permalink / raw)
  To: Tom Herbert; +Cc: Linux Kernel Network Developers

On 9/9/15 6:51 PM, Tom Herbert wrote:
> It is NAT since you are changing the source address and modifying the
> transport protocol checksum below IP and transport layer. There are a
> bunch of side effects that you would need to consider. This is
> creating custom APIs changing the semantics of address selection, and
> also creates inconsistency between how addresses may be selected
> between a connected and unconnected sockets. Consider that
> ip_local_out_sk calls netfilter NF_INET_LOCAL_OUT hook before
> dst->output, so then netfilter would start seeing packets with zero
> source address???

understood.

>
> A lot of design in the stack is predicated on inet_select_addr
> returning the source address to use for sending a packet. This should
> always return a reasonable address as an invariant, if someone wishes
> to rewrite addresses at a lower layer that's fine, but that should be
> defined as a NAT operation. If a device wants to weigh in on address
> selection then we can define an ndo function for that as I mentioned
> before.

I am floating an idea internally that re-implements how VRF impacts the 
stack. It's 4.4 material and essentially adds dev_xxxx() / ndo functions 
for the intrusions. With net-next closed no since throwing them out yet 
and Nikolay always has good comments on my wild ass ideas.

David

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2 v2] net: Remove VRF change to udp_sendmsg
  2015-09-10  0:51       ` Tom Herbert
  2015-09-10  1:10         ` David Ahern
@ 2015-09-10  3:20         ` David Miller
  2015-09-10  3:32           ` David Ahern
  1 sibling, 1 reply; 10+ messages in thread
From: David Miller @ 2015-09-10  3:20 UTC (permalink / raw)
  To: tom; +Cc: dsa, netdev

From: Tom Herbert <tom@herbertland.com>
Date: Wed, 9 Sep 2015 17:51:19 -0700

> Consider that ip_local_out_sk calls netfilter NF_INET_LOCAL_OUT hook
> before dst->output, so then netfilter would start seeing packets
> with zero source address???

I have to agree with Tom here, this is fatal.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2 v2] net: Remove VRF change to udp_sendmsg
  2015-09-10  3:20         ` David Miller
@ 2015-09-10  3:32           ` David Ahern
  0 siblings, 0 replies; 10+ messages in thread
From: David Ahern @ 2015-09-10  3:32 UTC (permalink / raw)
  To: David Miller, tom; +Cc: netdev

On 9/9/15 9:20 PM, David Miller wrote:
> From: Tom Herbert <tom@herbertland.com>
> Date: Wed, 9 Sep 2015 17:51:19 -0700
>
>> Consider that ip_local_out_sk calls netfilter NF_INET_LOCAL_OUT hook
>> before dst->output, so then netfilter would start seeing packets
>> with zero source address???
>
> I have to agree with Tom here, this is fatal.
>

Ok. I will drop this topic for 4.3; revisit for 4.4.

David

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2015-09-10  3:32 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-09-09 21:57 [PATCH net-next 1/2] net: Refactor path selection in __ip_route_output_key David Ahern
2015-09-09 21:57 ` [PATCH 2/2 v2] net: Remove VRF change to udp_sendmsg David Ahern
2015-09-10  0:04   ` Tom Herbert
2015-09-10  0:23     ` David Ahern
2015-09-10  0:51       ` Tom Herbert
2015-09-10  1:10         ` David Ahern
2015-09-10  3:20         ` David Miller
2015-09-10  3:32           ` David Ahern
2015-09-09 22:56 ` [PATCH net-next 1/2] net: Refactor path selection in __ip_route_output_key David Ahern
2015-09-10  0:00   ` David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).