Re: [patch v2] ipvs: IPv6 tunnel mode

All of lore.kernel.org
 help / color / mirror / Atom feed

* Re: [patch v2] ipvs: IPv6 tunnel mode
@ 2010-09-27 13:59 ` Simon Horman
  0 siblings, 0 replies; 13+ messages in thread
From: Simon Horman @ 2010-09-27 13:59 UTC (permalink / raw)
  To: netfilter-devel, netdev
  Cc: Hans Schillstrom, Julian Anastasov, lvs-devel, Julius Volz,
	Wensong Zhang, Patrick McHardy

From: Julian Anastasov <ja@ssi.bg>

Tunnel mode for IPv6 doesn't work.

IPv6 encapsulation uses a bad source address for the tunnel.
i.e. VIP will be used as local-addr and encap. dst addr.
Decapsulation will not accept this.

Example
LVS (eth1 2003::2:0:1/96, VIP 2003::2:0:100)
   (eth0 2003::1:0:1/96)
RS  (ethX 2003::1:0:5/96)

tcpdump
2003::2:0:100 > 2003::1:0:5:
IP6 (hlim 63, next-header TCP (6) payload length: 40)
 2003::3:0:10.50991 > 2003::2:0:100.http: Flags [S], cksum 0x7312
(correct), seq 3006460279, win 5760, options [mss 1440,sackOK,TS val
1904932 ecr 0,nop,wscale 3], length 0

In Linux IPv6 impl. you can't have a tunnel with an any cast address
receiving packets (I have not tried to interpret RFC 2473)
To have receive capabilities the tunnel must have:
 - Local address set as multicast addr or an unicast addr
 - Remote address set as an unicast addr.
 - Loop back addres or Link local address are not allowed.

This causes us to setup a tunnel in the Real Server with the
LVS as the remote address, here you can't use the VIP address since it's
used inside the tunnel.

Solution
Use outgoing interface IPv6 address (match against the destination).
i.e. use ip6_route_output() to look up the route cache and
then use ipv6_dev_get_saddr(...) to set the source address of the
encapsulated packet.

Additionally, cache the results in new destination
fields: dst_cookie and dst_saddr and properly check the
returned dst from ip6_route_output. We now add xfrm_lookup
call only for the tunneling method where the source address
is a local one.

Original patch by Hans Schillstrom.
Check dst state and cache results for IPv6 by Julian Anastasov.

Tested-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>

--- 

* v1

  This is Julian's patch with a slightly edited version of the description
  from Hans's original patch.

* v2

  Updated changelog as per commends from Julian

Patrick, please consider this for nf-next.

diff -urp net-next-2.6-e548833-nfct_snat_reroute/linux/include/net/ip_vs.h linux/include/net/ip_vs.h
--- net-next-2.6-e548833-nfct_snat_reroute/linux/include/net/ip_vs.h	2010-09-16 09:03:48.000000000 +0300
+++ linux/include/net/ip_vs.h	2010-09-22 10:50:18.548963467 +0300
@@ -509,6 +509,10 @@ struct ip_vs_dest {
 	spinlock_t		dst_lock;	/* lock of dst_cache */
 	struct dst_entry	*dst_cache;	/* destination cache entry */
 	u32			dst_rtos;	/* RT_TOS(tos) for dst */
+	u32			dst_cookie;
+#ifdef CONFIG_IP_VS_IPV6
+	struct in6_addr		dst_saddr;
+#endif

 	/* for virtual service */
 	struct ip_vs_service	*svc;		/* service it belongs to */
diff -urp net-next-2.6-e548833-nfct_snat_reroute/linux/net/netfilter/ipvs/ip_vs_xmit.c linux/net/netfilter/ipvs/ip_vs_xmit.c
--- net-next-2.6-e548833-nfct_snat_reroute/linux/net/netfilter/ipvs/ip_vs_xmit.c	2010-09-16 09:02:25.000000000 +0300
+++ linux/net/netfilter/ipvs/ip_vs_xmit.c	2010-09-22 16:29:43.271964521 +0300
@@ -26,6 +26,7 @@
 #include <net/route.h>                  /* for ip_route_output */
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
+#include <net/addrconf.h>
 #include <linux/icmpv6.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
@@ -37,26 +38,27 @@
  *      Destination cache to speed up outgoing route lookup
  */
 static inline void
-__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
+__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
+		u32 dst_cookie)
 {
 	struct dst_entry *old_dst;

 	old_dst = dest->dst_cache;
 	dest->dst_cache = dst;
 	dest->dst_rtos = rtos;
+	dest->dst_cookie = dst_cookie;
 	dst_release(old_dst);
 }

 static inline struct dst_entry *
-__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
+__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
 {
 	struct dst_entry *dst = dest->dst_cache;

 	if (!dst)
 		return NULL;
-	if ((dst->obsolete
-	     || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
-	    dst->ops->check(dst, cookie) == NULL) {
+	if ((dst->obsolete || rtos != dest->dst_rtos) &&
+	    dst->ops->check(dst, dest->dst_cookie) == NULL) {
 		dest->dst_cache = NULL;
 		dst_release(dst);
 		return NULL;
@@ -66,15 +68,16 @@ __ip_vs_dst_check(struct ip_vs_dest *des
 }

 static struct rtable *
-__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
+__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos)
 {
+	struct net *net = dev_net(skb->dev);
 	struct rtable *rt;			/* Route to the other host */
 	struct ip_vs_dest *dest = cp->dest;

 	if (dest) {
 		spin_lock(&dest->dst_lock);
 		if (!(rt = (struct rtable *)
-		      __ip_vs_dst_check(dest, rtos, 0))) {
+		      __ip_vs_dst_check(dest, rtos))) {
 			struct flowi fl = {
 				.oif = 0,
 				.nl_u = {
@@ -84,13 +87,13 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
 						.tos = rtos, } },
 			};

-			if (ip_route_output_key(&init_net, &rt, &fl)) {
+			if (ip_route_output_key(net, &rt, &fl)) {
 				spin_unlock(&dest->dst_lock);
 				IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
 					     &dest->addr.ip);
 				return NULL;
 			}
-			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst));
+			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
 			IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n",
 				  &dest->addr.ip,
 				  atomic_read(&rt->dst.__refcnt), rtos);
@@ -106,7 +109,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
 					.tos = rtos, } },
 		};

-		if (ip_route_output_key(&init_net, &rt, &fl)) {
+		if (ip_route_output_key(net, &rt, &fl)) {
 			IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
 				     &cp->daddr.ip);
 			return NULL;
@@ -117,62 +120,79 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
 }

 #ifdef CONFIG_IP_VS_IPV6
+
+static struct dst_entry *
+__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
+			struct in6_addr *ret_saddr, int do_xfrm)
+{
+	struct dst_entry *dst;
+	struct flowi fl = {
+		.oif = 0,
+		.nl_u = {
+			.ip6_u = {
+				.daddr = *daddr,
+			},
+		},
+	};
+
+	dst = ip6_route_output(net, NULL, &fl);
+	if (dst->error)
+		goto out_err;
+	if (!ret_saddr)
+		return dst;
+	if (ipv6_addr_any(&fl.fl6_src) &&
+	    ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
+			       &fl.fl6_dst, 0, &fl.fl6_src) < 0)
+		goto out_err;
+	if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0)
+		goto out_err;
+	ipv6_addr_copy(ret_saddr, &fl.fl6_src);
+	return dst;
+
+out_err:
+	dst_release(dst);
+	IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
+	return NULL;
+}
+
 static struct rt6_info *
-__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
+__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+		      struct in6_addr *ret_saddr, int do_xfrm)
 {
+	struct net *net = dev_net(skb->dev);
 	struct rt6_info *rt;			/* Route to the other host */
 	struct ip_vs_dest *dest = cp->dest;
+	struct dst_entry *dst;

 	if (dest) {
 		spin_lock(&dest->dst_lock);
-		rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
+		rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
 		if (!rt) {
-			struct flowi fl = {
-				.oif = 0,
-				.nl_u = {
-					.ip6_u = {
-						.daddr = dest->addr.in6,
-						.saddr = {
-							.s6_addr32 =
-								{ 0, 0, 0, 0 },
-						},
-					},
-				},
-			};
+			u32 cookie;

-			rt = (struct rt6_info *)ip6_route_output(&init_net,
-								 NULL, &fl);
-			if (!rt) {
+			dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
+						      &dest->dst_saddr,
+						      do_xfrm);
+			if (!dst) {
 				spin_unlock(&dest->dst_lock);
-				IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
-					     &dest->addr.in6);
 				return NULL;
 			}
-			__ip_vs_dst_set(dest, 0, dst_clone(&rt->dst));
-			IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n",
-				  &dest->addr.in6,
+			rt = (struct rt6_info *) dst;
+			cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+			__ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
+			IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
+				  &dest->addr.in6, &dest->dst_saddr,
 				  atomic_read(&rt->dst.__refcnt));
 		}
+		if (ret_saddr)
+			ipv6_addr_copy(ret_saddr, &dest->dst_saddr);
 		spin_unlock(&dest->dst_lock);
 	} else {
-		struct flowi fl = {
-			.oif = 0,
-			.nl_u = {
-				.ip6_u = {
-					.daddr = cp->daddr.in6,
-					.saddr = {
-						.s6_addr32 = { 0, 0, 0, 0 },
-					},
-				},
-			},
-		};
-
-		rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
-		if (!rt) {
-			IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
-				     &cp->daddr.in6);
+		dst = __ip_vs_route_output_v6(net, &cp->daddr.in6, ret_saddr,
+					      do_xfrm);
+		if (!dst)
 			return NULL;
-		}
+		rt = (struct rt6_info *) dst;
 	}

 	return rt;
@@ -248,6 +268,7 @@ int
 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		  struct ip_vs_protocol *pp)
 {
+	struct net *net = dev_net(skb->dev);
 	struct rtable *rt;			/* Route to the other host */
 	struct iphdr  *iph = ip_hdr(skb);
 	u8     tos = iph->tos;
@@ -263,7 +284,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, s

 	EnterFunction(10);

-	if (ip_route_output_key(&init_net, &rt, &fl)) {
+	if (ip_route_output_key(net, &rt, &fl)) {
 		IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n",
 			     __func__, &iph->daddr);
 		goto tx_error_icmp;
@@ -313,25 +334,18 @@ int
 ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		     struct ip_vs_protocol *pp)
 {
+	struct net *net = dev_net(skb->dev);
+	struct dst_entry *dst;
 	struct rt6_info *rt;			/* Route to the other host */
 	struct ipv6hdr  *iph = ipv6_hdr(skb);
 	int    mtu;
-	struct flowi fl = {
-		.oif = 0,
-		.nl_u = {
-			.ip6_u = {
-				.daddr = iph->daddr,
-				.saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
-	};

 	EnterFunction(10);

-	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
-	if (!rt) {
-		IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6\n",
-			     __func__, &iph->daddr);
+	dst = __ip_vs_route_output_v6(net, &iph->daddr, NULL, 0);
+	if (!dst)
 		goto tx_error_icmp;
-	}
+	rt = (struct rt6_info *) dst;

 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
@@ -397,7 +411,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, stru
 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
 	}

-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
 		goto tx_error_icmp;

 	/* MTU checking */
@@ -472,7 +486,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, s
 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
 	}

-	rt = __ip_vs_get_out_rt_v6(cp);
+	rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
 	if (!rt)
 		goto tx_error_icmp;

@@ -557,7 +571,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
 	struct iphdr  *old_iph = ip_hdr(skb);
 	u8     tos = old_iph->tos;
 	__be16 df = old_iph->frag_off;
-	sk_buff_data_t old_transport_header = skb->transport_header;
 	struct iphdr  *iph;			/* Our new IP header */
 	unsigned int max_headroom;		/* The extra header space needed */
 	int    mtu;
@@ -572,7 +585,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
 		goto tx_error;
 	}

-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(tos))))
 		goto tx_error_icmp;

 	tdev = rt->dst.dev;
@@ -616,7 +629,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
 		old_iph = ip_hdr(skb);
 	}

-	skb->transport_header = old_transport_header;
+	skb->transport_header = skb->network_header;

 	/* fix old IP header checksum */
 	ip_send_check(old_iph);
@@ -670,9 +683,9 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
 		     struct ip_vs_protocol *pp)
 {
 	struct rt6_info *rt;		/* Route to the other host */
+	struct in6_addr saddr;		/* Source for tunnel */
 	struct net_device *tdev;	/* Device to other host */
 	struct ipv6hdr  *old_iph = ipv6_hdr(skb);
-	sk_buff_data_t old_transport_header = skb->transport_header;
 	struct ipv6hdr  *iph;		/* Our new IP header */
 	unsigned int max_headroom;	/* The extra header space needed */
 	int    mtu;
@@ -687,17 +700,17 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
 		goto tx_error;
 	}

-	rt = __ip_vs_get_out_rt_v6(cp);
+	rt = __ip_vs_get_out_rt_v6(skb, cp, &saddr, 1);
 	if (!rt)
 		goto tx_error_icmp;

 	tdev = rt->dst.dev;

 	mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
-	/* TODO IPv6: do we need this check in IPv6? */
-	if (mtu < 1280) {
+	if (mtu < IPV6_MIN_MTU) {
 		dst_release(&rt->dst);
-		IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__);
+		IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
+			     IPV6_MIN_MTU);
 		goto tx_error;
 	}
 	if (skb_dst(skb))
@@ -730,7 +743,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
 		old_iph = ipv6_hdr(skb);
 	}

-	skb->transport_header = old_transport_header;
+	skb->transport_header = skb->network_header;

 	skb_push(skb, sizeof(struct ipv6hdr));
 	skb_reset_network_header(skb);
@@ -750,8 +763,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
 	be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
 	iph->priority		=	old_iph->priority;
 	memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
-	iph->daddr		=	rt->rt6i_dst.addr;
-	iph->saddr		=	cp->vaddr.in6; /* rt->rt6i_src.addr; */
+	ipv6_addr_copy(&iph->daddr, &rt->rt6i_dst.addr);
+	ipv6_addr_copy(&iph->saddr, &saddr);
 	iph->hop_limit		=	old_iph->hop_limit;

 	/* Another hack: avoid icmp_send in ip_fragment */
@@ -791,7 +804,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struc

 	EnterFunction(10);

-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
 		goto tx_error_icmp;

 	/* MTU checking */
@@ -843,7 +856,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, st

 	EnterFunction(10);

-	rt = __ip_vs_get_out_rt_v6(cp);
+	rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
 	if (!rt)
 		goto tx_error_icmp;

@@ -919,7 +932,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, str
 	 * mangle and send the packet here (only for VS/NAT)
 	 */

-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(ip_hdr(skb)->tos))))
 		goto tx_error_icmp;

 	/* MTU checking */
@@ -993,7 +1006,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb,
 	 * mangle and send the packet here (only for VS/NAT)
 	 */

-	rt = __ip_vs_get_out_rt_v6(cp);
+	rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
 	if (!rt)
 		goto tx_error_icmp;


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch v2] ipvs: IPv6 tunnel mode
@ 2010-09-27 13:59 ` Simon Horman
  0 siblings, 0 replies; 13+ messages in thread
From: Simon Horman @ 2010-09-27 13:59 UTC (permalink / raw)
  To: lvs-devel, netfilter-devel, netdev
  Cc: Hans Schillstrom, Julian Anastasov, lvs-devel, Julius Volz,
	Wensong Zhang, Patrick McHardy

From: Julian Anastasov <ja@ssi.bg>

Tunnel mode for IPv6 doesn't work.

IPv6 encapsulation uses a bad source address for the tunnel.
i.e. VIP will be used as local-addr and encap. dst addr.
Decapsulation will not accept this.

Example
LVS (eth1 2003::2:0:1/96, VIP 2003::2:0:100)
   (eth0 2003::1:0:1/96)
RS  (ethX 2003::1:0:5/96)

tcpdump
2003::2:0:100 > 2003::1:0:5:
IP6 (hlim 63, next-header TCP (6) payload length: 40)
 2003::3:0:10.50991 > 2003::2:0:100.http: Flags [S], cksum 0x7312
(correct), seq 3006460279, win 5760, options [mss 1440,sackOK,TS val
1904932 ecr 0,nop,wscale 3], length 0

In Linux IPv6 impl. you can't have a tunnel with an any cast address
receiving packets (I have not tried to interpret RFC 2473)
To have receive capabilities the tunnel must have:
 - Local address set as multicast addr or an unicast addr
 - Remote address set as an unicast addr.
 - Loop back addres or Link local address are not allowed.

This causes us to setup a tunnel in the Real Server with the
LVS as the remote address, here you can't use the VIP address since it's
used inside the tunnel.

Solution
Use outgoing interface IPv6 address (match against the destination).
i.e. use ip6_route_output() to look up the route cache and
then use ipv6_dev_get_saddr(...) to set the source address of the
encapsulated packet.

Additionally, cache the results in new destination
fields: dst_cookie and dst_saddr and properly check the
returned dst from ip6_route_output. We now add xfrm_lookup
call only for the tunneling method where the source address
is a local one.

Original patch by Hans Schillstrom.
Check dst state and cache results for IPv6 by Julian Anastasov.

Tested-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>

--- 

* v1

  This is Julian's patch with a slightly edited version of the description
  from Hans's original patch.

* v2

  Updated changelog as per commends from Julian

Patrick, please consider this for nf-next.

diff -urp net-next-2.6-e548833-nfct_snat_reroute/linux/include/net/ip_vs.h linux/include/net/ip_vs.h
--- net-next-2.6-e548833-nfct_snat_reroute/linux/include/net/ip_vs.h	2010-09-16 09:03:48.000000000 +0300
+++ linux/include/net/ip_vs.h	2010-09-22 10:50:18.548963467 +0300
@@ -509,6 +509,10 @@ struct ip_vs_dest {
 	spinlock_t		dst_lock;	/* lock of dst_cache */
 	struct dst_entry	*dst_cache;	/* destination cache entry */
 	u32			dst_rtos;	/* RT_TOS(tos) for dst */
+	u32			dst_cookie;
+#ifdef CONFIG_IP_VS_IPV6
+	struct in6_addr		dst_saddr;
+#endif

 	/* for virtual service */
 	struct ip_vs_service	*svc;		/* service it belongs to */
diff -urp net-next-2.6-e548833-nfct_snat_reroute/linux/net/netfilter/ipvs/ip_vs_xmit.c linux/net/netfilter/ipvs/ip_vs_xmit.c
--- net-next-2.6-e548833-nfct_snat_reroute/linux/net/netfilter/ipvs/ip_vs_xmit.c	2010-09-16 09:02:25.000000000 +0300
+++ linux/net/netfilter/ipvs/ip_vs_xmit.c	2010-09-22 16:29:43.271964521 +0300
@@ -26,6 +26,7 @@
 #include <net/route.h>                  /* for ip_route_output */
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
+#include <net/addrconf.h>
 #include <linux/icmpv6.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
@@ -37,26 +38,27 @@
  *      Destination cache to speed up outgoing route lookup
  */
 static inline void
-__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
+__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
+		u32 dst_cookie)
 {
 	struct dst_entry *old_dst;

 	old_dst = dest->dst_cache;
 	dest->dst_cache = dst;
 	dest->dst_rtos = rtos;
+	dest->dst_cookie = dst_cookie;
 	dst_release(old_dst);
 }

 static inline struct dst_entry *
-__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
+__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
 {
 	struct dst_entry *dst = dest->dst_cache;

 	if (!dst)
 		return NULL;
-	if ((dst->obsolete
-	     || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
-	    dst->ops->check(dst, cookie) == NULL) {
+	if ((dst->obsolete || rtos != dest->dst_rtos) &&
+	    dst->ops->check(dst, dest->dst_cookie) == NULL) {
 		dest->dst_cache = NULL;
 		dst_release(dst);
 		return NULL;
@@ -66,15 +68,16 @@ __ip_vs_dst_check(struct ip_vs_dest *des
 }

 static struct rtable *
-__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
+__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos)
 {
+	struct net *net = dev_net(skb->dev);
 	struct rtable *rt;			/* Route to the other host */
 	struct ip_vs_dest *dest = cp->dest;

 	if (dest) {
 		spin_lock(&dest->dst_lock);
 		if (!(rt = (struct rtable *)
-		      __ip_vs_dst_check(dest, rtos, 0))) {
+		      __ip_vs_dst_check(dest, rtos))) {
 			struct flowi fl = {
 				.oif = 0,
 				.nl_u = {
@@ -84,13 +87,13 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
 						.tos = rtos, } },
 			};

-			if (ip_route_output_key(&init_net, &rt, &fl)) {
+			if (ip_route_output_key(net, &rt, &fl)) {
 				spin_unlock(&dest->dst_lock);
 				IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
 					     &dest->addr.ip);
 				return NULL;
 			}
-			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst));
+			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
 			IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n",
 				  &dest->addr.ip,
 				  atomic_read(&rt->dst.__refcnt), rtos);
@@ -106,7 +109,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
 					.tos = rtos, } },
 		};

-		if (ip_route_output_key(&init_net, &rt, &fl)) {
+		if (ip_route_output_key(net, &rt, &fl)) {
 			IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
 				     &cp->daddr.ip);
 			return NULL;
@@ -117,62 +120,79 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
 }

 #ifdef CONFIG_IP_VS_IPV6
+
+static struct dst_entry *
+__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
+			struct in6_addr *ret_saddr, int do_xfrm)
+{
+	struct dst_entry *dst;
+	struct flowi fl = {
+		.oif = 0,
+		.nl_u = {
+			.ip6_u = {
+				.daddr = *daddr,
+			},
+		},
+	};
+
+	dst = ip6_route_output(net, NULL, &fl);
+	if (dst->error)
+		goto out_err;
+	if (!ret_saddr)
+		return dst;
+	if (ipv6_addr_any(&fl.fl6_src) &&
+	    ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
+			       &fl.fl6_dst, 0, &fl.fl6_src) < 0)
+		goto out_err;
+	if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0)
+		goto out_err;
+	ipv6_addr_copy(ret_saddr, &fl.fl6_src);
+	return dst;
+
+out_err:
+	dst_release(dst);
+	IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
+	return NULL;
+}
+
 static struct rt6_info *
-__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
+__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+		      struct in6_addr *ret_saddr, int do_xfrm)
 {
+	struct net *net = dev_net(skb->dev);
 	struct rt6_info *rt;			/* Route to the other host */
 	struct ip_vs_dest *dest = cp->dest;
+	struct dst_entry *dst;

 	if (dest) {
 		spin_lock(&dest->dst_lock);
-		rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
+		rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
 		if (!rt) {
-			struct flowi fl = {
-				.oif = 0,
-				.nl_u = {
-					.ip6_u = {
-						.daddr = dest->addr.in6,
-						.saddr = {
-							.s6_addr32 =
-								{ 0, 0, 0, 0 },
-						},
-					},
-				},
-			};
+			u32 cookie;

-			rt = (struct rt6_info *)ip6_route_output(&init_net,
-								 NULL, &fl);
-			if (!rt) {
+			dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
+						      &dest->dst_saddr,
+						      do_xfrm);
+			if (!dst) {
 				spin_unlock(&dest->dst_lock);
-				IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
-					     &dest->addr.in6);
 				return NULL;
 			}
-			__ip_vs_dst_set(dest, 0, dst_clone(&rt->dst));
-			IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n",
-				  &dest->addr.in6,
+			rt = (struct rt6_info *) dst;
+			cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+			__ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
+			IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
+				  &dest->addr.in6, &dest->dst_saddr,
 				  atomic_read(&rt->dst.__refcnt));
 		}
+		if (ret_saddr)
+			ipv6_addr_copy(ret_saddr, &dest->dst_saddr);
 		spin_unlock(&dest->dst_lock);
 	} else {
-		struct flowi fl = {
-			.oif = 0,
-			.nl_u = {
-				.ip6_u = {
-					.daddr = cp->daddr.in6,
-					.saddr = {
-						.s6_addr32 = { 0, 0, 0, 0 },
-					},
-				},
-			},
-		};
-
-		rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
-		if (!rt) {
-			IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
-				     &cp->daddr.in6);
+		dst = __ip_vs_route_output_v6(net, &cp->daddr.in6, ret_saddr,
+					      do_xfrm);
+		if (!dst)
 			return NULL;
-		}
+		rt = (struct rt6_info *) dst;
 	}

 	return rt;
@@ -248,6 +268,7 @@ int
 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		  struct ip_vs_protocol *pp)
 {
+	struct net *net = dev_net(skb->dev);
 	struct rtable *rt;			/* Route to the other host */
 	struct iphdr  *iph = ip_hdr(skb);
 	u8     tos = iph->tos;
@@ -263,7 +284,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, s

 	EnterFunction(10);

-	if (ip_route_output_key(&init_net, &rt, &fl)) {
+	if (ip_route_output_key(net, &rt, &fl)) {
 		IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n",
 			     __func__, &iph->daddr);
 		goto tx_error_icmp;
@@ -313,25 +334,18 @@ int
 ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		     struct ip_vs_protocol *pp)
 {
+	struct net *net = dev_net(skb->dev);
+	struct dst_entry *dst;
 	struct rt6_info *rt;			/* Route to the other host */
 	struct ipv6hdr  *iph = ipv6_hdr(skb);
 	int    mtu;
-	struct flowi fl = {
-		.oif = 0,
-		.nl_u = {
-			.ip6_u = {
-				.daddr = iph->daddr,
-				.saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
-	};

 	EnterFunction(10);

-	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
-	if (!rt) {
-		IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6\n",
-			     __func__, &iph->daddr);
+	dst = __ip_vs_route_output_v6(net, &iph->daddr, NULL, 0);
+	if (!dst)
 		goto tx_error_icmp;
-	}
+	rt = (struct rt6_info *) dst;

 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
@@ -397,7 +411,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, stru
 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
 	}

-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
 		goto tx_error_icmp;

 	/* MTU checking */
@@ -472,7 +486,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, s
 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
 	}

-	rt = __ip_vs_get_out_rt_v6(cp);
+	rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
 	if (!rt)
 		goto tx_error_icmp;

@@ -557,7 +571,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
 	struct iphdr  *old_iph = ip_hdr(skb);
 	u8     tos = old_iph->tos;
 	__be16 df = old_iph->frag_off;
-	sk_buff_data_t old_transport_header = skb->transport_header;
 	struct iphdr  *iph;			/* Our new IP header */
 	unsigned int max_headroom;		/* The extra header space needed */
 	int    mtu;
@@ -572,7 +585,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
 		goto tx_error;
 	}

-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(tos))))
 		goto tx_error_icmp;

 	tdev = rt->dst.dev;
@@ -616,7 +629,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
 		old_iph = ip_hdr(skb);
 	}

-	skb->transport_header = old_transport_header;
+	skb->transport_header = skb->network_header;

 	/* fix old IP header checksum */
 	ip_send_check(old_iph);
@@ -670,9 +683,9 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
 		     struct ip_vs_protocol *pp)
 {
 	struct rt6_info *rt;		/* Route to the other host */
+	struct in6_addr saddr;		/* Source for tunnel */
 	struct net_device *tdev;	/* Device to other host */
 	struct ipv6hdr  *old_iph = ipv6_hdr(skb);
-	sk_buff_data_t old_transport_header = skb->transport_header;
 	struct ipv6hdr  *iph;		/* Our new IP header */
 	unsigned int max_headroom;	/* The extra header space needed */
 	int    mtu;
@@ -687,17 +700,17 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
 		goto tx_error;
 	}

-	rt = __ip_vs_get_out_rt_v6(cp);
+	rt = __ip_vs_get_out_rt_v6(skb, cp, &saddr, 1);
 	if (!rt)
 		goto tx_error_icmp;

 	tdev = rt->dst.dev;

 	mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
-	/* TODO IPv6: do we need this check in IPv6? */
-	if (mtu < 1280) {
+	if (mtu < IPV6_MIN_MTU) {
 		dst_release(&rt->dst);
-		IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__);
+		IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
+			     IPV6_MIN_MTU);
 		goto tx_error;
 	}
 	if (skb_dst(skb))
@@ -730,7 +743,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
 		old_iph = ipv6_hdr(skb);
 	}

-	skb->transport_header = old_transport_header;
+	skb->transport_header = skb->network_header;

 	skb_push(skb, sizeof(struct ipv6hdr));
 	skb_reset_network_header(skb);
@@ -750,8 +763,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
 	be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
 	iph->priority		=	old_iph->priority;
 	memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
-	iph->daddr		=	rt->rt6i_dst.addr;
-	iph->saddr		=	cp->vaddr.in6; /* rt->rt6i_src.addr; */
+	ipv6_addr_copy(&iph->daddr, &rt->rt6i_dst.addr);
+	ipv6_addr_copy(&iph->saddr, &saddr);
 	iph->hop_limit		=	old_iph->hop_limit;

 	/* Another hack: avoid icmp_send in ip_fragment */
@@ -791,7 +804,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struc

 	EnterFunction(10);

-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
 		goto tx_error_icmp;

 	/* MTU checking */
@@ -843,7 +856,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, st

 	EnterFunction(10);

-	rt = __ip_vs_get_out_rt_v6(cp);
+	rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
 	if (!rt)
 		goto tx_error_icmp;

@@ -919,7 +932,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, str
 	 * mangle and send the packet here (only for VS/NAT)
 	 */

-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(ip_hdr(skb)->tos))))
 		goto tx_error_icmp;

 	/* MTU checking */
@@ -993,7 +1006,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb,
 	 * mangle and send the packet here (only for VS/NAT)
 	 */

-	rt = __ip_vs_get_out_rt_v6(cp);
+	rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
 	if (!rt)
 		goto tx_error_icmp;


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch v2] ipvs: IPv6 tunnel mode
  2010-09-27 13:59 ` Simon Horman
  (?)
@ 2010-09-30  1:22 ` Simon Horman
  2010-10-04 19:06   ` Patrick McHardy
  -1 siblings, 1 reply; 13+ messages in thread
From: Simon Horman @ 2010-09-30  1:22 UTC (permalink / raw)
  To: lvs-devel, netfilter-devel, netdev
  Cc: Hans Schillstrom, Julian Anastasov, Julius Volz, Wensong Zhang,
	Patrick McHardy

On Mon, Sep 27, 2010 at 10:59:14PM +0900, Simon Horman wrote:
> From: Julian Anastasov <ja@ssi.bg>
> 
> Tunnel mode for IPv6 doesn't work.

Patrick, can you please drop this patch for now.
Hans has found some problems with it.

http://archive.linuxvirtualserver.org/html/lvs-devel/2010-09/msg00073.html

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch v2] ipvs: IPv6 tunnel mode
  2010-09-30  1:22 ` Simon Horman
@ 2010-10-04 19:06   ` Patrick McHardy
  0 siblings, 0 replies; 13+ messages in thread
From: Patrick McHardy @ 2010-10-04 19:06 UTC (permalink / raw)
  To: Simon Horman
  Cc: lvs-devel, netfilter-devel, netdev, Hans Schillstrom,
	Julian Anastasov, Julius Volz, Wensong Zhang

Am 30.09.2010 03:22, schrieb Simon Horman:
> On Mon, Sep 27, 2010 at 10:59:14PM +0900, Simon Horman wrote:
>> From: Julian Anastasov <ja@ssi.bg>
>>
>> Tunnel mode for IPv6 doesn't work.
> 
> Patrick, can you please drop this patch for now.
> Hans has found some problems with it.
> 
> http://archive.linuxvirtualserver.org/html/lvs-devel/2010-09/msg00073.html

Sure, just resend once you consider it ready for inclusion.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [patch v3] ipvs: IPv6 tunnel mode
  2010-09-27 13:59 ` Simon Horman
  (?)
  (?)
@ 2010-10-05  5:52 ` Hans Schillstrom
  2010-10-05  7:11   ` Julian Anastasov
  -1 siblings, 1 reply; 13+ messages in thread
From: Hans Schillstrom @ 2010-10-05  5:52 UTC (permalink / raw)
  To: Simon Horman
  Cc: lvs-devel@vger.kernel.org, Julian Anastasov, Julius Volz,
	Wensong Zhang

Hi Simon
Finally I got time to test this, and it works now with change of dest.

 
Tunnel mode for IPv6 doesn't work.

IPv6 encapsulation uses a bad source address for the tunnel.
i.e. VIP will be used as local-addr and encap. dst addr.
Decapsulation will not accept this.

Example
LVS (eth1 2003::2:0:1/96, VIP 2003::2:0:100)
   (eth0 2003::1:0:1/96)
RS  (ethX 2003::1:0:5/96)

tcpdump
2003::2:0:100 > 2003::1:0:5:
IP6 (hlim 63, next-header TCP (6) payload length: 40)
 2003::3:0:10.50991 > 2003::2:0:100.http: Flags [S], cksum 0x7312
(correct), seq 3006460279, win 5760, options [mss 1440,sackOK,TS val
1904932 ecr 0,nop,wscale 3], length 0

In Linux IPv6 impl. you can't have a tunnel with an any cast address
receiving packets (I have not tried to interpret RFC 2473)
To have receive capabilities the tunnel must have:
 - Local address set as multicast addr or an unicast addr
 - Remote address set as an unicast addr.
 - Loop back addres or Link local address are not allowed.

This causes us to setup a tunnel in the Real Server with the
LVS as the remote address, here you can't use the VIP address since it's
used inside the tunnel.

Solution
Use outgoing interface IPv6 address (match against the destination).
i.e. use ip6_route_output() to look up the route cache and
then use ipv6_dev_get_saddr(...) to set the source address of the
encapsulated packet.

Additionally, cache the results in new destination
fields: dst_cookie and dst_saddr and properly check the
returned dst from ip6_route_output. We now add xfrm_lookup
call only for the tunneling method where the source address
is a local one.

Original patch by Hans Schillstrom.
Check dst state and cache results for IPv6 by Julian Anastasov.


Signed-off-by:Hans Schillstrom <hans.schillstrom@ericsson.com>
Tested-by: Hans Schillstrom <hans.schillstrom@ericsson.com>

---

* v1

  This is Julian's patch with a slightly edited version of the
description
  from Hans's original patch.

* v2

  Updated changelog as per commends from Julian

* v3 

  Flowi dest address used as destination instead of rt6_info in
ip_vs_tunnel_xmit_v6()
  rt6_info somtimes contains a netw address insted of a tunnel


diff -urp
net-next-2.6-e548833-nfct_snat_reroute/linux/include/net/ip_vs.h
linux/include/net/ip_vs.h
--- net-next-2.6-e548833-nfct_snat_reroute/linux/include/net/ip_vs.h
2010-09-16 09:03:48.000000000 +0300
+++ linux/include/net/ip_vs.h   2010-09-22 10:50:18.548963467 +0300
@@ -509,6 +509,10 @@ struct ip_vs_dest {
        spinlock_t              dst_lock;       /* lock of dst_cache */
        struct dst_entry        *dst_cache;     /* destination cache
entry */
        u32                     dst_rtos;       /* RT_TOS(tos) for dst
*/
+       u32                     dst_cookie;
+#ifdef CONFIG_IP_VS_IPV6
+       struct in6_addr         dst_saddr;
+#endif

        /* for virtual service */
        struct ip_vs_service    *svc;           /* service it belongs to
*/
diff -urp
net-next-2.6-e548833-nfct_snat_reroute/linux/net/netfilter/ipvs/ip_vs_xmit.c linux/net/netfilter/ipvs/ip_vs_xmit.c
---
net-next-2.6-e548833-nfct_snat_reroute/linux/net/netfilter/ipvs/ip_vs_xmit.c        2010-09-16 09:02:25.000000000 +0300
+++ linux/net/netfilter/ipvs/ip_vs_xmit.c       2010-09-22
16:29:43.271964521 +0300
@@ -26,6 +26,7 @@
 #include <net/route.h>                  /* for ip_route_output */
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
+#include <net/addrconf.h>
 #include <linux/icmpv6.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
@@ -37,26 +38,27 @@
  *      Destination cache to speed up outgoing route lookup
  */
 static inline void
-__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry
*dst)
+__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry
*dst,
+               u32 dst_cookie)
 {
        struct dst_entry *old_dst;

        old_dst = dest->dst_cache;
        dest->dst_cache = dst;
        dest->dst_rtos = rtos;
+       dest->dst_cookie = dst_cookie;
        dst_release(old_dst);
 }

 static inline struct dst_entry *
-__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
+__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
 {
        struct dst_entry *dst = dest->dst_cache;

        if (!dst)
                return NULL;
-       if ((dst->obsolete
-            || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
-           dst->ops->check(dst, cookie) == NULL) {
+       if ((dst->obsolete || rtos != dest->dst_rtos) &&
+           dst->ops->check(dst, dest->dst_cookie) == NULL) {
                dest->dst_cache = NULL;
                dst_release(dst);
                return NULL;
@@ -66,15 +68,16 @@ __ip_vs_dst_check(struct ip_vs_dest *des
 }

 static struct rtable *
-__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
+__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32
rtos)
 {
+       struct net *net = dev_net(skb->dev);
        struct rtable *rt;                      /* Route to the other
host */
        struct ip_vs_dest *dest = cp->dest;

        if (dest) {
                spin_lock(&dest->dst_lock);
                if (!(rt = (struct rtable *)
-                     __ip_vs_dst_check(dest, rtos, 0))) {
+                     __ip_vs_dst_check(dest, rtos))) {
                        struct flowi fl = {
                                .oif = 0,
                                .nl_u = {
@@ -84,13 +87,13 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
                                                .tos = rtos, } },
                        };

-                       if (ip_route_output_key(&init_net, &rt, &fl)) {
+                       if (ip_route_output_key(net, &rt, &fl)) {
                                spin_unlock(&dest->dst_lock);
                                IP_VS_DBG_RL("ip_route_output error,
dest: %pI4\n",
                                             &dest->addr.ip);
                                return NULL;
                        }
-                       __ip_vs_dst_set(dest, rtos,
dst_clone(&rt->dst));
+                       __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst),
0);
                        IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X
\n",
                                  &dest->addr.ip,
                                  atomic_read(&rt->dst.__refcnt), rtos);
@@ -106,7 +109,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
                                        .tos = rtos, } },
                };

-               if (ip_route_output_key(&init_net, &rt, &fl)) {
+               if (ip_route_output_key(net, &rt, &fl)) {
                        IP_VS_DBG_RL("ip_route_output error, dest: %pI4
\n",
                                     &cp->daddr.ip);
                        return NULL;
@@ -117,62 +120,79 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
 }

 #ifdef CONFIG_IP_VS_IPV6
+
+static struct dst_entry *
+__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
+                       struct in6_addr *ret_saddr, int do_xfrm)
+{
+       struct dst_entry *dst;
+       struct flowi fl = {
+               .oif = 0,
+               .nl_u = {
+                       .ip6_u = {
+                               .daddr = *daddr,
+                       },
+               },
+       };
+
+       dst = ip6_route_output(net, NULL, &fl);
+       if (dst->error)
+               goto out_err;
+       if (!ret_saddr)
+               return dst;
+       if (ipv6_addr_any(&fl.fl6_src) &&
+           ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
+                              &fl.fl6_dst, 0, &fl.fl6_src) < 0)
+               goto out_err;
+       if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0)
+               goto out_err;
+       ipv6_addr_copy(ret_saddr, &fl.fl6_src);
+       return dst;
+
+out_err:
+       dst_release(dst);
+       IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
+       return NULL;
+}
+
 static struct rt6_info *
-__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
+__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+                     struct in6_addr *ret_saddr, int do_xfrm)
 {
+       struct net *net = dev_net(skb->dev);
        struct rt6_info *rt;                    /* Route to the other
host */
        struct ip_vs_dest *dest = cp->dest;
+       struct dst_entry *dst;

        if (dest) {
                spin_lock(&dest->dst_lock);
-               rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
+               rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
                if (!rt) {
-                       struct flowi fl = {
-                               .oif = 0,
-                               .nl_u = {
-                                       .ip6_u = {
-                                               .daddr = dest->addr.in6,
-                                               .saddr = {
-                                                       .s6_addr32 =
-                                                               { 0, 0,
0, 0 },
-                                               },
-                                       },
-                               },
-                       };
+                       u32 cookie;

-                       rt = (struct rt6_info
*)ip6_route_output(&init_net,
-                                                                NULL,
&fl);
-                       if (!rt) {
+                       dst = __ip_vs_route_output_v6(net,
&dest->addr.in6,
+                                                     &dest->dst_saddr,
+                                                     do_xfrm);
+                       if (!dst) {
                                spin_unlock(&dest->dst_lock);
-                               IP_VS_DBG_RL("ip6_route_output error,
dest: %pI6\n",
-                                            &dest->addr.in6);
                                return NULL;
                        }
-                       __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst));
-                       IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n",
-                                 &dest->addr.in6,
+                       rt = (struct rt6_info *) dst;
+                       cookie = rt->rt6i_node ?
rt->rt6i_node->fn_sernum : 0;
+                       __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst),
cookie);
+                       IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d
\n",
+                                 &dest->addr.in6, &dest->dst_saddr,
                                  atomic_read(&rt->dst.__refcnt));
                }
+               if (ret_saddr)
+                       ipv6_addr_copy(ret_saddr, &dest->dst_saddr);
                spin_unlock(&dest->dst_lock);
        } else {
-               struct flowi fl = {
-                       .oif = 0,
-                       .nl_u = {
-                               .ip6_u = {
-                                       .daddr = cp->daddr.in6,
-                                       .saddr = {
-                                               .s6_addr32 = { 0, 0, 0,
0 },
-                                       },
-                               },
-                       },
-               };
-
-               rt = (struct rt6_info *)ip6_route_output(&init_net,
NULL, &fl);
-               if (!rt) {
-                       IP_VS_DBG_RL("ip6_route_output error, dest: %pI6
\n",
-                                    &cp->daddr.in6);
+               dst = __ip_vs_route_output_v6(net, &cp->daddr.in6,
ret_saddr,
+                                             do_xfrm);
+               if (!dst)
                        return NULL;
-               }
+               rt = (struct rt6_info *) dst;
        }

        return rt;
@@ -248,6 +268,7 @@ int
 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
                  struct ip_vs_protocol *pp)
 {
+       struct net *net = dev_net(skb->dev);
        struct rtable *rt;                      /* Route to the other
host */
        struct iphdr  *iph = ip_hdr(skb);
        u8     tos = iph->tos;
@@ -263,7 +284,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, s

        EnterFunction(10);

-       if (ip_route_output_key(&init_net, &rt, &fl)) {
+       if (ip_route_output_key(net, &rt, &fl)) {
                IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4
\n",
                             __func__, &iph->daddr);
                goto tx_error_icmp;
@@ -313,25 +334,18 @@ int
 ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
                     struct ip_vs_protocol *pp)
 {
+       struct net *net = dev_net(skb->dev);
+       struct dst_entry *dst;
        struct rt6_info *rt;                    /* Route to the other
host */
        struct ipv6hdr  *iph = ipv6_hdr(skb);
        int    mtu;
-       struct flowi fl = {
-               .oif = 0,
-               .nl_u = {
-                       .ip6_u = {
-                               .daddr = iph->daddr,
-                               .saddr = { .s6_addr32 = {0, 0, 0,
0} }, } },
-       };

        EnterFunction(10);

-       rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
-       if (!rt) {
-               IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6
\n",
-                            __func__, &iph->daddr);
+       dst = __ip_vs_route_output_v6(net, &iph->daddr, NULL, 0);
+       if (!dst)
                goto tx_error_icmp;
-       }
+       rt = (struct rt6_info *) dst;

        /* MTU checking */
        mtu = dst_mtu(&rt->dst);
@@ -397,7 +411,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, stru
                IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
        }

-       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+       if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
                goto tx_error_icmp;

        /* MTU checking */
@@ -472,7 +486,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, s
                IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
        }

-       rt = __ip_vs_get_out_rt_v6(cp);
+       rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
        if (!rt)
                goto tx_error_icmp;

@@ -557,7 +571,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
        struct iphdr  *old_iph = ip_hdr(skb);
        u8     tos = old_iph->tos;
        __be16 df = old_iph->frag_off;
-       sk_buff_data_t old_transport_header = skb->transport_header;
        struct iphdr  *iph;                     /* Our new IP header */
        unsigned int max_headroom;              /* The extra header
space needed */
        int    mtu;
@@ -572,7 +585,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
                goto tx_error;
        }

-       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
+       if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(tos))))
                goto tx_error_icmp;

        tdev = rt->dst.dev;
@@ -616,7 +629,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
                old_iph = ip_hdr(skb);
        }

-       skb->transport_header = old_transport_header;
+       skb->transport_header = skb->network_header;

        /* fix old IP header checksum */
        ip_send_check(old_iph);
@@ -670,9 +683,9 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
                     struct ip_vs_protocol *pp)
 {
        struct rt6_info *rt;            /* Route to the other host */
+       struct in6_addr saddr;          /* Source for tunnel */
        struct net_device *tdev;        /* Device to other host */
        struct ipv6hdr  *old_iph = ipv6_hdr(skb);
-       sk_buff_data_t old_transport_header = skb->transport_header;
        struct ipv6hdr  *iph;           /* Our new IP header */
        unsigned int max_headroom;      /* The extra header space needed
*/
        int    mtu;
@@ -687,17 +700,17 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
                goto tx_error;
        }

-       rt = __ip_vs_get_out_rt_v6(cp);
+       rt = __ip_vs_get_out_rt_v6(skb, cp, &saddr, 1);
        if (!rt)
                goto tx_error_icmp;

        tdev = rt->dst.dev;

        mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
-       /* TODO IPv6: do we need this check in IPv6? */
-       if (mtu < 1280) {
+       if (mtu < IPV6_MIN_MTU) {
                dst_release(&rt->dst);
-               IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__);
+               IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
+                            IPV6_MIN_MTU);
                goto tx_error;
        }
        if (skb_dst(skb))
@@ -730,7 +743,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
                old_iph = ipv6_hdr(skb);
        }

-       skb->transport_header = old_transport_header;
+       skb->transport_header = skb->network_header;

        skb_push(skb, sizeof(struct ipv6hdr));
        skb_reset_network_header(skb);
@@ -750,8 +763,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
        be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
        iph->priority           =       old_iph->priority;
        memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
-       iph->daddr              =       rt->rt6i_dst.addr;
-       iph->saddr              =       cp->vaddr.in6; /*
rt->rt6i_src.addr; */
+       ipv6_addr_copy(&iph->daddr, &cp->dest->addr.in6);
+       ipv6_addr_copy(&iph->saddr, &saddr);
        iph->hop_limit          =       old_iph->hop_limit;

        /* Another hack: avoid icmp_send in ip_fragment */
@@ -791,7 +804,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struc

        EnterFunction(10);

-       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+       if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
                goto tx_error_icmp;

        /* MTU checking */
@@ -843,7 +856,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, st

        EnterFunction(10);

-       rt = __ip_vs_get_out_rt_v6(cp);
+       rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
        if (!rt)
                goto tx_error_icmp;

@@ -919,7 +932,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, str
         * mangle and send the packet here (only for VS/NAT)
         */

-       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
+       if (!(rt = __ip_vs_get_out_rt(skb, cp,
RT_TOS(ip_hdr(skb)->tos))))
                goto tx_error_icmp;

        /* MTU checking */
@@ -993,7 +1006,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb,
         * mangle and send the packet here (only for VS/NAT)
         */

-       rt = __ip_vs_get_out_rt_v6(cp);
+       rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
        if (!rt)
                goto tx_error_icmp;




^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch v3] ipvs: IPv6 tunnel mode
  2010-10-05  5:52 ` [patch v3] " Hans Schillstrom
@ 2010-10-05  7:11   ` Julian Anastasov
  0 siblings, 0 replies; 13+ messages in thread
From: Julian Anastasov @ 2010-10-05  7:11 UTC (permalink / raw)
  To: Hans Schillstrom
  Cc: Simon Horman, lvs-devel@vger.kernel.org, Julius Volz,
	Wensong Zhang


 	Hello,

On Tue, 5 Oct 2010, Hans Schillstrom wrote:

> Hi Simon
> Finally I got time to test this, and it works now with change of dest.

 	You are using editor that changes tabs into spaces
and wraps long lines. The final patch does not look good.
Also, do not take the risk to use cp->dest->addr. Even if
one day IPv6 sync is supported, there is no rule that
says backup to have all real servers that are present
in master. NULL value for cp->dest is still supported.
Use &cp->daddr.in6 instead, it is perfectly valid for
TUN method.

> @@ -750,8 +763,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
>        be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
>        iph->priority           =       old_iph->priority;
>        memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
> -       iph->daddr              =       rt->rt6i_dst.addr;
> -       iph->saddr              =       cp->vaddr.in6; /*
> rt->rt6i_src.addr; */
> +       ipv6_addr_copy(&iph->daddr, &cp->dest->addr.in6);
> +       ipv6_addr_copy(&iph->saddr, &saddr);
>        iph->hop_limit          =       old_iph->hop_limit;
>
>        /* Another hack: avoid icmp_send in ip_fragment */

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch v2] ipvs: IPv6 tunnel mode
@ 2010-09-26 13:33 Simon Horman
  2010-09-26 14:21 ` Julian Anastasov
  2010-09-27  6:00 ` Hans Schillstrom
  0 siblings, 2 replies; 13+ messages in thread
From: Simon Horman @ 2010-09-26 13:33 UTC (permalink / raw)
  To: Hans Schillstrom; +Cc: Julian Anastasov, lvs-devel, Julius Volz

From: Julian Anastasov <ja@ssi.bg>

Tunnel mode for IPv6 doesn't work.

IPv6 encapsulation uses a bad source address for the tunnel.
i.e. VIP will be used as local-addr and encap. dst addr.
Decapsulation will not accept this.

Example
LVS (eth1 2003::2:0:1/96, VIP 2003::2:0:100)
   (eth0 2003::1:0:1/96)
RS  (ethX 2003::1:0:5/96)

tcpdump
2003::2:0:100 > 2003::1:0:5:
IP6 (hlim 63, next-header TCP (6) payload length: 40)
 2003::3:0:10.50991 > 2003::2:0:100.http: Flags [S], cksum 0x7312
(correct), seq 3006460279, win 5760, options [mss 1440,sackOK,TS val
1904932 ecr 0,nop,wscale 3], length 0

In Linux IPv6 impl. you can't have a tunnel with an any cast address
receiving packets (I have not tried to interpret RFC 2473)
To have receive capabilities the tunnel must have:
 - Local address set as multicast addr or an unicast addr
 - Remote address set as an unicast addr.
 - Loop back addres or Link local address are not allowed.

This causes us to setup a tunnel in the Real Server with the
LVS as the remote address, here you can't use the VIP address since it's
used inside the tunnel.

Solution
Use outgoing interface IPv6 address (match against the destination).
i.e. use ip6_route_output() to look up the route cache and
then use ipv6_dev_get_saddr(...) to set the source address of the
encapsulated packet.

Additionally, cache the results in new destination
fields: dst_cookie and dst_saddr and properly check the
returned dst from ip6_route_output. We now add xfrm_lookup
call only for the tunneling method where the source address
is a local one.

Original patch by Hans Schillstrom.
Check dst state and cache results for IPv6 by Julian Anastasov.

Tested-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>

--- 

* v1

  This is Julian's patch with a slightly edited version of the description
  from Hans's original patch.

* v2

  Updated changelog as per commends from Julian

Is everyone ok with pushing this?

diff -urp net-next-2.6-e548833-nfct_snat_reroute/linux/include/net/ip_vs.h linux/include/net/ip_vs.h
--- net-next-2.6-e548833-nfct_snat_reroute/linux/include/net/ip_vs.h	2010-09-16 09:03:48.000000000 +0300
+++ linux/include/net/ip_vs.h	2010-09-22 10:50:18.548963467 +0300
@@ -509,6 +509,10 @@ struct ip_vs_dest {
 	spinlock_t		dst_lock;	/* lock of dst_cache */
 	struct dst_entry	*dst_cache;	/* destination cache entry */
 	u32			dst_rtos;	/* RT_TOS(tos) for dst */
+	u32			dst_cookie;
+#ifdef CONFIG_IP_VS_IPV6
+	struct in6_addr		dst_saddr;
+#endif

 	/* for virtual service */
 	struct ip_vs_service	*svc;		/* service it belongs to */
diff -urp net-next-2.6-e548833-nfct_snat_reroute/linux/net/netfilter/ipvs/ip_vs_xmit.c linux/net/netfilter/ipvs/ip_vs_xmit.c
--- net-next-2.6-e548833-nfct_snat_reroute/linux/net/netfilter/ipvs/ip_vs_xmit.c	2010-09-16 09:02:25.000000000 +0300
+++ linux/net/netfilter/ipvs/ip_vs_xmit.c	2010-09-22 16:29:43.271964521 +0300
@@ -26,6 +26,7 @@
 #include <net/route.h>                  /* for ip_route_output */
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
+#include <net/addrconf.h>
 #include <linux/icmpv6.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
@@ -37,26 +38,27 @@
  *      Destination cache to speed up outgoing route lookup
  */
 static inline void
-__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
+__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
+		u32 dst_cookie)
 {
 	struct dst_entry *old_dst;

 	old_dst = dest->dst_cache;
 	dest->dst_cache = dst;
 	dest->dst_rtos = rtos;
+	dest->dst_cookie = dst_cookie;
 	dst_release(old_dst);
 }

 static inline struct dst_entry *
-__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
+__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
 {
 	struct dst_entry *dst = dest->dst_cache;

 	if (!dst)
 		return NULL;
-	if ((dst->obsolete
-	     || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
-	    dst->ops->check(dst, cookie) == NULL) {
+	if ((dst->obsolete || rtos != dest->dst_rtos) &&
+	    dst->ops->check(dst, dest->dst_cookie) == NULL) {
 		dest->dst_cache = NULL;
 		dst_release(dst);
 		return NULL;
@@ -66,15 +68,16 @@ __ip_vs_dst_check(struct ip_vs_dest *des
 }

 static struct rtable *
-__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
+__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos)
 {
+	struct net *net = dev_net(skb->dev);
 	struct rtable *rt;			/* Route to the other host */
 	struct ip_vs_dest *dest = cp->dest;

 	if (dest) {
 		spin_lock(&dest->dst_lock);
 		if (!(rt = (struct rtable *)
-		      __ip_vs_dst_check(dest, rtos, 0))) {
+		      __ip_vs_dst_check(dest, rtos))) {
 			struct flowi fl = {
 				.oif = 0,
 				.nl_u = {
@@ -84,13 +87,13 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
 						.tos = rtos, } },
 			};

-			if (ip_route_output_key(&init_net, &rt, &fl)) {
+			if (ip_route_output_key(net, &rt, &fl)) {
 				spin_unlock(&dest->dst_lock);
 				IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
 					     &dest->addr.ip);
 				return NULL;
 			}
-			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst));
+			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
 			IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n",
 				  &dest->addr.ip,
 				  atomic_read(&rt->dst.__refcnt), rtos);
@@ -106,7 +109,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
 					.tos = rtos, } },
 		};

-		if (ip_route_output_key(&init_net, &rt, &fl)) {
+		if (ip_route_output_key(net, &rt, &fl)) {
 			IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
 				     &cp->daddr.ip);
 			return NULL;
@@ -117,62 +120,79 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
 }

 #ifdef CONFIG_IP_VS_IPV6
+
+static struct dst_entry *
+__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
+			struct in6_addr *ret_saddr, int do_xfrm)
+{
+	struct dst_entry *dst;
+	struct flowi fl = {
+		.oif = 0,
+		.nl_u = {
+			.ip6_u = {
+				.daddr = *daddr,
+			},
+		},
+	};
+
+	dst = ip6_route_output(net, NULL, &fl);
+	if (dst->error)
+		goto out_err;
+	if (!ret_saddr)
+		return dst;
+	if (ipv6_addr_any(&fl.fl6_src) &&
+	    ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
+			       &fl.fl6_dst, 0, &fl.fl6_src) < 0)
+		goto out_err;
+	if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0)
+		goto out_err;
+	ipv6_addr_copy(ret_saddr, &fl.fl6_src);
+	return dst;
+
+out_err:
+	dst_release(dst);
+	IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
+	return NULL;
+}
+
 static struct rt6_info *
-__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
+__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+		      struct in6_addr *ret_saddr, int do_xfrm)
 {
+	struct net *net = dev_net(skb->dev);
 	struct rt6_info *rt;			/* Route to the other host */
 	struct ip_vs_dest *dest = cp->dest;
+	struct dst_entry *dst;

 	if (dest) {
 		spin_lock(&dest->dst_lock);
-		rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
+		rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
 		if (!rt) {
-			struct flowi fl = {
-				.oif = 0,
-				.nl_u = {
-					.ip6_u = {
-						.daddr = dest->addr.in6,
-						.saddr = {
-							.s6_addr32 =
-								{ 0, 0, 0, 0 },
-						},
-					},
-				},
-			};
+			u32 cookie;

-			rt = (struct rt6_info *)ip6_route_output(&init_net,
-								 NULL, &fl);
-			if (!rt) {
+			dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
+						      &dest->dst_saddr,
+						      do_xfrm);
+			if (!dst) {
 				spin_unlock(&dest->dst_lock);
-				IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
-					     &dest->addr.in6);
 				return NULL;
 			}
-			__ip_vs_dst_set(dest, 0, dst_clone(&rt->dst));
-			IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n",
-				  &dest->addr.in6,
+			rt = (struct rt6_info *) dst;
+			cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+			__ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
+			IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
+				  &dest->addr.in6, &dest->dst_saddr,
 				  atomic_read(&rt->dst.__refcnt));
 		}
+		if (ret_saddr)
+			ipv6_addr_copy(ret_saddr, &dest->dst_saddr);
 		spin_unlock(&dest->dst_lock);
 	} else {
-		struct flowi fl = {
-			.oif = 0,
-			.nl_u = {
-				.ip6_u = {
-					.daddr = cp->daddr.in6,
-					.saddr = {
-						.s6_addr32 = { 0, 0, 0, 0 },
-					},
-				},
-			},
-		};
-
-		rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
-		if (!rt) {
-			IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
-				     &cp->daddr.in6);
+		dst = __ip_vs_route_output_v6(net, &cp->daddr.in6, ret_saddr,
+					      do_xfrm);
+		if (!dst)
 			return NULL;
-		}
+		rt = (struct rt6_info *) dst;
 	}

 	return rt;
@@ -248,6 +268,7 @@ int
 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		  struct ip_vs_protocol *pp)
 {
+	struct net *net = dev_net(skb->dev);
 	struct rtable *rt;			/* Route to the other host */
 	struct iphdr  *iph = ip_hdr(skb);
 	u8     tos = iph->tos;
@@ -263,7 +284,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, s

 	EnterFunction(10);

-	if (ip_route_output_key(&init_net, &rt, &fl)) {
+	if (ip_route_output_key(net, &rt, &fl)) {
 		IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n",
 			     __func__, &iph->daddr);
 		goto tx_error_icmp;
@@ -313,25 +334,18 @@ int
 ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		     struct ip_vs_protocol *pp)
 {
+	struct net *net = dev_net(skb->dev);
+	struct dst_entry *dst;
 	struct rt6_info *rt;			/* Route to the other host */
 	struct ipv6hdr  *iph = ipv6_hdr(skb);
 	int    mtu;
-	struct flowi fl = {
-		.oif = 0,
-		.nl_u = {
-			.ip6_u = {
-				.daddr = iph->daddr,
-				.saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
-	};

 	EnterFunction(10);

-	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
-	if (!rt) {
-		IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6\n",
-			     __func__, &iph->daddr);
+	dst = __ip_vs_route_output_v6(net, &iph->daddr, NULL, 0);
+	if (!dst)
 		goto tx_error_icmp;
-	}
+	rt = (struct rt6_info *) dst;

 	/* MTU checking */
 	mtu = dst_mtu(&rt->dst);
@@ -397,7 +411,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, stru
 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
 	}

-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
 		goto tx_error_icmp;

 	/* MTU checking */
@@ -472,7 +486,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, s
 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
 	}

-	rt = __ip_vs_get_out_rt_v6(cp);
+	rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
 	if (!rt)
 		goto tx_error_icmp;

@@ -557,7 +571,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
 	struct iphdr  *old_iph = ip_hdr(skb);
 	u8     tos = old_iph->tos;
 	__be16 df = old_iph->frag_off;
-	sk_buff_data_t old_transport_header = skb->transport_header;
 	struct iphdr  *iph;			/* Our new IP header */
 	unsigned int max_headroom;		/* The extra header space needed */
 	int    mtu;
@@ -572,7 +585,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
 		goto tx_error;
 	}

-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(tos))))
 		goto tx_error_icmp;

 	tdev = rt->dst.dev;
@@ -616,7 +629,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
 		old_iph = ip_hdr(skb);
 	}

-	skb->transport_header = old_transport_header;
+	skb->transport_header = skb->network_header;

 	/* fix old IP header checksum */
 	ip_send_check(old_iph);
@@ -670,9 +683,9 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
 		     struct ip_vs_protocol *pp)
 {
 	struct rt6_info *rt;		/* Route to the other host */
+	struct in6_addr saddr;		/* Source for tunnel */
 	struct net_device *tdev;	/* Device to other host */
 	struct ipv6hdr  *old_iph = ipv6_hdr(skb);
-	sk_buff_data_t old_transport_header = skb->transport_header;
 	struct ipv6hdr  *iph;		/* Our new IP header */
 	unsigned int max_headroom;	/* The extra header space needed */
 	int    mtu;
@@ -687,17 +700,17 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
 		goto tx_error;
 	}

-	rt = __ip_vs_get_out_rt_v6(cp);
+	rt = __ip_vs_get_out_rt_v6(skb, cp, &saddr, 1);
 	if (!rt)
 		goto tx_error_icmp;

 	tdev = rt->dst.dev;

 	mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
-	/* TODO IPv6: do we need this check in IPv6? */
-	if (mtu < 1280) {
+	if (mtu < IPV6_MIN_MTU) {
 		dst_release(&rt->dst);
-		IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__);
+		IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
+			     IPV6_MIN_MTU);
 		goto tx_error;
 	}
 	if (skb_dst(skb))
@@ -730,7 +743,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
 		old_iph = ipv6_hdr(skb);
 	}

-	skb->transport_header = old_transport_header;
+	skb->transport_header = skb->network_header;

 	skb_push(skb, sizeof(struct ipv6hdr));
 	skb_reset_network_header(skb);
@@ -750,8 +763,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
 	be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
 	iph->priority		=	old_iph->priority;
 	memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
-	iph->daddr		=	rt->rt6i_dst.addr;
-	iph->saddr		=	cp->vaddr.in6; /* rt->rt6i_src.addr; */
+	ipv6_addr_copy(&iph->daddr, &rt->rt6i_dst.addr);
+	ipv6_addr_copy(&iph->saddr, &saddr);
 	iph->hop_limit		=	old_iph->hop_limit;

 	/* Another hack: avoid icmp_send in ip_fragment */
@@ -791,7 +804,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struc

 	EnterFunction(10);

-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
 		goto tx_error_icmp;

 	/* MTU checking */
@@ -843,7 +856,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, st

 	EnterFunction(10);

-	rt = __ip_vs_get_out_rt_v6(cp);
+	rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
 	if (!rt)
 		goto tx_error_icmp;

@@ -919,7 +932,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, str
 	 * mangle and send the packet here (only for VS/NAT)
 	 */

-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(ip_hdr(skb)->tos))))
 		goto tx_error_icmp;

 	/* MTU checking */
@@ -993,7 +1006,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb,
 	 * mangle and send the packet here (only for VS/NAT)
 	 */

-	rt = __ip_vs_get_out_rt_v6(cp);
+	rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
 	if (!rt)
 		goto tx_error_icmp;


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch v2] ipvs: IPv6 tunnel mode
  2010-09-26 13:33 [patch v2] " Simon Horman
@ 2010-09-26 14:21 ` Julian Anastasov
  2010-09-29 13:30   ` Hans Schillstrom
  2010-09-27  6:00 ` Hans Schillstrom
  1 sibling, 1 reply; 13+ messages in thread
From: Julian Anastasov @ 2010-09-26 14:21 UTC (permalink / raw)
  To: Simon Horman; +Cc: Hans Schillstrom, lvs-devel, Julius Volz


 	Hello,

On Sun, 26 Sep 2010, Simon Horman wrote:

> From: Julian Anastasov <ja@ssi.bg>
>
> Tunnel mode for IPv6 doesn't work.
>
> IPv6 encapsulation uses a bad source address for the tunnel.
> i.e. VIP will be used as local-addr and encap. dst addr.
> Decapsulation will not accept this.
>
> Example
> LVS (eth1 2003::2:0:1/96, VIP 2003::2:0:100)
>   (eth0 2003::1:0:1/96)
> RS  (ethX 2003::1:0:5/96)
>
> tcpdump
> 2003::2:0:100 > 2003::1:0:5:
> IP6 (hlim 63, next-header TCP (6) payload length: 40)
> 2003::3:0:10.50991 > 2003::2:0:100.http: Flags [S], cksum 0x7312
> (correct), seq 3006460279, win 5760, options [mss 1440,sackOK,TS val
> 1904932 ecr 0,nop,wscale 3], length 0
>
> In Linux IPv6 impl. you can't have a tunnel with an any cast address
> receiving packets (I have not tried to interpret RFC 2473)
> To have receive capabilities the tunnel must have:
> - Local address set as multicast addr or an unicast addr
> - Remote address set as an unicast addr.
> - Loop back addres or Link local address are not allowed.
>
> This causes us to setup a tunnel in the Real Server with the
> LVS as the remote address, here you can't use the VIP address since it's
> used inside the tunnel.
>
> Solution
> Use outgoing interface IPv6 address (match against the destination).
> i.e. use ip6_route_output() to look up the route cache and
> then use ipv6_dev_get_saddr(...) to set the source address of the
> encapsulated packet.
>
> Additionally, cache the results in new destination
> fields: dst_cookie and dst_saddr and properly check the
> returned dst from ip6_route_output. We now add xfrm_lookup
> call only for the tunneling method where the source address
> is a local one.
>
> Original patch by Hans Schillstrom.
> Check dst state and cache results for IPv6 by Julian Anastasov.
>
> Tested-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
> Signed-off-by: Julian Anastasov <ja@ssi.bg>
> Signed-off-by: Simon Horman <horms@verge.net.au>
>
> ---
>
> * v1
>
>  This is Julian's patch with a slightly edited version of the description
>  from Hans's original patch.
>
> * v2
>
>  Updated changelog as per commends from Julian
>
> Is everyone ok with pushing this?
>
> diff -urp net-next-2.6-e548833-nfct_snat_reroute/linux/include/net/ip_vs.h linux/include/net/ip_vs.h
> --- net-next-2.6-e548833-nfct_snat_reroute/linux/include/net/ip_vs.h	2010-09-16 09:03:48.000000000 +0300
> +++ linux/include/net/ip_vs.h	2010-09-22 10:50:18.548963467 +0300
> @@ -509,6 +509,10 @@ struct ip_vs_dest {
> 	spinlock_t		dst_lock;	/* lock of dst_cache */
> 	struct dst_entry	*dst_cache;	/* destination cache entry */
> 	u32			dst_rtos;	/* RT_TOS(tos) for dst */
> +	u32			dst_cookie;
> +#ifdef CONFIG_IP_VS_IPV6
> +	struct in6_addr		dst_saddr;
> +#endif
>
> 	/* for virtual service */
> 	struct ip_vs_service	*svc;		/* service it belongs to */
> diff -urp net-next-2.6-e548833-nfct_snat_reroute/linux/net/netfilter/ipvs/ip_vs_xmit.c linux/net/netfilter/ipvs/ip_vs_xmit.c
> --- net-next-2.6-e548833-nfct_snat_reroute/linux/net/netfilter/ipvs/ip_vs_xmit.c	2010-09-16 09:02:25.000000000 +0300
> +++ linux/net/netfilter/ipvs/ip_vs_xmit.c	2010-09-22 16:29:43.271964521 +0300
> @@ -26,6 +26,7 @@
> #include <net/route.h>                  /* for ip_route_output */
> #include <net/ipv6.h>
> #include <net/ip6_route.h>
> +#include <net/addrconf.h>
> #include <linux/icmpv6.h>
> #include <linux/netfilter.h>
> #include <linux/netfilter_ipv4.h>
> @@ -37,26 +38,27 @@
>  *      Destination cache to speed up outgoing route lookup
>  */
> static inline void
> -__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
> +__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
> +		u32 dst_cookie)
> {
> 	struct dst_entry *old_dst;
>
> 	old_dst = dest->dst_cache;
> 	dest->dst_cache = dst;
> 	dest->dst_rtos = rtos;
> +	dest->dst_cookie = dst_cookie;
> 	dst_release(old_dst);
> }
>
> static inline struct dst_entry *
> -__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
> +__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
> {
> 	struct dst_entry *dst = dest->dst_cache;
>
> 	if (!dst)
> 		return NULL;
> -	if ((dst->obsolete
> -	     || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
> -	    dst->ops->check(dst, cookie) == NULL) {
> +	if ((dst->obsolete || rtos != dest->dst_rtos) &&
> +	    dst->ops->check(dst, dest->dst_cookie) == NULL) {
> 		dest->dst_cache = NULL;
> 		dst_release(dst);
> 		return NULL;
> @@ -66,15 +68,16 @@ __ip_vs_dst_check(struct ip_vs_dest *des
> }
>
> static struct rtable *
> -__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
> +__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos)
> {
> +	struct net *net = dev_net(skb->dev);
> 	struct rtable *rt;			/* Route to the other host */
> 	struct ip_vs_dest *dest = cp->dest;
>
> 	if (dest) {
> 		spin_lock(&dest->dst_lock);
> 		if (!(rt = (struct rtable *)
> -		      __ip_vs_dst_check(dest, rtos, 0))) {
> +		      __ip_vs_dst_check(dest, rtos))) {
> 			struct flowi fl = {
> 				.oif = 0,
> 				.nl_u = {
> @@ -84,13 +87,13 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
> 						.tos = rtos, } },
> 			};
>
> -			if (ip_route_output_key(&init_net, &rt, &fl)) {
> +			if (ip_route_output_key(net, &rt, &fl)) {
> 				spin_unlock(&dest->dst_lock);
> 				IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
> 					     &dest->addr.ip);
> 				return NULL;
> 			}
> -			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst));
> +			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
> 			IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n",
> 				  &dest->addr.ip,
> 				  atomic_read(&rt->dst.__refcnt), rtos);
> @@ -106,7 +109,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
> 					.tos = rtos, } },
> 		};
>
> -		if (ip_route_output_key(&init_net, &rt, &fl)) {
> +		if (ip_route_output_key(net, &rt, &fl)) {
> 			IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
> 				     &cp->daddr.ip);
> 			return NULL;
> @@ -117,62 +120,79 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
> }
>
> #ifdef CONFIG_IP_VS_IPV6
> +
> +static struct dst_entry *
> +__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
> +			struct in6_addr *ret_saddr, int do_xfrm)
> +{
> +	struct dst_entry *dst;
> +	struct flowi fl = {
> +		.oif = 0,
> +		.nl_u = {
> +			.ip6_u = {
> +				.daddr = *daddr,
> +			},
> +		},
> +	};
> +
> +	dst = ip6_route_output(net, NULL, &fl);
> +	if (dst->error)
> +		goto out_err;
> +	if (!ret_saddr)
> +		return dst;
> +	if (ipv6_addr_any(&fl.fl6_src) &&
> +	    ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
> +			       &fl.fl6_dst, 0, &fl.fl6_src) < 0)
> +		goto out_err;
> +	if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0)
> +		goto out_err;
> +	ipv6_addr_copy(ret_saddr, &fl.fl6_src);
> +	return dst;
> +
> +out_err:
> +	dst_release(dst);
> +	IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
> +	return NULL;
> +}
> +
> static struct rt6_info *
> -__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
> +__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
> +		      struct in6_addr *ret_saddr, int do_xfrm)
> {
> +	struct net *net = dev_net(skb->dev);
> 	struct rt6_info *rt;			/* Route to the other host */
> 	struct ip_vs_dest *dest = cp->dest;
> +	struct dst_entry *dst;
>
> 	if (dest) {
> 		spin_lock(&dest->dst_lock);
> -		rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
> +		rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
> 		if (!rt) {
> -			struct flowi fl = {
> -				.oif = 0,
> -				.nl_u = {
> -					.ip6_u = {
> -						.daddr = dest->addr.in6,
> -						.saddr = {
> -							.s6_addr32 =
> -								{ 0, 0, 0, 0 },
> -						},
> -					},
> -				},
> -			};
> +			u32 cookie;
>
> -			rt = (struct rt6_info *)ip6_route_output(&init_net,
> -								 NULL, &fl);
> -			if (!rt) {
> +			dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
> +						      &dest->dst_saddr,
> +						      do_xfrm);
> +			if (!dst) {
> 				spin_unlock(&dest->dst_lock);
> -				IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
> -					     &dest->addr.in6);
> 				return NULL;
> 			}
> -			__ip_vs_dst_set(dest, 0, dst_clone(&rt->dst));
> -			IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n",
> -				  &dest->addr.in6,
> +			rt = (struct rt6_info *) dst;
> +			cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
> +			__ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
> +			IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
> +				  &dest->addr.in6, &dest->dst_saddr,
> 				  atomic_read(&rt->dst.__refcnt));
> 		}
> +		if (ret_saddr)
> +			ipv6_addr_copy(ret_saddr, &dest->dst_saddr);
> 		spin_unlock(&dest->dst_lock);
> 	} else {
> -		struct flowi fl = {
> -			.oif = 0,
> -			.nl_u = {
> -				.ip6_u = {
> -					.daddr = cp->daddr.in6,
> -					.saddr = {
> -						.s6_addr32 = { 0, 0, 0, 0 },
> -					},
> -				},
> -			},
> -		};
> -
> -		rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
> -		if (!rt) {
> -			IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
> -				     &cp->daddr.in6);
> +		dst = __ip_vs_route_output_v6(net, &cp->daddr.in6, ret_saddr,
> +					      do_xfrm);
> +		if (!dst)
> 			return NULL;
> -		}
> +		rt = (struct rt6_info *) dst;
> 	}
>
> 	return rt;
> @@ -248,6 +268,7 @@ int
> ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
> 		  struct ip_vs_protocol *pp)
> {
> +	struct net *net = dev_net(skb->dev);
> 	struct rtable *rt;			/* Route to the other host */
> 	struct iphdr  *iph = ip_hdr(skb);
> 	u8     tos = iph->tos;
> @@ -263,7 +284,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, s
>
> 	EnterFunction(10);
>
> -	if (ip_route_output_key(&init_net, &rt, &fl)) {
> +	if (ip_route_output_key(net, &rt, &fl)) {
> 		IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n",
> 			     __func__, &iph->daddr);
> 		goto tx_error_icmp;
> @@ -313,25 +334,18 @@ int
> ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
> 		     struct ip_vs_protocol *pp)
> {
> +	struct net *net = dev_net(skb->dev);
> +	struct dst_entry *dst;
> 	struct rt6_info *rt;			/* Route to the other host */
> 	struct ipv6hdr  *iph = ipv6_hdr(skb);
> 	int    mtu;
> -	struct flowi fl = {
> -		.oif = 0,
> -		.nl_u = {
> -			.ip6_u = {
> -				.daddr = iph->daddr,
> -				.saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
> -	};
>
> 	EnterFunction(10);
>
> -	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
> -	if (!rt) {
> -		IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6\n",
> -			     __func__, &iph->daddr);
> +	dst = __ip_vs_route_output_v6(net, &iph->daddr, NULL, 0);
> +	if (!dst)
> 		goto tx_error_icmp;
> -	}
> +	rt = (struct rt6_info *) dst;
>
> 	/* MTU checking */
> 	mtu = dst_mtu(&rt->dst);
> @@ -397,7 +411,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, stru
> 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
> 	}
>
> -	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
> +	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
> 		goto tx_error_icmp;
>
> 	/* MTU checking */
> @@ -472,7 +486,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, s
> 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
> 	}
>
> -	rt = __ip_vs_get_out_rt_v6(cp);
> +	rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
> 	if (!rt)
> 		goto tx_error_icmp;
>
> @@ -557,7 +571,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
> 	struct iphdr  *old_iph = ip_hdr(skb);
> 	u8     tos = old_iph->tos;
> 	__be16 df = old_iph->frag_off;
> -	sk_buff_data_t old_transport_header = skb->transport_header;
> 	struct iphdr  *iph;			/* Our new IP header */
> 	unsigned int max_headroom;		/* The extra header space needed */
> 	int    mtu;
> @@ -572,7 +585,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
> 		goto tx_error;
> 	}
>
> -	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
> +	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(tos))))
> 		goto tx_error_icmp;
>
> 	tdev = rt->dst.dev;
> @@ -616,7 +629,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
> 		old_iph = ip_hdr(skb);
> 	}
>
> -	skb->transport_header = old_transport_header;
> +	skb->transport_header = skb->network_header;
>
> 	/* fix old IP header checksum */
> 	ip_send_check(old_iph);
> @@ -670,9 +683,9 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
> 		     struct ip_vs_protocol *pp)
> {
> 	struct rt6_info *rt;		/* Route to the other host */
> +	struct in6_addr saddr;		/* Source for tunnel */
> 	struct net_device *tdev;	/* Device to other host */
> 	struct ipv6hdr  *old_iph = ipv6_hdr(skb);
> -	sk_buff_data_t old_transport_header = skb->transport_header;
> 	struct ipv6hdr  *iph;		/* Our new IP header */
> 	unsigned int max_headroom;	/* The extra header space needed */
> 	int    mtu;
> @@ -687,17 +700,17 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
> 		goto tx_error;
> 	}
>
> -	rt = __ip_vs_get_out_rt_v6(cp);
> +	rt = __ip_vs_get_out_rt_v6(skb, cp, &saddr, 1);
> 	if (!rt)
> 		goto tx_error_icmp;
>
> 	tdev = rt->dst.dev;
>
> 	mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
> -	/* TODO IPv6: do we need this check in IPv6? */
> -	if (mtu < 1280) {
> +	if (mtu < IPV6_MIN_MTU) {
> 		dst_release(&rt->dst);
> -		IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__);
> +		IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
> +			     IPV6_MIN_MTU);
> 		goto tx_error;
> 	}
> 	if (skb_dst(skb))
> @@ -730,7 +743,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
> 		old_iph = ipv6_hdr(skb);
> 	}
>
> -	skb->transport_header = old_transport_header;
> +	skb->transport_header = skb->network_header;
>
> 	skb_push(skb, sizeof(struct ipv6hdr));
> 	skb_reset_network_header(skb);
> @@ -750,8 +763,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
> 	be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
> 	iph->priority		=	old_iph->priority;
> 	memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
> -	iph->daddr		=	rt->rt6i_dst.addr;
> -	iph->saddr		=	cp->vaddr.in6; /* rt->rt6i_src.addr; */
> +	ipv6_addr_copy(&iph->daddr, &rt->rt6i_dst.addr);
> +	ipv6_addr_copy(&iph->saddr, &saddr);
> 	iph->hop_limit		=	old_iph->hop_limit;
>
> 	/* Another hack: avoid icmp_send in ip_fragment */
> @@ -791,7 +804,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struc
>
> 	EnterFunction(10);
>
> -	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
> +	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
> 		goto tx_error_icmp;
>
> 	/* MTU checking */
> @@ -843,7 +856,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, st
>
> 	EnterFunction(10);
>
> -	rt = __ip_vs_get_out_rt_v6(cp);
> +	rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
> 	if (!rt)
> 		goto tx_error_icmp;
>
> @@ -919,7 +932,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, str
> 	 * mangle and send the packet here (only for VS/NAT)
> 	 */
>
> -	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
> +	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(ip_hdr(skb)->tos))))
> 		goto tx_error_icmp;
>
> 	/* MTU checking */
> @@ -993,7 +1006,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb,
> 	 * mangle and send the packet here (only for VS/NAT)
> 	 */
>
> -	rt = __ip_vs_get_out_rt_v6(cp);
> +	rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
> 	if (!rt)
> 		goto tx_error_icmp;
>

 	Looks good to me

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch v2] ipvs: IPv6 tunnel mode
  2010-09-26 14:21 ` Julian Anastasov
@ 2010-09-29 13:30   ` Hans Schillstrom
  2010-09-30 22:55     ` Julian Anastasov
  0 siblings, 1 reply; 13+ messages in thread
From: Hans Schillstrom @ 2010-09-29 13:30 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: Simon Horman, lvs-devel@vger.kernel.org, Julius Volz

Hello
I think this patch should be stopped since it doesn't work in some
cases. 
There is also a bug in the patch.
ip6_route_output() returns a key as dest address
 which can be a network and that's not a good dest address.
(It can be seen if the tunnel remote endpoint has to pass through an
router/gateway)

@@ -750,8 +763,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
...
...
-        ipv6_addr_copy(&iph->daddr, &rt->rt6i_dst.addr);
+        ipv6_addr_copy(&iph->daddr, &cp->dest->addr.in6);



The principle for it is wrong, it's more or less impossible to get the
expected address from ipv6_get_saddr_eval(), depending on your network
topology.

Since there must be an address set on the remote/local pair for a IPv6
tunnel and you can't predict what source address you will get the
function is more or less useless.

After spending a day single stepping through the IPv6 stack I'm more or
less sure that:
 - The only way to get IPv6 tunnels to work in a predictable way is to
add a "tunnel source address" to the ipvsadm commad.

What to do ?
- add a --tunsrc switch to ipvsadm  -i command ?
 or
 - Disable the IPv6 tunnel mode ?
 or
 - Leave it as is ?

Regards
Hans Schillstrom <hans.schillstrom@ericsson.com>


On Sun, 2010-09-26 at 16:21 +0200, Julian Anastasov wrote:
> Hello,
> 
> On Sun, 26 Sep 2010, Simon Horman wrote:
> 
> > From: Julian Anastasov <ja@ssi.bg>
> >
> > Tunnel mode for IPv6 doesn't work.
> >
> > IPv6 encapsulation uses a bad source address for the tunnel.
> > i.e. VIP will be used as local-addr and encap. dst addr.
> > Decapsulation will not accept this.
> >
> > Example
> > LVS (eth1 2003::2:0:1/96, VIP 2003::2:0:100)
> >   (eth0 2003::1:0:1/96)
> > RS  (ethX 2003::1:0:5/96)
> >
> > tcpdump
> > 2003::2:0:100 > 2003::1:0:5:
> > IP6 (hlim 63, next-header TCP (6) payload length: 40)
> > 2003::3:0:10.50991 > 2003::2:0:100.http: Flags [S], cksum 0x7312
> > (correct), seq 3006460279, win 5760, options [mss 1440,sackOK,TS val
> > 1904932 ecr 0,nop,wscale 3], length 0
> >
> > In Linux IPv6 impl. you can't have a tunnel with an any cast address
> > receiving packets (I have not tried to interpret RFC 2473)
> > To have receive capabilities the tunnel must have:
> > - Local address set as multicast addr or an unicast addr
> > - Remote address set as an unicast addr.
> > - Loop back addres or Link local address are not allowed.
> >
> > This causes us to setup a tunnel in the Real Server with the
> > LVS as the remote address, here you can't use the VIP address since it's
> > used inside the tunnel.
> >
> > Solution
> > Use outgoing interface IPv6 address (match against the destination).
> > i.e. use ip6_route_output() to look up the route cache and
> > then use ipv6_dev_get_saddr(...) to set the source address of the
> > encapsulated packet.
> >
> > Additionally, cache the results in new destination
> > fields: dst_cookie and dst_saddr and properly check the
> > returned dst from ip6_route_output. We now add xfrm_lookup
> > call only for the tunneling method where the source address
> > is a local one.
> >
> > Original patch by Hans Schillstrom.
> > Check dst state and cache results for IPv6 by Julian Anastasov.
> >
> > Tested-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
> > Signed-off-by: Julian Anastasov <ja@ssi.bg>
> > Signed-off-by: Simon Horman <horms@verge.net.au>
> >
> > ---
> >
> > * v1
> >
> >  This is Julian's patch with a slightly edited version of the description
> >  from Hans's original patch.
> >
> > * v2
> >
> >  Updated changelog as per commends from Julian
> >
> > Is everyone ok with pushing this?
> >
> > diff -urp net-next-2.6-e548833-nfct_snat_reroute/linux/include/net/ip_vs.h linux/include/net/ip_vs.h
> > --- net-next-2.6-e548833-nfct_snat_reroute/linux/include/net/ip_vs.h  2010-09-16 09:03:48.000000000 +0300
> > +++ linux/include/net/ip_vs.h 2010-09-22 10:50:18.548963467 +0300
> > @@ -509,6 +509,10 @@ struct ip_vs_dest {
> >       spinlock_t              dst_lock;       /* lock of dst_cache */
> >       struct dst_entry        *dst_cache;     /* destination cache entry */
> >       u32                     dst_rtos;       /* RT_TOS(tos) for dst */
> > +     u32                     dst_cookie;
> > +#ifdef CONFIG_IP_VS_IPV6
> > +     struct in6_addr         dst_saddr;
> > +#endif
> >
> >       /* for virtual service */
> >       struct ip_vs_service    *svc;           /* service it belongs to */
> > diff -urp net-next-2.6-e548833-nfct_snat_reroute/linux/net/netfilter/ipvs/ip_vs_xmit.c linux/net/netfilter/ipvs/ip_vs_xmit.c
> > --- net-next-2.6-e548833-nfct_snat_reroute/linux/net/netfilter/ipvs/ip_vs_xmit.c      2010-09-16 09:02:25.000000000 +0300
> > +++ linux/net/netfilter/ipvs/ip_vs_xmit.c     2010-09-22 16:29:43.271964521 +0300
> > @@ -26,6 +26,7 @@
> > #include <net/route.h>                  /* for ip_route_output */
> > #include <net/ipv6.h>
> > #include <net/ip6_route.h>
> > +#include <net/addrconf.h>
> > #include <linux/icmpv6.h>
> > #include <linux/netfilter.h>
> > #include <linux/netfilter_ipv4.h>
> > @@ -37,26 +38,27 @@
> >  *      Destination cache to speed up outgoing route lookup
> >  */
> > static inline void
> > -__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
> > +__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
> > +             u32 dst_cookie)
> > {
> >       struct dst_entry *old_dst;
> >
> >       old_dst = dest->dst_cache;
> >       dest->dst_cache = dst;
> >       dest->dst_rtos = rtos;
> > +     dest->dst_cookie = dst_cookie;
> >       dst_release(old_dst);
> > }
> >
> > static inline struct dst_entry *
> > -__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
> > +__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
> > {
> >       struct dst_entry *dst = dest->dst_cache;
> >
> >       if (!dst)
> >               return NULL;
> > -     if ((dst->obsolete
> > -          || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
> > -         dst->ops->check(dst, cookie) == NULL) {
> > +     if ((dst->obsolete || rtos != dest->dst_rtos) &&
> > +         dst->ops->check(dst, dest->dst_cookie) == NULL) {
> >               dest->dst_cache = NULL;
> >               dst_release(dst);
> >               return NULL;
> > @@ -66,15 +68,16 @@ __ip_vs_dst_check(struct ip_vs_dest *des
> > }
> >
> > static struct rtable *
> > -__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
> > +__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos)
> > {
> > +     struct net *net = dev_net(skb->dev);
> >       struct rtable *rt;                      /* Route to the other host */
> >       struct ip_vs_dest *dest = cp->dest;
> >
> >       if (dest) {
> >               spin_lock(&dest->dst_lock);
> >               if (!(rt = (struct rtable *)
> > -                   __ip_vs_dst_check(dest, rtos, 0))) {
> > +                   __ip_vs_dst_check(dest, rtos))) {
> >                       struct flowi fl = {
> >                               .oif = 0,
> >                               .nl_u = {
> > @@ -84,13 +87,13 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
> >                                               .tos = rtos, } },
> >                       };
> >
> > -                     if (ip_route_output_key(&init_net, &rt, &fl)) {
> > +                     if (ip_route_output_key(net, &rt, &fl)) {
> >                               spin_unlock(&dest->dst_lock);
> >                               IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
> >                                            &dest->addr.ip);
> >                               return NULL;
> >                       }
> > -                     __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst));
> > +                     __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
> >                       IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n",
> >                                 &dest->addr.ip,
> >                                 atomic_read(&rt->dst.__refcnt), rtos);
> > @@ -106,7 +109,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
> >                                       .tos = rtos, } },
> >               };
> >
> > -             if (ip_route_output_key(&init_net, &rt, &fl)) {
> > +             if (ip_route_output_key(net, &rt, &fl)) {
> >                       IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
> >                                    &cp->daddr.ip);
> >                       return NULL;
> > @@ -117,62 +120,79 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
> > }
> >
> > #ifdef CONFIG_IP_VS_IPV6
> > +
> > +static struct dst_entry *
> > +__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
> > +                     struct in6_addr *ret_saddr, int do_xfrm)
> > +{
> > +     struct dst_entry *dst;
> > +     struct flowi fl = {
> > +             .oif = 0,
> > +             .nl_u = {
> > +                     .ip6_u = {
> > +                             .daddr = *daddr,
> > +                     },
> > +             },
> > +     };
> > +
> > +     dst = ip6_route_output(net, NULL, &fl);
> > +     if (dst->error)
> > +             goto out_err;
> > +     if (!ret_saddr)
> > +             return dst;
> > +     if (ipv6_addr_any(&fl.fl6_src) &&
> > +         ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
> > +                            &fl.fl6_dst, 0, &fl.fl6_src) < 0)
> > +             goto out_err;
> > +     if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0)
> > +             goto out_err;
> > +     ipv6_addr_copy(ret_saddr, &fl.fl6_src);
> > +     return dst;
> > +
> > +out_err:
> > +     dst_release(dst);
> > +     IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
> > +     return NULL;
> > +}
> > +
> > static struct rt6_info *
> > -__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
> > +__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
> > +                   struct in6_addr *ret_saddr, int do_xfrm)
> > {
> > +     struct net *net = dev_net(skb->dev);
> >       struct rt6_info *rt;                    /* Route to the other host */
> >       struct ip_vs_dest *dest = cp->dest;
> > +     struct dst_entry *dst;
> >
> >       if (dest) {
> >               spin_lock(&dest->dst_lock);
> > -             rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
> > +             rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
> >               if (!rt) {
> > -                     struct flowi fl = {
> > -                             .oif = 0,
> > -                             .nl_u = {
> > -                                     .ip6_u = {
> > -                                             .daddr = dest->addr.in6,
> > -                                             .saddr = {
> > -                                                     .s6_addr32 =
> > -                                                             { 0, 0, 0, 0 },
> > -                                             },
> > -                                     },
> > -                             },
> > -                     };
> > +                     u32 cookie;
> >
> > -                     rt = (struct rt6_info *)ip6_route_output(&init_net,
> > -                                                              NULL, &fl);
> > -                     if (!rt) {
> > +                     dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
> > +                                                   &dest->dst_saddr,
> > +                                                   do_xfrm);
> > +                     if (!dst) {
> >                               spin_unlock(&dest->dst_lock);
> > -                             IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
> > -                                          &dest->addr.in6);
> >                               return NULL;
> >                       }
> > -                     __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst));
> > -                     IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n",
> > -                               &dest->addr.in6,
> > +                     rt = (struct rt6_info *) dst;
> > +                     cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
> > +                     __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
> > +                     IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
> > +                               &dest->addr.in6, &dest->dst_saddr,
> >                                 atomic_read(&rt->dst.__refcnt));
> >               }
> > +             if (ret_saddr)
> > +                     ipv6_addr_copy(ret_saddr, &dest->dst_saddr);
> >               spin_unlock(&dest->dst_lock);
> >       } else {
> > -             struct flowi fl = {
> > -                     .oif = 0,
> > -                     .nl_u = {
> > -                             .ip6_u = {
> > -                                     .daddr = cp->daddr.in6,
> > -                                     .saddr = {
> > -                                             .s6_addr32 = { 0, 0, 0, 0 },
> > -                                     },
> > -                             },
> > -                     },
> > -             };
> > -
> > -             rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
> > -             if (!rt) {
> > -                     IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
> > -                                  &cp->daddr.in6);
> > +             dst = __ip_vs_route_output_v6(net, &cp->daddr.in6, ret_saddr,
> > +                                           do_xfrm);
> > +             if (!dst)
> >                       return NULL;
> > -             }
> > +             rt = (struct rt6_info *) dst;
> >       }
> >
> >       return rt;
> > @@ -248,6 +268,7 @@ int
> > ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
> >                 struct ip_vs_protocol *pp)
> > {
> > +     struct net *net = dev_net(skb->dev);
> >       struct rtable *rt;                      /* Route to the other host */
> >       struct iphdr  *iph = ip_hdr(skb);
> >       u8     tos = iph->tos;
> > @@ -263,7 +284,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, s
> >
> >       EnterFunction(10);
> >
> > -     if (ip_route_output_key(&init_net, &rt, &fl)) {
> > +     if (ip_route_output_key(net, &rt, &fl)) {
> >               IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n",
> >                            __func__, &iph->daddr);
> >               goto tx_error_icmp;
> > @@ -313,25 +334,18 @@ int
> > ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
> >                    struct ip_vs_protocol *pp)
> > {
> > +     struct net *net = dev_net(skb->dev);
> > +     struct dst_entry *dst;
> >       struct rt6_info *rt;                    /* Route to the other host */
> >       struct ipv6hdr  *iph = ipv6_hdr(skb);
> >       int    mtu;
> > -     struct flowi fl = {
> > -             .oif = 0,
> > -             .nl_u = {
> > -                     .ip6_u = {
> > -                             .daddr = iph->daddr,
> > -                             .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
> > -     };
> >
> >       EnterFunction(10);
> >
> > -     rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
> > -     if (!rt) {
> > -             IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6\n",
> > -                          __func__, &iph->daddr);
> > +     dst = __ip_vs_route_output_v6(net, &iph->daddr, NULL, 0);
> > +     if (!dst)
> >               goto tx_error_icmp;
> > -     }
> > +     rt = (struct rt6_info *) dst;
> >
> >       /* MTU checking */
> >       mtu = dst_mtu(&rt->dst);
> > @@ -397,7 +411,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, stru
> >               IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
> >       }
> >
> > -     if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
> > +     if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
> >               goto tx_error_icmp;
> >
> >       /* MTU checking */
> > @@ -472,7 +486,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, s
> >               IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
> >       }
> >
> > -     rt = __ip_vs_get_out_rt_v6(cp);
> > +     rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
> >       if (!rt)
> >               goto tx_error_icmp;
> >
> > @@ -557,7 +571,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
> >       struct iphdr  *old_iph = ip_hdr(skb);
> >       u8     tos = old_iph->tos;
> >       __be16 df = old_iph->frag_off;
> > -     sk_buff_data_t old_transport_header = skb->transport_header;
> >       struct iphdr  *iph;                     /* Our new IP header */
> >       unsigned int max_headroom;              /* The extra header space needed */
> >       int    mtu;
> > @@ -572,7 +585,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
> >               goto tx_error;
> >       }
> >
> > -     if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
> > +     if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(tos))))
> >               goto tx_error_icmp;
> >
> >       tdev = rt->dst.dev;
> > @@ -616,7 +629,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
> >               old_iph = ip_hdr(skb);
> >       }
> >
> > -     skb->transport_header = old_transport_header;
> > +     skb->transport_header = skb->network_header;
> >
> >       /* fix old IP header checksum */
> >       ip_send_check(old_iph);
> > @@ -670,9 +683,9 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
> >                    struct ip_vs_protocol *pp)
> > {
> >       struct rt6_info *rt;            /* Route to the other host */
> > +     struct in6_addr saddr;          /* Source for tunnel */
> >       struct net_device *tdev;        /* Device to other host */
> >       struct ipv6hdr  *old_iph = ipv6_hdr(skb);
> > -     sk_buff_data_t old_transport_header = skb->transport_header;
> >       struct ipv6hdr  *iph;           /* Our new IP header */
> >       unsigned int max_headroom;      /* The extra header space needed */
> >       int    mtu;
> > @@ -687,17 +700,17 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
> >               goto tx_error;
> >       }
> >
> > -     rt = __ip_vs_get_out_rt_v6(cp);
> > +     rt = __ip_vs_get_out_rt_v6(skb, cp, &saddr, 1);
> >       if (!rt)
> >               goto tx_error_icmp;
> >
> >       tdev = rt->dst.dev;
> >
> >       mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
> > -     /* TODO IPv6: do we need this check in IPv6? */
> > -     if (mtu < 1280) {
> > +     if (mtu < IPV6_MIN_MTU) {
> >               dst_release(&rt->dst);
> > -             IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__);
> > +             IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
> > +                          IPV6_MIN_MTU);
> >               goto tx_error;
> >       }
> >       if (skb_dst(skb))
> > @@ -730,7 +743,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
> >               old_iph = ipv6_hdr(skb);
> >       }
> >
> > -     skb->transport_header = old_transport_header;
> > +     skb->transport_header = skb->network_header;
> >
> >       skb_push(skb, sizeof(struct ipv6hdr));
> >       skb_reset_network_header(skb);
> > @@ -750,8 +763,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
> >       be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
> >       iph->priority           =       old_iph->priority;
> >       memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
> > -     iph->daddr              =       rt->rt6i_dst.addr;
> > -     iph->saddr              =       cp->vaddr.in6; /* rt->rt6i_src.addr; */
> > +     ipv6_addr_copy(&iph->daddr, &rt->rt6i_dst.addr);
> > +     ipv6_addr_copy(&iph->saddr, &saddr);
> >       iph->hop_limit          =       old_iph->hop_limit;
> >
> >       /* Another hack: avoid icmp_send in ip_fragment */
> > @@ -791,7 +804,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struc
> >
> >       EnterFunction(10);
> >
> > -     if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
> > +     if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
> >               goto tx_error_icmp;
> >
> >       /* MTU checking */
> > @@ -843,7 +856,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, st
> >
> >       EnterFunction(10);
> >
> > -     rt = __ip_vs_get_out_rt_v6(cp);
> > +     rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
> >       if (!rt)
> >               goto tx_error_icmp;
> >
> > @@ -919,7 +932,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, str
> >        * mangle and send the packet here (only for VS/NAT)
> >        */
> >
> > -     if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
> > +     if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(ip_hdr(skb)->tos))))
> >               goto tx_error_icmp;
> >
> >       /* MTU checking */
> > @@ -993,7 +1006,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb,
> >        * mangle and send the packet here (only for VS/NAT)
> >        */
> >
> > -     rt = __ip_vs_get_out_rt_v6(cp);
> > +     rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
> >       if (!rt)
> >               goto tx_error_icmp;
> >
> 
>         Looks good to me
> 
> Regards
> 
> --
> Julian Anastasov <ja@ssi.bg>


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch v2] ipvs: IPv6 tunnel mode
  2010-09-29 13:30   ` Hans Schillstrom
@ 2010-09-30 22:55     ` Julian Anastasov
  2010-10-01  6:08       ` Hans Schillstrom
  0 siblings, 1 reply; 13+ messages in thread
From: Julian Anastasov @ 2010-09-30 22:55 UTC (permalink / raw)
  To: Hans Schillstrom; +Cc: Simon Horman, lvs-devel@vger.kernel.org, Julius Volz

 	Hello,

On Wed, 29 Sep 2010, Hans Schillstrom wrote:

> Hello
> I think this patch should be stopped since it doesn't work in some
> cases.

 	I think, the patch is still correct. May be we
just need to create new patch to fix the remaining problems
which are old (rt6i_dst use).

> There is also a bug in the patch.
> ip6_route_output() returns a key as dest address
> which can be a network and that's not a good dest address.
> (It can be seen if the tunnel remote endpoint has to pass through an
> router/gateway)

 	You are referring to rt6i_dst?

> @@ -750,8 +763,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
> ...
> ...
> -        ipv6_addr_copy(&iph->daddr, &rt->rt6i_dst.addr);
> +        ipv6_addr_copy(&iph->daddr, &cp->dest->addr.in6);

 	I prefer not to risk with cp->dest, see below.
May be ipv6_addr_copy(&iph->daddr, &cp->daddr.in6);
is safer. But we still can add more checks for the
daddr type one day when backup supports IPv6.

- both rt6_alloc_cow and rt6_alloc_clone copy our fl->fl6_dst
into rt6i_dst. Only that CLONE_OFFLINK_ROUTE is not 1 and
I'm not sure what happens with rt6i_dst in this
case (ip6_pol_route). May be you see the route prefix there
as set by ip6_route_add?

> The principle for it is wrong, it's more or less impossible to get the
> expected address from ipv6_get_saddr_eval(), depending on your network
> topology.
>
> Since there must be an address set on the remote/local pair for a IPv6
> tunnel and you can't predict what source address you will get the
> function is more or less useless.

 	Yes, for IPv6 I don't see option in tunnel to accept
traffic from any remote address. This was not the case with
IPv4. But the source address selection is rich enough.
I don't know for any example settings for IPVS on IPv6
but routes in IPv6 can have source address if CONFIG_IPV6_SUBTREES
is defined (ip6_route_add). Still, the routing should be
able to return valid source address. I don't know what
settings you are using but looking at code I think the
options are:

- assign source address for route

- add VIPs as deprecated addresses (prefered_lft=0). By this
way we don't risk they to be autoselected as source in
output routes because they are ignored in ipv6_get_saddr_eval
at step IPV6_SADDR_RULE_PREFERRED

 	As result, the source address should be properly
selected and our patch looks ok for the saddr part.

> After spending a day single stepping through the IPv6 stack I'm more or
> less sure that:
> - The only way to get IPv6 tunnels to work in a predictable way is to
> add a "tunnel source address" to the ipvsadm commad.

 	This is not possible for the case when cp->dest is
NULL, eg. when connection using fwmarks is moved to backup
server. But this does not happen because IPv6 is not supported
for sync. If one day we add support in new sync format I'm
sure there will be also fwmark and cp->dest will be valid
for the tunneling method.

> What to do ?
> - add a --tunsrc switch to ipvsadm  -i command ?
> or
> - Disable the IPv6 tunnel mode ?
> or
> - Leave it as is ?

 	This 3th option is better for the moment. I think,
saddr is selected correctly, see ip6_dst_lookup() and
ip6_dst_lookup_tail() in net/ipv6/ip6_output.c
But if you think using ipv6_addr_copy(&iph->daddr, &cp->daddr.in6);
is working then feel free to post a patch on top of
the previous patch. Because other code in kernel uses
resulting fl.fl6_dst, not rt6i_dst.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch v2] ipvs: IPv6 tunnel mode
  2010-09-30 22:55     ` Julian Anastasov
@ 2010-10-01  6:08       ` Hans Schillstrom
  2010-10-01  7:48         ` Julian Anastasov
  0 siblings, 1 reply; 13+ messages in thread
From: Hans Schillstrom @ 2010-10-01  6:08 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: Simon Horman, lvs-devel@vger.kernel.org, Julius Volz

Hello

On Fri, 2010-10-01 at 00:55 +0200, Julian Anastasov wrote:
> 	Hello,
> 
> On Wed, 29 Sep 2010, Hans Schillstrom wrote:
> 
> > Hello
> > I think this patch should be stopped since it doesn't work in some
> > cases.
> 
>  	I think, the patch is still correct. May be we
> just need to create new patch to fix the remaining problems
> which are old (rt6i_dst use).
> 
> > There is also a bug in the patch.
> > ip6_route_output() returns a key as dest address
> > which can be a network and that's not a good dest address.
> > (It can be seen if the tunnel remote endpoint has to pass through an
> > router/gateway)
> 
>  	You are referring to rt6i_dst?

Yes

> 
> > @@ -750,8 +763,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
> > ...
> > ...
> > -        ipv6_addr_copy(&iph->daddr, &rt->rt6i_dst.addr);
> > +        ipv6_addr_copy(&iph->daddr, &cp->dest->addr.in6);
> 
>  	I prefer not to risk with cp->dest, see below.
> May be ipv6_addr_copy(&iph->daddr, &cp->daddr.in6);
> is safer. But we still can add more checks for the
> daddr type one day when backup supports IPv6.
> 
> - both rt6_alloc_cow and rt6_alloc_clone copy our fl->fl6_dst
> into rt6i_dst. Only that CLONE_OFFLINK_ROUTE is not 1 and
> I'm not sure what happens with rt6i_dst in this
> case (ip6_pol_route). May be you see the route prefix there
> as set by ip6_route_add?

Yes, the destination network  

> 
> > The principle for it is wrong, it's more or less impossible to get the
> > expected address from ipv6_get_saddr_eval(), depending on your network
> > topology.
> >
> > Since there must be an address set on the remote/local pair for a IPv6
> > tunnel and you can't predict what source address you will get the
> > function is more or less useless.
> 
>  	Yes, for IPv6 I don't see option in tunnel to accept
> traffic from any remote address. This was not the case with
> IPv4. But the source address selection is rich enough.
> I don't know for any example settings for IPVS on IPv6
> but routes in IPv6 can have source address if CONFIG_IPV6_SUBTREES
> is defined (ip6_route_add). Still, the routing should be
> able to return valid source address. I don't know what
> settings you are using but looking at code I think the
> options are:
> 
> - assign source address for route
> 
> - add VIPs as deprecated addresses (prefered_lft=0). By this
> way we don't risk they to be autoselected as source in
> output routes because they are ignored in ipv6_get_saddr_eval
> at step IPV6_SADDR_RULE_PREFERRED
> 
>  	As result, the source address should be properly
> selected and our patch looks ok for the saddr part.

Yes you're right as usual :-) 
With "prefered_lft=0" src selection works in most cases now.
So this is not an issue any more.

> 
> > After spending a day single stepping through the IPv6 stack I'm more or
> > less sure that:
> > - The only way to get IPv6 tunnels to work in a predictable way is to
> > add a "tunnel source address" to the ipvsadm commad.
> 
>  	This is not possible for the case when cp->dest is
> NULL, eg. when connection using fwmarks is moved to backup
> server. But this does not happen because IPv6 is not supported
> for sync. If one day we add support in new sync format I'm
> sure there will be also fwmark and cp->dest will be valid
> for the tunneling method.
> 
> > What to do ?
> > - add a --tunsrc switch to ipvsadm  -i command ?
> > or
> > - Disable the IPv6 tunnel mode ?
> > or
> > - Leave it as is ?
> 
>  	This 3th option is better for the moment. I think,
> saddr is selected correctly, see ip6_dst_lookup() and
> ip6_dst_lookup_tail() in net/ipv6/ip6_output.c
> But if you think using ipv6_addr_copy(&iph->daddr, &cp->daddr.in6);
> is working then feel free to post a patch on top of
> the previous patch. Because other code in kernel uses
> resulting fl.fl6_dst, not rt6i_dst.

I prefer fl.fl6_dst right now since rt6i_dst gives a bad destination in
many cases.

I will do some more testing before posting the patch.
> 
> Regards
> 
> --
> Julian Anastasov <ja@ssi.bg>


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch v2] ipvs: IPv6 tunnel mode
  2010-10-01  6:08       ` Hans Schillstrom
@ 2010-10-01  7:48         ` Julian Anastasov
  0 siblings, 0 replies; 13+ messages in thread
From: Julian Anastasov @ 2010-10-01  7:48 UTC (permalink / raw)
  To: Hans Schillstrom; +Cc: Simon Horman, lvs-devel@vger.kernel.org, Julius Volz


 	Hello,

On Fri, 1 Oct 2010, Hans Schillstrom wrote:

>>> - Leave it as is ?
>>
>>  	This 3th option is better for the moment. I think,
>> saddr is selected correctly, see ip6_dst_lookup() and
>> ip6_dst_lookup_tail() in net/ipv6/ip6_output.c
>> But if you think using ipv6_addr_copy(&iph->daddr, &cp->daddr.in6);
>> is working then feel free to post a patch on top of
>> the previous patch. Because other code in kernel uses
>> resulting fl.fl6_dst, not rt6i_dst.
>
> I prefer fl.fl6_dst right now since rt6i_dst gives a bad destination in
> many cases.
>
> I will do some more testing before posting the patch.

 	ok, but note that now 'fl' is hidden in function
and not cached. I think fl.fl6_dst is read-only, so it
should contain cp->daddr.in6 in all cases.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch v2] ipvs: IPv6 tunnel mode
  2010-09-26 13:33 [patch v2] " Simon Horman
  2010-09-26 14:21 ` Julian Anastasov
@ 2010-09-27  6:00 ` Hans Schillstrom
  1 sibling, 0 replies; 13+ messages in thread
From: Hans Schillstrom @ 2010-09-27  6:00 UTC (permalink / raw)
  To: Simon Horman; +Cc: Julian Anastasov, lvs-devel@vger.kernel.org, Julius Volz

Hello,

On Sun, 2010-09-26 at 15:33 +0200, Simon Horman wrote:
> From: Julian Anastasov <ja@ssi.bg>
> 
> Tunnel mode for IPv6 doesn't work.
> 
> IPv6 encapsulation uses a bad source address for the tunnel.
> i.e. VIP will be used as local-addr and encap. dst addr.
> Decapsulation will not accept this.
> 
> Example
> LVS (eth1 2003::2:0:1/96, VIP 2003::2:0:100)
>    (eth0 2003::1:0:1/96)
> RS  (ethX 2003::1:0:5/96)
> 
> tcpdump
> 2003::2:0:100 > 2003::1:0:5:
> IP6 (hlim 63, next-header TCP (6) payload length: 40)
>  2003::3:0:10.50991 > 2003::2:0:100.http: Flags [S], cksum 0x7312
> (correct), seq 3006460279, win 5760, options [mss 1440,sackOK,TS val
> 1904932 ecr 0,nop,wscale 3], length 0
> 
> In Linux IPv6 impl. you can't have a tunnel with an any cast address
> receiving packets (I have not tried to interpret RFC 2473)
> To have receive capabilities the tunnel must have:
>  - Local address set as multicast addr or an unicast addr
>  - Remote address set as an unicast addr.
>  - Loop back addres or Link local address are not allowed.
> 
> This causes us to setup a tunnel in the Real Server with the
> LVS as the remote address, here you can't use the VIP address since it's
> used inside the tunnel.
> 
> Solution
> Use outgoing interface IPv6 address (match against the destination).
> i.e. use ip6_route_output() to look up the route cache and
> then use ipv6_dev_get_saddr(...) to set the source address of the
> encapsulated packet.
> 
> Additionally, cache the results in new destination
> fields: dst_cookie and dst_saddr and properly check the
> returned dst from ip6_route_output. We now add xfrm_lookup
> call only for the tunneling method where the source address
> is a local one.
> 
> Original patch by Hans Schillstrom.
> Check dst state and cache results for IPv6 by Julian Anastasov.
> 
> Tested-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
> Signed-off-by: Julian Anastasov <ja@ssi.bg>
> Signed-off-by: Simon Horman <horms@verge.net.au>
> 
> ---
> 
> * v1
> 
>   This is Julian's patch with a slightly edited version of the description
>   from Hans's original patch.
> 
> * v2
> 
>   Updated changelog as per commends from Julian
> 
> Is everyone ok with pushing this?
> 
> diff -urp net-next-2.6-e548833-nfct_snat_reroute/linux/include/net/ip_vs.h linux/include/net/ip_vs.h
> --- net-next-2.6-e548833-nfct_snat_reroute/linux/include/net/ip_vs.h    2010-09-16 09:03:48.000000000 +0300
> +++ linux/include/net/ip_vs.h   2010-09-22 10:50:18.548963467 +0300
> @@ -509,6 +509,10 @@ struct ip_vs_dest {
>         spinlock_t              dst_lock;       /* lock of dst_cache */
>         struct dst_entry        *dst_cache;     /* destination cache entry */
>         u32                     dst_rtos;       /* RT_TOS(tos) for dst */
> +       u32                     dst_cookie;
> +#ifdef CONFIG_IP_VS_IPV6
> +       struct in6_addr         dst_saddr;
> +#endif
> 
>         /* for virtual service */
>         struct ip_vs_service    *svc;           /* service it belongs to */
> diff -urp net-next-2.6-e548833-nfct_snat_reroute/linux/net/netfilter/ipvs/ip_vs_xmit.c linux/net/netfilter/ipvs/ip_vs_xmit.c
> --- net-next-2.6-e548833-nfct_snat_reroute/linux/net/netfilter/ipvs/ip_vs_xmit.c        2010-09-16 09:02:25.000000000 +0300
> +++ linux/net/netfilter/ipvs/ip_vs_xmit.c       2010-09-22 16:29:43.271964521 +0300
> @@ -26,6 +26,7 @@
>  #include <net/route.h>                  /* for ip_route_output */
>  #include <net/ipv6.h>
>  #include <net/ip6_route.h>
> +#include <net/addrconf.h>
>  #include <linux/icmpv6.h>
>  #include <linux/netfilter.h>
>  #include <linux/netfilter_ipv4.h>
> @@ -37,26 +38,27 @@
>   *      Destination cache to speed up outgoing route lookup
>   */
>  static inline void
> -__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
> +__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
> +               u32 dst_cookie)
>  {
>         struct dst_entry *old_dst;
> 
>         old_dst = dest->dst_cache;
>         dest->dst_cache = dst;
>         dest->dst_rtos = rtos;
> +       dest->dst_cookie = dst_cookie;
>         dst_release(old_dst);
>  }
> 
>  static inline struct dst_entry *
> -__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
> +__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
>  {
>         struct dst_entry *dst = dest->dst_cache;
> 
>         if (!dst)
>                 return NULL;
> -       if ((dst->obsolete
> -            || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
> -           dst->ops->check(dst, cookie) == NULL) {
> +       if ((dst->obsolete || rtos != dest->dst_rtos) &&
> +           dst->ops->check(dst, dest->dst_cookie) == NULL) {
>                 dest->dst_cache = NULL;
>                 dst_release(dst);
>                 return NULL;
> @@ -66,15 +68,16 @@ __ip_vs_dst_check(struct ip_vs_dest *des
>  }
> 
>  static struct rtable *
> -__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
> +__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos)
>  {
> +       struct net *net = dev_net(skb->dev);
>         struct rtable *rt;                      /* Route to the other host */
>         struct ip_vs_dest *dest = cp->dest;
> 
>         if (dest) {
>                 spin_lock(&dest->dst_lock);
>                 if (!(rt = (struct rtable *)
> -                     __ip_vs_dst_check(dest, rtos, 0))) {
> +                     __ip_vs_dst_check(dest, rtos))) {
>                         struct flowi fl = {
>                                 .oif = 0,
>                                 .nl_u = {
> @@ -84,13 +87,13 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
>                                                 .tos = rtos, } },
>                         };
> 
> -                       if (ip_route_output_key(&init_net, &rt, &fl)) {
> +                       if (ip_route_output_key(net, &rt, &fl)) {
>                                 spin_unlock(&dest->dst_lock);
>                                 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
>                                              &dest->addr.ip);
>                                 return NULL;
>                         }
> -                       __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst));
> +                       __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
>                         IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n",
>                                   &dest->addr.ip,
>                                   atomic_read(&rt->dst.__refcnt), rtos);
> @@ -106,7 +109,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
>                                         .tos = rtos, } },
>                 };
> 
> -               if (ip_route_output_key(&init_net, &rt, &fl)) {
> +               if (ip_route_output_key(net, &rt, &fl)) {
>                         IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
>                                      &cp->daddr.ip);
>                         return NULL;
> @@ -117,62 +120,79 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
>  }
> 
>  #ifdef CONFIG_IP_VS_IPV6
> +
> +static struct dst_entry *
> +__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
> +                       struct in6_addr *ret_saddr, int do_xfrm)
> +{
> +       struct dst_entry *dst;
> +       struct flowi fl = {
> +               .oif = 0,
> +               .nl_u = {
> +                       .ip6_u = {
> +                               .daddr = *daddr,
> +                       },
> +               },
> +       };
> +
> +       dst = ip6_route_output(net, NULL, &fl);
> +       if (dst->error)
> +               goto out_err;
> +       if (!ret_saddr)
> +               return dst;
> +       if (ipv6_addr_any(&fl.fl6_src) &&
> +           ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
> +                              &fl.fl6_dst, 0, &fl.fl6_src) < 0)
> +               goto out_err;
> +       if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0)
> +               goto out_err;
> +       ipv6_addr_copy(ret_saddr, &fl.fl6_src);
> +       return dst;
> +
> +out_err:
> +       dst_release(dst);
> +       IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
> +       return NULL;
> +}
> +
>  static struct rt6_info *
> -__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
> +__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
> +                     struct in6_addr *ret_saddr, int do_xfrm)
>  {
> +       struct net *net = dev_net(skb->dev);
>         struct rt6_info *rt;                    /* Route to the other host */
>         struct ip_vs_dest *dest = cp->dest;
> +       struct dst_entry *dst;
> 
>         if (dest) {
>                 spin_lock(&dest->dst_lock);
> -               rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
> +               rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
>                 if (!rt) {
> -                       struct flowi fl = {
> -                               .oif = 0,
> -                               .nl_u = {
> -                                       .ip6_u = {
> -                                               .daddr = dest->addr.in6,
> -                                               .saddr = {
> -                                                       .s6_addr32 =
> -                                                               { 0, 0, 0, 0 },
> -                                               },
> -                                       },
> -                               },
> -                       };
> +                       u32 cookie;
> 
> -                       rt = (struct rt6_info *)ip6_route_output(&init_net,
> -                                                                NULL, &fl);
> -                       if (!rt) {
> +                       dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
> +                                                     &dest->dst_saddr,
> +                                                     do_xfrm);
> +                       if (!dst) {
>                                 spin_unlock(&dest->dst_lock);
> -                               IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
> -                                            &dest->addr.in6);
>                                 return NULL;
>                         }
> -                       __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst));
> -                       IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n",
> -                                 &dest->addr.in6,
> +                       rt = (struct rt6_info *) dst;
> +                       cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
> +                       __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
> +                       IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
> +                                 &dest->addr.in6, &dest->dst_saddr,
>                                   atomic_read(&rt->dst.__refcnt));
>                 }
> +               if (ret_saddr)
> +                       ipv6_addr_copy(ret_saddr, &dest->dst_saddr);
>                 spin_unlock(&dest->dst_lock);
>         } else {
> -               struct flowi fl = {
> -                       .oif = 0,
> -                       .nl_u = {
> -                               .ip6_u = {
> -                                       .daddr = cp->daddr.in6,
> -                                       .saddr = {
> -                                               .s6_addr32 = { 0, 0, 0, 0 },
> -                                       },
> -                               },
> -                       },
> -               };
> -
> -               rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
> -               if (!rt) {
> -                       IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
> -                                    &cp->daddr.in6);
> +               dst = __ip_vs_route_output_v6(net, &cp->daddr.in6, ret_saddr,
> +                                             do_xfrm);
> +               if (!dst)
>                         return NULL;
> -               }
> +               rt = (struct rt6_info *) dst;
>         }
> 
>         return rt;
> @@ -248,6 +268,7 @@ int
>  ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
>                   struct ip_vs_protocol *pp)
>  {
> +       struct net *net = dev_net(skb->dev);
>         struct rtable *rt;                      /* Route to the other host */
>         struct iphdr  *iph = ip_hdr(skb);
>         u8     tos = iph->tos;
> @@ -263,7 +284,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, s
> 
>         EnterFunction(10);
> 
> -       if (ip_route_output_key(&init_net, &rt, &fl)) {
> +       if (ip_route_output_key(net, &rt, &fl)) {
>                 IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n",
>                              __func__, &iph->daddr);
>                 goto tx_error_icmp;
> @@ -313,25 +334,18 @@ int
>  ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
>                      struct ip_vs_protocol *pp)
>  {
> +       struct net *net = dev_net(skb->dev);
> +       struct dst_entry *dst;
>         struct rt6_info *rt;                    /* Route to the other host */
>         struct ipv6hdr  *iph = ipv6_hdr(skb);
>         int    mtu;
> -       struct flowi fl = {
> -               .oif = 0,
> -               .nl_u = {
> -                       .ip6_u = {
> -                               .daddr = iph->daddr,
> -                               .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
> -       };
> 
>         EnterFunction(10);
> 
> -       rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
> -       if (!rt) {
> -               IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6\n",
> -                            __func__, &iph->daddr);
> +       dst = __ip_vs_route_output_v6(net, &iph->daddr, NULL, 0);
> +       if (!dst)
>                 goto tx_error_icmp;
> -       }
> +       rt = (struct rt6_info *) dst;
> 
>         /* MTU checking */
>         mtu = dst_mtu(&rt->dst);
> @@ -397,7 +411,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, stru
>                 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
>         }
> 
> -       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
> +       if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
>                 goto tx_error_icmp;
> 
>         /* MTU checking */
> @@ -472,7 +486,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, s
>                 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
>         }
> 
> -       rt = __ip_vs_get_out_rt_v6(cp);
> +       rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
>         if (!rt)
>                 goto tx_error_icmp;
> 
> @@ -557,7 +571,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
>         struct iphdr  *old_iph = ip_hdr(skb);
>         u8     tos = old_iph->tos;
>         __be16 df = old_iph->frag_off;
> -       sk_buff_data_t old_transport_header = skb->transport_header;
>         struct iphdr  *iph;                     /* Our new IP header */
>         unsigned int max_headroom;              /* The extra header space needed */
>         int    mtu;
> @@ -572,7 +585,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
>                 goto tx_error;
>         }
> 
> -       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
> +       if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(tos))))
>                 goto tx_error_icmp;
> 
>         tdev = rt->dst.dev;
> @@ -616,7 +629,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
>                 old_iph = ip_hdr(skb);
>         }
> 
> -       skb->transport_header = old_transport_header;
> +       skb->transport_header = skb->network_header;
> 
>         /* fix old IP header checksum */
>         ip_send_check(old_iph);
> @@ -670,9 +683,9 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
>                      struct ip_vs_protocol *pp)
>  {
>         struct rt6_info *rt;            /* Route to the other host */
> +       struct in6_addr saddr;          /* Source for tunnel */
>         struct net_device *tdev;        /* Device to other host */
>         struct ipv6hdr  *old_iph = ipv6_hdr(skb);
> -       sk_buff_data_t old_transport_header = skb->transport_header;
>         struct ipv6hdr  *iph;           /* Our new IP header */
>         unsigned int max_headroom;      /* The extra header space needed */
>         int    mtu;
> @@ -687,17 +700,17 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
>                 goto tx_error;
>         }
> 
> -       rt = __ip_vs_get_out_rt_v6(cp);
> +       rt = __ip_vs_get_out_rt_v6(skb, cp, &saddr, 1);
>         if (!rt)
>                 goto tx_error_icmp;
> 
>         tdev = rt->dst.dev;
> 
>         mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
> -       /* TODO IPv6: do we need this check in IPv6? */
> -       if (mtu < 1280) {
> +       if (mtu < IPV6_MIN_MTU) {
>                 dst_release(&rt->dst);
> -               IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__);
> +               IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
> +                            IPV6_MIN_MTU);
>                 goto tx_error;
>         }
>         if (skb_dst(skb))
> @@ -730,7 +743,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
>                 old_iph = ipv6_hdr(skb);
>         }
> 
> -       skb->transport_header = old_transport_header;
> +       skb->transport_header = skb->network_header;
> 
>         skb_push(skb, sizeof(struct ipv6hdr));
>         skb_reset_network_header(skb);
> @@ -750,8 +763,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
>         be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
>         iph->priority           =       old_iph->priority;
>         memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
> -       iph->daddr              =       rt->rt6i_dst.addr;
> -       iph->saddr              =       cp->vaddr.in6; /* rt->rt6i_src.addr; */
> +       ipv6_addr_copy(&iph->daddr, &rt->rt6i_dst.addr);
> +       ipv6_addr_copy(&iph->saddr, &saddr);
>         iph->hop_limit          =       old_iph->hop_limit;
> 
>         /* Another hack: avoid icmp_send in ip_fragment */
> @@ -791,7 +804,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struc
> 
>         EnterFunction(10);
> 
> -       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
> +       if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
>                 goto tx_error_icmp;
> 
>         /* MTU checking */
> @@ -843,7 +856,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, st
> 
>         EnterFunction(10);
> 
> -       rt = __ip_vs_get_out_rt_v6(cp);
> +       rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
>         if (!rt)
>                 goto tx_error_icmp;
> 
> @@ -919,7 +932,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, str
>          * mangle and send the packet here (only for VS/NAT)
>          */
> 
> -       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
> +       if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(ip_hdr(skb)->tos))))
>                 goto tx_error_icmp;
> 
>         /* MTU checking */
> @@ -993,7 +1006,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb,
>          * mangle and send the packet here (only for VS/NAT)
>          */
> 
> -       rt = __ip_vs_get_out_rt_v6(cp);
> +       rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
>         if (!rt)
>                 goto tx_error_icmp;
> 
Looks good, go ahead
--
Regards
Hans Schillstrom <hans.schillstrom@ericsson.com>


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2010-10-05  7:11 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-09-27 13:59 [patch v2] ipvs: IPv6 tunnel mode Simon Horman
2010-09-27 13:59 ` Simon Horman
2010-09-30  1:22 ` Simon Horman
2010-10-04 19:06   ` Patrick McHardy
2010-10-05  5:52 ` [patch v3] " Hans Schillstrom
2010-10-05  7:11   ` Julian Anastasov
  -- strict thread matches above, loose matches on Subject: below --
2010-09-26 13:33 [patch v2] " Simon Horman
2010-09-26 14:21 ` Julian Anastasov
2010-09-29 13:30   ` Hans Schillstrom
2010-09-30 22:55     ` Julian Anastasov
2010-10-01  6:08       ` Hans Schillstrom
2010-10-01  7:48         ` Julian Anastasov
2010-09-27  6:00 ` Hans Schillstrom

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.