[PATCH net-next 0/3] ipv4: Hash-based multipath routing

linux-api.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH net-next 0/3] ipv4: Hash-based multipath routing
@ 2015-06-17 20:08 Peter Nørlund
       [not found] ` <1434571686-5149-1-git-send-email-pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Peter Nørlund @ 2015-06-17 20:08 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy,
	linux-api-u79uwXL29TY76Z2rM5mHXA

This patch series enhances the IPv4 multipath code, adding support for
hash-based multipath.

The multipath algorithm is a per-route attribute (RTA_MP_ALGO) with some
degree of binary compatibility with the old implementation (2.6.12 - 2.6.22),
but without source level compatibility since attributes have different names:

RT_MP_ALG_L3_HASH:
L3 hash-based distribution. This was IP_MP_ALG_NONE, which with the route
cache behaved somewhat like L3-based distribution. This is now the default.

RT_MP_ALG_PER_PACKET:
Per-packet distribution. Was IP_MP_ALG_RR. Uses round-robin.

RT_MP_ALG_DRR, RT_MP_ALG_RANDOM, RT_MP_ALG_WRANDOM:
Unsupported values, but reserved because they existed in 2.6.12 - 2.6.22.

RT_MP_ALG_L4_HASH:
L4 hash-based distribution. This is new.

The traditional modulo approach was replaced by a threshold-based approach,
described in RFC 2992. This reduces disruption in case of link failures or
route changes.

To better support anycast environments where PMTU usually breaks with
multipath, certain ICMP packets are hashed using the header within the
payload, ensuring that ICMP packets are routed over the same path as the
flow they belong to.

As a side effect, the multipath spinlock was removed and the code got faster.
I measured ip_mkroute_input (excl. __mkroute_input) on a Xeon X3350 (2.66GHz)
with two paths and L3 hashing:

1 thread:
Before: ~199.8 cycles(tsc)
After:   ~75.2 cycles(tsc)

4 threads:
Before: ~393.9 cycles(tsc)
After:   ~77.8 cycles(tsc)

If this patch is accepted, a follow-up patch to iproute2 will also be
submitted.

Best regards,
 Peter Nørlund

Peter Nørlund (3):
      ipv4: Lock-less per-packet multipath
      ipv4: L3 and L4 hash-based multipath routing
      ipv4: ICMP packet inspection for multipath

 include/net/ip_fib.h           |   10 ++-
 include/net/route.h            |    5 +
 include/uapi/linux/rtnetlink.h |   14 ++++
 net/ipv4/Kconfig               |    1 
 net/ipv4/fib_frontend.c        |    4 +
 net/ipv4/fib_semantics.c       |  146 +++++++++++++++++++++++++---------------
 net/ipv4/icmp.c                |   29 +++++++-
 net/ipv4/route.c               |  108 +++++++++++++++++++++++++++---
 net/ipv4/xfrm4_policy.c        |    2 -
 9 files changed, 246 insertions(+), 73 deletions(-)

^ permalink raw reply	[flat|nested] 8+ messages in thread

[parent not found: <1434571686-5149-1-git-send-email-pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>]

* [PATCH net-next 1/3] ipv4: Lock-less per-packet multipath
       [not found] ` <1434571686-5149-1-git-send-email-pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
@ 2015-06-17 20:08   ` Peter Nørlund
       [not found]     ` <1434571686-5149-2-git-send-email-pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Peter Nørlund @ 2015-06-17 20:08 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Peter Nørlund

The current multipath attempted to be quasi random, but in most cases it
behaved just like a round robin balancing. This patch refactors the
algorithm to be exactly that and in doing so, avoids the spin lock.

The new design paves the way for hash-based multipath, replacing the
modulo with thresholds, minimizing disruption in case of failing paths or
route replacements.

Signed-off-by: Peter Nørlund <pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
---
 include/net/ip_fib.h     |   6 +--
 net/ipv4/Kconfig         |   1 +
 net/ipv4/fib_semantics.c | 116 ++++++++++++++++++++++++++---------------------
 3 files changed, 68 insertions(+), 55 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 54271ed..4be4f25 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -76,8 +76,8 @@ struct fib_nh {
 	unsigned int		nh_flags;
 	unsigned char		nh_scope;
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-	int			nh_weight;
-	int			nh_power;
+	int			nh_mp_weight;
+	atomic_t		nh_mp_upper_bound;
 #endif
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	__u32			nh_tclassid;
@@ -115,7 +115,7 @@ struct fib_info {
 #define fib_advmss fib_metrics[RTAX_ADVMSS-1]
 	int			fib_nhs;
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-	int			fib_power;
+	int			fib_mp_weight;
 #endif
 	struct rcu_head		rcu;
 	struct fib_nh		fib_nh[0];
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index d83071d..cb91f67 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -81,6 +81,7 @@ config IP_MULTIPLE_TABLES
 config IP_ROUTE_MULTIPATH
 	bool "IP: equal cost multipath"
 	depends on IP_ADVANCED_ROUTER
+	select BITREVERSE
 	help
 	  Normally, the routing tables specify a single action to be taken in
 	  a deterministic manner for a given packet. If you say Y here
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 28ec3c1..8c8df80 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -15,6 +15,7 @@
 
 #include <asm/uaccess.h>
 #include <linux/bitops.h>
+#include <linux/bitrev.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/jiffies.h>
@@ -57,7 +58,7 @@ static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 
-static DEFINE_SPINLOCK(fib_multipath_lock);
+static DEFINE_PER_CPU(u8, fib_mp_rr_counter);
 
 #define for_nexthops(fi) {						\
 	int nhsel; const struct fib_nh *nh;				\
@@ -261,7 +262,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
 		    nh->nh_gw  != onh->nh_gw ||
 		    nh->nh_scope != onh->nh_scope ||
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-		    nh->nh_weight != onh->nh_weight ||
+		    nh->nh_mp_weight != onh->nh_mp_weight ||
 #endif
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		    nh->nh_tclassid != onh->nh_tclassid ||
@@ -449,6 +450,43 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
 	return remaining > 0 ? 0 : nhs;
 }
 
+static void fib_rebalance(struct fib_info *fi)
+{
+	int factor;
+	int total;
+	int w;
+
+	if (fi->fib_nhs < 2)
+		return;
+
+	total = 0;
+	for_nexthops(fi) {
+		if (!(nh->nh_flags & RTNH_F_DEAD))
+			total += nh->nh_mp_weight;
+	} endfor_nexthops(fi);
+
+	if (likely(total != 0)) {
+		factor = DIV_ROUND_UP(total, 8388608);
+		total /= factor;
+	} else {
+		factor = 1;
+	}
+
+	w = 0;
+	change_nexthops(fi) {
+		int upper_bound;
+
+		if (nexthop_nh->nh_flags & RTNH_F_DEAD) {
+			upper_bound = -1;
+		} else {
+			w += nexthop_nh->nh_mp_weight / factor;
+			upper_bound = DIV_ROUND_CLOSEST(256 * w, total);
+		}
+
+		atomic_set(&nexthop_nh->nh_mp_upper_bound, upper_bound);
+	} endfor_nexthops(fi);
+}
+
 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 		       int remaining, struct fib_config *cfg)
 {
@@ -461,7 +499,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 		nexthop_nh->nh_flags =
 			(cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
 		nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
-		nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
+		nexthop_nh->nh_mp_weight = rtnh->rtnh_hops + 1;
 
 		attrlen = rtnh_attrlen(rtnh);
 		if (attrlen > 0) {
@@ -884,7 +922,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 			fi->fib_net->ipv4.fib_num_tclassid_users++;
 #endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-		nh->nh_weight = 1;
+		nh->nh_mp_weight = 1;
 #endif
 	}
 
@@ -936,8 +974,15 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 
 	change_nexthops(fi) {
 		fib_info_update_nh_saddr(net, nexthop_nh);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+		fi->fib_mp_weight += nexthop_nh->nh_mp_weight;
+#endif
 	} endfor_nexthops(fi)
 
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	fib_rebalance(fi);
+#endif
+
 link_it:
 	ofi = fib_find_info(fi);
 	if (ofi) {
@@ -1050,7 +1095,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 				goto nla_put_failure;
 
 			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
-			rtnh->rtnh_hops = nh->nh_weight - 1;
+			rtnh->rtnh_hops = nh->nh_mp_weight - 1;
 			rtnh->rtnh_ifindex = nh->nh_oif;
 
 			if (nh->nh_gw &&
@@ -1130,12 +1175,6 @@ int fib_sync_down_dev(struct net_device *dev, int force)
 			else if (nexthop_nh->nh_dev == dev &&
 				 nexthop_nh->nh_scope != scope) {
 				nexthop_nh->nh_flags |= RTNH_F_DEAD;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-				spin_lock_bh(&fib_multipath_lock);
-				fi->fib_power -= nexthop_nh->nh_power;
-				nexthop_nh->nh_power = 0;
-				spin_unlock_bh(&fib_multipath_lock);
-#endif
 				dead++;
 			}
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -1149,6 +1188,10 @@ int fib_sync_down_dev(struct net_device *dev, int force)
 			fi->fib_flags |= RTNH_F_DEAD;
 			ret++;
 		}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+		fib_rebalance(fi);
+#endif
 	}
 
 	return ret;
@@ -1254,16 +1297,15 @@ int fib_sync_up(struct net_device *dev)
 			    !__in_dev_get_rtnl(dev))
 				continue;
 			alive++;
-			spin_lock_bh(&fib_multipath_lock);
-			nexthop_nh->nh_power = 0;
 			nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
-			spin_unlock_bh(&fib_multipath_lock);
 		} endfor_nexthops(fi)
 
 		if (alive > 0) {
 			fi->fib_flags &= ~RTNH_F_DEAD;
 			ret++;
 		}
+
+		fib_rebalance(fi);
 	}
 
 	return ret;
@@ -1276,49 +1318,19 @@ int fib_sync_up(struct net_device *dev)
 void fib_select_multipath(struct fib_result *res)
 {
 	struct fib_info *fi = res->fi;
-	int w;
-
-	spin_lock_bh(&fib_multipath_lock);
-	if (fi->fib_power <= 0) {
-		int power = 0;
-		change_nexthops(fi) {
-			if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
-				power += nexthop_nh->nh_weight;
-				nexthop_nh->nh_power = nexthop_nh->nh_weight;
-			}
-		} endfor_nexthops(fi);
-		fi->fib_power = power;
-		if (power <= 0) {
-			spin_unlock_bh(&fib_multipath_lock);
-			/* Race condition: route has just become dead. */
-			res->nh_sel = 0;
-			return;
-		}
-	}
+	u8 w;
 
+	w = bitrev8(this_cpu_inc_return(fib_mp_rr_counter));
 
-	/* w should be random number [0..fi->fib_power-1],
-	 * it is pretty bad approximation.
-	 */
-
-	w = jiffies % fi->fib_power;
+	for_nexthops(fi) {
+		if (w >= atomic_read(&nh->nh_mp_upper_bound))
+			continue;
 
-	change_nexthops(fi) {
-		if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
-		    nexthop_nh->nh_power) {
-			w -= nexthop_nh->nh_power;
-			if (w <= 0) {
-				nexthop_nh->nh_power--;
-				fi->fib_power--;
-				res->nh_sel = nhsel;
-				spin_unlock_bh(&fib_multipath_lock);
-				return;
-			}
-		}
+		res->nh_sel = nhsel;
+		return;
 	} endfor_nexthops(fi);
 
-	/* Race condition: route has just become dead. */
+	/* Race condition: route has just become dead */
 	res->nh_sel = 0;
-	spin_unlock_bh(&fib_multipath_lock);
 }
 #endif
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

[parent not found: <1434571686-5149-2-git-send-email-pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>]

* Re: [PATCH net-next 1/3] ipv4: Lock-less per-packet multipath
       [not found]     ` <1434571686-5149-2-git-send-email-pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
@ 2015-06-18 19:42       ` Alexander Duyck
       [not found]         ` <55831F0D.3040703-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Alexander Duyck @ 2015-06-18 19:42 UTC (permalink / raw)
  To: Peter Nørlund, netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy,
	linux-api-u79uwXL29TY76Z2rM5mHXA

On 06/17/2015 01:08 PM, Peter Nørlund wrote:
> The current multipath attempted to be quasi random, but in most cases it
> behaved just like a round robin balancing. This patch refactors the
> algorithm to be exactly that and in doing so, avoids the spin lock.
>
> The new design paves the way for hash-based multipath, replacing the
> modulo with thresholds, minimizing disruption in case of failing paths or
> route replacements.
>
> Signed-off-by: Peter Nørlund <pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
> ---
>   include/net/ip_fib.h     |   6 +--
>   net/ipv4/Kconfig         |   1 +
>   net/ipv4/fib_semantics.c | 116 ++++++++++++++++++++++++++---------------------
>   3 files changed, 68 insertions(+), 55 deletions(-)
>
> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> index 54271ed..4be4f25 100644
> --- a/include/net/ip_fib.h
> +++ b/include/net/ip_fib.h
> @@ -76,8 +76,8 @@ struct fib_nh {
>   	unsigned int		nh_flags;
>   	unsigned char		nh_scope;
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> -	int			nh_weight;
> -	int			nh_power;
> +	int			nh_mp_weight;
> +	atomic_t		nh_mp_upper_bound;
>   #endif
>   #ifdef CONFIG_IP_ROUTE_CLASSID
>   	__u32			nh_tclassid;
> @@ -115,7 +115,7 @@ struct fib_info {
>   #define fib_advmss fib_metrics[RTAX_ADVMSS-1]
>   	int			fib_nhs;
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> -	int			fib_power;
> +	int			fib_mp_weight;
>   #endif
>   	struct rcu_head		rcu;
>   	struct fib_nh		fib_nh[0];

I could do without some of this renaming.  For example you could 
probably not bother with adding the _mp piece to the name.  That way we 
don't have to track all the nh_weight -> nh_mp_weight changes.   Also 
you could probably just use the name fib_weight since not including the 
_mp was already the convention for the multipath portions of the 
structure anyway.

This isn't really improving readability at all so I would say don't 
bother renaming it.

> diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
> index d83071d..cb91f67 100644
> --- a/net/ipv4/Kconfig
> +++ b/net/ipv4/Kconfig
> @@ -81,6 +81,7 @@ config IP_MULTIPLE_TABLES
>   config IP_ROUTE_MULTIPATH
>   	bool "IP: equal cost multipath"
>   	depends on IP_ADVANCED_ROUTER
> +	select BITREVERSE
>   	help
>   	  Normally, the routing tables specify a single action to be taken in
>   	  a deterministic manner for a given packet. If you say Y here
> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
> index 28ec3c1..8c8df80 100644
> --- a/net/ipv4/fib_semantics.c
> +++ b/net/ipv4/fib_semantics.c
> @@ -15,6 +15,7 @@
>
>   #include <asm/uaccess.h>
>   #include <linux/bitops.h>
> +#include <linux/bitrev.h>
>   #include <linux/types.h>
>   #include <linux/kernel.h>
>   #include <linux/jiffies.h>
> @@ -57,7 +58,7 @@ static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
>
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
>
> -static DEFINE_SPINLOCK(fib_multipath_lock);
> +static DEFINE_PER_CPU(u8, fib_mp_rr_counter);
>
>   #define for_nexthops(fi) {						\
>   	int nhsel; const struct fib_nh *nh;				\
> @@ -261,7 +262,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
>   		    nh->nh_gw  != onh->nh_gw ||
>   		    nh->nh_scope != onh->nh_scope ||
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> -		    nh->nh_weight != onh->nh_weight ||
> +		    nh->nh_mp_weight != onh->nh_mp_weight ||
>   #endif
>   #ifdef CONFIG_IP_ROUTE_CLASSID
>   		    nh->nh_tclassid != onh->nh_tclassid ||
> @@ -449,6 +450,43 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
>   	return remaining > 0 ? 0 : nhs;
>   }
>

This is a good example.  If we don't do the rename we don't have to 
review changes like the one above which just add extra overhead to the 
patch.

> +static void fib_rebalance(struct fib_info *fi)
> +{
> +	int factor;
> +	int total;
> +	int w;
> +
> +	if (fi->fib_nhs < 2)
> +		return;
> +
> +	total = 0;
> +	for_nexthops(fi) {
> +		if (!(nh->nh_flags & RTNH_F_DEAD))
> +			total += nh->nh_mp_weight;
> +	} endfor_nexthops(fi);
> +
> +	if (likely(total != 0)) {
> +		factor = DIV_ROUND_UP(total, 8388608);
> +		total /= factor;
> +	} else {
> +		factor = 1;
> +	}
> +

So where does the 8388608 value come from?  Is it just here to help 
restrict the upper_bound to a u8 value?

> +	w = 0;
> +	change_nexthops(fi) {
> +		int upper_bound;
> +
> +		if (nexthop_nh->nh_flags & RTNH_F_DEAD) {
> +			upper_bound = -1;
> +		} else {
> +			w += nexthop_nh->nh_mp_weight / factor;
> +			upper_bound = DIV_ROUND_CLOSEST(256 * w, total);
> +		}

This is doing some confusing stuff.  I assume the whole point is to get 
the value to convert the upper_bound into a u8 value based on the weight 
where you end with 256 as the last value, do I have that right?

I'm not really sure that is the best way to go from a code standpoint 
since fib_nhs is an int which technically means we could support more 
than 256 next hops for a given route.

Also I assume that it is safe to divide by total because the assumption 
is that if one or more routes are not dead then total has a non-zero 
value, do I have that right?  If so what is the point of setting the 
factor to 1 above, unless you are just hacking around a divide by 0 error.

Maybe what you should do above is simply return if total == 0 since it 
doesn't look like you can safely update anything otherwise.

> +
> +		atomic_set(&nexthop_nh->nh_mp_upper_bound, upper_bound);
> +	} endfor_nexthops(fi);
> +}
> +
>   static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
>   		       int remaining, struct fib_config *cfg)
>   {
> @@ -461,7 +499,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
>   		nexthop_nh->nh_flags =
>   			(cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
>   		nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
> -		nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
> +		nexthop_nh->nh_mp_weight = rtnh->rtnh_hops + 1;
>
>   		attrlen = rtnh_attrlen(rtnh);
>   		if (attrlen > 0) {
> @@ -884,7 +922,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
>   			fi->fib_net->ipv4.fib_num_tclassid_users++;
>   #endif
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> -		nh->nh_weight = 1;
> +		nh->nh_mp_weight = 1;
>   #endif
>   	}
>

More unnecessary rename noise.

> @@ -936,8 +974,15 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
>
>   	change_nexthops(fi) {
>   		fib_info_update_nh_saddr(net, nexthop_nh);
> +#ifdef CONFIG_IP_ROUTE_MULTIPATH
> +		fi->fib_mp_weight += nexthop_nh->nh_mp_weight;
> +#endif
>   	} endfor_nexthops(fi)
>
> +#ifdef CONFIG_IP_ROUTE_MULTIPATH
> +	fib_rebalance(fi);
> +#endif
> +
>   link_it:
>   	ofi = fib_find_info(fi);
>   	if (ofi) {
> @@ -1050,7 +1095,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
>   				goto nla_put_failure;
>
>   			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
> -			rtnh->rtnh_hops = nh->nh_weight - 1;
> +			rtnh->rtnh_hops = nh->nh_mp_weight - 1;
>   			rtnh->rtnh_ifindex = nh->nh_oif;
>
>   			if (nh->nh_gw &&
> @@ -1130,12 +1175,6 @@ int fib_sync_down_dev(struct net_device *dev, int force)
>   			else if (nexthop_nh->nh_dev == dev &&
>   				 nexthop_nh->nh_scope != scope) {
>   				nexthop_nh->nh_flags |= RTNH_F_DEAD;
> -#ifdef CONFIG_IP_ROUTE_MULTIPATH
> -				spin_lock_bh(&fib_multipath_lock);
> -				fi->fib_power -= nexthop_nh->nh_power;
> -				nexthop_nh->nh_power = 0;
> -				spin_unlock_bh(&fib_multipath_lock);
> -#endif
>   				dead++;
>   			}
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> @@ -1149,6 +1188,10 @@ int fib_sync_down_dev(struct net_device *dev, int force)
>   			fi->fib_flags |= RTNH_F_DEAD;
>   			ret++;
>   		}
> +
> +#ifdef CONFIG_IP_ROUTE_MULTIPATH
> +		fib_rebalance(fi);
> +#endif
>   	}
>
>   	return ret;
> @@ -1254,16 +1297,15 @@ int fib_sync_up(struct net_device *dev)
>   			    !__in_dev_get_rtnl(dev))
>   				continue;
>   			alive++;
> -			spin_lock_bh(&fib_multipath_lock);
> -			nexthop_nh->nh_power = 0;
>   			nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
> -			spin_unlock_bh(&fib_multipath_lock);
>   		} endfor_nexthops(fi)
>
>   		if (alive > 0) {
>   			fi->fib_flags &= ~RTNH_F_DEAD;
>   			ret++;
>   		}
> +
> +		fib_rebalance(fi);
>   	}
>
>   	return ret;
> @@ -1276,49 +1318,19 @@ int fib_sync_up(struct net_device *dev)
>   void fib_select_multipath(struct fib_result *res)
>   {
>   	struct fib_info *fi = res->fi;
> -	int w;
> -
> -	spin_lock_bh(&fib_multipath_lock);
> -	if (fi->fib_power <= 0) {
> -		int power = 0;
> -		change_nexthops(fi) {
> -			if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
> -				power += nexthop_nh->nh_weight;
> -				nexthop_nh->nh_power = nexthop_nh->nh_weight;
> -			}
> -		} endfor_nexthops(fi);
> -		fi->fib_power = power;
> -		if (power <= 0) {
> -			spin_unlock_bh(&fib_multipath_lock);
> -			/* Race condition: route has just become dead. */
> -			res->nh_sel = 0;
> -			return;
> -		}
> -	}
> +	u8 w;
>
> +	w = bitrev8(this_cpu_inc_return(fib_mp_rr_counter));
>
> -	/* w should be random number [0..fi->fib_power-1],
> -	 * it is pretty bad approximation.
> -	 */
> -
> -	w = jiffies % fi->fib_power;
> +	for_nexthops(fi) {
> +		if (w >= atomic_read(&nh->nh_mp_upper_bound))
> +			continue;
>
> -	change_nexthops(fi) {
> -		if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
> -		    nexthop_nh->nh_power) {
> -			w -= nexthop_nh->nh_power;
> -			if (w <= 0) {
> -				nexthop_nh->nh_power--;
> -				fi->fib_power--;
> -				res->nh_sel = nhsel;
> -				spin_unlock_bh(&fib_multipath_lock);
> -				return;
> -			}
> -		}
> +		res->nh_sel = nhsel;
> +		return;
>   	} endfor_nexthops(fi);
>
> -	/* Race condition: route has just become dead. */
> +	/* Race condition: route has just become dead */
>   	res->nh_sel = 0;
> -	spin_unlock_bh(&fib_multipath_lock);
>   }
>   #endif
>

Instead of doing bitrev8 why not just hash jiffies since that is 
essentially what you are doing with this counter anyway.  You could 
probably even look at converting from a u8 to something like a u32 which 
would give you a more pseudo random distribution.

^ permalink raw reply	[flat|nested] 8+ messages in thread

[parent not found: <55831F0D.3040703-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>]

* Re: [PATCH net-next 1/3] ipv4: Lock-less per-packet multipath
       [not found]         ` <55831F0D.3040703-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2015-06-20 18:42           ` Peter Nørlund
  0 siblings, 0 replies; 8+ messages in thread
From: Peter Nørlund @ 2015-06-20 18:42 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, David S. Miller, Alexey Kuznetsov,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy,
	linux-api-u79uwXL29TY76Z2rM5mHXA

On Thu, 18 Jun 2015 12:42:05 -0700
Alexander Duyck <alexander.h.duyck-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:

> On 06/17/2015 01:08 PM, Peter Nørlund wrote:
> > The current multipath attempted to be quasi random, but in most
> > cases it behaved just like a round robin balancing. This patch
> > refactors the algorithm to be exactly that and in doing so, avoids
> > the spin lock.
> >
> > The new design paves the way for hash-based multipath, replacing the
> > modulo with thresholds, minimizing disruption in case of failing
> > paths or route replacements.
> >
> > Signed-off-by: Peter Nørlund <pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
> > ---
> >   include/net/ip_fib.h     |   6 +--
> >   net/ipv4/Kconfig         |   1 +
> >   net/ipv4/fib_semantics.c | 116
> > ++++++++++++++++++++++++++--------------------- 3 files changed, 68
> > insertions(+), 55 deletions(-)
> >
> > diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> > index 54271ed..4be4f25 100644
> > --- a/include/net/ip_fib.h
> > +++ b/include/net/ip_fib.h
> > @@ -76,8 +76,8 @@ struct fib_nh {
> >   	unsigned int		nh_flags;
> >   	unsigned char		nh_scope;
> >   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> > -	int			nh_weight;
> > -	int			nh_power;
> > +	int			nh_mp_weight;
> > +	atomic_t		nh_mp_upper_bound;
> >   #endif
> >   #ifdef CONFIG_IP_ROUTE_CLASSID
> >   	__u32			nh_tclassid;
> > @@ -115,7 +115,7 @@ struct fib_info {
> >   #define fib_advmss fib_metrics[RTAX_ADVMSS-1]
> >   	int			fib_nhs;
> >   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> > -	int			fib_power;
> > +	int			fib_mp_weight;
> >   #endif
> >   	struct rcu_head		rcu;
> >   	struct fib_nh		fib_nh[0];
> 
> I could do without some of this renaming.  For example you could 
> probably not bother with adding the _mp piece to the name.  That way
> we don't have to track all the nh_weight -> nh_mp_weight changes.
> Also you could probably just use the name fib_weight since not
> including the _mp was already the convention for the multipath
> portions of the structure anyway.
> 
> This isn't really improving readability at all so I would say don't 
> bother renaming it.
> 

Good point. I'll skip the renaming.

> > diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
> > index d83071d..cb91f67 100644
> > --- a/net/ipv4/Kconfig
> > +++ b/net/ipv4/Kconfig
> > @@ -81,6 +81,7 @@ config IP_MULTIPLE_TABLES
> >   config IP_ROUTE_MULTIPATH
> >   	bool "IP: equal cost multipath"
> >   	depends on IP_ADVANCED_ROUTER
> > +	select BITREVERSE
> >   	help
> >   	  Normally, the routing tables specify a single action to
> > be taken in a deterministic manner for a given packet. If you say Y
> > here diff --git a/net/ipv4/fib_semantics.c
> > b/net/ipv4/fib_semantics.c index 28ec3c1..8c8df80 100644
> > --- a/net/ipv4/fib_semantics.c
> > +++ b/net/ipv4/fib_semantics.c
> > @@ -15,6 +15,7 @@
> >
> >   #include <asm/uaccess.h>
> >   #include <linux/bitops.h>
> > +#include <linux/bitrev.h>
> >   #include <linux/types.h>
> >   #include <linux/kernel.h>
> >   #include <linux/jiffies.h>
> > @@ -57,7 +58,7 @@ static struct hlist_head
> > fib_info_devhash[DEVINDEX_HASHSIZE];
> >
> >   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> >
> > -static DEFINE_SPINLOCK(fib_multipath_lock);
> > +static DEFINE_PER_CPU(u8, fib_mp_rr_counter);
> >
> >   #define for_nexthops(fi)
> > {						\ int nhsel; const
> > struct fib_nh *nh;				\ @@ -261,7
> > +262,7 @@ static inline int nh_comp(const struct fib_info *fi,
> > const struct fib_info *ofi) nh->nh_gw  != onh->nh_gw ||
> > nh->nh_scope != onh->nh_scope || #ifdef CONFIG_IP_ROUTE_MULTIPATH
> > -		    nh->nh_weight != onh->nh_weight ||
> > +		    nh->nh_mp_weight != onh->nh_mp_weight ||
> >   #endif
> >   #ifdef CONFIG_IP_ROUTE_CLASSID
> >   		    nh->nh_tclassid != onh->nh_tclassid ||
> > @@ -449,6 +450,43 @@ static int fib_count_nexthops(struct rtnexthop
> > *rtnh, int remaining) return remaining > 0 ? 0 : nhs;
> >   }
> >
> 
> This is a good example.  If we don't do the rename we don't have to 
> review changes like the one above which just add extra overhead to
> the patch.
> 

Right.

> > +static void fib_rebalance(struct fib_info *fi)
> > +{
> > +	int factor;
> > +	int total;
> > +	int w;
> > +
> > +	if (fi->fib_nhs < 2)
> > +		return;
> > +
> > +	total = 0;
> > +	for_nexthops(fi) {
> > +		if (!(nh->nh_flags & RTNH_F_DEAD))
> > +			total += nh->nh_mp_weight;
> > +	} endfor_nexthops(fi);
> > +
> > +	if (likely(total != 0)) {
> > +		factor = DIV_ROUND_UP(total, 8388608);
> > +		total /= factor;
> > +	} else {
> > +		factor = 1;
> > +	}
> > +
> 
> So where does the 8388608 value come from?  Is it just here to help 
> restrict the upper_bound to a u8 value?
> 

Yes. Although I think it is rare for the weight to be that large, the
API supports it, so I'd better make sure nothing weird happens. Or is
it too hypothetical? Today, if one were to add two paths, each with a
weight above 2^30, the total weight would overflow and become negative.
fib_select_multipath would then think the route is dead and always
select the first path.

> > +	w = 0;
> > +	change_nexthops(fi) {
> > +		int upper_bound;
> > +
> > +		if (nexthop_nh->nh_flags & RTNH_F_DEAD) {
> > +			upper_bound = -1;
> > +		} else {
> > +			w += nexthop_nh->nh_mp_weight / factor;
> > +			upper_bound = DIV_ROUND_CLOSEST(256 * w,
> > total);
> > +		}
> 
> This is doing some confusing stuff.  I assume the whole point is to
> get the value to convert the upper_bound into a u8 value based on the
> weight where you end with 256 as the last value, do I have that right?
> 

Yes - and -1 is used to ensure that dead paths are never selected.

> I'm not really sure that is the best way to go from a code standpoint 
> since fib_nhs is an int which technically means we could support more 
> than 256 next hops for a given route.
> 

It is true that fib_nhs is an int, but nh_sel is an unsigned char. So
although you could technically have more than 256 paths, fib_result
would have to be modified to actually support more paths.

> Also I assume that it is safe to divide by total because the
> assumption is that if one or more routes are not dead then total has
> a non-zero value, do I have that right?  If so what is the point of
> setting the factor to 1 above, unless you are just hacking around a
> divide by 0 error.
> 

You are right, it is superfluous. If all paths are dead, we never
actually use the factor for anything.

> Maybe what you should do above is simply return if total == 0 since
> it doesn't look like you can safely update anything otherwise.
> 
> > +
> > +		atomic_set(&nexthop_nh->nh_mp_upper_bound,
> > upper_bound);
> > +	} endfor_nexthops(fi);
> > +}
> > +
> >   static int fib_get_nhs(struct fib_info *fi, struct rtnexthop
> > *rtnh, int remaining, struct fib_config *cfg)
> >   {
> > @@ -461,7 +499,7 @@ static int fib_get_nhs(struct fib_info *fi,
> > struct rtnexthop *rtnh, nexthop_nh->nh_flags =
> >   			(cfg->fc_flags & ~0xFF) |
> > rtnh->rtnh_flags; nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
> > -		nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
> > +		nexthop_nh->nh_mp_weight = rtnh->rtnh_hops + 1;
> >
> >   		attrlen = rtnh_attrlen(rtnh);
> >   		if (attrlen > 0) {
> > @@ -884,7 +922,7 @@ struct fib_info *fib_create_info(struct
> > fib_config *cfg) fi->fib_net->ipv4.fib_num_tclassid_users++;
> >   #endif
> >   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> > -		nh->nh_weight = 1;
> > +		nh->nh_mp_weight = 1;
> >   #endif
> >   	}
> >
> 
> More unnecessary rename noise.

Got it.

> 
> > @@ -936,8 +974,15 @@ struct fib_info *fib_create_info(struct
> > fib_config *cfg)
> >
> >   	change_nexthops(fi) {
> >   		fib_info_update_nh_saddr(net, nexthop_nh);
> > +#ifdef CONFIG_IP_ROUTE_MULTIPATH
> > +		fi->fib_mp_weight += nexthop_nh->nh_mp_weight;
> > +#endif
> >   	} endfor_nexthops(fi)
> >
> > +#ifdef CONFIG_IP_ROUTE_MULTIPATH
> > +	fib_rebalance(fi);
> > +#endif
> > +
> >   link_it:
> >   	ofi = fib_find_info(fi);
> >   	if (ofi) {
> > @@ -1050,7 +1095,7 @@ int fib_dump_info(struct sk_buff *skb, u32
> > portid, u32 seq, int event, goto nla_put_failure;
> >
> >   			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
> > -			rtnh->rtnh_hops = nh->nh_weight - 1;
> > +			rtnh->rtnh_hops = nh->nh_mp_weight - 1;
> >   			rtnh->rtnh_ifindex = nh->nh_oif;
> >
> >   			if (nh->nh_gw &&
> > @@ -1130,12 +1175,6 @@ int fib_sync_down_dev(struct net_device
> > *dev, int force) else if (nexthop_nh->nh_dev == dev &&
> >   				 nexthop_nh->nh_scope != scope) {
> >   				nexthop_nh->nh_flags |=
> > RTNH_F_DEAD; -#ifdef CONFIG_IP_ROUTE_MULTIPATH
> > -				spin_lock_bh(&fib_multipath_lock);
> > -				fi->fib_power -=
> > nexthop_nh->nh_power;
> > -				nexthop_nh->nh_power = 0;
> > -
> > spin_unlock_bh(&fib_multipath_lock); -#endif
> >   				dead++;
> >   			}
> >   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> > @@ -1149,6 +1188,10 @@ int fib_sync_down_dev(struct net_device
> > *dev, int force) fi->fib_flags |= RTNH_F_DEAD;
> >   			ret++;
> >   		}
> > +
> > +#ifdef CONFIG_IP_ROUTE_MULTIPATH
> > +		fib_rebalance(fi);
> > +#endif
> >   	}
> >
> >   	return ret;
> > @@ -1254,16 +1297,15 @@ int fib_sync_up(struct net_device *dev)
> >   			    !__in_dev_get_rtnl(dev))
> >   				continue;
> >   			alive++;
> > -			spin_lock_bh(&fib_multipath_lock);
> > -			nexthop_nh->nh_power = 0;
> >   			nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
> > -			spin_unlock_bh(&fib_multipath_lock);
> >   		} endfor_nexthops(fi)
> >
> >   		if (alive > 0) {
> >   			fi->fib_flags &= ~RTNH_F_DEAD;
> >   			ret++;
> >   		}
> > +
> > +		fib_rebalance(fi);
> >   	}
> >
> >   	return ret;
> > @@ -1276,49 +1318,19 @@ int fib_sync_up(struct net_device *dev)
> >   void fib_select_multipath(struct fib_result *res)
> >   {
> >   	struct fib_info *fi = res->fi;
> > -	int w;
> > -
> > -	spin_lock_bh(&fib_multipath_lock);
> > -	if (fi->fib_power <= 0) {
> > -		int power = 0;
> > -		change_nexthops(fi) {
> > -			if (!(nexthop_nh->nh_flags & RTNH_F_DEAD))
> > {
> > -				power += nexthop_nh->nh_weight;
> > -				nexthop_nh->nh_power =
> > nexthop_nh->nh_weight;
> > -			}
> > -		} endfor_nexthops(fi);
> > -		fi->fib_power = power;
> > -		if (power <= 0) {
> > -			spin_unlock_bh(&fib_multipath_lock);
> > -			/* Race condition: route has just become
> > dead. */
> > -			res->nh_sel = 0;
> > -			return;
> > -		}
> > -	}
> > +	u8 w;
> >
> > +	w = bitrev8(this_cpu_inc_return(fib_mp_rr_counter));
> >
> > -	/* w should be random number [0..fi->fib_power-1],
> > -	 * it is pretty bad approximation.
> > -	 */
> > -
> > -	w = jiffies % fi->fib_power;
> > +	for_nexthops(fi) {
> > +		if (w >= atomic_read(&nh->nh_mp_upper_bound))
> > +			continue;
> >
> > -	change_nexthops(fi) {
> > -		if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
> > -		    nexthop_nh->nh_power) {
> > -			w -= nexthop_nh->nh_power;
> > -			if (w <= 0) {
> > -				nexthop_nh->nh_power--;
> > -				fi->fib_power--;
> > -				res->nh_sel = nhsel;
> > -
> > spin_unlock_bh(&fib_multipath_lock);
> > -				return;
> > -			}
> > -		}
> > +		res->nh_sel = nhsel;
> > +		return;
> >   	} endfor_nexthops(fi);
> >
> > -	/* Race condition: route has just become dead. */
> > +	/* Race condition: route has just become dead */
> >   	res->nh_sel = 0;
> > -	spin_unlock_bh(&fib_multipath_lock);
> >   }
> >   #endif
> >
> 
> Instead of doing bitrev8 why not just hash jiffies since that is 
> essentially what you are doing with this counter anyway.  You could 
> probably even look at converting from a u8 to something like a u32
> which would give you a more pseudo random distribution.

The old algorithm tried to be kinda random by using jiffies, but since
jiffies has a relatively low resolution (at best 1kHz), at lot of
packets in a row would get the same jiffies value. The code did try to
adhere to the weight ratios and as a consequence it really worked like
round robin (except at every tick when the jiffies variable would
increment and offset the round robin slightly).

Considering that, I thought I'd avoid any false pretenses and just do
round robin. But since the new algorithm relies on thresholds and not
modulo, the most significant bits are the ones that need to flip at
every packet. So by reversing the bit order, I get exactly that. At a
slow packet rate this is somewhat costly as the bitrev8 lookup table
must be fetched from L3 cache or even memory, but as the packet rate
increase, the likelihood of it being in L2 or L1 cache increases and
cost go down.

Regarding moving from u8 to larger integers, I chose 8-bit resolution
since hashing L3 and L4 headers with XOR into a single byte actually
produced fairly good results with 8 bits, while switching to 16 bits
was terrible. Considering the purpose of the round robin counter here,
the primary advantage of switching to higher resolution would be to
better support a total weight above 255.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH net-next 2/3] ipv4: L3 and L4 hash-based multipath routing
  2015-06-17 20:08 [PATCH net-next 0/3] ipv4: Hash-based multipath routing Peter Nørlund
       [not found] ` <1434571686-5149-1-git-send-email-pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
@ 2015-06-17 20:08 ` Peter Nørlund
  2015-06-18 22:52   ` Alexander Duyck
  2015-06-17 20:08 ` [PATCH net-next 3/3] ipv4: ICMP packet inspection for multipath Peter Nørlund
  2 siblings, 1 reply; 8+ messages in thread
From: Peter Nørlund @ 2015-06-17 20:08 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, linux-api, Peter Nørlund

This patch adds L3 and L4 hash-based multipath routing, selectable on a
per-route basis with the reintroduced RTA_MP_ALGO attribute. The default is
now RT_MP_ALG_L3_HASH.

Signed-off-by: Peter Nørlund <pch@ordbogen.com>
---
 include/net/ip_fib.h           |  4 ++-
 include/net/route.h            |  5 ++--
 include/uapi/linux/rtnetlink.h | 14 ++++++++++-
 net/ipv4/fib_frontend.c        |  4 +++
 net/ipv4/fib_semantics.c       | 34 ++++++++++++++++++++++---
 net/ipv4/icmp.c                |  4 +--
 net/ipv4/route.c               | 56 +++++++++++++++++++++++++++++++++++-------
 net/ipv4/xfrm4_policy.c        |  2 +-
 8 files changed, 103 insertions(+), 20 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 4be4f25..250d98e 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -37,6 +37,7 @@ struct fib_config {
 	u32			fc_flags;
 	u32			fc_priority;
 	__be32			fc_prefsrc;
+	int			fc_mp_alg;
 	struct nlattr		*fc_mx;
 	struct rtnexthop	*fc_mp;
 	int			fc_mx_len;
@@ -116,6 +117,7 @@ struct fib_info {
 	int			fib_nhs;
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	int			fib_mp_weight;
+	int			fib_mp_alg;
 #endif
 	struct rcu_head		rcu;
 	struct fib_nh		fib_nh[0];
@@ -308,7 +310,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev);
 int fib_sync_down_dev(struct net_device *dev, int force);
 int fib_sync_down_addr(struct net *net, __be32 local);
 int fib_sync_up(struct net_device *dev);
-void fib_select_multipath(struct fib_result *res);
+void fib_select_multipath(struct fib_result *res, const struct flowi4 *flow);
 
 /* Exported by fib_trie.c */
 void fib_trie_init(void);
diff --git a/include/net/route.h b/include/net/route.h
index fe22d03..1fc7deb 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -110,7 +110,8 @@ struct in_device;
 int ip_rt_init(void);
 void rt_cache_flush(struct net *net);
 void rt_flush_dev(struct net_device *dev);
-struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp);
+struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp,
+				     const struct flowi4 *mp_flow);
 struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
 				    struct sock *sk);
 struct dst_entry *ipv4_blackhole_route(struct net *net,
@@ -267,7 +268,7 @@ static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
 			      sport, dport, sk);
 
 	if (!dst || !src) {
-		rt = __ip_route_output_key(net, fl4);
+		rt = __ip_route_output_key(net, fl4, NULL);
 		if (IS_ERR(rt))
 			return rt;
 		ip_rt_put(rt);
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 17fb02f..dff4a72 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -271,6 +271,18 @@ enum rt_scope_t {
 #define RTM_F_EQUALIZE		0x400	/* Multipath equalizer: NI	*/
 #define RTM_F_PREFIX		0x800	/* Prefix addresses		*/
 
+/* Multipath algorithms */
+
+enum rt_mp_alg_t {
+	RT_MP_ALG_L3_HASH,	/* Was IP_MP_ALG_NONE */
+	RT_MP_ALG_PER_PACKET,	/* Was IP_MP_ALG_RR */
+	RT_MP_ALG_DRR,		/* not used */
+	RT_MP_ALG_RANDOM,	/* not used */
+	RT_MP_ALG_WRANDOM,	/* not used */
+	RT_MP_ALG_L4_HASH,
+	__RT_MP_ALG_MAX
+};
+
 /* Reserved table identifiers */
 
 enum rt_class_t {
@@ -301,7 +313,7 @@ enum rtattr_type_t {
 	RTA_FLOW,
 	RTA_CACHEINFO,
 	RTA_SESSION, /* no longer used */
-	RTA_MP_ALGO, /* no longer used */
+	RTA_MP_ALGO,
 	RTA_TABLE,
 	RTA_MARK,
 	RTA_MFC_STATS,
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 872494e..376e8c1 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -590,6 +590,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 	[RTA_PREFSRC]		= { .type = NLA_U32 },
 	[RTA_METRICS]		= { .type = NLA_NESTED },
 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
+	[RTA_MP_ALGO]		= { .type = NLA_U32 },
 	[RTA_FLOW]		= { .type = NLA_U32 },
 };
 
@@ -650,6 +651,9 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
 			cfg->fc_mp = nla_data(attr);
 			cfg->fc_mp_len = nla_len(attr);
 			break;
+		case RTA_MP_ALGO:
+			cfg->fc_mp_alg = nla_get_u32(attr);
+			break;
 		case RTA_FLOW:
 			cfg->fc_flow = nla_get_u32(attr);
 			break;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 8c8df80..da06e88 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -257,6 +257,11 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
 {
 	const struct fib_nh *onh = ofi->fib_nh;
 
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	if (fi->fib_mp_alg != ofi->fib_mp_alg)
+		return -1;
+#endif
+
 	for_nexthops(fi) {
 		if (nh->nh_oif != onh->nh_oif ||
 		    nh->nh_gw  != onh->nh_gw ||
@@ -896,6 +901,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 
 	if (cfg->fc_mp) {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
+		fi->fib_mp_alg = cfg->fc_mp_alg;
 		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
 		if (err != 0)
 			goto failure;
@@ -1085,6 +1091,10 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 		struct rtnexthop *rtnh;
 		struct nlattr *mp;
 
+		if (fi->fib_mp_alg &&
+		    nla_put_u32(skb, RTA_MP_ALGO, fi->fib_mp_alg))
+			goto nla_put_failure;
+
 		mp = nla_nest_start(skb, RTA_MULTIPATH);
 		if (!mp)
 			goto nla_put_failure;
@@ -1312,15 +1322,31 @@ int fib_sync_up(struct net_device *dev)
 }
 
 /*
- * The algorithm is suboptimal, but it provides really
- * fair weighted route distribution.
+ * Compute multipath hash based on 3- or 5-tuple
  */
-void fib_select_multipath(struct fib_result *res)
+static int fib_multipath_hash(const struct fib_result *res,
+			      const struct flowi4 *flow)
+{
+	u32 hash = flow->saddr ^ flow->daddr;
+
+	if (res->fi->fib_mp_alg == RT_MP_ALG_L4_HASH && flow->flowi4_proto != 0)
+		hash ^= flow->flowi4_proto ^ flow->fl4_sport ^ flow->fl4_dport;
+
+	hash ^= hash >> 16;
+	hash ^= hash >> 8;
+	return hash & 0xFF;
+}
+
+void fib_select_multipath(struct fib_result *res, const struct flowi4 *flow)
 {
 	struct fib_info *fi = res->fi;
 	u8 w;
 
-	w = bitrev8(this_cpu_inc_return(fib_mp_rr_counter));
+	if (res->fi->fib_mp_alg == RT_MP_ALG_PER_PACKET) {
+		w = bitrev8(this_cpu_inc_return(fib_mp_rr_counter));
+	} else {
+		w = fib_multipath_hash(res, flow);
+	}
 
 	for_nexthops(fi) {
 		if (w >= atomic_read(&nh->nh_mp_upper_bound))
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index f5203fb..3abcfea 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -459,7 +459,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
 	fl4->fl4_icmp_type = type;
 	fl4->fl4_icmp_code = code;
 	security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
-	rt = __ip_route_output_key(net, fl4);
+	rt = __ip_route_output_key(net, fl4, NULL);
 	if (IS_ERR(rt))
 		return rt;
 
@@ -481,7 +481,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
 		goto relookup_failed;
 
 	if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) {
-		rt2 = __ip_route_output_key(net, &fl4_dec);
+		rt2 = __ip_route_output_key(net, &fl4_dec, NULL);
 		if (IS_ERR(rt2))
 			err = PTR_ERR(rt2);
 	} else {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f605598..a1ec62c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1006,7 +1006,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
 
 	__build_flow_key(&fl4, NULL, iph, oif,
 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
-	rt = __ip_route_output_key(net, &fl4);
+	rt = __ip_route_output_key(net, &fl4, NULL);
 	if (!IS_ERR(rt)) {
 		__ip_rt_update_pmtu(rt, &fl4, mtu);
 		ip_rt_put(rt);
@@ -1025,7 +1025,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 	if (!fl4.flowi4_mark)
 		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
 
-	rt = __ip_route_output_key(sock_net(sk), &fl4);
+	rt = __ip_route_output_key(sock_net(sk), &fl4, NULL);
 	if (!IS_ERR(rt)) {
 		__ip_rt_update_pmtu(rt, &fl4, mtu);
 		ip_rt_put(rt);
@@ -1094,7 +1094,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net,
 
 	__build_flow_key(&fl4, NULL, iph, oif,
 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
-	rt = __ip_route_output_key(net, &fl4);
+	rt = __ip_route_output_key(net, &fl4, NULL);
 	if (!IS_ERR(rt)) {
 		__ip_do_redirect(rt, skb, &fl4, false);
 		ip_rt_put(rt);
@@ -1109,7 +1109,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
 	struct rtable *rt;
 
 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
-	rt = __ip_route_output_key(sock_net(sk), &fl4);
+	rt = __ip_route_output_key(sock_net(sk), &fl4, NULL);
 	if (!IS_ERR(rt)) {
 		__ip_do_redirect(rt, skb, &fl4, false);
 		ip_rt_put(rt);
@@ -1631,6 +1631,39 @@ out:
 	return err;
 }
 
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+/* Fill flow key data based on packet for use in multipath routing. */
+static void ip_multipath_flow(const struct sk_buff *skb, struct flowi4 *flow)
+{
+	const struct iphdr *iph;
+
+	iph = ip_hdr(skb);
+
+	flow->saddr = iph->saddr;
+	flow->daddr = iph->daddr;
+	flow->flowi4_proto = iph->protocol;
+	flow->fl4_sport = 0;
+	flow->fl4_dport = 0;
+
+	if (unlikely(ip_is_fragment(iph)))
+		return;
+
+	if (iph->protocol == IPPROTO_TCP ||
+	    iph->protocol == IPPROTO_UDP ||
+	    iph->protocol == IPPROTO_SCTP) {
+		__be16 _ports;
+		const __be16 *ports;
+
+		ports = skb_header_pointer(skb, iph->ihl * 4, sizeof(_ports),
+					   &_ports);
+		if (ports) {
+			flow->fl4_sport = ports[0];
+			flow->fl4_dport = ports[1];
+		}
+	}
+}
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
 static int ip_mkroute_input(struct sk_buff *skb,
 			    struct fib_result *res,
 			    const struct flowi4 *fl4,
@@ -1638,8 +1671,12 @@ static int ip_mkroute_input(struct sk_buff *skb,
 			    __be32 daddr, __be32 saddr, u32 tos)
 {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-	if (res->fi && res->fi->fib_nhs > 1)
-		fib_select_multipath(res);
+	if (res->fi && res->fi->fib_nhs > 1) {
+		struct flowi4 mp_flow;
+
+		ip_multipath_flow(skb, &mp_flow);
+		fib_select_multipath(res, &mp_flow);
+	}
 #endif
 
 	/* create a routing cache entry */
@@ -2012,7 +2049,8 @@ add:
  * Major route resolver routine.
  */
 
-struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
+struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4,
+				     const struct flowi4 *mp_flow)
 {
 	struct net_device *dev_out = NULL;
 	__u8 tos = RT_FL_TOS(fl4);
@@ -2170,7 +2208,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
-		fib_select_multipath(&res);
+		fib_select_multipath(&res, (mp_flow ? mp_flow : fl4));
 	else
 #endif
 	if (!res.prefixlen &&
@@ -2273,7 +2311,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 				    struct sock *sk)
 {
-	struct rtable *rt = __ip_route_output_key(net, flp4);
+	struct rtable *rt = __ip_route_output_key(net, flp4, NULL);
 
 	if (IS_ERR(rt))
 		return rt;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index bff6974..7eae158 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -31,7 +31,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
 	if (saddr)
 		fl4->saddr = saddr->a4;
 
-	rt = __ip_route_output_key(net, fl4);
+	rt = __ip_route_output_key(net, fl4, NULL);
 	if (!IS_ERR(rt))
 		return &rt->dst;
 
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next 2/3] ipv4: L3 and L4 hash-based multipath routing
  2015-06-17 20:08 ` [PATCH net-next 2/3] ipv4: L3 and L4 hash-based multipath routing Peter Nørlund
@ 2015-06-18 22:52   ` Alexander Duyck
  2015-06-20 18:46     ` Peter Nørlund
  0 siblings, 1 reply; 8+ messages in thread
From: Alexander Duyck @ 2015-06-18 22:52 UTC (permalink / raw)
  To: Peter Nørlund, netdev
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, linux-api



On 06/17/2015 01:08 PM, Peter Nørlund wrote:
> This patch adds L3 and L4 hash-based multipath routing, selectable on a
> per-route basis with the reintroduced RTA_MP_ALGO attribute. The default is
> now RT_MP_ALG_L3_HASH.
>
> Signed-off-by: Peter Nørlund <pch@ordbogen.com>
> ---
>   include/net/ip_fib.h           |  4 ++-
>   include/net/route.h            |  5 ++--
>   include/uapi/linux/rtnetlink.h | 14 ++++++++++-
>   net/ipv4/fib_frontend.c        |  4 +++
>   net/ipv4/fib_semantics.c       | 34 ++++++++++++++++++++++---
>   net/ipv4/icmp.c                |  4 +--
>   net/ipv4/route.c               | 56 +++++++++++++++++++++++++++++++++++-------
>   net/ipv4/xfrm4_policy.c        |  2 +-
>   8 files changed, 103 insertions(+), 20 deletions(-)
>
> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> index 4be4f25..250d98e 100644
> --- a/include/net/ip_fib.h
> +++ b/include/net/ip_fib.h
> @@ -37,6 +37,7 @@ struct fib_config {
>   	u32			fc_flags;
>   	u32			fc_priority;
>   	__be32			fc_prefsrc;
> +	int			fc_mp_alg;
>   	struct nlattr		*fc_mx;
>   	struct rtnexthop	*fc_mp;
>   	int			fc_mx_len;
> @@ -116,6 +117,7 @@ struct fib_info {
>   	int			fib_nhs;
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
>   	int			fib_mp_weight;
> +	int			fib_mp_alg;
>   #endif
>   	struct rcu_head		rcu;
>   	struct fib_nh		fib_nh[0];
> @@ -308,7 +310,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev);
>   int fib_sync_down_dev(struct net_device *dev, int force);
>   int fib_sync_down_addr(struct net *net, __be32 local);
>   int fib_sync_up(struct net_device *dev);
> -void fib_select_multipath(struct fib_result *res);
> +void fib_select_multipath(struct fib_result *res, const struct flowi4 *flow);
>
>   /* Exported by fib_trie.c */
>   void fib_trie_init(void);
> diff --git a/include/net/route.h b/include/net/route.h
> index fe22d03..1fc7deb 100644
> --- a/include/net/route.h
> +++ b/include/net/route.h
> @@ -110,7 +110,8 @@ struct in_device;
>   int ip_rt_init(void);
>   void rt_cache_flush(struct net *net);
>   void rt_flush_dev(struct net_device *dev);
> -struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp);
> +struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp,
> +				     const struct flowi4 *mp_flow);
>   struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
>   				    struct sock *sk);
>   struct dst_entry *ipv4_blackhole_route(struct net *net,
> @@ -267,7 +268,7 @@ static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
>   			      sport, dport, sk);
>
>   	if (!dst || !src) {
> -		rt = __ip_route_output_key(net, fl4);
> +		rt = __ip_route_output_key(net, fl4, NULL);
>   		if (IS_ERR(rt))
>   			return rt;
>   		ip_rt_put(rt);
> diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
> index 17fb02f..dff4a72 100644
> --- a/include/uapi/linux/rtnetlink.h
> +++ b/include/uapi/linux/rtnetlink.h
> @@ -271,6 +271,18 @@ enum rt_scope_t {
>   #define RTM_F_EQUALIZE		0x400	/* Multipath equalizer: NI	*/
>   #define RTM_F_PREFIX		0x800	/* Prefix addresses		*/
>
> +/* Multipath algorithms */
> +
> +enum rt_mp_alg_t {
> +	RT_MP_ALG_L3_HASH,	/* Was IP_MP_ALG_NONE */
> +	RT_MP_ALG_PER_PACKET,	/* Was IP_MP_ALG_RR */
> +	RT_MP_ALG_DRR,		/* not used */
> +	RT_MP_ALG_RANDOM,	/* not used */
> +	RT_MP_ALG_WRANDOM,	/* not used */
> +	RT_MP_ALG_L4_HASH,
> +	__RT_MP_ALG_MAX
> +};
> +
>   /* Reserved table identifiers */
>
>   enum rt_class_t {
> @@ -301,7 +313,7 @@ enum rtattr_type_t {
>   	RTA_FLOW,
>   	RTA_CACHEINFO,
>   	RTA_SESSION, /* no longer used */
> -	RTA_MP_ALGO, /* no longer used */
> +	RTA_MP_ALGO,
>   	RTA_TABLE,
>   	RTA_MARK,
>   	RTA_MFC_STATS,
> diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
> index 872494e..376e8c1 100644
> --- a/net/ipv4/fib_frontend.c
> +++ b/net/ipv4/fib_frontend.c
> @@ -590,6 +590,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
>   	[RTA_PREFSRC]		= { .type = NLA_U32 },
>   	[RTA_METRICS]		= { .type = NLA_NESTED },
>   	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
> +	[RTA_MP_ALGO]		= { .type = NLA_U32 },
>   	[RTA_FLOW]		= { .type = NLA_U32 },
>   };
>
> @@ -650,6 +651,9 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
>   			cfg->fc_mp = nla_data(attr);
>   			cfg->fc_mp_len = nla_len(attr);
>   			break;
> +		case RTA_MP_ALGO:
> +			cfg->fc_mp_alg = nla_get_u32(attr);
> +			break;
>   		case RTA_FLOW:
>   			cfg->fc_flow = nla_get_u32(attr);
>   			break;
> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
> index 8c8df80..da06e88 100644
> --- a/net/ipv4/fib_semantics.c
> +++ b/net/ipv4/fib_semantics.c
> @@ -257,6 +257,11 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
>   {
>   	const struct fib_nh *onh = ofi->fib_nh;
>
> +#ifdef CONFIG_IP_ROUTE_MULTIPATH
> +	if (fi->fib_mp_alg != ofi->fib_mp_alg)
> +		return -1;
> +#endif
> +
>   	for_nexthops(fi) {
>   		if (nh->nh_oif != onh->nh_oif ||
>   		    nh->nh_gw  != onh->nh_gw ||
> @@ -896,6 +901,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
>
>   	if (cfg->fc_mp) {
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> +		fi->fib_mp_alg = cfg->fc_mp_alg;
>   		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
>   		if (err != 0)
>   			goto failure;
> @@ -1085,6 +1091,10 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
>   		struct rtnexthop *rtnh;
>   		struct nlattr *mp;
>
> +		if (fi->fib_mp_alg &&
> +		    nla_put_u32(skb, RTA_MP_ALGO, fi->fib_mp_alg))
> +			goto nla_put_failure;
> +
>   		mp = nla_nest_start(skb, RTA_MULTIPATH);
>   		if (!mp)
>   			goto nla_put_failure;
> @@ -1312,15 +1322,31 @@ int fib_sync_up(struct net_device *dev)
>   }
>
>   /*
> - * The algorithm is suboptimal, but it provides really
> - * fair weighted route distribution.
> + * Compute multipath hash based on 3- or 5-tuple
>    */
> -void fib_select_multipath(struct fib_result *res)
> +static int fib_multipath_hash(const struct fib_result *res,
> +			      const struct flowi4 *flow)
> +{
> +	u32 hash = flow->saddr ^ flow->daddr;
> +
> +	if (res->fi->fib_mp_alg == RT_MP_ALG_L4_HASH && flow->flowi4_proto != 0)
> +		hash ^= flow->flowi4_proto ^ flow->fl4_sport ^ flow->fl4_dport;
> +
> +	hash ^= hash >> 16;
> +	hash ^= hash >> 8;
> +	return hash & 0xFF;
> +}
> +

This hash is still far from optimal.  Really I think you should look at 
something such as jhash_3words or the like for mixing up the addresses. 
  Right now just XORing the values together like you are will end up 
with a fairly high collision rate since for example in the case of two 
endpoints on the same subnet you would lose the subnet as a result of 
XORing the source and destination addresses.  Also you would lose the 
port data in the case of a protocol using something such as UDP where 
the source and destination ports might be the same value.

> +void fib_select_multipath(struct fib_result *res, const struct flowi4 *flow)
>   {
>   	struct fib_info *fi = res->fi;
>   	u8 w;
>
> -	w = bitrev8(this_cpu_inc_return(fib_mp_rr_counter));
> +	if (res->fi->fib_mp_alg == RT_MP_ALG_PER_PACKET) {
> +		w = bitrev8(this_cpu_inc_return(fib_mp_rr_counter));
> +	} else {
> +		w = fib_multipath_hash(res, flow);
> +	}
>
>   	for_nexthops(fi) {
>   		if (w >= atomic_read(&nh->nh_mp_upper_bound))
> diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
> index f5203fb..3abcfea 100644
> --- a/net/ipv4/icmp.c
> +++ b/net/ipv4/icmp.c
> @@ -459,7 +459,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
>   	fl4->fl4_icmp_type = type;
>   	fl4->fl4_icmp_code = code;
>   	security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
> -	rt = __ip_route_output_key(net, fl4);
> +	rt = __ip_route_output_key(net, fl4, NULL);
>   	if (IS_ERR(rt))
>   		return rt;
>
> @@ -481,7 +481,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
>   		goto relookup_failed;
>
>   	if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) {
> -		rt2 = __ip_route_output_key(net, &fl4_dec);
> +		rt2 = __ip_route_output_key(net, &fl4_dec, NULL);
>   		if (IS_ERR(rt2))
>   			err = PTR_ERR(rt2);
>   	} else {
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index f605598..a1ec62c 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -1006,7 +1006,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
>
>   	__build_flow_key(&fl4, NULL, iph, oif,
>   			 RT_TOS(iph->tos), protocol, mark, flow_flags);
> -	rt = __ip_route_output_key(net, &fl4);
> +	rt = __ip_route_output_key(net, &fl4, NULL);
>   	if (!IS_ERR(rt)) {
>   		__ip_rt_update_pmtu(rt, &fl4, mtu);
>   		ip_rt_put(rt);
> @@ -1025,7 +1025,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
>   	if (!fl4.flowi4_mark)
>   		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
>
> -	rt = __ip_route_output_key(sock_net(sk), &fl4);
> +	rt = __ip_route_output_key(sock_net(sk), &fl4, NULL);
>   	if (!IS_ERR(rt)) {
>   		__ip_rt_update_pmtu(rt, &fl4, mtu);
>   		ip_rt_put(rt);
> @@ -1094,7 +1094,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net,
>
>   	__build_flow_key(&fl4, NULL, iph, oif,
>   			 RT_TOS(iph->tos), protocol, mark, flow_flags);
> -	rt = __ip_route_output_key(net, &fl4);
> +	rt = __ip_route_output_key(net, &fl4, NULL);
>   	if (!IS_ERR(rt)) {
>   		__ip_do_redirect(rt, skb, &fl4, false);
>   		ip_rt_put(rt);
> @@ -1109,7 +1109,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
>   	struct rtable *rt;
>
>   	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
> -	rt = __ip_route_output_key(sock_net(sk), &fl4);
> +	rt = __ip_route_output_key(sock_net(sk), &fl4, NULL);
>   	if (!IS_ERR(rt)) {
>   		__ip_do_redirect(rt, skb, &fl4, false);
>   		ip_rt_put(rt);
> @@ -1631,6 +1631,39 @@ out:
>   	return err;
>   }
>
> +#ifdef CONFIG_IP_ROUTE_MULTIPATH
> +/* Fill flow key data based on packet for use in multipath routing. */
> +static void ip_multipath_flow(const struct sk_buff *skb, struct flowi4 *flow)
> +{
> +	const struct iphdr *iph;
> +
> +	iph = ip_hdr(skb);
> +
> +	flow->saddr = iph->saddr;
> +	flow->daddr = iph->daddr;
> +	flow->flowi4_proto = iph->protocol;
> +	flow->fl4_sport = 0;
> +	flow->fl4_dport = 0;
> +
> +	if (unlikely(ip_is_fragment(iph)))
> +		return;
> +

I'm not sure if checking for fragmentation is enough.  For example if 
this system is routing and received a flow of UDP packets, some 
fragmented some not then it might end up mixing them over 2 separate 
next hops since some will include L4 header data and some won't.

As such you may want to have the option to exclude UDP from the 
protocols listed below.

> +	if (iph->protocol == IPPROTO_TCP ||
> +	    iph->protocol == IPPROTO_UDP ||
> +	    iph->protocol == IPPROTO_SCTP) {
> +		__be16 _ports;
> +		const __be16 *ports;
> +
> +		ports = skb_header_pointer(skb, iph->ihl * 4, sizeof(_ports),
> +					   &_ports);
> +		if (ports) {
> +			flow->fl4_sport = ports[0];
> +			flow->fl4_dport = ports[1];
> +		}
> +	}
> +}
> +#endif /* CONFIG_IP_ROUTE_MULTIPATH */
> +
>   static int ip_mkroute_input(struct sk_buff *skb,
>   			    struct fib_result *res,
>   			    const struct flowi4 *fl4,
> @@ -1638,8 +1671,12 @@ static int ip_mkroute_input(struct sk_buff *skb,
>   			    __be32 daddr, __be32 saddr, u32 tos)
>   {
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> -	if (res->fi && res->fi->fib_nhs > 1)
> -		fib_select_multipath(res);
> +	if (res->fi && res->fi->fib_nhs > 1) {
> +		struct flowi4 mp_flow;
> +
> +		ip_multipath_flow(skb, &mp_flow);
> +		fib_select_multipath(res, &mp_flow);
> +	}

What is the point in populating the mp_flow if you don't know if it is 
going to be used?  You are populating it in ip_multipath_flow, and then 
you might completely ignore it in fib_select_multipath.

Maybe instead of passing the mp_flow you could instead look at passing a 
function pointer that would alter the flow for the multipath case instead.

>   #endif
>
>   	/* create a routing cache entry */
> @@ -2012,7 +2049,8 @@ add:
>    * Major route resolver routine.
>    */
>
> -struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
> +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4,
> +				     const struct flowi4 *mp_flow)
>   {
>   	struct net_device *dev_out = NULL;
>   	__u8 tos = RT_FL_TOS(fl4);
> @@ -2170,7 +2208,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
>
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
>   	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
> -		fib_select_multipath(&res);
> +		fib_select_multipath(&res, (mp_flow ? mp_flow : fl4));
>   	else
>   #endif
>   	if (!res.prefixlen &&
> @@ -2273,7 +2311,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
>   struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
>   				    struct sock *sk)
>   {
> -	struct rtable *rt = __ip_route_output_key(net, flp4);
> +	struct rtable *rt = __ip_route_output_key(net, flp4, NULL);
>
>   	if (IS_ERR(rt))
>   		return rt;
> diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
> index bff6974..7eae158 100644
> --- a/net/ipv4/xfrm4_policy.c
> +++ b/net/ipv4/xfrm4_policy.c
> @@ -31,7 +31,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
>   	if (saddr)
>   		fl4->saddr = saddr->a4;
>
> -	rt = __ip_route_output_key(net, fl4);
> +	rt = __ip_route_output_key(net, fl4, NULL);
>   	if (!IS_ERR(rt))
>   		return &rt->dst;
>
>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next 2/3] ipv4: L3 and L4 hash-based multipath routing
  2015-06-18 22:52   ` Alexander Duyck
@ 2015-06-20 18:46     ` Peter Nørlund
  0 siblings, 0 replies; 8+ messages in thread
From: Peter Nørlund @ 2015-06-20 18:46 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: netdev, David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, linux-api

On Thu, 18 Jun 2015 15:52:22 -0700
Alexander Duyck <alexander.h.duyck@redhat.com> wrote:

> 
> 
> On 06/17/2015 01:08 PM, Peter Nørlund wrote:
> > This patch adds L3 and L4 hash-based multipath routing, selectable
> > on a per-route basis with the reintroduced RTA_MP_ALGO attribute.
> > The default is now RT_MP_ALG_L3_HASH.
> >
> > Signed-off-by: Peter Nørlund <pch@ordbogen.com>
> > ---
> >   include/net/ip_fib.h           |  4 ++-
> >   include/net/route.h            |  5 ++--
> >   include/uapi/linux/rtnetlink.h | 14 ++++++++++-
> >   net/ipv4/fib_frontend.c        |  4 +++
> >   net/ipv4/fib_semantics.c       | 34 ++++++++++++++++++++++---
> >   net/ipv4/icmp.c                |  4 +--
> >   net/ipv4/route.c               | 56
> > +++++++++++++++++++++++++++++++++++-------
> > net/ipv4/xfrm4_policy.c        |  2 +- 8 files changed, 103
> > insertions(+), 20 deletions(-)
> >
> > diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> > index 4be4f25..250d98e 100644
> > --- a/include/net/ip_fib.h
> > +++ b/include/net/ip_fib.h
> > @@ -37,6 +37,7 @@ struct fib_config {
> >   	u32			fc_flags;
> >   	u32			fc_priority;
> >   	__be32			fc_prefsrc;
> > +	int			fc_mp_alg;
> >   	struct nlattr		*fc_mx;
> >   	struct rtnexthop	*fc_mp;
> >   	int			fc_mx_len;
> > @@ -116,6 +117,7 @@ struct fib_info {
> >   	int			fib_nhs;
> >   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> >   	int			fib_mp_weight;
> > +	int			fib_mp_alg;
> >   #endif
> >   	struct rcu_head		rcu;
> >   	struct fib_nh		fib_nh[0];
> > @@ -308,7 +310,7 @@ int ip_fib_check_default(__be32 gw, struct
> > net_device *dev); int fib_sync_down_dev(struct net_device *dev, int
> > force); int fib_sync_down_addr(struct net *net, __be32 local);
> >   int fib_sync_up(struct net_device *dev);
> > -void fib_select_multipath(struct fib_result *res);
> > +void fib_select_multipath(struct fib_result *res, const struct
> > flowi4 *flow);
> >
> >   /* Exported by fib_trie.c */
> >   void fib_trie_init(void);
> > diff --git a/include/net/route.h b/include/net/route.h
> > index fe22d03..1fc7deb 100644
> > --- a/include/net/route.h
> > +++ b/include/net/route.h
> > @@ -110,7 +110,8 @@ struct in_device;
> >   int ip_rt_init(void);
> >   void rt_cache_flush(struct net *net);
> >   void rt_flush_dev(struct net_device *dev);
> > -struct rtable *__ip_route_output_key(struct net *, struct flowi4
> > *flp); +struct rtable *__ip_route_output_key(struct net *, struct
> > flowi4 *flp,
> > +				     const struct flowi4 *mp_flow);
> >   struct rtable *ip_route_output_flow(struct net *, struct flowi4
> > *flp, struct sock *sk);
> >   struct dst_entry *ipv4_blackhole_route(struct net *net,
> > @@ -267,7 +268,7 @@ static inline struct rtable
> > *ip_route_connect(struct flowi4 *fl4, sport, dport, sk);
> >
> >   	if (!dst || !src) {
> > -		rt = __ip_route_output_key(net, fl4);
> > +		rt = __ip_route_output_key(net, fl4, NULL);
> >   		if (IS_ERR(rt))
> >   			return rt;
> >   		ip_rt_put(rt);
> > diff --git a/include/uapi/linux/rtnetlink.h
> > b/include/uapi/linux/rtnetlink.h index 17fb02f..dff4a72 100644
> > --- a/include/uapi/linux/rtnetlink.h
> > +++ b/include/uapi/linux/rtnetlink.h
> > @@ -271,6 +271,18 @@ enum rt_scope_t {
> >   #define RTM_F_EQUALIZE		0x400	/* Multipath
> > equalizer: NI	*/ #define RTM_F_PREFIX
> > 0x800	/* Prefix addresses		*/
> >
> > +/* Multipath algorithms */
> > +
> > +enum rt_mp_alg_t {
> > +	RT_MP_ALG_L3_HASH,	/* Was IP_MP_ALG_NONE */
> > +	RT_MP_ALG_PER_PACKET,	/* Was IP_MP_ALG_RR */
> > +	RT_MP_ALG_DRR,		/* not used */
> > +	RT_MP_ALG_RANDOM,	/* not used */
> > +	RT_MP_ALG_WRANDOM,	/* not used */
> > +	RT_MP_ALG_L4_HASH,
> > +	__RT_MP_ALG_MAX
> > +};
> > +
> >   /* Reserved table identifiers */
> >
> >   enum rt_class_t {
> > @@ -301,7 +313,7 @@ enum rtattr_type_t {
> >   	RTA_FLOW,
> >   	RTA_CACHEINFO,
> >   	RTA_SESSION, /* no longer used */
> > -	RTA_MP_ALGO, /* no longer used */
> > +	RTA_MP_ALGO,
> >   	RTA_TABLE,
> >   	RTA_MARK,
> >   	RTA_MFC_STATS,
> > diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
> > index 872494e..376e8c1 100644
> > --- a/net/ipv4/fib_frontend.c
> > +++ b/net/ipv4/fib_frontend.c
> > @@ -590,6 +590,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX
> > + 1] = { [RTA_PREFSRC]		= { .type = NLA_U32 },
> >   	[RTA_METRICS]		= { .type = NLA_NESTED },
> >   	[RTA_MULTIPATH]		= { .len = sizeof(struct
> > rtnexthop) },
> > +	[RTA_MP_ALGO]		= { .type = NLA_U32 },
> >   	[RTA_FLOW]		= { .type = NLA_U32 },
> >   };
> >
> > @@ -650,6 +651,9 @@ static int rtm_to_fib_config(struct net *net,
> > struct sk_buff *skb, cfg->fc_mp = nla_data(attr);
> >   			cfg->fc_mp_len = nla_len(attr);
> >   			break;
> > +		case RTA_MP_ALGO:
> > +			cfg->fc_mp_alg = nla_get_u32(attr);
> > +			break;
> >   		case RTA_FLOW:
> >   			cfg->fc_flow = nla_get_u32(attr);
> >   			break;
> > diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
> > index 8c8df80..da06e88 100644
> > --- a/net/ipv4/fib_semantics.c
> > +++ b/net/ipv4/fib_semantics.c
> > @@ -257,6 +257,11 @@ static inline int nh_comp(const struct
> > fib_info *fi, const struct fib_info *ofi) {
> >   	const struct fib_nh *onh = ofi->fib_nh;
> >
> > +#ifdef CONFIG_IP_ROUTE_MULTIPATH
> > +	if (fi->fib_mp_alg != ofi->fib_mp_alg)
> > +		return -1;
> > +#endif
> > +
> >   	for_nexthops(fi) {
> >   		if (nh->nh_oif != onh->nh_oif ||
> >   		    nh->nh_gw  != onh->nh_gw ||
> > @@ -896,6 +901,7 @@ struct fib_info *fib_create_info(struct
> > fib_config *cfg)
> >
> >   	if (cfg->fc_mp) {
> >   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> > +		fi->fib_mp_alg = cfg->fc_mp_alg;
> >   		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len,
> > cfg); if (err != 0)
> >   			goto failure;
> > @@ -1085,6 +1091,10 @@ int fib_dump_info(struct sk_buff *skb, u32
> > portid, u32 seq, int event, struct rtnexthop *rtnh;
> >   		struct nlattr *mp;
> >
> > +		if (fi->fib_mp_alg &&
> > +		    nla_put_u32(skb, RTA_MP_ALGO, fi->fib_mp_alg))
> > +			goto nla_put_failure;
> > +
> >   		mp = nla_nest_start(skb, RTA_MULTIPATH);
> >   		if (!mp)
> >   			goto nla_put_failure;
> > @@ -1312,15 +1322,31 @@ int fib_sync_up(struct net_device *dev)
> >   }
> >
> >   /*
> > - * The algorithm is suboptimal, but it provides really
> > - * fair weighted route distribution.
> > + * Compute multipath hash based on 3- or 5-tuple
> >    */
> > -void fib_select_multipath(struct fib_result *res)
> > +static int fib_multipath_hash(const struct fib_result *res,
> > +			      const struct flowi4 *flow)
> > +{
> > +	u32 hash = flow->saddr ^ flow->daddr;
> > +
> > +	if (res->fi->fib_mp_alg == RT_MP_ALG_L4_HASH &&
> > flow->flowi4_proto != 0)
> > +		hash ^= flow->flowi4_proto ^ flow->fl4_sport ^
> > flow->fl4_dport; +
> > +	hash ^= hash >> 16;
> > +	hash ^= hash >> 8;
> > +	return hash & 0xFF;
> > +}
> > +
> 
> This hash is still far from optimal.  Really I think you should look
> at something such as jhash_3words or the like for mixing up the
> addresses. Right now just XORing the values together like you are
> will end up with a fairly high collision rate since for example in
> the case of two endpoints on the same subnet you would lose the
> subnet as a result of XORing the source and destination addresses.
> Also you would lose the port data in the case of a protocol using
> something such as UDP where the source and destination ports might be
> the same value.
> 

When I begun to work on the multipath code, I took a look at what the
hardware routers did, and found that you could typically choose between
XOR, CRC16, and CRC32. CRC32 produces excellent hashes but is slow
(unless you take advantage of SSE 4.2), while XOR is fast but less
optimal. When I tested the XOR hash on the routers of ordbogen.com (my
work place), I was actually surprised to see that the XOR hash was
sufficient as long as I stuck with 8 bits. But you are probably right.
While HTTP-traffic will likely balance well enough with XOR, other kind
of traffic might not. I didn't know the Jenkins hash function, so I
haven't tested it, but by the looks of it, I too think it is a good
candidate. It also adds the possibility of extending the key space to
16 bits or more.

> > +void fib_select_multipath(struct fib_result *res, const struct
> > flowi4 *flow) {
> >   	struct fib_info *fi = res->fi;
> >   	u8 w;
> >
> > -	w = bitrev8(this_cpu_inc_return(fib_mp_rr_counter));
> > +	if (res->fi->fib_mp_alg == RT_MP_ALG_PER_PACKET) {
> > +		w =
> > bitrev8(this_cpu_inc_return(fib_mp_rr_counter));
> > +	} else {
> > +		w = fib_multipath_hash(res, flow);
> > +	}
> >
> >   	for_nexthops(fi) {
> >   		if (w >= atomic_read(&nh->nh_mp_upper_bound))
> > diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
> > index f5203fb..3abcfea 100644
> > --- a/net/ipv4/icmp.c
> > +++ b/net/ipv4/icmp.c
> > @@ -459,7 +459,7 @@ static struct rtable *icmp_route_lookup(struct
> > net *net, fl4->fl4_icmp_type = type;
> >   	fl4->fl4_icmp_code = code;
> >   	security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
> > -	rt = __ip_route_output_key(net, fl4);
> > +	rt = __ip_route_output_key(net, fl4, NULL);
> >   	if (IS_ERR(rt))
> >   		return rt;
> >
> > @@ -481,7 +481,7 @@ static struct rtable *icmp_route_lookup(struct
> > net *net, goto relookup_failed;
> >
> >   	if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) {
> > -		rt2 = __ip_route_output_key(net, &fl4_dec);
> > +		rt2 = __ip_route_output_key(net, &fl4_dec, NULL);
> >   		if (IS_ERR(rt2))
> >   			err = PTR_ERR(rt2);
> >   	} else {
> > diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> > index f605598..a1ec62c 100644
> > --- a/net/ipv4/route.c
> > +++ b/net/ipv4/route.c
> > @@ -1006,7 +1006,7 @@ void ipv4_update_pmtu(struct sk_buff *skb,
> > struct net *net, u32 mtu,
> >
> >   	__build_flow_key(&fl4, NULL, iph, oif,
> >   			 RT_TOS(iph->tos), protocol, mark,
> > flow_flags);
> > -	rt = __ip_route_output_key(net, &fl4);
> > +	rt = __ip_route_output_key(net, &fl4, NULL);
> >   	if (!IS_ERR(rt)) {
> >   		__ip_rt_update_pmtu(rt, &fl4, mtu);
> >   		ip_rt_put(rt);
> > @@ -1025,7 +1025,7 @@ static void __ipv4_sk_update_pmtu(struct
> > sk_buff *skb, struct sock *sk, u32 mtu) if (!fl4.flowi4_mark)
> >   		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk),
> > skb->mark);
> >
> > -	rt = __ip_route_output_key(sock_net(sk), &fl4);
> > +	rt = __ip_route_output_key(sock_net(sk), &fl4, NULL);
> >   	if (!IS_ERR(rt)) {
> >   		__ip_rt_update_pmtu(rt, &fl4, mtu);
> >   		ip_rt_put(rt);
> > @@ -1094,7 +1094,7 @@ void ipv4_redirect(struct sk_buff *skb,
> > struct net *net,
> >
> >   	__build_flow_key(&fl4, NULL, iph, oif,
> >   			 RT_TOS(iph->tos), protocol, mark,
> > flow_flags);
> > -	rt = __ip_route_output_key(net, &fl4);
> > +	rt = __ip_route_output_key(net, &fl4, NULL);
> >   	if (!IS_ERR(rt)) {
> >   		__ip_do_redirect(rt, skb, &fl4, false);
> >   		ip_rt_put(rt);
> > @@ -1109,7 +1109,7 @@ void ipv4_sk_redirect(struct sk_buff *skb,
> > struct sock *sk) struct rtable *rt;
> >
> >   	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
> > -	rt = __ip_route_output_key(sock_net(sk), &fl4);
> > +	rt = __ip_route_output_key(sock_net(sk), &fl4, NULL);
> >   	if (!IS_ERR(rt)) {
> >   		__ip_do_redirect(rt, skb, &fl4, false);
> >   		ip_rt_put(rt);
> > @@ -1631,6 +1631,39 @@ out:
> >   	return err;
> >   }
> >
> > +#ifdef CONFIG_IP_ROUTE_MULTIPATH
> > +/* Fill flow key data based on packet for use in multipath
> > routing. */ +static void ip_multipath_flow(const struct sk_buff
> > *skb, struct flowi4 *flow) +{
> > +	const struct iphdr *iph;
> > +
> > +	iph = ip_hdr(skb);
> > +
> > +	flow->saddr = iph->saddr;
> > +	flow->daddr = iph->daddr;
> > +	flow->flowi4_proto = iph->protocol;
> > +	flow->fl4_sport = 0;
> > +	flow->fl4_dport = 0;
> > +
> > +	if (unlikely(ip_is_fragment(iph)))
> > +		return;
> > +
> 
> I'm not sure if checking for fragmentation is enough.  For example if 
> this system is routing and received a flow of UDP packets, some 
> fragmented some not then it might end up mixing them over 2 separate 
> next hops since some will include L4 header data and some won't.
> 
> As such you may want to have the option to exclude UDP from the 
> protocols listed below.
> 

Just like the hash algorithms, I looked at the hardware routers for
inspiration, and they typically do like this. It is a known problem,
and it can just as well happen with TCP, so I don't think adding a
special case for UDP is the way to go. But what about looking at the
don't-fragment-bit? If the DF-bit is set on a packet, it is usually set
on all packets in that flow, so one can assume that that flow will
never fragment. If it is clear, you can't be sure and should probably
avoid L4 hashing altogether. The interesting question is how many flows
will be excluded from L4 hashing with this approach.

> > +	if (iph->protocol == IPPROTO_TCP ||
> > +	    iph->protocol == IPPROTO_UDP ||
> > +	    iph->protocol == IPPROTO_SCTP) {
> > +		__be16 _ports;
> > +		const __be16 *ports;
> > +
> > +		ports = skb_header_pointer(skb, iph->ihl * 4,
> > sizeof(_ports),
> > +					   &_ports);
> > +		if (ports) {
> > +			flow->fl4_sport = ports[0];
> > +			flow->fl4_dport = ports[1];
> > +		}
> > +	}
> > +}
> > +#endif /* CONFIG_IP_ROUTE_MULTIPATH */
> > +
> >   static int ip_mkroute_input(struct sk_buff *skb,
> >   			    struct fib_result *res,
> >   			    const struct flowi4 *fl4,
> > @@ -1638,8 +1671,12 @@ static int ip_mkroute_input(struct sk_buff
> > *skb, __be32 daddr, __be32 saddr, u32 tos)
> >   {
> >   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> > -	if (res->fi && res->fi->fib_nhs > 1)
> > -		fib_select_multipath(res);
> > +	if (res->fi && res->fi->fib_nhs > 1) {
> > +		struct flowi4 mp_flow;
> > +
> > +		ip_multipath_flow(skb, &mp_flow);
> > +		fib_select_multipath(res, &mp_flow);
> > +	}
> 
> What is the point in populating the mp_flow if you don't know if it
> is going to be used?  You are populating it in ip_multipath_flow, and
> then you might completely ignore it in fib_select_multipath.
> 
> Maybe instead of passing the mp_flow you could instead look at
> passing a function pointer that would alter the flow for the
> multipath case instead.
> 

Definably not a bad idea. It would remove unnecessary overhead when
running per-packet multipath.

> >   #endif
> >
> >   	/* create a routing cache entry */
> > @@ -2012,7 +2049,8 @@ add:
> >    * Major route resolver routine.
> >    */
> >
> > -struct rtable *__ip_route_output_key(struct net *net, struct
> > flowi4 *fl4) +struct rtable *__ip_route_output_key(struct net *net,
> > struct flowi4 *fl4,
> > +				     const struct flowi4 *mp_flow)
> >   {
> >   	struct net_device *dev_out = NULL;
> >   	__u8 tos = RT_FL_TOS(fl4);
> > @@ -2170,7 +2208,7 @@ struct rtable *__ip_route_output_key(struct
> > net *net, struct flowi4 *fl4)
> >
> >   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> >   	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
> > -		fib_select_multipath(&res);
> > +		fib_select_multipath(&res, (mp_flow ? mp_flow :
> > fl4)); else
> >   #endif
> >   	if (!res.prefixlen &&
> > @@ -2273,7 +2311,7 @@ struct dst_entry *ipv4_blackhole_route(struct
> > net *net, struct dst_entry *dst_or struct rtable
> > *ip_route_output_flow(struct net *net, struct flowi4 *flp4, struct
> > sock *sk) {
> > -	struct rtable *rt = __ip_route_output_key(net, flp4);
> > +	struct rtable *rt = __ip_route_output_key(net, flp4, NULL);
> >
> >   	if (IS_ERR(rt))
> >   		return rt;
> > diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
> > index bff6974..7eae158 100644
> > --- a/net/ipv4/xfrm4_policy.c
> > +++ b/net/ipv4/xfrm4_policy.c
> > @@ -31,7 +31,7 @@ static struct dst_entry
> > *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, if (saddr)
> >   		fl4->saddr = saddr->a4;
> >
> > -	rt = __ip_route_output_key(net, fl4);
> > +	rt = __ip_route_output_key(net, fl4, NULL);
> >   	if (!IS_ERR(rt))
> >   		return &rt->dst;
> >
> >

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH net-next 3/3] ipv4: ICMP packet inspection for multipath
  2015-06-17 20:08 [PATCH net-next 0/3] ipv4: Hash-based multipath routing Peter Nørlund
       [not found] ` <1434571686-5149-1-git-send-email-pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
  2015-06-17 20:08 ` [PATCH net-next 2/3] ipv4: L3 and L4 hash-based multipath routing Peter Nørlund
@ 2015-06-17 20:08 ` Peter Nørlund
  2 siblings, 0 replies; 8+ messages in thread
From: Peter Nørlund @ 2015-06-17 20:08 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, linux-api, Peter Nørlund

ICMP packets are inspected to let them route together with the flow they
belong to, allowing anycast environments to work with ECMP.

Signed-off-by: Peter Nørlund <pch@ordbogen.com>
---
 net/ipv4/icmp.c  | 27 ++++++++++++++++++-
 net/ipv4/route.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 92 insertions(+), 15 deletions(-)

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 3abcfea..20f1d5e 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -447,6 +447,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
 {
 	struct rtable *rt, *rt2;
 	struct flowi4 fl4_dec;
+	struct flowi4 mp_flow;
 	int err;
 
 	memset(fl4, 0, sizeof(*fl4));
@@ -459,7 +460,31 @@ static struct rtable *icmp_route_lookup(struct net *net,
 	fl4->fl4_icmp_type = type;
 	fl4->fl4_icmp_code = code;
 	security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
-	rt = __ip_route_output_key(net, fl4, NULL);
+
+	/* Source and destination is swapped. See ip_multipath_flow */
+	mp_flow.saddr = iph->daddr;
+	mp_flow.daddr = iph->saddr;
+	mp_flow.flowi4_proto = iph->protocol;
+	mp_flow.fl4_sport = 0;
+	mp_flow.fl4_dport = 0;
+	if (!ip_is_fragment(iph)) {
+		if (iph->protocol == IPPROTO_TCP ||
+		    iph->protocol == IPPROTO_UDP ||
+		    iph->protocol == IPPROTO_SCTP) {
+			__be16 _ports[2];
+			const __be16 *ports;
+
+			ports = skb_header_pointer(skb_in, iph->ihl * 4,
+						   sizeof(_ports),
+						   &_ports);
+			if (ports) {
+				mp_flow.fl4_sport = ports[1];
+				mp_flow.fl4_dport = ports[0];
+			}
+		}
+	}
+
+	rt = __ip_route_output_key(net, fl4, &mp_flow);
 	if (IS_ERR(rt))
 		return rt;
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a1ec62c..bab4318 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1635,31 +1635,83 @@ out:
 /* Fill flow key data based on packet for use in multipath routing. */
 static void ip_multipath_flow(const struct sk_buff *skb, struct flowi4 *flow)
 {
-	const struct iphdr *iph;
-
-	iph = ip_hdr(skb);
-
-	flow->saddr = iph->saddr;
-	flow->daddr = iph->daddr;
-	flow->flowi4_proto = iph->protocol;
+	struct icmphdr _icmph;
+	struct iphdr _inner_iph;
+	const struct iphdr *outer_iph;
+	const struct icmphdr *icmph;
+	const struct iphdr *inner_iph;
+	unsigned int offset;
+	__be16 _ports[2];
+	const __be16 *ports;
+
+	outer_iph = ip_hdr(skb);
+
+	flow->saddr = outer_iph->saddr;
+	flow->daddr = outer_iph->daddr;
+	flow->flowi4_proto = outer_iph->protocol;
 	flow->fl4_sport = 0;
 	flow->fl4_dport = 0;
 
-	if (unlikely(ip_is_fragment(iph)))
+	if (unlikely(ip_is_fragment(outer_iph)))
 		return;
 
-	if (iph->protocol == IPPROTO_TCP ||
-	    iph->protocol == IPPROTO_UDP ||
-	    iph->protocol == IPPROTO_SCTP) {
-		__be16 _ports;
-		const __be16 *ports;
+	offset = outer_iph->ihl * 4;
 
-		ports = skb_header_pointer(skb, iph->ihl * 4, sizeof(_ports),
+	if (outer_iph->protocol == IPPROTO_TCP ||
+	    outer_iph->protocol == IPPROTO_UDP ||
+	    outer_iph->protocol == IPPROTO_SCTP) {
+		ports = skb_header_pointer(skb, offset, sizeof(_ports),
 					   &_ports);
 		if (ports) {
 			flow->fl4_sport = ports[0];
 			flow->fl4_dport = ports[1];
 		}
+
+		return;
+	}
+
+	if (outer_iph->protocol != IPPROTO_ICMP)
+		return;
+
+	icmph = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+	if (!icmph)
+		return;
+
+	if (icmph->type != ICMP_DEST_UNREACH &&
+	    icmph->type != ICMP_SOURCE_QUENCH &&
+	    icmph->type != ICMP_REDIRECT &&
+	    icmph->type != ICMP_TIME_EXCEEDED &&
+	    icmph->type != ICMP_PARAMETERPROB) {
+		return;
+	}
+
+	offset += sizeof(_icmph);
+	inner_iph = skb_header_pointer(skb, offset, sizeof(_inner_iph),
+				       &_inner_iph);
+	if (inner_iph)
+		return;
+
+	/* Since the ICMP payload contains a packet sent from the current
+	 * recipient, we swap source and destination addresses and ports
+	 */
+	flow->saddr = inner_iph->daddr;
+	flow->daddr = inner_iph->saddr;
+	flow->flowi4_proto = inner_iph->protocol;
+
+	if (unlikely(ip_is_fragment(inner_iph)))
+		return;
+
+	if (inner_iph->protocol != IPPROTO_TCP &&
+	    inner_iph->protocol != IPPROTO_UDP &&
+	    inner_iph->protocol != IPPROTO_SCTP) {
+		return;
+	}
+
+	offset += inner_iph->ihl * 4;
+	ports = skb_header_pointer(skb, offset, sizeof(_ports), &_ports);
+	if (ports) {
+		flow->fl4_sport = ports[1];
+		flow->fl4_dport = ports[0];
 	}
 }
 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2015-06-20 18:46 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-06-17 20:08 [PATCH net-next 0/3] ipv4: Hash-based multipath routing Peter Nørlund
     [not found] ` <1434571686-5149-1-git-send-email-pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
2015-06-17 20:08   ` [PATCH net-next 1/3] ipv4: Lock-less per-packet multipath Peter Nørlund
     [not found]     ` <1434571686-5149-2-git-send-email-pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
2015-06-18 19:42       ` Alexander Duyck
     [not found]         ` <55831F0D.3040703-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-06-20 18:42           ` Peter Nørlund
2015-06-17 20:08 ` [PATCH net-next 2/3] ipv4: L3 and L4 hash-based multipath routing Peter Nørlund
2015-06-18 22:52   ` Alexander Duyck
2015-06-20 18:46     ` Peter Nørlund
2015-06-17 20:08 ` [PATCH net-next 3/3] ipv4: ICMP packet inspection for multipath Peter Nørlund

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).