Re: [PATCH net-next 1/3] ipv4: Lock-less per-packet multipath

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Alexander Duyck <alexander.h.duyck-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
To: "Peter Nørlund" <pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>,
	netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: "David S. Miller" <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>,
	Alexey Kuznetsov <kuznet-v/Mj1YrvjDBInbfyfbPRSQ@public.gmane.org>,
	James Morris <jmorris-gx6/JNMH7DfYtjvyW6yDsg@public.gmane.org>,
	Hideaki YOSHIFUJI
	<yoshfuji-VfPWfsRibaP+Ru+s062T9g@public.gmane.org>,
	Patrick McHardy <kaber-dcUjhNyLwpNeoWH0uzbU5w@public.gmane.org>,
	linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Subject: Re: [PATCH net-next 1/3] ipv4: Lock-less per-packet multipath
Date: Thu, 18 Jun 2015 12:42:05 -0700	[thread overview]
Message-ID: <55831F0D.3040703@redhat.com> (raw)
In-Reply-To: <1434571686-5149-2-git-send-email-pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>

On 06/17/2015 01:08 PM, Peter Nørlund wrote:
> The current multipath attempted to be quasi random, but in most cases it
> behaved just like a round robin balancing. This patch refactors the
> algorithm to be exactly that and in doing so, avoids the spin lock.
>
> The new design paves the way for hash-based multipath, replacing the
> modulo with thresholds, minimizing disruption in case of failing paths or
> route replacements.
>
> Signed-off-by: Peter Nørlund <pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
> ---
>   include/net/ip_fib.h     |   6 +--
>   net/ipv4/Kconfig         |   1 +
>   net/ipv4/fib_semantics.c | 116 ++++++++++++++++++++++++++---------------------
>   3 files changed, 68 insertions(+), 55 deletions(-)
>
> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> index 54271ed..4be4f25 100644
> --- a/include/net/ip_fib.h
> +++ b/include/net/ip_fib.h
> @@ -76,8 +76,8 @@ struct fib_nh {
>   	unsigned int		nh_flags;
>   	unsigned char		nh_scope;
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> -	int			nh_weight;
> -	int			nh_power;
> +	int			nh_mp_weight;
> +	atomic_t		nh_mp_upper_bound;
>   #endif
>   #ifdef CONFIG_IP_ROUTE_CLASSID
>   	__u32			nh_tclassid;
> @@ -115,7 +115,7 @@ struct fib_info {
>   #define fib_advmss fib_metrics[RTAX_ADVMSS-1]
>   	int			fib_nhs;
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> -	int			fib_power;
> +	int			fib_mp_weight;
>   #endif
>   	struct rcu_head		rcu;
>   	struct fib_nh		fib_nh[0];

I could do without some of this renaming.  For example you could 
probably not bother with adding the _mp piece to the name.  That way we 
don't have to track all the nh_weight -> nh_mp_weight changes.   Also 
you could probably just use the name fib_weight since not including the 
_mp was already the convention for the multipath portions of the 
structure anyway.

This isn't really improving readability at all so I would say don't 
bother renaming it.

> diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
> index d83071d..cb91f67 100644
> --- a/net/ipv4/Kconfig
> +++ b/net/ipv4/Kconfig
> @@ -81,6 +81,7 @@ config IP_MULTIPLE_TABLES
>   config IP_ROUTE_MULTIPATH
>   	bool "IP: equal cost multipath"
>   	depends on IP_ADVANCED_ROUTER
> +	select BITREVERSE
>   	help
>   	  Normally, the routing tables specify a single action to be taken in
>   	  a deterministic manner for a given packet. If you say Y here
> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
> index 28ec3c1..8c8df80 100644
> --- a/net/ipv4/fib_semantics.c
> +++ b/net/ipv4/fib_semantics.c
> @@ -15,6 +15,7 @@
>
>   #include <asm/uaccess.h>
>   #include <linux/bitops.h>
> +#include <linux/bitrev.h>
>   #include <linux/types.h>
>   #include <linux/kernel.h>
>   #include <linux/jiffies.h>
> @@ -57,7 +58,7 @@ static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
>
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
>
> -static DEFINE_SPINLOCK(fib_multipath_lock);
> +static DEFINE_PER_CPU(u8, fib_mp_rr_counter);
>
>   #define for_nexthops(fi) {						\
>   	int nhsel; const struct fib_nh *nh;				\
> @@ -261,7 +262,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
>   		    nh->nh_gw  != onh->nh_gw ||
>   		    nh->nh_scope != onh->nh_scope ||
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> -		    nh->nh_weight != onh->nh_weight ||
> +		    nh->nh_mp_weight != onh->nh_mp_weight ||
>   #endif
>   #ifdef CONFIG_IP_ROUTE_CLASSID
>   		    nh->nh_tclassid != onh->nh_tclassid ||
> @@ -449,6 +450,43 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
>   	return remaining > 0 ? 0 : nhs;
>   }
>

This is a good example.  If we don't do the rename we don't have to 
review changes like the one above which just add extra overhead to the 
patch.

> +static void fib_rebalance(struct fib_info *fi)
> +{
> +	int factor;
> +	int total;
> +	int w;
> +
> +	if (fi->fib_nhs < 2)
> +		return;
> +
> +	total = 0;
> +	for_nexthops(fi) {
> +		if (!(nh->nh_flags & RTNH_F_DEAD))
> +			total += nh->nh_mp_weight;
> +	} endfor_nexthops(fi);
> +
> +	if (likely(total != 0)) {
> +		factor = DIV_ROUND_UP(total, 8388608);
> +		total /= factor;
> +	} else {
> +		factor = 1;
> +	}
> +

So where does the 8388608 value come from?  Is it just here to help 
restrict the upper_bound to a u8 value?

> +	w = 0;
> +	change_nexthops(fi) {
> +		int upper_bound;
> +
> +		if (nexthop_nh->nh_flags & RTNH_F_DEAD) {
> +			upper_bound = -1;
> +		} else {
> +			w += nexthop_nh->nh_mp_weight / factor;
> +			upper_bound = DIV_ROUND_CLOSEST(256 * w, total);
> +		}

This is doing some confusing stuff.  I assume the whole point is to get 
the value to convert the upper_bound into a u8 value based on the weight 
where you end with 256 as the last value, do I have that right?

I'm not really sure that is the best way to go from a code standpoint 
since fib_nhs is an int which technically means we could support more 
than 256 next hops for a given route.

Also I assume that it is safe to divide by total because the assumption 
is that if one or more routes are not dead then total has a non-zero 
value, do I have that right?  If so what is the point of setting the 
factor to 1 above, unless you are just hacking around a divide by 0 error.

Maybe what you should do above is simply return if total == 0 since it 
doesn't look like you can safely update anything otherwise.

> +
> +		atomic_set(&nexthop_nh->nh_mp_upper_bound, upper_bound);
> +	} endfor_nexthops(fi);
> +}
> +
>   static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
>   		       int remaining, struct fib_config *cfg)
>   {
> @@ -461,7 +499,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
>   		nexthop_nh->nh_flags =
>   			(cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
>   		nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
> -		nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
> +		nexthop_nh->nh_mp_weight = rtnh->rtnh_hops + 1;
>
>   		attrlen = rtnh_attrlen(rtnh);
>   		if (attrlen > 0) {
> @@ -884,7 +922,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
>   			fi->fib_net->ipv4.fib_num_tclassid_users++;
>   #endif
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> -		nh->nh_weight = 1;
> +		nh->nh_mp_weight = 1;
>   #endif
>   	}
>

More unnecessary rename noise.

> @@ -936,8 +974,15 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
>
>   	change_nexthops(fi) {
>   		fib_info_update_nh_saddr(net, nexthop_nh);
> +#ifdef CONFIG_IP_ROUTE_MULTIPATH
> +		fi->fib_mp_weight += nexthop_nh->nh_mp_weight;
> +#endif
>   	} endfor_nexthops(fi)
>
> +#ifdef CONFIG_IP_ROUTE_MULTIPATH
> +	fib_rebalance(fi);
> +#endif
> +
>   link_it:
>   	ofi = fib_find_info(fi);
>   	if (ofi) {
> @@ -1050,7 +1095,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
>   				goto nla_put_failure;
>
>   			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
> -			rtnh->rtnh_hops = nh->nh_weight - 1;
> +			rtnh->rtnh_hops = nh->nh_mp_weight - 1;
>   			rtnh->rtnh_ifindex = nh->nh_oif;
>
>   			if (nh->nh_gw &&
> @@ -1130,12 +1175,6 @@ int fib_sync_down_dev(struct net_device *dev, int force)
>   			else if (nexthop_nh->nh_dev == dev &&
>   				 nexthop_nh->nh_scope != scope) {
>   				nexthop_nh->nh_flags |= RTNH_F_DEAD;
> -#ifdef CONFIG_IP_ROUTE_MULTIPATH
> -				spin_lock_bh(&fib_multipath_lock);
> -				fi->fib_power -= nexthop_nh->nh_power;
> -				nexthop_nh->nh_power = 0;
> -				spin_unlock_bh(&fib_multipath_lock);
> -#endif
>   				dead++;
>   			}
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> @@ -1149,6 +1188,10 @@ int fib_sync_down_dev(struct net_device *dev, int force)
>   			fi->fib_flags |= RTNH_F_DEAD;
>   			ret++;
>   		}
> +
> +#ifdef CONFIG_IP_ROUTE_MULTIPATH
> +		fib_rebalance(fi);
> +#endif
>   	}
>
>   	return ret;
> @@ -1254,16 +1297,15 @@ int fib_sync_up(struct net_device *dev)
>   			    !__in_dev_get_rtnl(dev))
>   				continue;
>   			alive++;
> -			spin_lock_bh(&fib_multipath_lock);
> -			nexthop_nh->nh_power = 0;
>   			nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
> -			spin_unlock_bh(&fib_multipath_lock);
>   		} endfor_nexthops(fi)
>
>   		if (alive > 0) {
>   			fi->fib_flags &= ~RTNH_F_DEAD;
>   			ret++;
>   		}
> +
> +		fib_rebalance(fi);
>   	}
>
>   	return ret;
> @@ -1276,49 +1318,19 @@ int fib_sync_up(struct net_device *dev)
>   void fib_select_multipath(struct fib_result *res)
>   {
>   	struct fib_info *fi = res->fi;
> -	int w;
> -
> -	spin_lock_bh(&fib_multipath_lock);
> -	if (fi->fib_power <= 0) {
> -		int power = 0;
> -		change_nexthops(fi) {
> -			if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
> -				power += nexthop_nh->nh_weight;
> -				nexthop_nh->nh_power = nexthop_nh->nh_weight;
> -			}
> -		} endfor_nexthops(fi);
> -		fi->fib_power = power;
> -		if (power <= 0) {
> -			spin_unlock_bh(&fib_multipath_lock);
> -			/* Race condition: route has just become dead. */
> -			res->nh_sel = 0;
> -			return;
> -		}
> -	}
> +	u8 w;
>
> +	w = bitrev8(this_cpu_inc_return(fib_mp_rr_counter));
>
> -	/* w should be random number [0..fi->fib_power-1],
> -	 * it is pretty bad approximation.
> -	 */
> -
> -	w = jiffies % fi->fib_power;
> +	for_nexthops(fi) {
> +		if (w >= atomic_read(&nh->nh_mp_upper_bound))
> +			continue;
>
> -	change_nexthops(fi) {
> -		if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
> -		    nexthop_nh->nh_power) {
> -			w -= nexthop_nh->nh_power;
> -			if (w <= 0) {
> -				nexthop_nh->nh_power--;
> -				fi->fib_power--;
> -				res->nh_sel = nhsel;
> -				spin_unlock_bh(&fib_multipath_lock);
> -				return;
> -			}
> -		}
> +		res->nh_sel = nhsel;
> +		return;
>   	} endfor_nexthops(fi);
>
> -	/* Race condition: route has just become dead. */
> +	/* Race condition: route has just become dead */
>   	res->nh_sel = 0;
> -	spin_unlock_bh(&fib_multipath_lock);
>   }
>   #endif
>

Instead of doing bitrev8 why not just hash jiffies since that is 
essentially what you are doing with this counter anyway.  You could 
probably even look at converting from a u8 to something like a u32 which 
would give you a more pseudo random distribution.

next prev parent reply	other threads:[~2015-06-18 19:42 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-06-17 20:08 [PATCH net-next 0/3] ipv4: Hash-based multipath routing Peter Nørlund
     [not found] ` <1434571686-5149-1-git-send-email-pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
2015-06-17 20:08   ` [PATCH net-next 1/3] ipv4: Lock-less per-packet multipath Peter Nørlund
     [not found]     ` <1434571686-5149-2-git-send-email-pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
2015-06-18 19:42       ` Alexander Duyck [this message]
     [not found]         ` <55831F0D.3040703-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-06-20 18:42           ` Peter Nørlund
2015-06-17 20:08 ` [PATCH net-next 2/3] ipv4: L3 and L4 hash-based multipath routing Peter Nørlund
2015-06-18 22:52   ` Alexander Duyck
2015-06-20 18:46     ` Peter Nørlund
2015-06-17 20:08 ` [PATCH net-next 3/3] ipv4: ICMP packet inspection for multipath Peter Nørlund

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=55831F0D.3040703@redhat.com \
    --to=alexander.h.duyck-h+wxahxf7alqt0dzr+alfa@public.gmane.org \
    --cc=davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org \
    --cc=jmorris-gx6/JNMH7DfYtjvyW6yDsg@public.gmane.org \
    --cc=kaber-dcUjhNyLwpNeoWH0uzbU5w@public.gmane.org \
    --cc=kuznet-v/Mj1YrvjDBInbfyfbPRSQ@public.gmane.org \
    --cc=linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    --cc=netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    --cc=pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org \
    --cc=yoshfuji-VfPWfsRibaP+Ru+s062T9g@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.