Re: [PATCH net-next 2/3] ipv4: L3 and L4 hash-based multipath routing

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Alexander Duyck <alexander.h.duyck@redhat.com>
To: "Peter Nørlund" <pch@ordbogen.com>, netdev@vger.kernel.org
Cc: "David S. Miller" <davem@davemloft.net>,
	Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>,
	James Morris <jmorris@namei.org>,
	Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>,
	Patrick McHardy <kaber@trash.net>,
	linux-api@vger.kernel.org
Subject: Re: [PATCH net-next 2/3] ipv4: L3 and L4 hash-based multipath routing
Date: Thu, 18 Jun 2015 15:52:22 -0700	[thread overview]
Message-ID: <55834BA6.7050405@redhat.com> (raw)
In-Reply-To: <1434571686-5149-3-git-send-email-pch@ordbogen.com>



On 06/17/2015 01:08 PM, Peter Nørlund wrote:
> This patch adds L3 and L4 hash-based multipath routing, selectable on a
> per-route basis with the reintroduced RTA_MP_ALGO attribute. The default is
> now RT_MP_ALG_L3_HASH.
>
> Signed-off-by: Peter Nørlund <pch@ordbogen.com>
> ---
>   include/net/ip_fib.h           |  4 ++-
>   include/net/route.h            |  5 ++--
>   include/uapi/linux/rtnetlink.h | 14 ++++++++++-
>   net/ipv4/fib_frontend.c        |  4 +++
>   net/ipv4/fib_semantics.c       | 34 ++++++++++++++++++++++---
>   net/ipv4/icmp.c                |  4 +--
>   net/ipv4/route.c               | 56 +++++++++++++++++++++++++++++++++++-------
>   net/ipv4/xfrm4_policy.c        |  2 +-
>   8 files changed, 103 insertions(+), 20 deletions(-)
>
> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> index 4be4f25..250d98e 100644
> --- a/include/net/ip_fib.h
> +++ b/include/net/ip_fib.h
> @@ -37,6 +37,7 @@ struct fib_config {
>   	u32			fc_flags;
>   	u32			fc_priority;
>   	__be32			fc_prefsrc;
> +	int			fc_mp_alg;
>   	struct nlattr		*fc_mx;
>   	struct rtnexthop	*fc_mp;
>   	int			fc_mx_len;
> @@ -116,6 +117,7 @@ struct fib_info {
>   	int			fib_nhs;
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
>   	int			fib_mp_weight;
> +	int			fib_mp_alg;
>   #endif
>   	struct rcu_head		rcu;
>   	struct fib_nh		fib_nh[0];
> @@ -308,7 +310,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev);
>   int fib_sync_down_dev(struct net_device *dev, int force);
>   int fib_sync_down_addr(struct net *net, __be32 local);
>   int fib_sync_up(struct net_device *dev);
> -void fib_select_multipath(struct fib_result *res);
> +void fib_select_multipath(struct fib_result *res, const struct flowi4 *flow);
>
>   /* Exported by fib_trie.c */
>   void fib_trie_init(void);
> diff --git a/include/net/route.h b/include/net/route.h
> index fe22d03..1fc7deb 100644
> --- a/include/net/route.h
> +++ b/include/net/route.h
> @@ -110,7 +110,8 @@ struct in_device;
>   int ip_rt_init(void);
>   void rt_cache_flush(struct net *net);
>   void rt_flush_dev(struct net_device *dev);
> -struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp);
> +struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp,
> +				     const struct flowi4 *mp_flow);
>   struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
>   				    struct sock *sk);
>   struct dst_entry *ipv4_blackhole_route(struct net *net,
> @@ -267,7 +268,7 @@ static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
>   			      sport, dport, sk);
>
>   	if (!dst || !src) {
> -		rt = __ip_route_output_key(net, fl4);
> +		rt = __ip_route_output_key(net, fl4, NULL);
>   		if (IS_ERR(rt))
>   			return rt;
>   		ip_rt_put(rt);
> diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
> index 17fb02f..dff4a72 100644
> --- a/include/uapi/linux/rtnetlink.h
> +++ b/include/uapi/linux/rtnetlink.h
> @@ -271,6 +271,18 @@ enum rt_scope_t {
>   #define RTM_F_EQUALIZE		0x400	/* Multipath equalizer: NI	*/
>   #define RTM_F_PREFIX		0x800	/* Prefix addresses		*/
>
> +/* Multipath algorithms */
> +
> +enum rt_mp_alg_t {
> +	RT_MP_ALG_L3_HASH,	/* Was IP_MP_ALG_NONE */
> +	RT_MP_ALG_PER_PACKET,	/* Was IP_MP_ALG_RR */
> +	RT_MP_ALG_DRR,		/* not used */
> +	RT_MP_ALG_RANDOM,	/* not used */
> +	RT_MP_ALG_WRANDOM,	/* not used */
> +	RT_MP_ALG_L4_HASH,
> +	__RT_MP_ALG_MAX
> +};
> +
>   /* Reserved table identifiers */
>
>   enum rt_class_t {
> @@ -301,7 +313,7 @@ enum rtattr_type_t {
>   	RTA_FLOW,
>   	RTA_CACHEINFO,
>   	RTA_SESSION, /* no longer used */
> -	RTA_MP_ALGO, /* no longer used */
> +	RTA_MP_ALGO,
>   	RTA_TABLE,
>   	RTA_MARK,
>   	RTA_MFC_STATS,
> diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
> index 872494e..376e8c1 100644
> --- a/net/ipv4/fib_frontend.c
> +++ b/net/ipv4/fib_frontend.c
> @@ -590,6 +590,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
>   	[RTA_PREFSRC]		= { .type = NLA_U32 },
>   	[RTA_METRICS]		= { .type = NLA_NESTED },
>   	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
> +	[RTA_MP_ALGO]		= { .type = NLA_U32 },
>   	[RTA_FLOW]		= { .type = NLA_U32 },
>   };
>
> @@ -650,6 +651,9 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
>   			cfg->fc_mp = nla_data(attr);
>   			cfg->fc_mp_len = nla_len(attr);
>   			break;
> +		case RTA_MP_ALGO:
> +			cfg->fc_mp_alg = nla_get_u32(attr);
> +			break;
>   		case RTA_FLOW:
>   			cfg->fc_flow = nla_get_u32(attr);
>   			break;
> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
> index 8c8df80..da06e88 100644
> --- a/net/ipv4/fib_semantics.c
> +++ b/net/ipv4/fib_semantics.c
> @@ -257,6 +257,11 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
>   {
>   	const struct fib_nh *onh = ofi->fib_nh;
>
> +#ifdef CONFIG_IP_ROUTE_MULTIPATH
> +	if (fi->fib_mp_alg != ofi->fib_mp_alg)
> +		return -1;
> +#endif
> +
>   	for_nexthops(fi) {
>   		if (nh->nh_oif != onh->nh_oif ||
>   		    nh->nh_gw  != onh->nh_gw ||
> @@ -896,6 +901,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
>
>   	if (cfg->fc_mp) {
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> +		fi->fib_mp_alg = cfg->fc_mp_alg;
>   		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
>   		if (err != 0)
>   			goto failure;
> @@ -1085,6 +1091,10 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
>   		struct rtnexthop *rtnh;
>   		struct nlattr *mp;
>
> +		if (fi->fib_mp_alg &&
> +		    nla_put_u32(skb, RTA_MP_ALGO, fi->fib_mp_alg))
> +			goto nla_put_failure;
> +
>   		mp = nla_nest_start(skb, RTA_MULTIPATH);
>   		if (!mp)
>   			goto nla_put_failure;
> @@ -1312,15 +1322,31 @@ int fib_sync_up(struct net_device *dev)
>   }
>
>   /*
> - * The algorithm is suboptimal, but it provides really
> - * fair weighted route distribution.
> + * Compute multipath hash based on 3- or 5-tuple
>    */
> -void fib_select_multipath(struct fib_result *res)
> +static int fib_multipath_hash(const struct fib_result *res,
> +			      const struct flowi4 *flow)
> +{
> +	u32 hash = flow->saddr ^ flow->daddr;
> +
> +	if (res->fi->fib_mp_alg == RT_MP_ALG_L4_HASH && flow->flowi4_proto != 0)
> +		hash ^= flow->flowi4_proto ^ flow->fl4_sport ^ flow->fl4_dport;
> +
> +	hash ^= hash >> 16;
> +	hash ^= hash >> 8;
> +	return hash & 0xFF;
> +}
> +

This hash is still far from optimal.  Really I think you should look at 
something such as jhash_3words or the like for mixing up the addresses. 
  Right now just XORing the values together like you are will end up 
with a fairly high collision rate since for example in the case of two 
endpoints on the same subnet you would lose the subnet as a result of 
XORing the source and destination addresses.  Also you would lose the 
port data in the case of a protocol using something such as UDP where 
the source and destination ports might be the same value.

> +void fib_select_multipath(struct fib_result *res, const struct flowi4 *flow)
>   {
>   	struct fib_info *fi = res->fi;
>   	u8 w;
>
> -	w = bitrev8(this_cpu_inc_return(fib_mp_rr_counter));
> +	if (res->fi->fib_mp_alg == RT_MP_ALG_PER_PACKET) {
> +		w = bitrev8(this_cpu_inc_return(fib_mp_rr_counter));
> +	} else {
> +		w = fib_multipath_hash(res, flow);
> +	}
>
>   	for_nexthops(fi) {
>   		if (w >= atomic_read(&nh->nh_mp_upper_bound))
> diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
> index f5203fb..3abcfea 100644
> --- a/net/ipv4/icmp.c
> +++ b/net/ipv4/icmp.c
> @@ -459,7 +459,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
>   	fl4->fl4_icmp_type = type;
>   	fl4->fl4_icmp_code = code;
>   	security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
> -	rt = __ip_route_output_key(net, fl4);
> +	rt = __ip_route_output_key(net, fl4, NULL);
>   	if (IS_ERR(rt))
>   		return rt;
>
> @@ -481,7 +481,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
>   		goto relookup_failed;
>
>   	if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) {
> -		rt2 = __ip_route_output_key(net, &fl4_dec);
> +		rt2 = __ip_route_output_key(net, &fl4_dec, NULL);
>   		if (IS_ERR(rt2))
>   			err = PTR_ERR(rt2);
>   	} else {
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index f605598..a1ec62c 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -1006,7 +1006,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
>
>   	__build_flow_key(&fl4, NULL, iph, oif,
>   			 RT_TOS(iph->tos), protocol, mark, flow_flags);
> -	rt = __ip_route_output_key(net, &fl4);
> +	rt = __ip_route_output_key(net, &fl4, NULL);
>   	if (!IS_ERR(rt)) {
>   		__ip_rt_update_pmtu(rt, &fl4, mtu);
>   		ip_rt_put(rt);
> @@ -1025,7 +1025,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
>   	if (!fl4.flowi4_mark)
>   		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
>
> -	rt = __ip_route_output_key(sock_net(sk), &fl4);
> +	rt = __ip_route_output_key(sock_net(sk), &fl4, NULL);
>   	if (!IS_ERR(rt)) {
>   		__ip_rt_update_pmtu(rt, &fl4, mtu);
>   		ip_rt_put(rt);
> @@ -1094,7 +1094,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net,
>
>   	__build_flow_key(&fl4, NULL, iph, oif,
>   			 RT_TOS(iph->tos), protocol, mark, flow_flags);
> -	rt = __ip_route_output_key(net, &fl4);
> +	rt = __ip_route_output_key(net, &fl4, NULL);
>   	if (!IS_ERR(rt)) {
>   		__ip_do_redirect(rt, skb, &fl4, false);
>   		ip_rt_put(rt);
> @@ -1109,7 +1109,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
>   	struct rtable *rt;
>
>   	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
> -	rt = __ip_route_output_key(sock_net(sk), &fl4);
> +	rt = __ip_route_output_key(sock_net(sk), &fl4, NULL);
>   	if (!IS_ERR(rt)) {
>   		__ip_do_redirect(rt, skb, &fl4, false);
>   		ip_rt_put(rt);
> @@ -1631,6 +1631,39 @@ out:
>   	return err;
>   }
>
> +#ifdef CONFIG_IP_ROUTE_MULTIPATH
> +/* Fill flow key data based on packet for use in multipath routing. */
> +static void ip_multipath_flow(const struct sk_buff *skb, struct flowi4 *flow)
> +{
> +	const struct iphdr *iph;
> +
> +	iph = ip_hdr(skb);
> +
> +	flow->saddr = iph->saddr;
> +	flow->daddr = iph->daddr;
> +	flow->flowi4_proto = iph->protocol;
> +	flow->fl4_sport = 0;
> +	flow->fl4_dport = 0;
> +
> +	if (unlikely(ip_is_fragment(iph)))
> +		return;
> +

I'm not sure if checking for fragmentation is enough.  For example if 
this system is routing and received a flow of UDP packets, some 
fragmented some not then it might end up mixing them over 2 separate 
next hops since some will include L4 header data and some won't.

As such you may want to have the option to exclude UDP from the 
protocols listed below.

> +	if (iph->protocol == IPPROTO_TCP ||
> +	    iph->protocol == IPPROTO_UDP ||
> +	    iph->protocol == IPPROTO_SCTP) {
> +		__be16 _ports;
> +		const __be16 *ports;
> +
> +		ports = skb_header_pointer(skb, iph->ihl * 4, sizeof(_ports),
> +					   &_ports);
> +		if (ports) {
> +			flow->fl4_sport = ports[0];
> +			flow->fl4_dport = ports[1];
> +		}
> +	}
> +}
> +#endif /* CONFIG_IP_ROUTE_MULTIPATH */
> +
>   static int ip_mkroute_input(struct sk_buff *skb,
>   			    struct fib_result *res,
>   			    const struct flowi4 *fl4,
> @@ -1638,8 +1671,12 @@ static int ip_mkroute_input(struct sk_buff *skb,
>   			    __be32 daddr, __be32 saddr, u32 tos)
>   {
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> -	if (res->fi && res->fi->fib_nhs > 1)
> -		fib_select_multipath(res);
> +	if (res->fi && res->fi->fib_nhs > 1) {
> +		struct flowi4 mp_flow;
> +
> +		ip_multipath_flow(skb, &mp_flow);
> +		fib_select_multipath(res, &mp_flow);
> +	}

What is the point in populating the mp_flow if you don't know if it is 
going to be used?  You are populating it in ip_multipath_flow, and then 
you might completely ignore it in fib_select_multipath.

Maybe instead of passing the mp_flow you could instead look at passing a 
function pointer that would alter the flow for the multipath case instead.

>   #endif
>
>   	/* create a routing cache entry */
> @@ -2012,7 +2049,8 @@ add:
>    * Major route resolver routine.
>    */
>
> -struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
> +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4,
> +				     const struct flowi4 *mp_flow)
>   {
>   	struct net_device *dev_out = NULL;
>   	__u8 tos = RT_FL_TOS(fl4);
> @@ -2170,7 +2208,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
>
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
>   	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
> -		fib_select_multipath(&res);
> +		fib_select_multipath(&res, (mp_flow ? mp_flow : fl4));
>   	else
>   #endif
>   	if (!res.prefixlen &&
> @@ -2273,7 +2311,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
>   struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
>   				    struct sock *sk)
>   {
> -	struct rtable *rt = __ip_route_output_key(net, flp4);
> +	struct rtable *rt = __ip_route_output_key(net, flp4, NULL);
>
>   	if (IS_ERR(rt))
>   		return rt;
> diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
> index bff6974..7eae158 100644
> --- a/net/ipv4/xfrm4_policy.c
> +++ b/net/ipv4/xfrm4_policy.c
> @@ -31,7 +31,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
>   	if (saddr)
>   		fl4->saddr = saddr->a4;
>
> -	rt = __ip_route_output_key(net, fl4);
> +	rt = __ip_route_output_key(net, fl4, NULL);
>   	if (!IS_ERR(rt))
>   		return &rt->dst;
>
>

next prev parent reply	other threads:[~2015-06-18 22:52 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-06-17 20:08 [PATCH net-next 0/3] ipv4: Hash-based multipath routing Peter Nørlund
     [not found] ` <1434571686-5149-1-git-send-email-pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
2015-06-17 20:08   ` [PATCH net-next 1/3] ipv4: Lock-less per-packet multipath Peter Nørlund
     [not found]     ` <1434571686-5149-2-git-send-email-pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
2015-06-18 19:42       ` Alexander Duyck
     [not found]         ` <55831F0D.3040703-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-06-20 18:42           ` Peter Nørlund
2015-06-17 20:08 ` [PATCH net-next 2/3] ipv4: L3 and L4 hash-based multipath routing Peter Nørlund
2015-06-18 22:52   ` Alexander Duyck [this message]
2015-06-20 18:46     ` Peter Nørlund
2015-06-17 20:08 ` [PATCH net-next 3/3] ipv4: ICMP packet inspection for multipath Peter Nørlund

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=55834BA6.7050405@redhat.com \
    --to=alexander.h.duyck@redhat.com \
    --cc=davem@davemloft.net \
    --cc=jmorris@namei.org \
    --cc=kaber@trash.net \
    --cc=kuznet@ms2.inr.ac.ru \
    --cc=linux-api@vger.kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=pch@ordbogen.com \
    --cc=yoshfuji@linux-ipv6.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.