From mboxrd@z Thu Jan 1 00:00:00 1970 From: Alexander Duyck Subject: Re: [PATCH net-next 2/3] ipv4: L3 and L4 hash-based multipath routing Date: Thu, 18 Jun 2015 15:52:22 -0700 Message-ID: <55834BA6.7050405@redhat.com> References: <1434571686-5149-1-git-send-email-pch@ordbogen.com> <1434571686-5149-3-git-send-email-pch@ordbogen.com> Mime-Version: 1.0 Content-Type: text/plain; charset=utf-8; format=flowed Content-Transfer-Encoding: QUOTED-PRINTABLE Cc: "David S. Miller" , Alexey Kuznetsov , James Morris , Hideaki YOSHIFUJI , Patrick McHardy , linux-api@vger.kernel.org To: =?UTF-8?B?UGV0ZXIgTsO4cmx1bmQ=?= , netdev@vger.kernel.org Return-path: Received: from mx1.redhat.com ([209.132.183.28]:58203 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750791AbbFRWw0 (ORCPT ); Thu, 18 Jun 2015 18:52:26 -0400 In-Reply-To: <1434571686-5149-3-git-send-email-pch@ordbogen.com> Sender: netdev-owner@vger.kernel.org List-ID: On 06/17/2015 01:08 PM, Peter N=C3=B8rlund wrote: > This patch adds L3 and L4 hash-based multipath routing, selectable on= a > per-route basis with the reintroduced RTA_MP_ALGO attribute. The defa= ult is > now RT_MP_ALG_L3_HASH. > > Signed-off-by: Peter N=C3=B8rlund > --- > include/net/ip_fib.h | 4 ++- > include/net/route.h | 5 ++-- > include/uapi/linux/rtnetlink.h | 14 ++++++++++- > net/ipv4/fib_frontend.c | 4 +++ > net/ipv4/fib_semantics.c | 34 ++++++++++++++++++++++--- > net/ipv4/icmp.c | 4 +-- > net/ipv4/route.c | 56 +++++++++++++++++++++++++++++++= ++++------- > net/ipv4/xfrm4_policy.c | 2 +- > 8 files changed, 103 insertions(+), 20 deletions(-) > > diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h > index 4be4f25..250d98e 100644 > --- a/include/net/ip_fib.h > +++ b/include/net/ip_fib.h > @@ -37,6 +37,7 @@ struct fib_config { > u32 fc_flags; > u32 fc_priority; > __be32 fc_prefsrc; > + int fc_mp_alg; > struct nlattr *fc_mx; > struct rtnexthop *fc_mp; > int fc_mx_len; > @@ -116,6 +117,7 @@ struct fib_info { > int fib_nhs; > #ifdef CONFIG_IP_ROUTE_MULTIPATH > int fib_mp_weight; > + int fib_mp_alg; > #endif > struct rcu_head rcu; > struct fib_nh fib_nh[0]; > @@ -308,7 +310,7 @@ int ip_fib_check_default(__be32 gw, struct net_de= vice *dev); > int fib_sync_down_dev(struct net_device *dev, int force); > int fib_sync_down_addr(struct net *net, __be32 local); > int fib_sync_up(struct net_device *dev); > -void fib_select_multipath(struct fib_result *res); > +void fib_select_multipath(struct fib_result *res, const struct flowi= 4 *flow); > > /* Exported by fib_trie.c */ > void fib_trie_init(void); > diff --git a/include/net/route.h b/include/net/route.h > index fe22d03..1fc7deb 100644 > --- a/include/net/route.h > +++ b/include/net/route.h > @@ -110,7 +110,8 @@ struct in_device; > int ip_rt_init(void); > void rt_cache_flush(struct net *net); > void rt_flush_dev(struct net_device *dev); > -struct rtable *__ip_route_output_key(struct net *, struct flowi4 *fl= p); > +struct rtable *__ip_route_output_key(struct net *, struct flowi4 *fl= p, > + const struct flowi4 *mp_flow); > struct rtable *ip_route_output_flow(struct net *, struct flowi4 *fl= p, > struct sock *sk); > struct dst_entry *ipv4_blackhole_route(struct net *net, > @@ -267,7 +268,7 @@ static inline struct rtable *ip_route_connect(str= uct flowi4 *fl4, > sport, dport, sk); > > if (!dst || !src) { > - rt =3D __ip_route_output_key(net, fl4); > + rt =3D __ip_route_output_key(net, fl4, NULL); > if (IS_ERR(rt)) > return rt; > ip_rt_put(rt); > diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtne= tlink.h > index 17fb02f..dff4a72 100644 > --- a/include/uapi/linux/rtnetlink.h > +++ b/include/uapi/linux/rtnetlink.h > @@ -271,6 +271,18 @@ enum rt_scope_t { > #define RTM_F_EQUALIZE 0x400 /* Multipath equalizer: NI */ > #define RTM_F_PREFIX 0x800 /* Prefix addresses */ > > +/* Multipath algorithms */ > + > +enum rt_mp_alg_t { > + RT_MP_ALG_L3_HASH, /* Was IP_MP_ALG_NONE */ > + RT_MP_ALG_PER_PACKET, /* Was IP_MP_ALG_RR */ > + RT_MP_ALG_DRR, /* not used */ > + RT_MP_ALG_RANDOM, /* not used */ > + RT_MP_ALG_WRANDOM, /* not used */ > + RT_MP_ALG_L4_HASH, > + __RT_MP_ALG_MAX > +}; > + > /* Reserved table identifiers */ > > enum rt_class_t { > @@ -301,7 +313,7 @@ enum rtattr_type_t { > RTA_FLOW, > RTA_CACHEINFO, > RTA_SESSION, /* no longer used */ > - RTA_MP_ALGO, /* no longer used */ > + RTA_MP_ALGO, > RTA_TABLE, > RTA_MARK, > RTA_MFC_STATS, > diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c > index 872494e..376e8c1 100644 > --- a/net/ipv4/fib_frontend.c > +++ b/net/ipv4/fib_frontend.c > @@ -590,6 +590,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX += 1] =3D { > [RTA_PREFSRC] =3D { .type =3D NLA_U32 }, > [RTA_METRICS] =3D { .type =3D NLA_NESTED }, > [RTA_MULTIPATH] =3D { .len =3D sizeof(struct rtnexthop) }, > + [RTA_MP_ALGO] =3D { .type =3D NLA_U32 }, > [RTA_FLOW] =3D { .type =3D NLA_U32 }, > }; > > @@ -650,6 +651,9 @@ static int rtm_to_fib_config(struct net *net, str= uct sk_buff *skb, > cfg->fc_mp =3D nla_data(attr); > cfg->fc_mp_len =3D nla_len(attr); > break; > + case RTA_MP_ALGO: > + cfg->fc_mp_alg =3D nla_get_u32(attr); > + break; > case RTA_FLOW: > cfg->fc_flow =3D nla_get_u32(attr); > break; > diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c > index 8c8df80..da06e88 100644 > --- a/net/ipv4/fib_semantics.c > +++ b/net/ipv4/fib_semantics.c > @@ -257,6 +257,11 @@ static inline int nh_comp(const struct fib_info = *fi, const struct fib_info *ofi) > { > const struct fib_nh *onh =3D ofi->fib_nh; > > +#ifdef CONFIG_IP_ROUTE_MULTIPATH > + if (fi->fib_mp_alg !=3D ofi->fib_mp_alg) > + return -1; > +#endif > + > for_nexthops(fi) { > if (nh->nh_oif !=3D onh->nh_oif || > nh->nh_gw !=3D onh->nh_gw || > @@ -896,6 +901,7 @@ struct fib_info *fib_create_info(struct fib_confi= g *cfg) > > if (cfg->fc_mp) { > #ifdef CONFIG_IP_ROUTE_MULTIPATH > + fi->fib_mp_alg =3D cfg->fc_mp_alg; > err =3D fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg); > if (err !=3D 0) > goto failure; > @@ -1085,6 +1091,10 @@ int fib_dump_info(struct sk_buff *skb, u32 por= tid, u32 seq, int event, > struct rtnexthop *rtnh; > struct nlattr *mp; > > + if (fi->fib_mp_alg && > + nla_put_u32(skb, RTA_MP_ALGO, fi->fib_mp_alg)) > + goto nla_put_failure; > + > mp =3D nla_nest_start(skb, RTA_MULTIPATH); > if (!mp) > goto nla_put_failure; > @@ -1312,15 +1322,31 @@ int fib_sync_up(struct net_device *dev) > } > > /* > - * The algorithm is suboptimal, but it provides really > - * fair weighted route distribution. > + * Compute multipath hash based on 3- or 5-tuple > */ > -void fib_select_multipath(struct fib_result *res) > +static int fib_multipath_hash(const struct fib_result *res, > + const struct flowi4 *flow) > +{ > + u32 hash =3D flow->saddr ^ flow->daddr; > + > + if (res->fi->fib_mp_alg =3D=3D RT_MP_ALG_L4_HASH && flow->flowi4_pr= oto !=3D 0) > + hash ^=3D flow->flowi4_proto ^ flow->fl4_sport ^ flow->fl4_dport; > + > + hash ^=3D hash >> 16; > + hash ^=3D hash >> 8; > + return hash & 0xFF; > +} > + This hash is still far from optimal. Really I think you should look at= =20 something such as jhash_3words or the like for mixing up the addresses.= =20 Right now just XORing the values together like you are will end up=20 with a fairly high collision rate since for example in the case of two=20 endpoints on the same subnet you would lose the subnet as a result of=20 XORing the source and destination addresses. Also you would lose the=20 port data in the case of a protocol using something such as UDP where=20 the source and destination ports might be the same value. > +void fib_select_multipath(struct fib_result *res, const struct flowi= 4 *flow) > { > struct fib_info *fi =3D res->fi; > u8 w; > > - w =3D bitrev8(this_cpu_inc_return(fib_mp_rr_counter)); > + if (res->fi->fib_mp_alg =3D=3D RT_MP_ALG_PER_PACKET) { > + w =3D bitrev8(this_cpu_inc_return(fib_mp_rr_counter)); > + } else { > + w =3D fib_multipath_hash(res, flow); > + } > > for_nexthops(fi) { > if (w >=3D atomic_read(&nh->nh_mp_upper_bound)) > diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c > index f5203fb..3abcfea 100644 > --- a/net/ipv4/icmp.c > +++ b/net/ipv4/icmp.c > @@ -459,7 +459,7 @@ static struct rtable *icmp_route_lookup(struct ne= t *net, > fl4->fl4_icmp_type =3D type; > fl4->fl4_icmp_code =3D code; > security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); > - rt =3D __ip_route_output_key(net, fl4); > + rt =3D __ip_route_output_key(net, fl4, NULL); > if (IS_ERR(rt)) > return rt; > > @@ -481,7 +481,7 @@ static struct rtable *icmp_route_lookup(struct ne= t *net, > goto relookup_failed; > > if (inet_addr_type(net, fl4_dec.saddr) =3D=3D RTN_LOCAL) { > - rt2 =3D __ip_route_output_key(net, &fl4_dec); > + rt2 =3D __ip_route_output_key(net, &fl4_dec, NULL); > if (IS_ERR(rt2)) > err =3D PTR_ERR(rt2); > } else { > diff --git a/net/ipv4/route.c b/net/ipv4/route.c > index f605598..a1ec62c 100644 > --- a/net/ipv4/route.c > +++ b/net/ipv4/route.c > @@ -1006,7 +1006,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, stru= ct net *net, u32 mtu, > > __build_flow_key(&fl4, NULL, iph, oif, > RT_TOS(iph->tos), protocol, mark, flow_flags); > - rt =3D __ip_route_output_key(net, &fl4); > + rt =3D __ip_route_output_key(net, &fl4, NULL); > if (!IS_ERR(rt)) { > __ip_rt_update_pmtu(rt, &fl4, mtu); > ip_rt_put(rt); > @@ -1025,7 +1025,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buf= f *skb, struct sock *sk, u32 mtu) > if (!fl4.flowi4_mark) > fl4.flowi4_mark =3D IP4_REPLY_MARK(sock_net(sk), skb->mark); > > - rt =3D __ip_route_output_key(sock_net(sk), &fl4); > + rt =3D __ip_route_output_key(sock_net(sk), &fl4, NULL); > if (!IS_ERR(rt)) { > __ip_rt_update_pmtu(rt, &fl4, mtu); > ip_rt_put(rt); > @@ -1094,7 +1094,7 @@ void ipv4_redirect(struct sk_buff *skb, struct = net *net, > > __build_flow_key(&fl4, NULL, iph, oif, > RT_TOS(iph->tos), protocol, mark, flow_flags); > - rt =3D __ip_route_output_key(net, &fl4); > + rt =3D __ip_route_output_key(net, &fl4, NULL); > if (!IS_ERR(rt)) { > __ip_do_redirect(rt, skb, &fl4, false); > ip_rt_put(rt); > @@ -1109,7 +1109,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, stru= ct sock *sk) > struct rtable *rt; > > __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); > - rt =3D __ip_route_output_key(sock_net(sk), &fl4); > + rt =3D __ip_route_output_key(sock_net(sk), &fl4, NULL); > if (!IS_ERR(rt)) { > __ip_do_redirect(rt, skb, &fl4, false); > ip_rt_put(rt); > @@ -1631,6 +1631,39 @@ out: > return err; > } > > +#ifdef CONFIG_IP_ROUTE_MULTIPATH > +/* Fill flow key data based on packet for use in multipath routing. = */ > +static void ip_multipath_flow(const struct sk_buff *skb, struct flow= i4 *flow) > +{ > + const struct iphdr *iph; > + > + iph =3D ip_hdr(skb); > + > + flow->saddr =3D iph->saddr; > + flow->daddr =3D iph->daddr; > + flow->flowi4_proto =3D iph->protocol; > + flow->fl4_sport =3D 0; > + flow->fl4_dport =3D 0; > + > + if (unlikely(ip_is_fragment(iph))) > + return; > + I'm not sure if checking for fragmentation is enough. For example if=20 this system is routing and received a flow of UDP packets, some=20 fragmented some not then it might end up mixing them over 2 separate=20 next hops since some will include L4 header data and some won't. As such you may want to have the option to exclude UDP from the=20 protocols listed below. > + if (iph->protocol =3D=3D IPPROTO_TCP || > + iph->protocol =3D=3D IPPROTO_UDP || > + iph->protocol =3D=3D IPPROTO_SCTP) { > + __be16 _ports; > + const __be16 *ports; > + > + ports =3D skb_header_pointer(skb, iph->ihl * 4, sizeof(_ports), > + &_ports); > + if (ports) { > + flow->fl4_sport =3D ports[0]; > + flow->fl4_dport =3D ports[1]; > + } > + } > +} > +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ > + > static int ip_mkroute_input(struct sk_buff *skb, > struct fib_result *res, > const struct flowi4 *fl4, > @@ -1638,8 +1671,12 @@ static int ip_mkroute_input(struct sk_buff *sk= b, > __be32 daddr, __be32 saddr, u32 tos) > { > #ifdef CONFIG_IP_ROUTE_MULTIPATH > - if (res->fi && res->fi->fib_nhs > 1) > - fib_select_multipath(res); > + if (res->fi && res->fi->fib_nhs > 1) { > + struct flowi4 mp_flow; > + > + ip_multipath_flow(skb, &mp_flow); > + fib_select_multipath(res, &mp_flow); > + } What is the point in populating the mp_flow if you don't know if it is=20 going to be used? You are populating it in ip_multipath_flow, and then= =20 you might completely ignore it in fib_select_multipath. Maybe instead of passing the mp_flow you could instead look at passing = a=20 function pointer that would alter the flow for the multipath case inste= ad. > #endif > > /* create a routing cache entry */ > @@ -2012,7 +2049,8 @@ add: > * Major route resolver routine. > */ > > -struct rtable *__ip_route_output_key(struct net *net, struct flowi4 = *fl4) > +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 = *fl4, > + const struct flowi4 *mp_flow) > { > struct net_device *dev_out =3D NULL; > __u8 tos =3D RT_FL_TOS(fl4); > @@ -2170,7 +2208,7 @@ struct rtable *__ip_route_output_key(struct net= *net, struct flowi4 *fl4) > > #ifdef CONFIG_IP_ROUTE_MULTIPATH > if (res.fi->fib_nhs > 1 && fl4->flowi4_oif =3D=3D 0) > - fib_select_multipath(&res); > + fib_select_multipath(&res, (mp_flow ? mp_flow : fl4)); > else > #endif > if (!res.prefixlen && > @@ -2273,7 +2311,7 @@ struct dst_entry *ipv4_blackhole_route(struct n= et *net, struct dst_entry *dst_or > struct rtable *ip_route_output_flow(struct net *net, struct flowi4 = *flp4, > struct sock *sk) > { > - struct rtable *rt =3D __ip_route_output_key(net, flp4); > + struct rtable *rt =3D __ip_route_output_key(net, flp4, NULL); > > if (IS_ERR(rt)) > return rt; > diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c > index bff6974..7eae158 100644 > --- a/net/ipv4/xfrm4_policy.c > +++ b/net/ipv4/xfrm4_policy.c > @@ -31,7 +31,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct = net *net, struct flowi4 *fl4, > if (saddr) > fl4->saddr =3D saddr->a4; > > - rt =3D __ip_route_output_key(net, fl4); > + rt =3D __ip_route_output_key(net, fl4, NULL); > if (!IS_ERR(rt)) > return &rt->dst; > >