* [PATCH 1/1] net/ipv4: Enable flow-based ECMP
@ 2015-08-04 1:28 Richard Laing
2015-08-04 5:31 ` Stephen Hemminger
2015-08-04 7:37 ` Oleg A Arkhangelsky
0 siblings, 2 replies; 6+ messages in thread
From: Richard Laing @ 2015-08-04 1:28 UTC (permalink / raw)
To: netdev; +Cc: jmorris, Richard Laing
Enable flow-based ECMP.
Currently if equal-cost multipath is enabled the kernel chooses between
equal cost paths for each matching packet, essentially packets are
round-robined between the routes. This means that packets from a single
flow can traverse different routes. If one of the routes experiences
congestion this can result in delayed or out of order packets arriving
at the destination.
This patch allows packets to be routed based on their flow - packets
in the same flow will always use the same route. This prevents out of
order packets. There are other issues with round-robin based ECMP routing
related to variable path MTU handling and debugging. The default
behaviour is changed by this patch to enable flow based ECMP routing
rather than the previous round-robin routing. The behaviour can be
changed using a new sysctl option /net/ipv4/route/flow_based_ecmp.
See RFC2991 for more details on the problems associated with packet
based ECMP routing.
This patch relies on the skb hash value to select between routes. The
selection uses a hash-threshold algorithm (see RFC2992).
Signed-off-by: Richard Laing <richard.laing@alliedtelesis.co.nz>
---
include/net/flow.h | 8 ++++++++
include/net/ip_fib.h | 4 ++++
include/net/route.h | 2 ++
net/ipv4/fib_semantics.c | 30 ++++++++++++++++++++++++++++++
net/ipv4/route.c | 19 +++++++++++++++----
5 files changed, 59 insertions(+), 4 deletions(-)
diff --git a/include/net/flow.h b/include/net/flow.h
index 8109a15..b0a2524 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -79,6 +79,8 @@ struct flowi4 {
#define fl4_ipsec_spi uli.spi
#define fl4_mh_type uli.mht.type
#define fl4_gre_key uli.gre_key
+
+ __u32 flowi4_hash;
} __attribute__((__aligned__(BITS_PER_LONG/8)));
static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
@@ -99,6 +101,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
fl4->saddr = saddr;
fl4->fl4_dport = dport;
fl4->fl4_sport = sport;
+ fl4->flowi4_hash = 0;
}
/* Reset some input parameters after previous lookup */
@@ -182,6 +185,11 @@ static inline struct flowi *flowidn_to_flowi(struct flowidn *fldn)
return container_of(fldn, struct flowi, u.dn);
}
+static inline void flowi4_set_flow_hash(struct flowi4 *fl, __u32 hash)
+{
+ fl->flowi4_hash = hash;
+}
+
typedef unsigned long flow_compare_t;
static inline size_t flow_key_size(u16 family)
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 5fa643b..7db9f72 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -117,6 +117,8 @@ struct fib_info {
#ifdef CONFIG_IP_ROUTE_MULTIPATH
int fib_power;
#endif
+ /* Cache the number of live nexthops for flow based ECMP calculation. */
+ int live_nexthops;
struct rcu_head rcu;
struct fib_nh fib_nh[0];
#define fib_dev fib_nh[0].nh_dev
@@ -310,6 +312,8 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event);
int fib_sync_down_addr(struct net *net, __be32 local);
int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
void fib_select_multipath(struct fib_result *res);
+void fib_select_multipath_for_flow(struct fib_result *res,
+ const struct flowi4 *fl4);
/* Exported by fib_trie.c */
void fib_trie_init(void);
diff --git a/include/net/route.h b/include/net/route.h
index fe22d03..a00e606 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -252,6 +252,8 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32
flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
protocol, flow_flags, dst, src, dport, sport);
+
+ flowi4_set_flow_hash(fl4, sk->sk_txhash);
}
static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 3a06586..0a56ad3 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -981,6 +981,7 @@ link_it:
head = &fib_info_devhash[hash];
hlist_add_head(&nexthop_nh->nh_hash, head);
} endfor_nexthops(fi)
+ fi->live_nexthops = fi->fib_nhs;
spin_unlock_bh(&fib_info_lock);
return fi;
@@ -1196,6 +1197,7 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event)
}
ret++;
}
+ fi->live_nexthops = fi->fib_nhs - dead;
}
return ret;
@@ -1331,6 +1333,7 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
if (alive > 0) {
fi->fib_flags &= ~nh_flags;
ret++;
+ fi->live_nexthops = alive;
}
}
@@ -1397,4 +1400,31 @@ void fib_select_multipath(struct fib_result *res)
res->nh_sel = 0;
spin_unlock_bh(&fib_multipath_lock);
}
+
+void fib_select_multipath_for_flow(struct fib_result *res,
+ const struct flowi4 *fl4)
+{
+ struct fib_info *fi = res->fi;
+ int multipath_entry;
+ int region_size;
+
+ if (fl4->flowi4_hash) {
+ /* Hash-threshold algorithm, see RFC2992. */
+ region_size = U32_MAX / fi->live_nexthops;
+ multipath_entry = fl4->flowi4_hash / region_size;
+
+ spin_lock_bh(&fib_multipath_lock);
+ for_nexthops(fi) {
+ if (!(nh->nh_flags & RTNH_F_DEAD)) {
+ res->nh_sel = nhsel;
+ if (multipath_entry == 0)
+ break;
+ multipath_entry--;
+ }
+ } endfor_nexthops(fi);
+ spin_unlock_bh(&fib_multipath_lock);
+ } else {
+ fib_select_multipath(res);
+ }
+}
#endif
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index e681b85..a9ac9ff 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -124,6 +124,7 @@ static int ip_rt_error_burst __read_mostly = 5 * HZ;
static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
static int ip_rt_min_advmss __read_mostly = 256;
+static int ip_rt_flow_based_ecmp __read_mostly = 1;
/*
* Interface to generic destination cache.
@@ -1633,13 +1634,20 @@ out:
static int ip_mkroute_input(struct sk_buff *skb,
struct fib_result *res,
- const struct flowi4 *fl4,
+ struct flowi4 *fl4,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res->fi && res->fi->fib_nhs > 1)
- fib_select_multipath(res);
+ if (res->fi && res->fi->fib_nhs > 1) {
+ if (ip_rt_flow_based_ecmp) {
+ if (skb)
+ flowi4_set_flow_hash(fl4, skb_get_hash(skb));
+ fib_select_multipath_for_flow(res, fl4);
+ } else {
+ fib_select_multipath(res);
+ }
+ }
#endif
/* create a routing cache entry */
@@ -2170,7 +2178,10 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
- fib_select_multipath(&res);
+ if (ip_rt_flow_based_ecmp)
+ fib_select_multipath_for_flow(&res, fl4);
+ else
+ fib_select_multipath(&res);
else
#endif
if (!res.prefixlen &&
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [PATCH 1/1] net/ipv4: Enable flow-based ECMP
2015-08-04 1:28 [PATCH 1/1] net/ipv4: Enable flow-based ECMP Richard Laing
@ 2015-08-04 5:31 ` Stephen Hemminger
2015-08-04 22:07 ` Richard Laing
2015-08-04 7:37 ` Oleg A Arkhangelsky
1 sibling, 1 reply; 6+ messages in thread
From: Stephen Hemminger @ 2015-08-04 5:31 UTC (permalink / raw)
To: Richard Laing; +Cc: netdev, jmorris
On Tue, 4 Aug 2015 13:28:47 +1200
Richard Laing <richard.laing@alliedtelesis.co.nz> wrote:
> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> index 5fa643b..7db9f72 100644
> --- a/include/net/ip_fib.h
> +++ b/include/net/ip_fib.h
> @@ -117,6 +117,8 @@ struct fib_info {
> #ifdef CONFIG_IP_ROUTE_MULTIPATH
> int fib_power;
> #endif
> + /* Cache the number of live nexthops for flow based ECMP calculation. */
> + int live_nexthops;
unsigned or u16 ? rather than risking sign issues.
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH 1/1] net/ipv4: Enable flow-based ECMP
2015-08-04 1:28 [PATCH 1/1] net/ipv4: Enable flow-based ECMP Richard Laing
2015-08-04 5:31 ` Stephen Hemminger
@ 2015-08-04 7:37 ` Oleg A Arkhangelsky
2015-08-04 22:10 ` Richard Laing
2015-08-04 22:25 ` Tom Herbert
1 sibling, 2 replies; 6+ messages in thread
From: Oleg A Arkhangelsky @ 2015-08-04 7:37 UTC (permalink / raw)
To: Richard Laing, netdev@vger.kernel.org, pch; +Cc: jmorris@namei.org
04.08.2015, 04:29, "Richard Laing" <richard.laing@alliedtelesis.co.nz>:
> Enable flow-based ECMP.
Looks like your approach is only restricted to the case when sockets are
involved. What about forwarded traffic (IP routing case)?
Have you seen similar work done by Peter Nørlund?
http://comments.gmane.org/gmane.linux.kernel.api/12201
--
wbr, Oleg.
"Anarchy is about taking complete responsibility for yourself."
Alan Moore.
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH 1/1] net/ipv4: Enable flow-based ECMP
2015-08-04 5:31 ` Stephen Hemminger
@ 2015-08-04 22:07 ` Richard Laing
0 siblings, 0 replies; 6+ messages in thread
From: Richard Laing @ 2015-08-04 22:07 UTC (permalink / raw)
To: netdev@vger.kernel.org
Hi Stephen,
Given that fib_nhs is currently an int I would rather keep live_nexthops
also as an int to match, probably fib_nhs could at least be set as
unsigned or changed to u16 or even u8 perhaps.
Best Regards,
Richard
On 08/04/2015 05:31 PM, Stephen Hemminger wrote:
> On Tue, 4 Aug 2015 13:28:47 +1200
> Richard Laing <richard.laing@alliedtelesis.co.nz> wrote:
>
>> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
>> index 5fa643b..7db9f72 100644
>> --- a/include/net/ip_fib.h
>> +++ b/include/net/ip_fib.h
>> @@ -117,6 +117,8 @@ struct fib_info {
>> #ifdef CONFIG_IP_ROUTE_MULTIPATH
>> int fib_power;
>> #endif
>> + /* Cache the number of live nexthops for flow based ECMP calculation. */
>> + int live_nexthops;
> unsigned or u16 ? rather than risking sign issues.
--
*Richard Laing*
Software Team Leader*
Allied Telesis Labs*| 27 Nazareth Ave | Christchurch 8024 | New Zealand
Phone: +64 3 3393000 | DDI: +64 3 339 9248 | Web: *alliedtelesis.com
<http://alliedtelesis.com/>*<skype:andrewriddell3?chat>
http://productselector.alliedtelesis.eu/
<http://productselector.alliedtelesis.eu/>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH 1/1] net/ipv4: Enable flow-based ECMP
2015-08-04 7:37 ` Oleg A Arkhangelsky
@ 2015-08-04 22:10 ` Richard Laing
2015-08-04 22:25 ` Tom Herbert
1 sibling, 0 replies; 6+ messages in thread
From: Richard Laing @ 2015-08-04 22:10 UTC (permalink / raw)
To: Oleg A Arkhangelsky, netdev@vger.kernel.org, pch@ordbogen.com
Cc: jmorris@namei.org
Hi Oleg,
I hadn't seen that patch, thanks, it looks like a pretty thorough solution.
Best Regards,
Richard
On 08/04/2015 07:37 PM, Oleg A Arkhangelsky wrote:
>
> 04.08.2015, 04:29, "Richard Laing" <richard.laing@alliedtelesis.co.nz>:
>> Enable flow-based ECMP.
> Looks like your approach is only restricted to the case when sockets are
> involved. What about forwarded traffic (IP routing case)?
>
> Have you seen similar work done by Peter Nørlund?
>
> http://comments.gmane.org/gmane.linux.kernel.api/12201
>
> --
> wbr, Oleg.
>
> "Anarchy is about taking complete responsibility for yourself."
> Alan Moore.
--
*Richard Laing*
Software Team Leader*
Allied Telesis Labs*| 27 Nazareth Ave | Christchurch 8024 | New Zealand
Phone: +64 3 3393000 | DDI: +64 3 339 9248 | Web: *alliedtelesis.com
<http://alliedtelesis.com/>*<skype:andrewriddell3?chat>
http://productselector.alliedtelesis.eu/
<http://productselector.alliedtelesis.eu/>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH 1/1] net/ipv4: Enable flow-based ECMP
2015-08-04 7:37 ` Oleg A Arkhangelsky
2015-08-04 22:10 ` Richard Laing
@ 2015-08-04 22:25 ` Tom Herbert
1 sibling, 0 replies; 6+ messages in thread
From: Tom Herbert @ 2015-08-04 22:25 UTC (permalink / raw)
To: Oleg A Arkhangelsky
Cc: Richard Laing, netdev@vger.kernel.org, pch, jmorris@namei.org
On Tue, Aug 4, 2015 at 12:37 AM, Oleg A Arkhangelsky <sysoleg@yandex.ru> wrote:
>
>
> 04.08.2015, 04:29, "Richard Laing" <richard.laing@alliedtelesis.co.nz>:
>> Enable flow-based ECMP.
>
> Looks like your approach is only restricted to the case when sockets are
> involved. What about forwarded traffic (IP routing case)?
>
skb_get_hash_flowi4 can be called to get the hash now. This should
work with or without (connected) sockets.
Thanks,
Tom
> Have you seen similar work done by Peter Nørlund?
>
> http://comments.gmane.org/gmane.linux.kernel.api/12201
>
> --
> wbr, Oleg.
>
> "Anarchy is about taking complete responsibility for yourself."
> Alan Moore.
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2015-08-04 22:25 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-08-04 1:28 [PATCH 1/1] net/ipv4: Enable flow-based ECMP Richard Laing
2015-08-04 5:31 ` Stephen Hemminger
2015-08-04 22:07 ` Richard Laing
2015-08-04 7:37 ` Oleg A Arkhangelsky
2015-08-04 22:10 ` Richard Laing
2015-08-04 22:25 ` Tom Herbert
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).