[RFC PATCH net-next v2] ipv6: implement consistent hashing for equal-cost multipath routing

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH net-next v2] ipv6: implement consistent hashing for equal-cost multipath routing
@ 2016-11-29 17:15 David Lebrun
  2016-11-30  3:52 ` Hannes Frederic Sowa
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: David Lebrun @ 2016-11-29 17:15 UTC (permalink / raw)
  To: netdev; +Cc: David Lebrun

When multiple nexthops are available for a given route, the routing engine
chooses a nexthop by computing the flow hash through get_hash_from_flowi6
and by taking that value modulo the number of nexthops. The resulting value
indexes the nexthop to select. This method causes issues when a new nexthop
is added or one is removed (e.g. link failure). In that case, the number
of nexthops changes and potentially all the flows get re-routed to another
nexthop.

This patch implements a consistent hash method to select the nexthop in
case of ECMP. The idea is to generate K slices (or intervals) for each
route with multiple nexthops. The nexthops are randomly assigned to those
slices, in a uniform manner. The number K is configurable through a sysctl
net.ipv6.route.ecmp_slices and is always an exponent of 2. To select the
nexthop, the algorithm takes the flow hash and computes an index which is
the flow hash modulo K. As K = 2^x, the modulo can be computed using a
simple binary AND operation (idx = hash & (K - 1)). The resulting index
references the selected nexthop. The lookup time complexity is thus O(1).

When a nexthop is added, it steals K/N slices from the other nexthops,
where N is the new number of nexthops. The slices are stolen randomly and
uniformly from the other nexthops. When a nexthop is removed, the orphan
slices are randomly reassigned to the other nexthops.

The number of slices for a route also fixes the maximum number of nexthops
possible for that route.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
---
 include/net/ip6_fib.h    |  25 +++++++
 include/net/netns/ipv6.h |   1 +
 net/ipv6/ip6_fib.c       | 174 ++++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv6/route.c         |  58 ++++++++--------
 4 files changed, 229 insertions(+), 29 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index a74e2aa..29594cc 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -93,6 +93,30 @@ struct rt6key {
 
 struct fib6_table;
 
+/* Multipath nexthop.
+ * @nh is the actual nexthop
+ * @nslices is the number of slices assigned to this nexthop
+ */
+struct rt6_multi_nh {
+	struct rt6_info *nh;
+	unsigned int	nslices;
+};
+
+/* Multipath routes map.
+ * @nhs is an array of available nexthops
+ * @size is the number of elements in @nhs
+ * @nslices is the number of slices and the number of elements in @index
+ *	    and is always in the form 2^x to prevent using a division for
+ *	    the computation of the modulo.
+ * @index is an array mapping a slice index to a nexthop index in @nhs
+ */
+struct rt6_multi_map {
+	struct rt6_multi_nh	*nhs;
+	unsigned int		size;
+	unsigned int		nslices;
+	__u8			*index;
+};
+
 struct rt6_info {
 	struct dst_entry		dst;
 
@@ -113,6 +137,7 @@ struct rt6_info {
 	 */
 	struct list_head		rt6i_siblings;
 	unsigned int			rt6i_nsiblings;
+	struct rt6_multi_map		*rt6i_nh_map;
 
 	atomic_t			rt6i_ref;
 
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index de7745e..4967846 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -36,6 +36,7 @@ struct netns_sysctl_ipv6 {
 	int idgen_retries;
 	int idgen_delay;
 	int flowlabel_state_ranges;
+	int ip6_rt_ecmp_slices;
 };
 
 struct netns_ipv6 {
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index ef54852..b6b1895 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -727,6 +727,166 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
 	}
 }
 
+static void fib6_mp_free(struct rt6_info *rt)
+{
+	struct rt6_multi_map *nh_map = rt->rt6i_nh_map;
+	struct rt6_info *sibling;
+
+	if (nh_map) {
+		list_for_each_entry(sibling, &rt->rt6i_siblings,
+				    rt6i_siblings) {
+			sibling->rt6i_nh_map = NULL;
+		}
+
+		rt->rt6i_nh_map = NULL;
+
+		kfree(nh_map->nhs);
+		kfree(nh_map->index);
+		kfree(nh_map);
+	}
+}
+
+static int fib6_mp_extend(struct net *net, struct rt6_info *sref,
+			  struct rt6_info *rt)
+{
+	struct rt6_multi_map *nh_map = sref->rt6i_nh_map;
+	struct rt6_multi_nh *tmp_nhs, *old_nhs;
+	unsigned int kslices;
+	unsigned int i, j;
+
+	if (!nh_map) {
+		__u8 *index;
+
+		nh_map = kmalloc(sizeof(*nh_map), GFP_ATOMIC);
+		if (!nh_map)
+			return -ENOMEM;
+
+		nh_map->size = 1;
+		nh_map->nslices = (1 << net->ipv6.sysctl.ip6_rt_ecmp_slices);
+
+		tmp_nhs = kmalloc(sizeof(*tmp_nhs), GFP_ATOMIC);
+		if (!tmp_nhs) {
+			kfree(nh_map);
+			return -ENOMEM;
+		}
+
+		tmp_nhs[0].nh = sref;
+		tmp_nhs[0].nslices = nh_map->nslices;
+		nh_map->nhs = tmp_nhs;
+
+		index = kmalloc_array(nh_map->nslices, sizeof(*index),
+				      GFP_ATOMIC);
+		if (!index) {
+			kfree(nh_map->nhs);
+			kfree(nh_map);
+			return -ENOMEM;
+		}
+
+		for (i = 0; i < nh_map->nslices; i++)
+			index[i] = 0;
+
+		nh_map->index = index;
+		sref->rt6i_nh_map = nh_map;
+	}
+
+	if (nh_map->size == nh_map->nslices)
+		return -ENOBUFS;
+
+	nh_map->size++;
+
+	tmp_nhs = kmalloc_array(nh_map->size, sizeof(*tmp_nhs), GFP_ATOMIC);
+	if (!tmp_nhs)
+		return -ENOMEM;
+
+	for (i = 0; i < nh_map->size - 1; i++)
+		tmp_nhs[i] = nh_map->nhs[i];
+
+	tmp_nhs[nh_map->size - 1].nh = rt;
+	tmp_nhs[nh_map->size - 1].nslices = 0;
+
+	kslices = nh_map->nslices / nh_map->size;
+
+	/* Loop over nexthops and steal a random slice until we have kslices for
+	 * the new nexthop. This algorithm ensures that flows are taken as
+	 * equally as possible from current nexthops.
+	 *
+	 * At each iteration, @j is the index in tmp_nhs of the nexthop
+	 * to steal from and @idx is the index of the slice to steal
+	 * among @j's slices.
+	 */
+	for (i = 0, j = 0; i < kslices; i++) {
+		unsigned int idx, k = 0;
+
+		if (tmp_nhs[j].nslices == 1)
+			continue;
+
+		idx = (prandom_u32() % tmp_nhs[j].nslices) + 1;
+		do {
+			if (nh_map->index[k++] == j)
+				idx--;
+		} while (idx);
+
+		nh_map->index[k - 1] = nh_map->size - 1;
+		tmp_nhs[nh_map->size - 1].nslices++;
+		tmp_nhs[j].nslices--;
+
+		j = (j + 1) % (nh_map->size - 1);
+	}
+
+	WARN_ON(tmp_nhs[nh_map->size - 1].nslices != kslices);
+
+	old_nhs = nh_map->nhs;
+	nh_map->nhs = tmp_nhs;
+	kfree(old_nhs);
+
+	rt->rt6i_nh_map = nh_map;
+
+	return 0;
+}
+
+static int fib6_mp_shrink(struct rt6_info *sref, struct rt6_info *rt)
+{
+	struct rt6_multi_map *nh_map = sref->rt6i_nh_map;
+	struct rt6_multi_nh *tmp_nhs, *old_nhs;
+	unsigned int i, j, idx = 0;
+
+	tmp_nhs = kmalloc_array(nh_map->size - 1, sizeof(*tmp_nhs), GFP_ATOMIC);
+	if (!tmp_nhs)
+		return -ENOMEM;
+
+	for (i = 0, j = 0; i < nh_map->size; i++) {
+		if (nh_map->nhs[i].nh != rt)
+			tmp_nhs[j++] = nh_map->nhs[i];
+		else
+			idx = i;
+	}
+
+	nh_map->size--;
+	old_nhs = nh_map->nhs;
+	nh_map->nhs = tmp_nhs;
+	kfree(old_nhs);
+
+	rt->rt6i_nh_map = NULL;
+
+	/* Update the slices index. For each slice mapping to the removed
+	 * nexthop, assign another random nexthop. For each slice mapping
+	 * to a nexthop that was after the removed nexthop, decrement the
+	 * nexthop index to reflect the shift. Nexthops that were before
+	 * the removed nexthop are unaffected.
+	 */
+	for (i = 0; i < nh_map->nslices; i++) {
+		if (nh_map->index[i] == idx) {
+			j = prandom_u32() % nh_map->size;
+			nh_map->index[i] = j;
+			nh_map->nhs[j].nslices++;
+		} else if (nh_map->index[i] > idx) {
+			nh_map->index[i]--;
+		}
+	}
+
+	return 0;
+}
+
 /*
  *	Insert routing information in a node.
  */
@@ -837,6 +997,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 			}
 			sibling = sibling->dst.rt6_next;
 		}
+		err = fib6_mp_extend(info->nl_net, sibling, rt);
+		if (err < 0)
+			return err;
 		/* For each sibling in the list, increment the counter of
 		 * siblings. BUG() if counters does not match, list of siblings
 		 * is broken!
@@ -900,6 +1063,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 			fn->fn_flags |= RTN_RTINFO;
 		}
 		nsiblings = iter->rt6i_nsiblings;
+		if (nsiblings)
+			fib6_mp_free(iter);
 		fib6_purge_rt(iter, fn, info->nl_net);
 		rt6_release(iter);
 
@@ -1407,13 +1572,20 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 
 	/* Remove this entry from other siblings */
 	if (rt->rt6i_nsiblings) {
-		struct rt6_info *sibling, *next_sibling;
+		struct rt6_info *sibling, *next_sibling, *sref;
+
+		sref = list_first_entry(&rt->rt6i_siblings, struct rt6_info,
+					rt6i_siblings);
 
 		list_for_each_entry_safe(sibling, next_sibling,
 					 &rt->rt6i_siblings, rt6i_siblings)
 			sibling->rt6i_nsiblings--;
 		rt->rt6i_nsiblings = 0;
 		list_del_init(&rt->rt6i_siblings);
+		if (!sref->rt6i_nsiblings)
+			fib6_mp_free(sref);
+		else
+			fib6_mp_shrink(sref, rt);
 	}
 
 	/* Adjust walkers */
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b317bb1..e9f13e0 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -427,39 +427,27 @@ static bool rt6_check_expired(const struct rt6_info *rt)
 	return false;
 }
 
-/* Multipath route selection:
- *   Hash based function using packet header and flowlabel.
- * Adapted from fib_info_hashfn()
- */
-static int rt6_info_hash_nhsfn(unsigned int candidate_count,
-			       const struct flowi6 *fl6)
-{
-	return get_hash_from_flowi6(fl6) % candidate_count;
-}
-
 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 					     struct flowi6 *fl6, int oif,
 					     int strict)
 {
-	struct rt6_info *sibling, *next_sibling;
-	int route_choosen;
+	struct rt6_multi_map *nh_map = match->rt6i_nh_map;
+	__u32 hash = get_hash_from_flowi6(fl6);
+	unsigned int slice, idx;
+	struct rt6_info *res;
 
-	route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
-	/* Don't change the route, if route_choosen == 0
-	 * (siblings does not include ourself)
-	 */
-	if (route_choosen)
-		list_for_each_entry_safe(sibling, next_sibling,
-				&match->rt6i_siblings, rt6i_siblings) {
-			route_choosen--;
-			if (route_choosen == 0) {
-				if (rt6_score_route(sibling, oif, strict) < 0)
-					break;
-				match = sibling;
-				break;
-			}
-		}
-	return match;
+	WARN_ON(!nh_map);
+	if (!nh_map)
+		return match;
+
+	slice = hash & (nh_map->nslices - 1);
+	idx = nh_map->index[slice];
+	res = nh_map->nhs[idx].nh;
+
+	if (rt6_score_route(res, oif, strict) < 0)
+		res = match;
+
+	return res;
 }
 
 /*
@@ -3550,6 +3538,9 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
 	return 0;
 }
 
+static int one = 1;
+static int thirtyone = 31;
+
 struct ctl_table ipv6_route_table_template[] = {
 	{
 		.procname	=	"flush",
@@ -3621,6 +3612,15 @@ struct ctl_table ipv6_route_table_template[] = {
 		.mode		=	0644,
 		.proc_handler	=	proc_dointvec_ms_jiffies,
 	},
+	{
+		.procname	=	"ecmp_slices",
+		.data		=	&init_net.ipv6.sysctl.ip6_rt_ecmp_slices,
+		.maxlen		=	sizeof(int),
+		.mode		=	0644,
+		.proc_handler	=	proc_dointvec_minmax,
+		.extra1		=	&one,
+		.extra2		=	&thirtyone,
+	},
 	{ }
 };
 
@@ -3644,6 +3644,7 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
+		table[10].data = &net->ipv6.sysctl.ip6_rt_ecmp_slices;
 
 		/* Don't export sysctls to unprivileged users */
 		if (net->user_ns != &init_user_ns)
@@ -3707,6 +3708,7 @@ static int __net_init ip6_route_net_init(struct net *net)
 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
+	net->ipv6.sysctl.ip6_rt_ecmp_slices = 5;
 
 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
 
-- 
2.7.3

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH net-next v2] ipv6: implement consistent hashing for equal-cost multipath routing
  2016-11-29 17:15 [RFC PATCH net-next v2] ipv6: implement consistent hashing for equal-cost multipath routing David Lebrun
@ 2016-11-30  3:52 ` Hannes Frederic Sowa
  2016-11-30  7:56   ` David Lebrun
  2016-11-30  4:04 ` Tom Herbert
  2016-11-30 19:49 ` David Miller
  2 siblings, 1 reply; 9+ messages in thread
From: Hannes Frederic Sowa @ 2016-11-30  3:52 UTC (permalink / raw)
  To: David Lebrun, netdev

Hi,

On Tue, Nov 29, 2016, at 18:15, David Lebrun wrote:
> When multiple nexthops are available for a given route, the routing
> engine
> chooses a nexthop by computing the flow hash through get_hash_from_flowi6
> and by taking that value modulo the number of nexthops. The resulting
> value
> indexes the nexthop to select. This method causes issues when a new
> nexthop
> is added or one is removed (e.g. link failure). In that case, the number
> of nexthops changes and potentially all the flows get re-routed to
> another
> nexthop.
> 
> This patch implements a consistent hash method to select the nexthop in
> case of ECMP. The idea is to generate K slices (or intervals) for each
> route with multiple nexthops. The nexthops are randomly assigned to those
> slices, in a uniform manner. The number K is configurable through a
> sysctl
> net.ipv6.route.ecmp_slices and is always an exponent of 2. To select the
> nexthop, the algorithm takes the flow hash and computes an index which is
> the flow hash modulo K. As K = 2^x, the modulo can be computed using a
> simple binary AND operation (idx = hash & (K - 1)). The resulting index
> references the selected nexthop. The lookup time complexity is thus O(1).
> 
> When a nexthop is added, it steals K/N slices from the other nexthops,
> where N is the new number of nexthops. The slices are stolen randomly and
> uniformly from the other nexthops. When a nexthop is removed, the orphan
> slices are randomly reassigned to the other nexthops.
> 
> The number of slices for a route also fixes the maximum number of
> nexthops
> possible for that route.

In the worst case this causes 2GB (order 19) allocations (x == 31) to
happen in GFP_ATOMIC (due to write lock) context and could cause update
failures to the routing table due to fragmentation. Are you sure the
upper limit of 31 is reasonable? I would very much prefer an upper limit
of below or equal 25 for x to stay within the bounds of the slab
allocators (which is still a lot and probably causes errors!).
Unfortunately because of the nature of the sysctl you can't really
create its own cache for it. :/

Also by design, one day this should all be RCU and having that much data
outstanding worries me a bit during routing table mutation.

I am a fan of consistent hashing but I am not so sure if it belongs into
a generic ECMP implementation or into its own ipvs or netfilter module
where you specifically know how much memory to burn for it.

Also please convert the sysctl to a netlink attribute if you pursue this
because if I change the sysctl while my quagga is hammering the routing
table I would like to know which nodes allocate what amount of memory.

Bye,
Hannes

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH net-next v2] ipv6: implement consistent hashing for equal-cost multipath routing
  2016-11-29 17:15 [RFC PATCH net-next v2] ipv6: implement consistent hashing for equal-cost multipath routing David Lebrun
  2016-11-30  3:52 ` Hannes Frederic Sowa
@ 2016-11-30  4:04 ` Tom Herbert
  2016-12-02  9:29   ` David Lebrun
  2016-11-30 19:49 ` David Miller
  2 siblings, 1 reply; 9+ messages in thread
From: Tom Herbert @ 2016-11-30  4:04 UTC (permalink / raw)
  To: David Lebrun; +Cc: Linux Kernel Network Developers

On Tue, Nov 29, 2016 at 9:15 AM, David Lebrun <david.lebrun@uclouvain.be> wrote:
> When multiple nexthops are available for a given route, the routing engine
> chooses a nexthop by computing the flow hash through get_hash_from_flowi6
> and by taking that value modulo the number of nexthops. The resulting value
> indexes the nexthop to select. This method causes issues when a new nexthop
> is added or one is removed (e.g. link failure). In that case, the number
> of nexthops changes and potentially all the flows get re-routed to another
> nexthop.
>
This is a lot of code to make ECMP work better. Can you be more
specific as to what the "issues" are? Assuming this is just the
transient packet reorder that happens in one link flap I am wondering
if this complexity is justified.

Tom

> This patch implements a consistent hash method to select the nexthop in
> case of ECMP. The idea is to generate K slices (or intervals) for each
> route with multiple nexthops. The nexthops are randomly assigned to those
> slices, in a uniform manner. The number K is configurable through a sysctl
> net.ipv6.route.ecmp_slices and is always an exponent of 2. To select the
> nexthop, the algorithm takes the flow hash and computes an index which is
> the flow hash modulo K. As K = 2^x, the modulo can be computed using a
> simple binary AND operation (idx = hash & (K - 1)). The resulting index
> references the selected nexthop. The lookup time complexity is thus O(1).
>
> When a nexthop is added, it steals K/N slices from the other nexthops,
> where N is the new number of nexthops. The slices are stolen randomly and
> uniformly from the other nexthops. When a nexthop is removed, the orphan
> slices are randomly reassigned to the other nexthops.
>
> The number of slices for a route also fixes the maximum number of nexthops
> possible for that route.
>
> Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
> ---
>  include/net/ip6_fib.h    |  25 +++++++
>  include/net/netns/ipv6.h |   1 +
>  net/ipv6/ip6_fib.c       | 174 ++++++++++++++++++++++++++++++++++++++++++++++-
>  net/ipv6/route.c         |  58 ++++++++--------
>  4 files changed, 229 insertions(+), 29 deletions(-)
>
> diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
> index a74e2aa..29594cc 100644
> --- a/include/net/ip6_fib.h
> +++ b/include/net/ip6_fib.h
> @@ -93,6 +93,30 @@ struct rt6key {
>
>  struct fib6_table;
>
> +/* Multipath nexthop.
> + * @nh is the actual nexthop
> + * @nslices is the number of slices assigned to this nexthop
> + */
> +struct rt6_multi_nh {
> +       struct rt6_info *nh;
> +       unsigned int    nslices;
> +};
> +
> +/* Multipath routes map.
> + * @nhs is an array of available nexthops
> + * @size is the number of elements in @nhs
> + * @nslices is the number of slices and the number of elements in @index
> + *         and is always in the form 2^x to prevent using a division for
> + *         the computation of the modulo.
> + * @index is an array mapping a slice index to a nexthop index in @nhs
> + */
> +struct rt6_multi_map {
> +       struct rt6_multi_nh     *nhs;
> +       unsigned int            size;
> +       unsigned int            nslices;
> +       __u8                    *index;
> +};
> +
>  struct rt6_info {
>         struct dst_entry                dst;
>
> @@ -113,6 +137,7 @@ struct rt6_info {
>          */
>         struct list_head                rt6i_siblings;
>         unsigned int                    rt6i_nsiblings;
> +       struct rt6_multi_map            *rt6i_nh_map;
>
>         atomic_t                        rt6i_ref;
>
> diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
> index de7745e..4967846 100644
> --- a/include/net/netns/ipv6.h
> +++ b/include/net/netns/ipv6.h
> @@ -36,6 +36,7 @@ struct netns_sysctl_ipv6 {
>         int idgen_retries;
>         int idgen_delay;
>         int flowlabel_state_ranges;
> +       int ip6_rt_ecmp_slices;
>  };
>
>  struct netns_ipv6 {
> diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
> index ef54852..b6b1895 100644
> --- a/net/ipv6/ip6_fib.c
> +++ b/net/ipv6/ip6_fib.c
> @@ -727,6 +727,166 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
>         }
>  }
>
> +static void fib6_mp_free(struct rt6_info *rt)
> +{
> +       struct rt6_multi_map *nh_map = rt->rt6i_nh_map;
> +       struct rt6_info *sibling;
> +
> +       if (nh_map) {
> +               list_for_each_entry(sibling, &rt->rt6i_siblings,
> +                                   rt6i_siblings) {
> +                       sibling->rt6i_nh_map = NULL;
> +               }
> +
> +               rt->rt6i_nh_map = NULL;
> +
> +               kfree(nh_map->nhs);
> +               kfree(nh_map->index);
> +               kfree(nh_map);
> +       }
> +}
> +
> +static int fib6_mp_extend(struct net *net, struct rt6_info *sref,
> +                         struct rt6_info *rt)
> +{
> +       struct rt6_multi_map *nh_map = sref->rt6i_nh_map;
> +       struct rt6_multi_nh *tmp_nhs, *old_nhs;
> +       unsigned int kslices;
> +       unsigned int i, j;
> +
> +       if (!nh_map) {
> +               __u8 *index;
> +
> +               nh_map = kmalloc(sizeof(*nh_map), GFP_ATOMIC);
> +               if (!nh_map)
> +                       return -ENOMEM;
> +
> +               nh_map->size = 1;
> +               nh_map->nslices = (1 << net->ipv6.sysctl.ip6_rt_ecmp_slices);
> +
> +               tmp_nhs = kmalloc(sizeof(*tmp_nhs), GFP_ATOMIC);
> +               if (!tmp_nhs) {
> +                       kfree(nh_map);
> +                       return -ENOMEM;
> +               }
> +
> +               tmp_nhs[0].nh = sref;
> +               tmp_nhs[0].nslices = nh_map->nslices;
> +               nh_map->nhs = tmp_nhs;
> +
> +               index = kmalloc_array(nh_map->nslices, sizeof(*index),
> +                                     GFP_ATOMIC);
> +               if (!index) {
> +                       kfree(nh_map->nhs);
> +                       kfree(nh_map);
> +                       return -ENOMEM;
> +               }
> +
> +               for (i = 0; i < nh_map->nslices; i++)
> +                       index[i] = 0;
> +
> +               nh_map->index = index;
> +               sref->rt6i_nh_map = nh_map;
> +       }
> +
> +       if (nh_map->size == nh_map->nslices)
> +               return -ENOBUFS;
> +
> +       nh_map->size++;
> +
> +       tmp_nhs = kmalloc_array(nh_map->size, sizeof(*tmp_nhs), GFP_ATOMIC);
> +       if (!tmp_nhs)
> +               return -ENOMEM;
> +
> +       for (i = 0; i < nh_map->size - 1; i++)
> +               tmp_nhs[i] = nh_map->nhs[i];
> +
> +       tmp_nhs[nh_map->size - 1].nh = rt;
> +       tmp_nhs[nh_map->size - 1].nslices = 0;
> +
> +       kslices = nh_map->nslices / nh_map->size;
> +
> +       /* Loop over nexthops and steal a random slice until we have kslices for
> +        * the new nexthop. This algorithm ensures that flows are taken as
> +        * equally as possible from current nexthops.
> +        *
> +        * At each iteration, @j is the index in tmp_nhs of the nexthop
> +        * to steal from and @idx is the index of the slice to steal
> +        * among @j's slices.
> +        */
> +       for (i = 0, j = 0; i < kslices; i++) {
> +               unsigned int idx, k = 0;
> +
> +               if (tmp_nhs[j].nslices == 1)
> +                       continue;
> +
> +               idx = (prandom_u32() % tmp_nhs[j].nslices) + 1;
> +               do {
> +                       if (nh_map->index[k++] == j)
> +                               idx--;
> +               } while (idx);
> +
> +               nh_map->index[k - 1] = nh_map->size - 1;
> +               tmp_nhs[nh_map->size - 1].nslices++;
> +               tmp_nhs[j].nslices--;
> +
> +               j = (j + 1) % (nh_map->size - 1);
> +       }
> +
> +       WARN_ON(tmp_nhs[nh_map->size - 1].nslices != kslices);
> +
> +       old_nhs = nh_map->nhs;
> +       nh_map->nhs = tmp_nhs;
> +       kfree(old_nhs);
> +
> +       rt->rt6i_nh_map = nh_map;
> +
> +       return 0;
> +}
> +
> +static int fib6_mp_shrink(struct rt6_info *sref, struct rt6_info *rt)
> +{
> +       struct rt6_multi_map *nh_map = sref->rt6i_nh_map;
> +       struct rt6_multi_nh *tmp_nhs, *old_nhs;
> +       unsigned int i, j, idx = 0;
> +
> +       tmp_nhs = kmalloc_array(nh_map->size - 1, sizeof(*tmp_nhs), GFP_ATOMIC);
> +       if (!tmp_nhs)
> +               return -ENOMEM;
> +
> +       for (i = 0, j = 0; i < nh_map->size; i++) {
> +               if (nh_map->nhs[i].nh != rt)
> +                       tmp_nhs[j++] = nh_map->nhs[i];
> +               else
> +                       idx = i;
> +       }
> +
> +       nh_map->size--;
> +       old_nhs = nh_map->nhs;
> +       nh_map->nhs = tmp_nhs;
> +       kfree(old_nhs);
> +
> +       rt->rt6i_nh_map = NULL;
> +
> +       /* Update the slices index. For each slice mapping to the removed
> +        * nexthop, assign another random nexthop. For each slice mapping
> +        * to a nexthop that was after the removed nexthop, decrement the
> +        * nexthop index to reflect the shift. Nexthops that were before
> +        * the removed nexthop are unaffected.
> +        */
> +       for (i = 0; i < nh_map->nslices; i++) {
> +               if (nh_map->index[i] == idx) {
> +                       j = prandom_u32() % nh_map->size;
> +                       nh_map->index[i] = j;
> +                       nh_map->nhs[j].nslices++;
> +               } else if (nh_map->index[i] > idx) {
> +                       nh_map->index[i]--;
> +               }
> +       }
> +
> +       return 0;
> +}
> +
>  /*
>   *     Insert routing information in a node.
>   */
> @@ -837,6 +997,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
>                         }
>                         sibling = sibling->dst.rt6_next;
>                 }
> +               err = fib6_mp_extend(info->nl_net, sibling, rt);
> +               if (err < 0)
> +                       return err;
>                 /* For each sibling in the list, increment the counter of
>                  * siblings. BUG() if counters does not match, list of siblings
>                  * is broken!
> @@ -900,6 +1063,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
>                         fn->fn_flags |= RTN_RTINFO;
>                 }
>                 nsiblings = iter->rt6i_nsiblings;
> +               if (nsiblings)
> +                       fib6_mp_free(iter);
>                 fib6_purge_rt(iter, fn, info->nl_net);
>                 rt6_release(iter);
>
> @@ -1407,13 +1572,20 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
>
>         /* Remove this entry from other siblings */
>         if (rt->rt6i_nsiblings) {
> -               struct rt6_info *sibling, *next_sibling;
> +               struct rt6_info *sibling, *next_sibling, *sref;
> +
> +               sref = list_first_entry(&rt->rt6i_siblings, struct rt6_info,
> +                                       rt6i_siblings);
>
>                 list_for_each_entry_safe(sibling, next_sibling,
>                                          &rt->rt6i_siblings, rt6i_siblings)
>                         sibling->rt6i_nsiblings--;
>                 rt->rt6i_nsiblings = 0;
>                 list_del_init(&rt->rt6i_siblings);
> +               if (!sref->rt6i_nsiblings)
> +                       fib6_mp_free(sref);
> +               else
> +                       fib6_mp_shrink(sref, rt);
>         }
>
>         /* Adjust walkers */
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index b317bb1..e9f13e0 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -427,39 +427,27 @@ static bool rt6_check_expired(const struct rt6_info *rt)
>         return false;
>  }
>
> -/* Multipath route selection:
> - *   Hash based function using packet header and flowlabel.
> - * Adapted from fib_info_hashfn()
> - */
> -static int rt6_info_hash_nhsfn(unsigned int candidate_count,
> -                              const struct flowi6 *fl6)
> -{
> -       return get_hash_from_flowi6(fl6) % candidate_count;
> -}
> -
>  static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
>                                              struct flowi6 *fl6, int oif,
>                                              int strict)
>  {
> -       struct rt6_info *sibling, *next_sibling;
> -       int route_choosen;
> +       struct rt6_multi_map *nh_map = match->rt6i_nh_map;
> +       __u32 hash = get_hash_from_flowi6(fl6);
> +       unsigned int slice, idx;
> +       struct rt6_info *res;
>
> -       route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
> -       /* Don't change the route, if route_choosen == 0
> -        * (siblings does not include ourself)
> -        */
> -       if (route_choosen)
> -               list_for_each_entry_safe(sibling, next_sibling,
> -                               &match->rt6i_siblings, rt6i_siblings) {
> -                       route_choosen--;
> -                       if (route_choosen == 0) {
> -                               if (rt6_score_route(sibling, oif, strict) < 0)
> -                                       break;
> -                               match = sibling;
> -                               break;
> -                       }
> -               }
> -       return match;
> +       WARN_ON(!nh_map);
> +       if (!nh_map)
> +               return match;
> +
> +       slice = hash & (nh_map->nslices - 1);
> +       idx = nh_map->index[slice];
> +       res = nh_map->nhs[idx].nh;
> +
> +       if (rt6_score_route(res, oif, strict) < 0)
> +               res = match;
> +
> +       return res;
>  }
>
>  /*
> @@ -3550,6 +3538,9 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
>         return 0;
>  }
>
> +static int one = 1;
> +static int thirtyone = 31;
> +
>  struct ctl_table ipv6_route_table_template[] = {
>         {
>                 .procname       =       "flush",
> @@ -3621,6 +3612,15 @@ struct ctl_table ipv6_route_table_template[] = {
>                 .mode           =       0644,
>                 .proc_handler   =       proc_dointvec_ms_jiffies,
>         },
> +       {
> +               .procname       =       "ecmp_slices",
> +               .data           =       &init_net.ipv6.sysctl.ip6_rt_ecmp_slices,
> +               .maxlen         =       sizeof(int),
> +               .mode           =       0644,
> +               .proc_handler   =       proc_dointvec_minmax,
> +               .extra1         =       &one,
> +               .extra2         =       &thirtyone,
> +       },
>         { }
>  };
>
> @@ -3644,6 +3644,7 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
>                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
>                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
>                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
> +               table[10].data = &net->ipv6.sysctl.ip6_rt_ecmp_slices;
>
>                 /* Don't export sysctls to unprivileged users */
>                 if (net->user_ns != &init_user_ns)
> @@ -3707,6 +3708,7 @@ static int __net_init ip6_route_net_init(struct net *net)
>         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
>         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
>         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
> +       net->ipv6.sysctl.ip6_rt_ecmp_slices = 5;
>
>         net->ipv6.ip6_rt_gc_expire = 30*HZ;
>
> --
> 2.7.3
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH net-next v2] ipv6: implement consistent hashing for equal-cost multipath routing
  2016-11-30  3:52 ` Hannes Frederic Sowa
@ 2016-11-30  7:56   ` David Lebrun
  2016-12-01 17:55     ` Roopa Prabhu
  0 siblings, 1 reply; 9+ messages in thread
From: David Lebrun @ 2016-11-30  7:56 UTC (permalink / raw)
  To: Hannes Frederic Sowa, netdev

[-- Attachment #1: Type: text/plain, Size: 1667 bytes --]

On 11/30/2016 04:52 AM, Hannes Frederic Sowa wrote:
> In the worst case this causes 2GB (order 19) allocations (x == 31) to
> happen in GFP_ATOMIC (due to write lock) context and could cause update
> failures to the routing table due to fragmentation. Are you sure the
> upper limit of 31 is reasonable? I would very much prefer an upper limit
> of below or equal 25 for x to stay within the bounds of the slab
> allocators (which is still a lot and probably causes errors!).
> Unfortunately because of the nature of the sysctl you can't really
> create its own cache for it. :/
> 

Agreed. I think that even something like 16 would be excessively
sufficient, that would enable 65K slices, which is way more than enough
to have sufficient balancing with a reasonable amount of nexthops (I
wonder whether there are actual deployments with more than 32 nexthops
for a route).

> Also by design, one day this should all be RCU and having that much data
> outstanding worries me a bit during routing table mutation.
> 
> I am a fan of consistent hashing but I am not so sure if it belongs into
> a generic ECMP implementation or into its own ipvs or netfilter module
> where you specifically know how much memory to burn for it.
> 

The complexity of the consistent hashing code might warrant something
like that, but I am ot sure of the implications.

> Also please convert the sysctl to a netlink attribute if you pursue this
> because if I change the sysctl while my quagga is hammering the routing
> table I would like to know which nodes allocate what amount of memory.

Yes, that was the idea.

Thanks for the feedback

David


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 163 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH net-next v2] ipv6: implement consistent hashing for equal-cost multipath routing
  2016-11-29 17:15 [RFC PATCH net-next v2] ipv6: implement consistent hashing for equal-cost multipath routing David Lebrun
  2016-11-30  3:52 ` Hannes Frederic Sowa
  2016-11-30  4:04 ` Tom Herbert
@ 2016-11-30 19:49 ` David Miller
  2016-12-01  5:30   ` Hannes Frederic Sowa
  2 siblings, 1 reply; 9+ messages in thread
From: David Miller @ 2016-11-30 19:49 UTC (permalink / raw)
  To: david.lebrun; +Cc: netdev

From: David Lebrun <david.lebrun@uclouvain.be>
Date: Tue, 29 Nov 2016 18:15:18 +0100

> When multiple nexthops are available for a given route, the routing engine
> chooses a nexthop by computing the flow hash through get_hash_from_flowi6
> and by taking that value modulo the number of nexthops. The resulting value
> indexes the nexthop to select. This method causes issues when a new nexthop
> is added or one is removed (e.g. link failure). In that case, the number
> of nexthops changes and potentially all the flows get re-routed to another
> nexthop.
> 
> This patch implements a consistent hash method to select the nexthop in
> case of ECMP. The idea is to generate K slices (or intervals) for each
> route with multiple nexthops. The nexthops are randomly assigned to those
> slices, in a uniform manner. The number K is configurable through a sysctl
> net.ipv6.route.ecmp_slices and is always an exponent of 2. To select the
> nexthop, the algorithm takes the flow hash and computes an index which is
> the flow hash modulo K. As K = 2^x, the modulo can be computed using a
> simple binary AND operation (idx = hash & (K - 1)). The resulting index
> references the selected nexthop. The lookup time complexity is thus O(1).
> 
> When a nexthop is added, it steals K/N slices from the other nexthops,
> where N is the new number of nexthops. The slices are stolen randomly and
> uniformly from the other nexthops. When a nexthop is removed, the orphan
> slices are randomly reassigned to the other nexthops.
> 
> The number of slices for a route also fixes the maximum number of nexthops
> possible for that route.
> 
> Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>

Interesting approach, but like Hannes I worry about the memory consumption
bounds.

Limiting to 1<<16 is interesting, but if you can limit to 1<<8 (256
nexthops) maybe the state requirement can be compressed even further?

We can always increase this if necessary in the future if someone
reports a reasonable use case that really needs it.  Let's start
simple and small first.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH net-next v2] ipv6: implement consistent hashing for equal-cost multipath routing
  2016-11-30 19:49 ` David Miller
@ 2016-12-01  5:30   ` Hannes Frederic Sowa
  0 siblings, 0 replies; 9+ messages in thread
From: Hannes Frederic Sowa @ 2016-12-01  5:30 UTC (permalink / raw)
  To: David Miller, david.lebrun; +Cc: netdev

On Wed, Nov 30, 2016, at 20:49, David Miller wrote:
> From: David Lebrun <david.lebrun@uclouvain.be>
> Date: Tue, 29 Nov 2016 18:15:18 +0100
> 
> > When multiple nexthops are available for a given route, the routing engine
> > chooses a nexthop by computing the flow hash through get_hash_from_flowi6
> > and by taking that value modulo the number of nexthops. The resulting value
> > indexes the nexthop to select. This method causes issues when a new nexthop
> > is added or one is removed (e.g. link failure). In that case, the number
> > of nexthops changes and potentially all the flows get re-routed to another
> > nexthop.
> > 
> > This patch implements a consistent hash method to select the nexthop in
> > case of ECMP. The idea is to generate K slices (or intervals) for each
> > route with multiple nexthops. The nexthops are randomly assigned to those
> > slices, in a uniform manner. The number K is configurable through a sysctl
> > net.ipv6.route.ecmp_slices and is always an exponent of 2. To select the
> > nexthop, the algorithm takes the flow hash and computes an index which is
> > the flow hash modulo K. As K = 2^x, the modulo can be computed using a
> > simple binary AND operation (idx = hash & (K - 1)). The resulting index
> > references the selected nexthop. The lookup time complexity is thus O(1).
> > 
> > When a nexthop is added, it steals K/N slices from the other nexthops,
> > where N is the new number of nexthops. The slices are stolen randomly and
> > uniformly from the other nexthops. When a nexthop is removed, the orphan
> > slices are randomly reassigned to the other nexthops.
> > 
> > The number of slices for a route also fixes the maximum number of nexthops
> > possible for that route.
> > 
> > Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
> 
> Interesting approach, but like Hannes I worry about the memory
> consumption
> bounds.
> 
> Limiting to 1<<16 is interesting, but if you can limit to 1<<8 (256
> nexthops) maybe the state requirement can be compressed even further?

>From what I have heard the user space DPDK-alike implementations of
consistent hashing actually use as much memory as possible (2^32) to
consistently steer the flow to the target, so every u32 flow hash value
has a unique corresponding mapping. The slice size doesn't affect the
number of nexthops (that is limited by the datatype in the array, which
is in this patch a u8), thus limiting us to 256 nexthops we could refer
to independently of the slice size. Lowering the slices means that the
algorithm becomes inaccurate as more flows map to one nexthop selector,
thus causing more inconsistent routing decisions to be made when changes
happen.

> We can always increase this if necessary in the future if someone
> reports a reasonable use case that really needs it.  Let's start
> simple and small first.

If people really rely on consistent hashing for load balancers setups
where the backends don't synchronize their state across their server
(the IPv6 anycast use case), they would probably always want to go with
the full sized state table, as every update on this map would cause user
errors and inconsistent state on the backend.

Thus, I think that we should also allow for maximum allocations,
wherever they come from, as it is already out there and deployed.

Bye,
Hannes

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH net-next v2] ipv6: implement consistent hashing for equal-cost multipath routing
  2016-11-30  7:56   ` David Lebrun
@ 2016-12-01 17:55     ` Roopa Prabhu
  2016-12-02  8:29       ` David Lebrun
  0 siblings, 1 reply; 9+ messages in thread
From: Roopa Prabhu @ 2016-12-01 17:55 UTC (permalink / raw)
  To: David Lebrun; +Cc: Hannes Frederic Sowa, netdev@vger.kernel.org

On Tue, Nov 29, 2016 at 11:56 PM, David Lebrun
<david.lebrun@uclouvain.be> wrote:
> On 11/30/2016 04:52 AM, Hannes Frederic Sowa wrote:

>
>> Also please convert the sysctl to a netlink attribute if you pursue this
>> because if I change the sysctl while my quagga is hammering the routing
>> table I would like to know which nodes allocate what amount of memory.
>
> Yes, that was the idea.
>

I think Its best for it to be a global setting, and thats why sysctl
seems like the best way (unless there are other ways to set this
globally via rtnetlink). If it helps, most hw switch vendors
supporting this feature also provide a globally tunable knob and it is
not on by default.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH net-next v2] ipv6: implement consistent hashing for equal-cost multipath routing
  2016-12-01 17:55     ` Roopa Prabhu
@ 2016-12-02  8:29       ` David Lebrun
  0 siblings, 0 replies; 9+ messages in thread
From: David Lebrun @ 2016-12-02  8:29 UTC (permalink / raw)
  To: Roopa Prabhu; +Cc: Hannes Frederic Sowa, netdev@vger.kernel.org

[-- Attachment #1: Type: text/plain, Size: 491 bytes --]

On 12/01/2016 06:55 PM, Roopa Prabhu wrote:
> I think Its best for it to be a global setting, and thats why sysctl
> seems like the best way (unless there are other ways to set this
> globally via rtnetlink). If it helps, most hw switch vendors
> supporting this feature also provide a globally tunable knob and it is
> not on by default.

What I had in mind was to keep the global sysctl but also provide a
per-route attribute, which would default to the global sysctl.

David


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 163 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH net-next v2] ipv6: implement consistent hashing for equal-cost multipath routing
  2016-11-30  4:04 ` Tom Herbert
@ 2016-12-02  9:29   ` David Lebrun
  0 siblings, 0 replies; 9+ messages in thread
From: David Lebrun @ 2016-12-02  9:29 UTC (permalink / raw)
  To: Tom Herbert; +Cc: Linux Kernel Network Developers

[-- Attachment #1: Type: text/plain, Size: 516 bytes --]

On 11/30/2016 05:04 AM, Tom Herbert wrote:
> This is a lot of code to make ECMP work better. Can you be more
> specific as to what the "issues" are? Assuming this is just the
> transient packet reorder that happens in one link flap I am wondering
> if this complexity is justified.

Unconsistent hashing is an issue when the load balancer is in front of
stateful backends, keeping per-flow state. Also, if neighbors are
constantly being added and removed, flows are constantly changing nexthops.

David


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 163 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2016-12-02  9:29 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-11-29 17:15 [RFC PATCH net-next v2] ipv6: implement consistent hashing for equal-cost multipath routing David Lebrun
2016-11-30  3:52 ` Hannes Frederic Sowa
2016-11-30  7:56   ` David Lebrun
2016-12-01 17:55     ` Roopa Prabhu
2016-12-02  8:29       ` David Lebrun
2016-11-30  4:04 ` Tom Herbert
2016-12-02  9:29   ` David Lebrun
2016-11-30 19:49 ` David Miller
2016-12-01  5:30   ` Hannes Frederic Sowa

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).