[RFC PATCH net-next] ipv6: implement consistent hashing for equal-cost multipath routing

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH net-next] ipv6: implement consistent hashing for equal-cost multipath routing
@ 2016-11-24 19:59 David Lebrun
  2016-11-28 16:22 ` David Miller
  0 siblings, 1 reply; 7+ messages in thread
From: David Lebrun @ 2016-11-24 19:59 UTC (permalink / raw)
  To: netdev; +Cc: David Lebrun

When multiple nexthops are available for a given route, the routing engine
chooses a nexthop by computing the flow hash through get_hash_from_flowi6
and by taking that value modulo the number of nexthops. The resulting value
indexes the nexthop to select. This method causes issues when a new nexthop
is added or one is removed (e.g. link failure). In that case, the number
of nexthops changes and potentially all the flows get re-routed to another
nexthop.

This patch implements a consistent hash method to select the nexthop in
case of ECMP. The idea is to generate N random numbers (__u32) for each
nexthop, where N is configurable. In order to select a nexthop, we find the
number that is directly higher than the flow hash (which is also __u32).
The nexthop associated to the number is then selected. The lookup method
performs a binary search over the sorted array of numbers, which yields a
time complexity of O(log n), where n is the number of nexthops times the
number of random values generated for each number.

This feature can be enabled through the CONFIG_IPV6_MULTIPATH_CONSISTENT
option and the number of random values generated for each nexthop is
defined through CONFIG_IPV6_MPCONSIST_BUCKETSIZE.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
---
 include/net/ip6_fib.h |  20 ++++
 net/ipv6/Kconfig      |  26 +++++
 net/ipv6/Makefile     |   1 +
 net/ipv6/ip6_ecmp.c   | 263 ++++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/ip6_fib.c    |  18 ++++
 net/ipv6/route.c      |  56 +++++++++++
 6 files changed, 384 insertions(+)
 create mode 100644 net/ipv6/ip6_ecmp.c

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index a74e2aa..e22417d 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -93,6 +93,16 @@ struct rt6key {
 
 struct fib6_table;
 
+struct rt6_multi_nh {
+	__u32		key;
+	struct rt6_info *nh;
+};
+
+struct rt6_multi_map {
+	struct rt6_multi_nh	*nhs;
+	unsigned int		size;
+};
+
 struct rt6_info {
 	struct dst_entry		dst;
 
@@ -113,6 +123,9 @@ struct rt6_info {
 	 */
 	struct list_head		rt6i_siblings;
 	unsigned int			rt6i_nsiblings;
+#ifdef CONFIG_IPV6_MULTIPATH_CONSISTENT
+	struct rt6_multi_map		*rt6i_nh_map;
+#endif
 
 	atomic_t			rt6i_ref;
 
@@ -302,4 +315,11 @@ static inline void              fib6_rules_cleanup(void)
 	return ;
 }
 #endif
+
+#ifdef CONFIG_IPV6_MULTIPATH_CONSISTENT
+int fib6_mp_shrink(struct rt6_info *sref, struct rt6_info *rt);
+int fib6_mp_extend(struct rt6_info *rt, struct rt6_multi_map *nh_map);
+void fib6_mp_free(struct rt6_info *rt);
+#endif
+
 #endif
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index ec1267e..ebfae0d 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -324,4 +324,30 @@ config IPV6_SEG6_HMAC
 
 	  If unsure, say N.
 
+config IPV6_MULTIPATH_CONSISTENT
+	bool "IPv6: enable consistent hashing for ECMP"
+	depends on IPV6
+	---help---
+	  Enable consistent hashing for Equal-Cost Multi-Path (ECMP)
+	  route selection. By default, the nexthop is selected by taking
+	  the flow hash modulo the number of nexthops. When a nexthop is
+	  added or removed (e.g. link failure), all flows might change to
+	  a different nexthop as the modulo changes. Enabling this option
+	  allows to ensure that when a nexthop is removed, only the affected
+	  flows are assigned to another nexthop, and they are balanced equally
+	  across the remaining nexthops. When a nexthop is added, only a subset
+	  of the flows are assigned to it. The lookup performance is O(log n)
+	  where n is the number of nexthops times CONFIG_IPV6_MPCONSIST_BUCKETSIZE.
+
+	  If unsure, say N.
+
+config IPV6_MPCONSIST_BUCKETSIZE
+	int "IPv6: bucket size for ECMP consistent hashing"
+	default 10
+	depends on IPV6_MULTIPATH_CONSISTENT
+	---help---
+	  Define the number of hash entries generated for each ECMP nexthop.
+	  A higher value increases the uniform distribution of flows across
+	  nexthops, but also increases lookup performances logarithmically.
+
 endif # IPV6
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index a9e9fec..6481c5d 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -25,6 +25,7 @@ ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o
 ipv6-$(CONFIG_NETLABEL) += calipso.o
 ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o
 ipv6-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o
+ipv6-$(CONFIG_IPV6_MULTIPATH_CONSISTENT) += ip6_ecmp.o
 
 ipv6-objs += $(ipv6-y)
 
diff --git a/net/ipv6/ip6_ecmp.c b/net/ipv6/ip6_ecmp.c
new file mode 100644
index 0000000..799a328
--- /dev/null
+++ b/net/ipv6/ip6_ecmp.c
@@ -0,0 +1,263 @@
+/*
+ * IPv6 Equal-Cost Multi-Path
+ *
+ * Author:
+ * David Lebrun <david.lebrun@uclouvain.be>
+
+ *  This program is free software; you can redistribute it and/or
+ *	  modify it under the terms of the GNU General Public License
+ *	  as published by the Free Software Foundation; either version
+ *	  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/route.h>
+#include <linux/netdevice.h>
+#include <linux/in6.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#include <net/ipv6.h>
+#include <net/ndisc.h>
+#include <net/addrconf.h>
+#include <net/lwtunnel.h>
+
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+
+static int mphash_bucket_size = CONFIG_IPV6_MPCONSIST_BUCKETSIZE;
+
+void fib6_mp_free(struct rt6_info *rt)
+{
+	struct rt6_multi_map *nh_map = rt->rt6i_nh_map;
+	struct rt6_info *sibling;
+
+	if (nh_map) {
+		list_for_each_entry(sibling, &rt->rt6i_siblings,
+				    rt6i_siblings) {
+			sibling->rt6i_nh_map = NULL;
+		}
+
+		rt->rt6i_nh_map = NULL;
+
+		kfree(nh_map->nhs);
+		kfree(nh_map);
+	}
+}
+
+static bool fib6_mp_key_exists(struct rt6_multi_nh *nhs, unsigned int size,
+			       __u32 key)
+{
+	unsigned int i;
+
+	for (i = 0; i < size; i++) {
+		if (nhs[i].key == key)
+			return true;
+	}
+
+	return false;
+}
+
+static void fib6_mp_populate(struct rt6_multi_nh *nhs, unsigned int offset,
+			     unsigned int size, unsigned int fullsize,
+			     struct rt6_info *rt)
+{
+	unsigned int i;
+
+	for (i = offset; i < offset + size; i++) {
+		__u32 key;
+
+		do {
+			get_random_bytes(&key, sizeof(key));
+		} while (fib6_mp_key_exists(nhs, fullsize, key));
+
+		nhs[i].key = key;
+		nhs[i].nh = rt;
+	}
+}
+
+static int fib6_mp_update(struct rt6_info *rt, struct rt6_multi_nh *nhs,
+			  unsigned int size, bool check)
+{
+	struct rt6_multi_map *new_map;
+	struct rt6_info *sibling;
+
+	new_map = kmalloc(sizeof(*new_map), GFP_ATOMIC);
+	if (!new_map)
+		return -ENOMEM;
+
+	new_map->nhs = nhs;
+	new_map->size = size;
+
+	list_for_each_entry(sibling, &rt->rt6i_siblings, rt6i_siblings) {
+		if (check)
+			WARN_ON(sibling->rt6i_nh_map);
+		sibling->rt6i_nh_map = new_map;
+	}
+
+	rt->rt6i_nh_map = new_map;
+
+	return 0;
+}
+
+static void ___merge(struct rt6_multi_nh *nhs, struct rt6_multi_nh *helper,
+		     unsigned int low, unsigned int middle, unsigned int high)
+{
+	unsigned int i, j, k;
+
+	i = low;
+	j = middle + 1;
+	k = 0;
+
+	while (i <= middle || j <= high) {
+		if (i <= middle && j <= high) {
+			if (nhs[i].key < nhs[j].key)
+				helper[k++] = nhs[i++];
+			else
+				helper[k++] = nhs[j++];
+		} else if (i <= middle) {
+			helper[k++] = nhs[i++];
+		} else {
+			helper[k++] = nhs[j++];
+		}
+	}
+
+	for (i = 0; i < (high - low + 1); i++)
+		nhs[low + i] = helper[i];
+}
+
+static void ___mergesort(struct rt6_multi_nh *nhs, struct rt6_multi_nh *helper,
+			 unsigned int low, unsigned int high)
+{
+	unsigned int middle;
+
+	if (low > high)
+		return;
+
+	middle = ((low + high) / 2);
+
+	if (low < high) {
+		___mergesort(nhs, helper, low, middle);
+		___mergesort(nhs, helper, middle + 1, high);
+	}
+
+	___merge(nhs, helper, low, middle, high);
+}
+
+static int fib6_mp_sort(struct rt6_multi_nh *nhs, unsigned int size)
+{
+	struct rt6_multi_nh *helper;
+
+	helper = kmalloc_array(size, sizeof(*helper), GFP_ATOMIC);
+	if (!helper)
+		return -ENOMEM;
+
+	___mergesort(nhs, helper, 0, size - 1);
+	kfree(helper);
+
+	return 0;
+}
+
+int fib6_mp_extend(struct rt6_info *rt, struct rt6_multi_map *nh_map)
+{
+	struct rt6_multi_nh *tmp_nhs;
+	unsigned int size;
+	int err;
+
+	if (!nh_map) {
+		struct rt6_info *sibling;
+
+		size = mphash_bucket_size * 2;
+		tmp_nhs = kcalloc(size, sizeof(struct rt6_multi_nh),
+				  GFP_ATOMIC);
+		if (!tmp_nhs)
+			return -ENOMEM;
+
+		sibling = list_first_entry(&rt->rt6i_siblings, struct rt6_info,
+					   rt6i_siblings);
+
+		fib6_mp_populate(tmp_nhs, 0, mphash_bucket_size, size, sibling);
+
+		fib6_mp_populate(tmp_nhs, mphash_bucket_size,
+				 mphash_bucket_size, size, rt);
+
+		err = fib6_mp_sort(tmp_nhs, size);
+		if (err) {
+			kfree(tmp_nhs);
+			return err;
+		}
+
+		err = fib6_mp_update(rt, tmp_nhs, size, true);
+		if (err) {
+			kfree(tmp_nhs);
+			return err;
+		}
+
+		return 0;
+	}
+
+	size = nh_map->size + mphash_bucket_size;
+	tmp_nhs = __krealloc(nh_map->nhs, size * sizeof(*tmp_nhs), GFP_ATOMIC);
+	if (!tmp_nhs)
+		return -ENOMEM;
+
+	memset(tmp_nhs + nh_map->size, 0,
+	       mphash_bucket_size * sizeof(*tmp_nhs));
+
+	fib6_mp_populate(tmp_nhs, nh_map->size, mphash_bucket_size, size, rt);
+
+	err = fib6_mp_sort(tmp_nhs, size);
+	if (err) {
+		kfree(tmp_nhs);
+		return err;
+	}
+
+	err = fib6_mp_update(rt, tmp_nhs, size, false);
+	if (err) {
+		kfree(tmp_nhs);
+		return err;
+	}
+
+	if (tmp_nhs != nh_map->nhs)
+		kfree(nh_map->nhs);
+	kfree(nh_map);
+
+	return 0;
+}
+
+int fib6_mp_shrink(struct rt6_info *sref, struct rt6_info *rt)
+{
+	struct rt6_multi_map *nh_map = sref->rt6i_nh_map;
+	struct rt6_multi_nh *tmp_nhs;
+	unsigned int size, i, j;
+	int err;
+
+	WARN_ON(!nh_map);
+	if (!nh_map)
+		return -ENOENT;
+
+	size = nh_map->size - mphash_bucket_size;
+	tmp_nhs = kcalloc(size, sizeof(*tmp_nhs), GFP_ATOMIC);
+	if (!tmp_nhs)
+		return -ENOMEM;
+
+	for (i = 0, j = 0; i < nh_map->size; i++) {
+		if (nh_map->nhs[i].nh != rt)
+			tmp_nhs[j++] = nh_map->nhs[i];
+	}
+
+	err = fib6_mp_update(sref, tmp_nhs, size, false);
+	if (err) {
+		kfree(tmp_nhs);
+		return err;
+	}
+
+	rt->rt6i_nh_map = NULL;
+	kfree(nh_map->nhs);
+	kfree(nh_map);
+
+	return 0;
+}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index ef54852..ad5f645 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -837,6 +837,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 			}
 			sibling = sibling->dst.rt6_next;
 		}
+#ifdef CONFIG_IPV6_MULTIPATH_CONSISTENT
+		fib6_mp_extend(rt, sibling->rt6i_nh_map);
+#endif
 		/* For each sibling in the list, increment the counter of
 		 * siblings. BUG() if counters does not match, list of siblings
 		 * is broken!
@@ -900,6 +903,10 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 			fn->fn_flags |= RTN_RTINFO;
 		}
 		nsiblings = iter->rt6i_nsiblings;
+#ifdef CONFIG_IPV6_MULTIPATH_CONSISTENT
+		if (nsiblings)
+			fib6_mp_free(iter);
+#endif
 		fib6_purge_rt(iter, fn, info->nl_net);
 		rt6_release(iter);
 
@@ -1407,6 +1414,11 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 
 	/* Remove this entry from other siblings */
 	if (rt->rt6i_nsiblings) {
+#ifdef CONFIG_IPV6_MULTIPATH_CONSISTENT
+		struct rt6_info *sref = list_first_entry(&rt->rt6i_siblings,
+							 struct rt6_info,
+							 rt6i_siblings);
+#endif
 		struct rt6_info *sibling, *next_sibling;
 
 		list_for_each_entry_safe(sibling, next_sibling,
@@ -1414,6 +1426,12 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 			sibling->rt6i_nsiblings--;
 		rt->rt6i_nsiblings = 0;
 		list_del_init(&rt->rt6i_siblings);
+#ifdef CONFIG_IPV6_MULTIPATH_CONSISTENT
+		if (!sref->rt6i_nsiblings)
+			fib6_mp_free(sref);
+		else
+			fib6_mp_shrink(sref, rt);
+#endif
 	}
 
 	/* Adjust walkers */
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b317bb1..109f371 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -427,6 +427,60 @@ static bool rt6_check_expired(const struct rt6_info *rt)
 	return false;
 }
 
+#ifdef CONFIG_IPV6_MULTIPATH_CONSISTENT
+
+static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
+					     struct flowi6 *fl6, int oif,
+					     int strict)
+{
+	struct rt6_multi_map *nh_map = match->rt6i_nh_map;
+	__u32 hash = get_hash_from_flowi6(fl6);
+	unsigned int left, right, idx;
+	struct rt6_info *res = NULL;
+
+	WARN_ON(!nh_map);
+	if (!nh_map)
+		return match;
+
+	if (hash <= nh_map->nhs[0].key ||
+	    hash > nh_map->nhs[nh_map->size - 1].key) {
+		res = nh_map->nhs[0].nh;
+		goto skip_lookup;
+	}
+
+	left = 0;
+	right = nh_map->size - 1;
+
+	do {
+		struct rt6_multi_nh *nh1, *nh2;
+
+		idx = (left + right) / 2;
+		nh1 = &nh_map->nhs[idx];
+		nh2 = &nh_map->nhs[idx + 1];
+
+		if (hash < nh1->key && hash < nh2->key)
+			right = idx;
+		else if (hash > nh1->key && hash > nh2->key)
+			left = idx + 1;
+		else if (hash == nh1->key)
+			res = nh1->nh;
+		else
+			res = nh2->nh;
+	} while (left != right && !res);
+
+	WARN_ON(!res);
+	if (!res)
+		return match;
+
+skip_lookup:
+	if (rt6_score_route(res, oif, strict) < 0)
+		res = match;
+
+	return res;
+}
+
+#else
+
 /* Multipath route selection:
  *   Hash based function using packet header and flowlabel.
  * Adapted from fib_info_hashfn()
@@ -462,6 +516,8 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 	return match;
 }
 
+#endif
+
 /*
  *	Route lookup. Any table->tb6_lock is implied.
  */
-- 
2.7.3

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH net-next] ipv6: implement consistent hashing for equal-cost multipath routing
  2016-11-24 19:59 [RFC PATCH net-next] ipv6: implement consistent hashing for equal-cost multipath routing David Lebrun
@ 2016-11-28 16:22 ` David Miller
  2016-11-28 20:16   ` David Lebrun
  0 siblings, 1 reply; 7+ messages in thread
From: David Miller @ 2016-11-28 16:22 UTC (permalink / raw)
  To: david.lebrun; +Cc: netdev

From: David Lebrun <david.lebrun@uclouvain.be>
Date: Thu, 24 Nov 2016 20:59:16 +0100

> When multiple nexthops are available for a given route, the routing engine
> chooses a nexthop by computing the flow hash through get_hash_from_flowi6
> and by taking that value modulo the number of nexthops. The resulting value
> indexes the nexthop to select. This method causes issues when a new nexthop
> is added or one is removed (e.g. link failure). In that case, the number
> of nexthops changes and potentially all the flows get re-routed to another
> nexthop.
> 
> This patch implements a consistent hash method to select the nexthop in
> case of ECMP. The idea is to generate N random numbers (__u32) for each
> nexthop, where N is configurable. In order to select a nexthop, we find the
> number that is directly higher than the flow hash (which is also __u32).
> The nexthop associated to the number is then selected. The lookup method
> performs a binary search over the sorted array of numbers, which yields a
> time complexity of O(log n), where n is the number of nexthops times the
> number of random values generated for each number.
> 
> This feature can be enabled through the CONFIG_IPV6_MULTIPATH_CONSISTENT
> option and the number of random values generated for each nexthop is
> defined through CONFIG_IPV6_MPCONSIST_BUCKETSIZE.
> 
> Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>

Thanks for trying to solve this problem.

But we really don't want this to be Kconfig gated.  If we decide to
support this it should be a run-time selectable option.  Every
distribution on the planet is going to turn your Kconfig option on, so
whatever you think we might be saving by putting this behind Kconfig
checks won't be realized for large swaths of the userbase.

I also question how necessary %100 consistent hashing of ECMP really
is.

If we can do something at extremely low cost and arrive at a very low
disruption rate, honestly that's probably good enough.

I assume you've looked over RFC2992.  A low disrutpion algorithm is
described there and if we could just get rid of the divide it might
be extremely desirable.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH net-next] ipv6: implement consistent hashing for equal-cost multipath routing
  2016-11-28 16:22 ` David Miller
@ 2016-11-28 20:16   ` David Lebrun
  2016-11-28 20:32     ` David Miller
  0 siblings, 1 reply; 7+ messages in thread
From: David Lebrun @ 2016-11-28 20:16 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: Type: text/plain, Size: 2123 bytes --]

On 11/28/2016 05:22 PM, David Miller wrote:
> Thanks for trying to solve this problem.
> 
> But we really don't want this to be Kconfig gated.  If we decide to
> support this it should be a run-time selectable option.  Every
> distribution on the planet is going to turn your Kconfig option on, so
> whatever you think we might be saving by putting this behind Kconfig
> checks won't be realized for large swaths of the userbase.
> 

OK for that

> I also question how necessary %100 consistent hashing of ECMP really
> is.
> 
> If we can do something at extremely low cost and arrive at a very low
> disruption rate, honestly that's probably good enough.
> 
> I assume you've looked over RFC2992.  A low disrutpion algorithm is
> described there and if we could just get rid of the divide it might
> be extremely desirable.

I wanted a solution that has very low disruption for both scale up and
scale down. When a next-hop is added, only 1/N flows should be disrupted
(i.e. in this case, reaffected to the new next-hop). When a next-hop is
removed, only the flows that were handled by this particular next-hop
should be disrupted, and those flows should be equally rebalanced across
the remaining next-hops. Although the solution presented in RFC2992 has
a "reasonably" low disruption (looks like it can get up to 1/2
disruption though), I'm not sure if the equal-rebalancing part holds.

The advantage of my solution over RFC2992 is lowest possible disruption
and equal rebalancing of affected flows. The disadvantage is the lookup
complexity of O(log n) vs O(1). Although from a theoretical viewpoint
O(1) is obviously better, would O(log n) have an effectively measurable
negative impact on scalability ? If we consider 32 next-hops for a route
and 100 pseudo-random numbers generated per next-hop, the lookup
algorithm would have to perform in the worst case log2 3200 = 11
comparisons to select a next-hop for that route.

For my part I prefer the minimal disruption over constant time lookup.
I'll try to come up with some measurements to see the actual impact.

David

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 163 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH net-next] ipv6: implement consistent hashing for equal-cost multipath routing
  2016-11-28 20:16   ` David Lebrun
@ 2016-11-28 20:32     ` David Miller
  2016-11-28 20:58       ` David Lebrun
  2016-11-30 13:12       ` Hannes Frederic Sowa
  0 siblings, 2 replies; 7+ messages in thread
From: David Miller @ 2016-11-28 20:32 UTC (permalink / raw)
  To: david.lebrun; +Cc: netdev

From: David Lebrun <david.lebrun@uclouvain.be>
Date: Mon, 28 Nov 2016 21:16:19 +0100

> The advantage of my solution over RFC2992 is lowest possible disruption
> and equal rebalancing of affected flows. The disadvantage is the lookup
> complexity of O(log n) vs O(1). Although from a theoretical viewpoint
> O(1) is obviously better, would O(log n) have an effectively measurable
> negative impact on scalability ? If we consider 32 next-hops for a route
> and 100 pseudo-random numbers generated per next-hop, the lookup
> algorithm would have to perform in the worst case log2 3200 = 11
> comparisons to select a next-hop for that route.

When I was working on the routing cache removal in ipv4 I compared
using a stupid O(1) hash lookup of the FIB entries vs. the O(log n)
fib_trie stuff actually in use.

It did make a difference.

This is a lookup that can be invoked 20 million times per second or
more.

Every cycle matters.

We already have a lot of trouble getting under the cycle budget one
has for routing at wire speed for very high link rates, please don't
make it worse.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH net-next] ipv6: implement consistent hashing for equal-cost multipath routing
  2016-11-28 20:32     ` David Miller
@ 2016-11-28 20:58       ` David Lebrun
  2016-11-30 13:12       ` Hannes Frederic Sowa
  1 sibling, 0 replies; 7+ messages in thread
From: David Lebrun @ 2016-11-28 20:58 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: Type: text/plain, Size: 635 bytes --]

On 11/28/2016 09:32 PM, David Miller wrote:
> When I was working on the routing cache removal in ipv4 I compared
> using a stupid O(1) hash lookup of the FIB entries vs. the O(log n)
> fib_trie stuff actually in use.
> 
> It did make a difference.
> 
> This is a lookup that can be invoked 20 million times per second or
> more.
> 
> Every cycle matters.
> 
> We already have a lot of trouble getting under the cycle budget one
> has for routing at wire speed for very high link rates, please don't
> make it worse.

OK, so O(1) mandatory. I will continue in that direction then.

Thanks for the feedback

David


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 163 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH net-next] ipv6: implement consistent hashing for equal-cost multipath routing
  2016-11-28 20:32     ` David Miller
  2016-11-28 20:58       ` David Lebrun
@ 2016-11-30 13:12       ` Hannes Frederic Sowa
  2016-12-01 20:26         ` David Miller
  1 sibling, 1 reply; 7+ messages in thread
From: Hannes Frederic Sowa @ 2016-11-30 13:12 UTC (permalink / raw)
  To: David Miller, david.lebrun; +Cc: netdev

On Mon, Nov 28, 2016, at 21:32, David Miller wrote:
> From: David Lebrun <david.lebrun@uclouvain.be>
> Date: Mon, 28 Nov 2016 21:16:19 +0100
> 
> > The advantage of my solution over RFC2992 is lowest possible disruption
> > and equal rebalancing of affected flows. The disadvantage is the lookup
> > complexity of O(log n) vs O(1). Although from a theoretical viewpoint
> > O(1) is obviously better, would O(log n) have an effectively measurable
> > negative impact on scalability ? If we consider 32 next-hops for a route
> > and 100 pseudo-random numbers generated per next-hop, the lookup
> > algorithm would have to perform in the worst case log2 3200 = 11
> > comparisons to select a next-hop for that route.
> 
> When I was working on the routing cache removal in ipv4 I compared
> using a stupid O(1) hash lookup of the FIB entries vs. the O(log n)
> fib_trie stuff actually in use.
> 
> It did make a difference.
> 
> This is a lookup that can be invoked 20 million times per second or
> more.
> 
> Every cycle matters.
> 
> We already have a lot of trouble getting under the cycle budget one
> has for routing at wire speed for very high link rates, please don't
> make it worse.

David, one question: do you remember if you measured with linked lists
at that time or also with arrays. I actually would expect small arrays
that entirely fit into cachelines to be actually faster than our current
approach, which also walks a linked list, probably the best algorithm to
trash cache lines. I ask because I currently prefer this approach more
than having large allocations in the O(1) case because of easier code
and easier management.

Thanks,
Hannes

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH net-next] ipv6: implement consistent hashing for equal-cost multipath routing
  2016-11-30 13:12       ` Hannes Frederic Sowa
@ 2016-12-01 20:26         ` David Miller
  0 siblings, 0 replies; 7+ messages in thread
From: David Miller @ 2016-12-01 20:26 UTC (permalink / raw)
  To: hannes; +Cc: david.lebrun, netdev

From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Wed, 30 Nov 2016 14:12:48 +0100

> David, one question: do you remember if you measured with linked lists
> at that time or also with arrays. I actually would expect small arrays
> that entirely fit into cachelines to be actually faster than our current
> approach, which also walks a linked list, probably the best algorithm to
> trash cache lines. I ask because I currently prefer this approach more
> than having large allocations in the O(1) case because of easier code
> and easier management.

I did not try this and I do agree with you that for extremely small table
sizes a list or array would perform better because of the cache behavior.

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2016-12-01 20:26 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-11-24 19:59 [RFC PATCH net-next] ipv6: implement consistent hashing for equal-cost multipath routing David Lebrun
2016-11-28 16:22 ` David Miller
2016-11-28 20:16   ` David Lebrun
2016-11-28 20:32     ` David Miller
2016-11-28 20:58       ` David Lebrun
2016-11-30 13:12       ` Hannes Frederic Sowa
2016-12-01 20:26         ` David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).