[RFC PATCH 1/1] net/ipv4: Enable flow-based ECMP

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH 1/1] net/ipv4: Enable flow-based ECMP
@ 2015-07-28  2:27 Richard Laing
  2015-07-28  7:20 ` Michal Kubecek
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Richard Laing @ 2015-07-28  2:27 UTC (permalink / raw)
  To: netdev@vger.kernel.org; +Cc: jmorris@namei.org

From: Richard Laing <richard.laing@alliedtelesis.co.nz>

Enable flow-based ECMP.

Currently if equal-cost multipath is enabled the kernel chooses between
equal cost paths for each matching packet, essentially packets are
round-robined between the routes. This means that packets from a single
flow can traverse different routes. If one of the routes experiences
congestion this can result in delayed or out of order packets arriving
at the destination.

This patch allows packets to be routed based on their
flow - packets in the same flow will always use the same route. This
prevents out of order packets. There are other issues with round-robin
based ECMP routing related to variable path MTU handling and debugging.
See RFC2991 for more details on the problems associated with packet
based ECMP routing.

This patch relies on the skb hash value to select between routes. The
selection uses a hash-threshold algorithm (see RFC2992).

Signed-off-by: Richard Laing <richard.laing@alliedtelesis.co.nz>
---

  include/net/flow.h       |   14 ++++++++++++++
  include/net/ip_fib.h     |    9 +++++++++
  include/net/route.h      |    4 ++++
  net/ipv4/Kconfig         |    9 +++++++++
  net/ipv4/fib_semantics.c |   38 ++++++++++++++++++++++++++++++++++++++
  net/ipv4/route.c         |   14 +++++++++++++-
  6 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/include/net/flow.h b/include/net/flow.h
index 8109a15..d1d933d 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -79,6 +79,10 @@ struct flowi4 {
  #define fl4_ipsec_spi        uli.spi
  #define fl4_mh_type        uli.mht.type
  #define fl4_gre_key        uli.gre_key
+
+#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
+    __u32    flowi4_hash;
+#endif
  } __attribute__((__aligned__(BITS_PER_LONG/8)));

  static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
@@ -99,6 +103,9 @@ static inline void flowi4_init_output(struct flowi4 
*fl4, int oif,
      fl4->saddr = saddr;
      fl4->fl4_dport = dport;
      fl4->fl4_sport = sport;
+#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
+    fl4->flowi4_hash = 0;
+#endif
  }

  /* Reset some input parameters after previous lookup */
@@ -182,6 +189,13 @@ static inline struct flowi *flowidn_to_flowi(struct 
flowidn *fldn)
      return container_of(fldn, struct flowi, u.dn);
  }

+#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
+static inline void flowi4_set_flow_hash(struct flowi4 *fl, __u32 hash)
+{
+    fl->flowi4_hash = hash;
+}
+#endif
+
  typedef unsigned long flow_compare_t;

  static inline size_t flow_key_size(u16 family)
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 5fa643b..c841698 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -117,6 +117,10 @@ struct fib_info {
  #ifdef CONFIG_IP_ROUTE_MULTIPATH
      int            fib_power;
  #endif
+#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
+    /* Cache the number of live nexthops for flow based ECMP 
calculation. */
+    int            live_nexthops;
+#endif
      struct rcu_head        rcu;
      struct fib_nh        fib_nh[0];
  #define fib_dev        fib_nh[0].nh_dev
@@ -311,6 +315,11 @@ int fib_sync_down_addr(struct net *net, __be32 local);
  int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
  void fib_select_multipath(struct fib_result *res);

+#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
+void fib_select_multipath_for_flow(struct fib_result *res,
+                   const struct flowi4 *fl4);
+#endif
+
  /* Exported by fib_trie.c */
  void fib_trie_init(void);
  struct fib_table *fib_trie_table(u32 id, struct fib_table *alias);
diff --git a/include/net/route.h b/include/net/route.h
index fe22d03..fdbbe7f 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -252,6 +252,10 @@ static inline void ip_route_connect_init(struct 
flowi4 *fl4, __be32 dst, __be32

      flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
                 protocol, flow_flags, dst, src, dport, sport);
+
+#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
+    flowi4_set_flow_hash(fl4, sk->sk_hash);
+#endif
  }

  static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6fb3c90..76714d0 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -90,6 +90,15 @@ config IP_ROUTE_MULTIPATH
        equal "cost" and chooses one of them in a non-deterministic fashion
        if a matching packet arrives.

+config IP_FLOW_BASED_MULTIPATH
+    bool "IP: flow based equal cost multipath"
+    depends on IP_ROUTE_MULTIPATH
+    help
+      Normally if equal cost multipath is enabled the router chooses 
between
+          equal "cost" paths for each matching packet, essentially 
packets are round-
+          robined between the routes. This option allows packets to be 
routed based on
+          their flow - packets in the same flow will always use the 
same route.
+
  config IP_ROUTE_VERBOSE
      bool "IP: verbose route monitoring"
      depends on IP_ADVANCED_ROUTER
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 3a06586..e4750e6 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -981,6 +981,9 @@ link_it:
          head = &fib_info_devhash[hash];
          hlist_add_head(&nexthop_nh->nh_hash, head);
      } endfor_nexthops(fi)
+#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
+    fi->live_nexthops = fi->fib_nhs;
+#endif
      spin_unlock_bh(&fib_info_lock);
      return fi;

@@ -1196,6 +1199,9 @@ int fib_sync_down_dev(struct net_device *dev, 
unsigned long event)
              }
              ret++;
          }
+#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
+        fi->live_nexthops = fi->fib_nhs - dead;
+#endif
      }

      return ret;
@@ -1331,6 +1337,9 @@ int fib_sync_up(struct net_device *dev, unsigned 
int nh_flags)
          if (alive > 0) {
              fi->fib_flags &= ~nh_flags;
              ret++;
+#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
+            fi->live_nexthops = alive;
+#endif
          }
      }

@@ -1397,4 +1406,33 @@ void fib_select_multipath(struct fib_result *res)
      res->nh_sel = 0;
      spin_unlock_bh(&fib_multipath_lock);
  }
+
+#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
+void fib_select_multipath_for_flow(struct fib_result *res,
+                   const struct flowi4 *fl4)
+{
+    struct fib_info *fi = res->fi;
+    int multipath_entry;
+    int region_size;
+
+    if (fl4->flowi4_hash) {
+        /* Hash-threshold algorithm, see RFC2992. */
+        region_size = U32_MAX / fi->live_nexthops;
+        multipath_entry = fl4->flowi4_hash / region_size;
+
+        spin_lock_bh(&fib_multipath_lock);
+        for_nexthops(fi) {
+            if (!(nh->nh_flags & RTNH_F_DEAD)) {
+                res->nh_sel = nhsel;
+                if (multipath_entry == 0)
+                    break;
+                multipath_entry--;
+            }
+        } endfor_nexthops(fi);
+        spin_unlock_bh(&fib_multipath_lock);
+    } else {
+        fib_select_multipath(res);
+    }
+}
+#endif
  #endif
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index e681b85..4629c04 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1638,9 +1638,17 @@ static int ip_mkroute_input(struct sk_buff *skb,
                  __be32 daddr, __be32 saddr, u32 tos)
  {
  #ifdef CONFIG_IP_ROUTE_MULTIPATH
-    if (res->fi && res->fi->fib_nhs > 1)
+    if (res->fi && res->fi->fib_nhs > 1) {
+#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
+        if (skb)
+            flowi4_set_flow_hash((struct flowi4 *)fl4,
+                         skb_get_hash(skb));
+        fib_select_multipath_for_flow(res, fl4);
+#else
          fib_select_multipath(res);
  #endif
+    }
+#endif

      /* create a routing cache entry */
      return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
@@ -2170,7 +2178,11 @@ struct rtable *__ip_route_output_key(struct net 
*net, struct flowi4 *fl4)

  #ifdef CONFIG_IP_ROUTE_MULTIPATH
      if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
+#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
+        fib_select_multipath_for_flow(&res, fl4);
+#else
          fib_select_multipath(&res);
+#endif
      else
  #endif
      if (!res.prefixlen &&

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 1/1] net/ipv4: Enable flow-based ECMP
  2015-07-28  2:27 [RFC PATCH 1/1] net/ipv4: Enable flow-based ECMP Richard Laing
@ 2015-07-28  7:20 ` Michal Kubecek
  2015-07-28 19:51 ` Stephen Hemminger
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 14+ messages in thread
From: Michal Kubecek @ 2015-07-28  7:20 UTC (permalink / raw)
  To: Richard Laing; +Cc: netdev@vger.kernel.org, jmorris@namei.org

On Tue, Jul 28, 2015 at 02:27:57AM +0000, Richard Laing wrote:
> From: Richard Laing <richard.laing@alliedtelesis.co.nz>
> 
> Enable flow-based ECMP.
> 
> Currently if equal-cost multipath is enabled the kernel chooses between
> equal cost paths for each matching packet, essentially packets are
> round-robined between the routes. This means that packets from a single
> flow can traverse different routes. If one of the routes experiences
> congestion this can result in delayed or out of order packets arriving
> at the destination.
> 
> This patch allows packets to be routed based on their
> flow - packets in the same flow will always use the same route. This
> prevents out of order packets. There are other issues with round-robin
> based ECMP routing related to variable path MTU handling and debugging.
> See RFC2991 for more details on the problems associated with packet
> based ECMP routing.
> 
> This patch relies on the skb hash value to select between routes. The
> selection uses a hash-threshold algorithm (see RFC2992).
> 
> Signed-off-by: Richard Laing <richard.laing@alliedtelesis.co.nz>

The patch looks corrupted (long lines split, tabs converted to (four?)
spaces etc.

                                                        Michal Kubecek

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 1/1] net/ipv4: Enable flow-based ECMP
  2015-07-28  2:27 [RFC PATCH 1/1] net/ipv4: Enable flow-based ECMP Richard Laing
  2015-07-28  7:20 ` Michal Kubecek
@ 2015-07-28 19:51 ` Stephen Hemminger
  2015-07-28 21:16   ` Richard Laing
  2015-07-28 21:02 ` Tom Herbert
  2015-07-29  7:56 ` Julian Anastasov
  3 siblings, 1 reply; 14+ messages in thread
From: Stephen Hemminger @ 2015-07-28 19:51 UTC (permalink / raw)
  To: Richard Laing; +Cc: netdev@vger.kernel.org, jmorris@namei.org

On Tue, 28 Jul 2015 02:27:57 +0000
Richard Laing <Richard.Laing@alliedtelesis.co.nz> wrote:

> From: Richard Laing <richard.laing@alliedtelesis.co.nz>
> 
> Enable flow-based ECMP.
> 
> Currently if equal-cost multipath is enabled the kernel chooses between
> equal cost paths for each matching packet, essentially packets are
> round-robined between the routes. This means that packets from a single
> flow can traverse different routes. If one of the routes experiences
> congestion this can result in delayed or out of order packets arriving
> at the destination.
> 
> This patch allows packets to be routed based on their
> flow - packets in the same flow will always use the same route. This
> prevents out of order packets. There are other issues with round-robin
> based ECMP routing related to variable path MTU handling and debugging.
> See RFC2991 for more details on the problems associated with packet
> based ECMP routing.
> 
> This patch relies on the skb hash value to select between routes. The
> selection uses a hash-threshold algorithm (see RFC2992).
> 
> Signed-off-by: Richard Laing <richard.laing@alliedtelesis.co.nz>

ECMP needs some work, glad to see someone has started again.
A little bit of history, the original ECMP algorithm selection code was dropped years ago
because it was buggy. The API was good, but the implemenation had problems.

Somewhere in my dustbin, I have some patches from internal stuff that allows
selecting the algorithm at runtime.

Your implementation makes it a compile time choice. Although this is good
for testing, it really needs to be a sysctl or route attribute to be
useful in a distro environment.

As others said, your mailer is mangling the lines by wrapping.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 1/1] net/ipv4: Enable flow-based ECMP
  2015-07-28  2:27 [RFC PATCH 1/1] net/ipv4: Enable flow-based ECMP Richard Laing
  2015-07-28  7:20 ` Michal Kubecek
  2015-07-28 19:51 ` Stephen Hemminger
@ 2015-07-28 21:02 ` Tom Herbert
  2015-07-28 21:15   ` Tom Herbert
  2015-07-28 21:20   ` Richard Laing
  2015-07-29  7:56 ` Julian Anastasov
  3 siblings, 2 replies; 14+ messages in thread
From: Tom Herbert @ 2015-07-28 21:02 UTC (permalink / raw)
  To: Richard Laing; +Cc: netdev@vger.kernel.org, jmorris@namei.org

On Mon, Jul 27, 2015 at 7:27 PM, Richard Laing
<Richard.Laing@alliedtelesis.co.nz> wrote:
> From: Richard Laing <richard.laing@alliedtelesis.co.nz>
>
> Enable flow-based ECMP.
>
> Currently if equal-cost multipath is enabled the kernel chooses between
> equal cost paths for each matching packet, essentially packets are
> round-robined between the routes. This means that packets from a single
> flow can traverse different routes. If one of the routes experiences
> congestion this can result in delayed or out of order packets arriving
> at the destination.
>
Richard, someone was complaining to me just last week about the
weakness of the round robin algorithm. Thanks for looking into this!

> This patch allows packets to be routed based on their
> flow - packets in the same flow will always use the same route. This
> prevents out of order packets. There are other issues with round-robin
> based ECMP routing related to variable path MTU handling and debugging.
> See RFC2991 for more details on the problems associated with packet
> based ECMP routing.
>
> This patch relies on the skb hash value to select between routes. The
> selection uses a hash-threshold algorithm (see RFC2992).
>
> Signed-off-by: Richard Laing <richard.laing@alliedtelesis.co.nz>
> ---
>
>   include/net/flow.h       |   14 ++++++++++++++
>   include/net/ip_fib.h     |    9 +++++++++
>   include/net/route.h      |    4 ++++
>   net/ipv4/Kconfig         |    9 +++++++++
>   net/ipv4/fib_semantics.c |   38 ++++++++++++++++++++++++++++++++++++++
>   net/ipv4/route.c         |   14 +++++++++++++-
>   6 files changed, 87 insertions(+), 1 deletion(-)
>
> diff --git a/include/net/flow.h b/include/net/flow.h
> index 8109a15..d1d933d 100644
> --- a/include/net/flow.h
> +++ b/include/net/flow.h
> @@ -79,6 +79,10 @@ struct flowi4 {
>   #define fl4_ipsec_spi        uli.spi
>   #define fl4_mh_type        uli.mht.type
>   #define fl4_gre_key        uli.gre_key
> +
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH

Why bother making this a CONFIG, round robin is a miserable algorithm
anyway and nearly all the other packet steering mechanisms already use
a hash.

> +    __u32    flowi4_hash;
> +#endif
>   } __attribute__((__aligned__(BITS_PER_LONG/8)));
>
>   static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
> @@ -99,6 +103,9 @@ static inline void flowi4_init_output(struct flowi4
> *fl4, int oif,
>       fl4->saddr = saddr;
>       fl4->fl4_dport = dport;
>       fl4->fl4_sport = sport;
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> +    fl4->flowi4_hash = 0;
> +#endif
>   }
>
>   /* Reset some input parameters after previous lookup */
> @@ -182,6 +189,13 @@ static inline struct flowi *flowidn_to_flowi(struct
> flowidn *fldn)
>       return container_of(fldn, struct flowi, u.dn);
>   }
>
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> +static inline void flowi4_set_flow_hash(struct flowi4 *fl, __u32 hash)
> +{
> +    fl->flowi4_hash = hash;
> +}
> +#endif
> +
>   typedef unsigned long flow_compare_t;
>
>   static inline size_t flow_key_size(u16 family)
> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> index 5fa643b..c841698 100644
> --- a/include/net/ip_fib.h
> +++ b/include/net/ip_fib.h
> @@ -117,6 +117,10 @@ struct fib_info {
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
>       int            fib_power;
>   #endif
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> +    /* Cache the number of live nexthops for flow based ECMP
> calculation. */
> +    int            live_nexthops;
> +#endif
>       struct rcu_head        rcu;
>       struct fib_nh        fib_nh[0];
>   #define fib_dev        fib_nh[0].nh_dev
> @@ -311,6 +315,11 @@ int fib_sync_down_addr(struct net *net, __be32 local);
>   int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
>   void fib_select_multipath(struct fib_result *res);
>
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> +void fib_select_multipath_for_flow(struct fib_result *res,
> +                   const struct flowi4 *fl4);
> +#endif
> +
>   /* Exported by fib_trie.c */
>   void fib_trie_init(void);
>   struct fib_table *fib_trie_table(u32 id, struct fib_table *alias);
> diff --git a/include/net/route.h b/include/net/route.h
> index fe22d03..fdbbe7f 100644
> --- a/include/net/route.h
> +++ b/include/net/route.h
> @@ -252,6 +252,10 @@ static inline void ip_route_connect_init(struct
> flowi4 *fl4, __be32 dst, __be32
>
>       flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
>                  protocol, flow_flags, dst, src, dport, sport);
> +
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> +    flowi4_set_flow_hash(fl4, sk->sk_hash);

Should be sk_txhash I think.

> +#endif
>   }
>
>   static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
> diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
> index 6fb3c90..76714d0 100644
> --- a/net/ipv4/Kconfig
> +++ b/net/ipv4/Kconfig
> @@ -90,6 +90,15 @@ config IP_ROUTE_MULTIPATH
>         equal "cost" and chooses one of them in a non-deterministic fashion
>         if a matching packet arrives.
>
> +config IP_FLOW_BASED_MULTIPATH
> +    bool "IP: flow based equal cost multipath"
> +    depends on IP_ROUTE_MULTIPATH
> +    help
> +      Normally if equal cost multipath is enabled the router chooses
> between
> +          equal "cost" paths for each matching packet, essentially
> packets are round-
> +          robined between the routes. This option allows packets to be
> routed based on
> +          their flow - packets in the same flow will always use the
> same route.
> +
>   config IP_ROUTE_VERBOSE
>       bool "IP: verbose route monitoring"
>       depends on IP_ADVANCED_ROUTER
> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
> index 3a06586..e4750e6 100644
> --- a/net/ipv4/fib_semantics.c
> +++ b/net/ipv4/fib_semantics.c
> @@ -981,6 +981,9 @@ link_it:
>           head = &fib_info_devhash[hash];
>           hlist_add_head(&nexthop_nh->nh_hash, head);
>       } endfor_nexthops(fi)
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> +    fi->live_nexthops = fi->fib_nhs;
> +#endif
>       spin_unlock_bh(&fib_info_lock);
>       return fi;
>
> @@ -1196,6 +1199,9 @@ int fib_sync_down_dev(struct net_device *dev,
> unsigned long event)
>               }
>               ret++;
>           }
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> +        fi->live_nexthops = fi->fib_nhs - dead;
> +#endif
>       }
>
>       return ret;
> @@ -1331,6 +1337,9 @@ int fib_sync_up(struct net_device *dev, unsigned
> int nh_flags)
>           if (alive > 0) {
>               fi->fib_flags &= ~nh_flags;
>               ret++;
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> +            fi->live_nexthops = alive;
> +#endif
>           }
>       }
>
> @@ -1397,4 +1406,33 @@ void fib_select_multipath(struct fib_result *res)
>       res->nh_sel = 0;
>       spin_unlock_bh(&fib_multipath_lock);
>   }
> +
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> +void fib_select_multipath_for_flow(struct fib_result *res,
> +                   const struct flowi4 *fl4)
> +{
> +    struct fib_info *fi = res->fi;
> +    int multipath_entry;
> +    int region_size;
> +
> +    if (fl4->flowi4_hash) {
> +        /* Hash-threshold algorithm, see RFC2992. */
> +        region_size = U32_MAX / fi->live_nexthops;
> +        multipath_entry = fl4->flowi4_hash / region_size;
> +
> +        spin_lock_bh(&fib_multipath_lock);
> +        for_nexthops(fi) {
> +            if (!(nh->nh_flags & RTNH_F_DEAD)) {
> +                res->nh_sel = nhsel;
> +                if (multipath_entry == 0)
> +                    break;
> +                multipath_entry--;
> +            }
> +        } endfor_nexthops(fi);
> +        spin_unlock_bh(&fib_multipath_lock);
> +    } else {
> +        fib_select_multipath(res);
> +    }
> +}
> +#endif
>   #endif
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index e681b85..4629c04 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -1638,9 +1638,17 @@ static int ip_mkroute_input(struct sk_buff *skb,
>                   __be32 daddr, __be32 saddr, u32 tos)
>   {
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> -    if (res->fi && res->fi->fib_nhs > 1)
> +    if (res->fi && res->fi->fib_nhs > 1) {
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> +        if (skb)
> +            flowi4_set_flow_hash((struct flowi4 *)fl4,
> +                         skb_get_hash(skb));
> +        fib_select_multipath_for_flow(res, fl4);
> +#else
>           fib_select_multipath(res);
>   #endif
> +    }
> +#endif
>
>       /* create a routing cache entry */
>       return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
> @@ -2170,7 +2178,11 @@ struct rtable *__ip_route_output_key(struct net
> *net, struct flowi4 *fl4)
>
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
>       if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> +        fib_select_multipath_for_flow(&res, fl4);
> +#else
>           fib_select_multipath(&res);
> +#endif
>       else
>   #endif
>       if (!res.prefixlen &&

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 1/1] net/ipv4: Enable flow-based ECMP
  2015-07-28 21:02 ` Tom Herbert
@ 2015-07-28 21:15   ` Tom Herbert
  2015-07-28 21:22     ` Richard Laing
  2015-07-28 21:20   ` Richard Laing
  1 sibling, 1 reply; 14+ messages in thread
From: Tom Herbert @ 2015-07-28 21:15 UTC (permalink / raw)
  To: Richard Laing; +Cc: netdev@vger.kernel.org, jmorris@namei.org, Martin Lau

On Tue, Jul 28, 2015 at 2:02 PM, Tom Herbert <tom@herbertland.com> wrote:
> On Mon, Jul 27, 2015 at 7:27 PM, Richard Laing
> <Richard.Laing@alliedtelesis.co.nz> wrote:
>> From: Richard Laing <richard.laing@alliedtelesis.co.nz>
>>
>> Enable flow-based ECMP.
>>
>> Currently if equal-cost multipath is enabled the kernel chooses between
>> equal cost paths for each matching packet, essentially packets are
>> round-robined between the routes. This means that packets from a single
>> flow can traverse different routes. If one of the routes experiences
>> congestion this can result in delayed or out of order packets arriving
>> at the destination.
>>
> Richard, someone was complaining to me just last week about the
> weakness of the round robin algorithm. Thanks for looking into this!
>
>> This patch allows packets to be routed based on their
>> flow - packets in the same flow will always use the same route. This
>> prevents out of order packets. There are other issues with round-robin
>> based ECMP routing related to variable path MTU handling and debugging.
>> See RFC2991 for more details on the problems associated with packet
>> based ECMP routing.
>>

btw, it looks like IPv6 is already doing the hash but is calculating
it by hand instead of using sk_txhash or skb->hash. It would be nice
if we could unify this between v4 and v6 at some point to both using
the existing hashes.

Tom

>> This patch relies on the skb hash value to select between routes. The
>> selection uses a hash-threshold algorithm (see RFC2992).
>>
>> Signed-off-by: Richard Laing <richard.laing@alliedtelesis.co.nz>
>> ---
>>
>>   include/net/flow.h       |   14 ++++++++++++++
>>   include/net/ip_fib.h     |    9 +++++++++
>>   include/net/route.h      |    4 ++++
>>   net/ipv4/Kconfig         |    9 +++++++++
>>   net/ipv4/fib_semantics.c |   38 ++++++++++++++++++++++++++++++++++++++
>>   net/ipv4/route.c         |   14 +++++++++++++-
>>   6 files changed, 87 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/net/flow.h b/include/net/flow.h
>> index 8109a15..d1d933d 100644
>> --- a/include/net/flow.h
>> +++ b/include/net/flow.h
>> @@ -79,6 +79,10 @@ struct flowi4 {
>>   #define fl4_ipsec_spi        uli.spi
>>   #define fl4_mh_type        uli.mht.type
>>   #define fl4_gre_key        uli.gre_key
>> +
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>
> Why bother making this a CONFIG, round robin is a miserable algorithm
> anyway and nearly all the other packet steering mechanisms already use
> a hash.
>
>> +    __u32    flowi4_hash;
>> +#endif
>>   } __attribute__((__aligned__(BITS_PER_LONG/8)));
>>
>>   static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
>> @@ -99,6 +103,9 @@ static inline void flowi4_init_output(struct flowi4
>> *fl4, int oif,
>>       fl4->saddr = saddr;
>>       fl4->fl4_dport = dport;
>>       fl4->fl4_sport = sport;
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +    fl4->flowi4_hash = 0;
>> +#endif
>>   }
>>
>>   /* Reset some input parameters after previous lookup */
>> @@ -182,6 +189,13 @@ static inline struct flowi *flowidn_to_flowi(struct
>> flowidn *fldn)
>>       return container_of(fldn, struct flowi, u.dn);
>>   }
>>
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +static inline void flowi4_set_flow_hash(struct flowi4 *fl, __u32 hash)
>> +{
>> +    fl->flowi4_hash = hash;
>> +}
>> +#endif
>> +
>>   typedef unsigned long flow_compare_t;
>>
>>   static inline size_t flow_key_size(u16 family)
>> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
>> index 5fa643b..c841698 100644
>> --- a/include/net/ip_fib.h
>> +++ b/include/net/ip_fib.h
>> @@ -117,6 +117,10 @@ struct fib_info {
>>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
>>       int            fib_power;
>>   #endif
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +    /* Cache the number of live nexthops for flow based ECMP
>> calculation. */
>> +    int            live_nexthops;
>> +#endif
>>       struct rcu_head        rcu;
>>       struct fib_nh        fib_nh[0];
>>   #define fib_dev        fib_nh[0].nh_dev
>> @@ -311,6 +315,11 @@ int fib_sync_down_addr(struct net *net, __be32 local);
>>   int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
>>   void fib_select_multipath(struct fib_result *res);
>>
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +void fib_select_multipath_for_flow(struct fib_result *res,
>> +                   const struct flowi4 *fl4);
>> +#endif
>> +
>>   /* Exported by fib_trie.c */
>>   void fib_trie_init(void);
>>   struct fib_table *fib_trie_table(u32 id, struct fib_table *alias);
>> diff --git a/include/net/route.h b/include/net/route.h
>> index fe22d03..fdbbe7f 100644
>> --- a/include/net/route.h
>> +++ b/include/net/route.h
>> @@ -252,6 +252,10 @@ static inline void ip_route_connect_init(struct
>> flowi4 *fl4, __be32 dst, __be32
>>
>>       flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
>>                  protocol, flow_flags, dst, src, dport, sport);
>> +
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +    flowi4_set_flow_hash(fl4, sk->sk_hash);
>
> Should be sk_txhash I think.
>
>> +#endif
>>   }
>>
>>   static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
>> diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
>> index 6fb3c90..76714d0 100644
>> --- a/net/ipv4/Kconfig
>> +++ b/net/ipv4/Kconfig
>> @@ -90,6 +90,15 @@ config IP_ROUTE_MULTIPATH
>>         equal "cost" and chooses one of them in a non-deterministic fashion
>>         if a matching packet arrives.
>>
>> +config IP_FLOW_BASED_MULTIPATH
>> +    bool "IP: flow based equal cost multipath"
>> +    depends on IP_ROUTE_MULTIPATH
>> +    help
>> +      Normally if equal cost multipath is enabled the router chooses
>> between
>> +          equal "cost" paths for each matching packet, essentially
>> packets are round-
>> +          robined between the routes. This option allows packets to be
>> routed based on
>> +          their flow - packets in the same flow will always use the
>> same route.
>> +
>>   config IP_ROUTE_VERBOSE
>>       bool "IP: verbose route monitoring"
>>       depends on IP_ADVANCED_ROUTER
>> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
>> index 3a06586..e4750e6 100644
>> --- a/net/ipv4/fib_semantics.c
>> +++ b/net/ipv4/fib_semantics.c
>> @@ -981,6 +981,9 @@ link_it:
>>           head = &fib_info_devhash[hash];
>>           hlist_add_head(&nexthop_nh->nh_hash, head);
>>       } endfor_nexthops(fi)
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +    fi->live_nexthops = fi->fib_nhs;
>> +#endif
>>       spin_unlock_bh(&fib_info_lock);
>>       return fi;
>>
>> @@ -1196,6 +1199,9 @@ int fib_sync_down_dev(struct net_device *dev,
>> unsigned long event)
>>               }
>>               ret++;
>>           }
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +        fi->live_nexthops = fi->fib_nhs - dead;
>> +#endif
>>       }
>>
>>       return ret;
>> @@ -1331,6 +1337,9 @@ int fib_sync_up(struct net_device *dev, unsigned
>> int nh_flags)
>>           if (alive > 0) {
>>               fi->fib_flags &= ~nh_flags;
>>               ret++;
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +            fi->live_nexthops = alive;
>> +#endif
>>           }
>>       }
>>
>> @@ -1397,4 +1406,33 @@ void fib_select_multipath(struct fib_result *res)
>>       res->nh_sel = 0;
>>       spin_unlock_bh(&fib_multipath_lock);
>>   }
>> +
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +void fib_select_multipath_for_flow(struct fib_result *res,
>> +                   const struct flowi4 *fl4)
>> +{
>> +    struct fib_info *fi = res->fi;
>> +    int multipath_entry;
>> +    int region_size;
>> +
>> +    if (fl4->flowi4_hash) {
>> +        /* Hash-threshold algorithm, see RFC2992. */
>> +        region_size = U32_MAX / fi->live_nexthops;
>> +        multipath_entry = fl4->flowi4_hash / region_size;
>> +
>> +        spin_lock_bh(&fib_multipath_lock);
>> +        for_nexthops(fi) {
>> +            if (!(nh->nh_flags & RTNH_F_DEAD)) {
>> +                res->nh_sel = nhsel;
>> +                if (multipath_entry == 0)
>> +                    break;
>> +                multipath_entry--;
>> +            }
>> +        } endfor_nexthops(fi);
>> +        spin_unlock_bh(&fib_multipath_lock);
>> +    } else {
>> +        fib_select_multipath(res);
>> +    }
>> +}
>> +#endif
>>   #endif
>> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
>> index e681b85..4629c04 100644
>> --- a/net/ipv4/route.c
>> +++ b/net/ipv4/route.c
>> @@ -1638,9 +1638,17 @@ static int ip_mkroute_input(struct sk_buff *skb,
>>                   __be32 daddr, __be32 saddr, u32 tos)
>>   {
>>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
>> -    if (res->fi && res->fi->fib_nhs > 1)
>> +    if (res->fi && res->fi->fib_nhs > 1) {
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +        if (skb)
>> +            flowi4_set_flow_hash((struct flowi4 *)fl4,
>> +                         skb_get_hash(skb));
>> +        fib_select_multipath_for_flow(res, fl4);
>> +#else
>>           fib_select_multipath(res);
>>   #endif
>> +    }
>> +#endif
>>
>>       /* create a routing cache entry */
>>       return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
>> @@ -2170,7 +2178,11 @@ struct rtable *__ip_route_output_key(struct net
>> *net, struct flowi4 *fl4)
>>
>>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
>>       if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +        fib_select_multipath_for_flow(&res, fl4);
>> +#else
>>           fib_select_multipath(&res);
>> +#endif
>>       else
>>   #endif
>>       if (!res.prefixlen &&

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 1/1] net/ipv4: Enable flow-based ECMP
  2015-07-28 19:51 ` Stephen Hemminger
@ 2015-07-28 21:16   ` Richard Laing
  0 siblings, 0 replies; 14+ messages in thread
From: Richard Laing @ 2015-07-28 21:16 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev@vger.kernel.org, jmorris@namei.org


On 07/29/2015 07:51 AM, Stephen Hemminger wrote:
> On Tue, 28 Jul 2015 02:27:57 +0000
> Richard Laing <Richard.Laing@alliedtelesis.co.nz> wrote:
>
>> From: Richard Laing <richard.laing@alliedtelesis.co.nz>
>>
>> Enable flow-based ECMP.
>>
>> Currently if equal-cost multipath is enabled the kernel chooses between
>> equal cost paths for each matching packet, essentially packets are
>> round-robined between the routes. This means that packets from a single
>> flow can traverse different routes. If one of the routes experiences
>> congestion this can result in delayed or out of order packets arriving
>> at the destination.
>>
>> This patch allows packets to be routed based on their
>> flow - packets in the same flow will always use the same route. This
>> prevents out of order packets. There are other issues with round-robin
>> based ECMP routing related to variable path MTU handling and debugging.
>> See RFC2991 for more details on the problems associated with packet
>> based ECMP routing.
>>
>> This patch relies on the skb hash value to select between routes. The
>> selection uses a hash-threshold algorithm (see RFC2992).
>>
>> Signed-off-by: Richard Laing <richard.laing@alliedtelesis.co.nz>
> ECMP needs some work, glad to see someone has started again.
> A little bit of history, the original ECMP algorithm selection code was dropped years ago
> because it was buggy. The API was good, but the implemenation had problems.
>
> Somewhere in my dustbin, I have some patches from internal stuff that allows
> selecting the algorithm at runtime.
>
> Your implementation makes it a compile time choice. Although this is good
> for testing, it really needs to be a sysctl or route attribute to be
> useful in a distro environment.
>
> As others said, your mailer is mangling the lines by wrapping.

Thanks Stephen, not sure what happened with the formatting, I'll try and 
find out. I will look at a sysctl option to enable the flow based ECMP 
option.

Cheers
Richard

-- 
Richard Laing
Software Team Leader
Allied Telesis Labs| 27 Nazareth Ave | Christchurch 8024 | New Zealand
Phone: +64 3 339 9248
Web: www.alliedtelesis.com

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 1/1] net/ipv4: Enable flow-based ECMP
  2015-07-28 21:02 ` Tom Herbert
  2015-07-28 21:15   ` Tom Herbert
@ 2015-07-28 21:20   ` Richard Laing
  2015-07-29  6:11     ` Michal Kubecek
  1 sibling, 1 reply; 14+ messages in thread
From: Richard Laing @ 2015-07-28 21:20 UTC (permalink / raw)
  To: Tom Herbert; +Cc: netdev@vger.kernel.org, jmorris@namei.org



On 07/29/2015 09:02 AM, Tom Herbert wrote:
> On Mon, Jul 27, 2015 at 7:27 PM, Richard Laing
> <Richard.Laing@alliedtelesis.co.nz> wrote:
>> From: Richard Laing <richard.laing@alliedtelesis.co.nz>
>>
>> Enable flow-based ECMP.
>>
>> Currently if equal-cost multipath is enabled the kernel chooses between
>> equal cost paths for each matching packet, essentially packets are
>> round-robined between the routes. This means that packets from a single
>> flow can traverse different routes. If one of the routes experiences
>> congestion this can result in delayed or out of order packets arriving
>> at the destination.
>>
> Richard, someone was complaining to me just last week about the
> weakness of the round robin algorithm. Thanks for looking into this!
>
>> This patch allows packets to be routed based on their
>> flow - packets in the same flow will always use the same route. This
>> prevents out of order packets. There are other issues with round-robin
>> based ECMP routing related to variable path MTU handling and debugging.
>> See RFC2991 for more details on the problems associated with packet
>> based ECMP routing.
>>
>> This patch relies on the skb hash value to select between routes. The
>> selection uses a hash-threshold algorithm (see RFC2992).
>>
>> Signed-off-by: Richard Laing <richard.laing@alliedtelesis.co.nz>
>> ---
>>
>>    include/net/flow.h       |   14 ++++++++++++++
>>    include/net/ip_fib.h     |    9 +++++++++
>>    include/net/route.h      |    4 ++++
>>    net/ipv4/Kconfig         |    9 +++++++++
>>    net/ipv4/fib_semantics.c |   38 ++++++++++++++++++++++++++++++++++++++
>>    net/ipv4/route.c         |   14 +++++++++++++-
>>    6 files changed, 87 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/net/flow.h b/include/net/flow.h
>> index 8109a15..d1d933d 100644
>> --- a/include/net/flow.h
>> +++ b/include/net/flow.h
>> @@ -79,6 +79,10 @@ struct flowi4 {
>>    #define fl4_ipsec_spi        uli.spi
>>    #define fl4_mh_type        uli.mht.type
>>    #define fl4_gre_key        uli.gre_key
>> +
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> Why bother making this a CONFIG, round robin is a miserable algorithm
> anyway and nearly all the other packet steering mechanisms already use
> a hash.

Fair enough, I will look at making it a sysctl option. I guess the 
default can be the current behaviour.
>
>> +    __u32    flowi4_hash;
>> +#endif
>>    } __attribute__((__aligned__(BITS_PER_LONG/8)));
>>
>>    static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
>> @@ -99,6 +103,9 @@ static inline void flowi4_init_output(struct flowi4
>> *fl4, int oif,
>>        fl4->saddr = saddr;
>>        fl4->fl4_dport = dport;
>>        fl4->fl4_sport = sport;
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +    fl4->flowi4_hash = 0;
>> +#endif
>>    }
>>
>>    /* Reset some input parameters after previous lookup */
>> @@ -182,6 +189,13 @@ static inline struct flowi *flowidn_to_flowi(struct
>> flowidn *fldn)
>>        return container_of(fldn, struct flowi, u.dn);
>>    }
>>
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +static inline void flowi4_set_flow_hash(struct flowi4 *fl, __u32 hash)
>> +{
>> +    fl->flowi4_hash = hash;
>> +}
>> +#endif
>> +
>>    typedef unsigned long flow_compare_t;
>>
>>    static inline size_t flow_key_size(u16 family)
>> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
>> index 5fa643b..c841698 100644
>> --- a/include/net/ip_fib.h
>> +++ b/include/net/ip_fib.h
>> @@ -117,6 +117,10 @@ struct fib_info {
>>    #ifdef CONFIG_IP_ROUTE_MULTIPATH
>>        int            fib_power;
>>    #endif
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +    /* Cache the number of live nexthops for flow based ECMP
>> calculation. */
>> +    int            live_nexthops;
>> +#endif
>>        struct rcu_head        rcu;
>>        struct fib_nh        fib_nh[0];
>>    #define fib_dev        fib_nh[0].nh_dev
>> @@ -311,6 +315,11 @@ int fib_sync_down_addr(struct net *net, __be32 local);
>>    int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
>>    void fib_select_multipath(struct fib_result *res);
>>
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +void fib_select_multipath_for_flow(struct fib_result *res,
>> +                   const struct flowi4 *fl4);
>> +#endif
>> +
>>    /* Exported by fib_trie.c */
>>    void fib_trie_init(void);
>>    struct fib_table *fib_trie_table(u32 id, struct fib_table *alias);
>> diff --git a/include/net/route.h b/include/net/route.h
>> index fe22d03..fdbbe7f 100644
>> --- a/include/net/route.h
>> +++ b/include/net/route.h
>> @@ -252,6 +252,10 @@ static inline void ip_route_connect_init(struct
>> flowi4 *fl4, __be32 dst, __be32
>>
>>        flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
>>                   protocol, flow_flags, dst, src, dport, sport);
>> +
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +    flowi4_set_flow_hash(fl4, sk->sk_hash);
> Should be sk_txhash I think.

I will check, it looks likely!
>
>> +#endif
>>    }
>>
>>    static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
>> diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
>> index 6fb3c90..76714d0 100644
>> --- a/net/ipv4/Kconfig
>> +++ b/net/ipv4/Kconfig
>> @@ -90,6 +90,15 @@ config IP_ROUTE_MULTIPATH
>>          equal "cost" and chooses one of them in a non-deterministic fashion
>>          if a matching packet arrives.
>>
>> +config IP_FLOW_BASED_MULTIPATH
>> +    bool "IP: flow based equal cost multipath"
>> +    depends on IP_ROUTE_MULTIPATH
>> +    help
>> +      Normally if equal cost multipath is enabled the router chooses
>> between
>> +          equal "cost" paths for each matching packet, essentially
>> packets are round-
>> +          robined between the routes. This option allows packets to be
>> routed based on
>> +          their flow - packets in the same flow will always use the
>> same route.
>> +
>>    config IP_ROUTE_VERBOSE
>>        bool "IP: verbose route monitoring"
>>        depends on IP_ADVANCED_ROUTER
>> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
>> index 3a06586..e4750e6 100644
>> --- a/net/ipv4/fib_semantics.c
>> +++ b/net/ipv4/fib_semantics.c
>> @@ -981,6 +981,9 @@ link_it:
>>            head = &fib_info_devhash[hash];
>>            hlist_add_head(&nexthop_nh->nh_hash, head);
>>        } endfor_nexthops(fi)
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +    fi->live_nexthops = fi->fib_nhs;
>> +#endif
>>        spin_unlock_bh(&fib_info_lock);
>>        return fi;
>>
>> @@ -1196,6 +1199,9 @@ int fib_sync_down_dev(struct net_device *dev,
>> unsigned long event)
>>                }
>>                ret++;
>>            }
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +        fi->live_nexthops = fi->fib_nhs - dead;
>> +#endif
>>        }
>>
>>        return ret;
>> @@ -1331,6 +1337,9 @@ int fib_sync_up(struct net_device *dev, unsigned
>> int nh_flags)
>>            if (alive > 0) {
>>                fi->fib_flags &= ~nh_flags;
>>                ret++;
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +            fi->live_nexthops = alive;
>> +#endif
>>            }
>>        }
>>
>> @@ -1397,4 +1406,33 @@ void fib_select_multipath(struct fib_result *res)
>>        res->nh_sel = 0;
>>        spin_unlock_bh(&fib_multipath_lock);
>>    }
>> +
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +void fib_select_multipath_for_flow(struct fib_result *res,
>> +                   const struct flowi4 *fl4)
>> +{
>> +    struct fib_info *fi = res->fi;
>> +    int multipath_entry;
>> +    int region_size;
>> +
>> +    if (fl4->flowi4_hash) {
>> +        /* Hash-threshold algorithm, see RFC2992. */
>> +        region_size = U32_MAX / fi->live_nexthops;
>> +        multipath_entry = fl4->flowi4_hash / region_size;
>> +
>> +        spin_lock_bh(&fib_multipath_lock);
>> +        for_nexthops(fi) {
>> +            if (!(nh->nh_flags & RTNH_F_DEAD)) {
>> +                res->nh_sel = nhsel;
>> +                if (multipath_entry == 0)
>> +                    break;
>> +                multipath_entry--;
>> +            }
>> +        } endfor_nexthops(fi);
>> +        spin_unlock_bh(&fib_multipath_lock);
>> +    } else {
>> +        fib_select_multipath(res);
>> +    }
>> +}
>> +#endif
>>    #endif
>> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
>> index e681b85..4629c04 100644
>> --- a/net/ipv4/route.c
>> +++ b/net/ipv4/route.c
>> @@ -1638,9 +1638,17 @@ static int ip_mkroute_input(struct sk_buff *skb,
>>                    __be32 daddr, __be32 saddr, u32 tos)
>>    {
>>    #ifdef CONFIG_IP_ROUTE_MULTIPATH
>> -    if (res->fi && res->fi->fib_nhs > 1)
>> +    if (res->fi && res->fi->fib_nhs > 1) {
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +        if (skb)
>> +            flowi4_set_flow_hash((struct flowi4 *)fl4,
>> +                         skb_get_hash(skb));
>> +        fib_select_multipath_for_flow(res, fl4);
>> +#else
>>            fib_select_multipath(res);
>>    #endif
>> +    }
>> +#endif
>>
>>        /* create a routing cache entry */
>>        return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
>> @@ -2170,7 +2178,11 @@ struct rtable *__ip_route_output_key(struct net
>> *net, struct flowi4 *fl4)
>>
>>    #ifdef CONFIG_IP_ROUTE_MULTIPATH
>>        if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +        fib_select_multipath_for_flow(&res, fl4);
>> +#else
>>            fib_select_multipath(&res);
>> +#endif
>>        else
>>    #endif
>>        if (!res.prefixlen &&

-- 
Richard Laing
Software Team Leader
Allied Telesis Labs| 27 Nazareth Ave | Christchurch 8024 | New Zealand
Phone: +64 3 339 9248
Web: www.alliedtelesis.com

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 1/1] net/ipv4: Enable flow-based ECMP
  2015-07-28 21:15   ` Tom Herbert
@ 2015-07-28 21:22     ` Richard Laing
  0 siblings, 0 replies; 14+ messages in thread
From: Richard Laing @ 2015-07-28 21:22 UTC (permalink / raw)
  To: Tom Herbert; +Cc: netdev@vger.kernel.org, jmorris@namei.org, Martin Lau



On 07/29/2015 09:15 AM, Tom Herbert wrote:
> On Tue, Jul 28, 2015 at 2:02 PM, Tom Herbert <tom@herbertland.com> wrote:
>> On Mon, Jul 27, 2015 at 7:27 PM, Richard Laing
>> <Richard.Laing@alliedtelesis.co.nz> wrote:
>>> From: Richard Laing <richard.laing@alliedtelesis.co.nz>
>>>
>>> Enable flow-based ECMP.
>>>
>>> Currently if equal-cost multipath is enabled the kernel chooses between
>>> equal cost paths for each matching packet, essentially packets are
>>> round-robined between the routes. This means that packets from a single
>>> flow can traverse different routes. If one of the routes experiences
>>> congestion this can result in delayed or out of order packets arriving
>>> at the destination.
>>>
>> Richard, someone was complaining to me just last week about the
>> weakness of the round robin algorithm. Thanks for looking into this!
>>
>>> This patch allows packets to be routed based on their
>>> flow - packets in the same flow will always use the same route. This
>>> prevents out of order packets. There are other issues with round-robin
>>> based ECMP routing related to variable path MTU handling and debugging.
>>> See RFC2991 for more details on the problems associated with packet
>>> based ECMP routing.
>>>
> btw, it looks like IPv6 is already doing the hash but is calculating
> it by hand instead of using sk_txhash or skb->hash. It would be nice
> if we could unify this between v4 and v6 at some point to both using
> the existing hashes.
>
> Tom

Thanks Tom, yes IPv6 does work just fine currently. Assuming I get a 
version of this patch pushed updating the IPv6 version would seem like a 
good option.


-- 
Richard Laing
Software Team Leader
Allied Telesis Labs| 27 Nazareth Ave | Christchurch 8024 | New Zealand
Phone: +64 3 339 9248
Web: www.alliedtelesis.com

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 1/1] net/ipv4: Enable flow-based ECMP
  2015-07-28 21:20   ` Richard Laing
@ 2015-07-29  6:11     ` Michal Kubecek
  2015-07-29  6:32       ` David Miller
  2015-07-29 16:12       ` Tom Herbert
  0 siblings, 2 replies; 14+ messages in thread
From: Michal Kubecek @ 2015-07-29  6:11 UTC (permalink / raw)
  To: Richard Laing; +Cc: Tom Herbert, netdev@vger.kernel.org, jmorris@namei.org

On Tue, Jul 28, 2015 at 09:20:02PM +0000, Richard Laing wrote:
> >> diff --git a/include/net/flow.h b/include/net/flow.h
> >> index 8109a15..d1d933d 100644
> >> --- a/include/net/flow.h
> >> +++ b/include/net/flow.h
> >> @@ -79,6 +79,10 @@ struct flowi4 {
> >>    #define fl4_ipsec_spi        uli.spi
> >>    #define fl4_mh_type        uli.mht.type
> >>    #define fl4_gre_key        uli.gre_key
> >> +
> >> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> > Why bother making this a CONFIG, round robin is a miserable algorithm
> > anyway and nearly all the other packet steering mechanisms already use
> > a hash.
> 
> Fair enough, I will look at making it a sysctl option. I guess the 
> default can be the current behaviour.

Hm... that's an interesting question. In general, it's better to use
current behaviour as a default so that people are not surprised on
upgrade. On the other hand, it used to be per-flow  - or rather per
destination - earlier (until the routing cache removal, I believe) and
per-flow distribution is IMHO preferrable in majority of use cases. In
theory, there was a route attribute "equalize" to switch to per-packet
distribution, but it was never actually implemented, AFAIK (it was
recognized by ip and passed to kernel but ignored there).

Anyway, config option is definitely inconvenient as most users install
distribution kernels and do not configure their kernels themselves. Even
boot parameter would be better - but sysctl sounds much better. Having
both sysctl and per-route attribute would be perfect, of course.

                                                        Michal Kubecek

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 1/1] net/ipv4: Enable flow-based ECMP
  2015-07-29  6:11     ` Michal Kubecek
@ 2015-07-29  6:32       ` David Miller
  2015-07-29 16:12       ` Tom Herbert
  1 sibling, 0 replies; 14+ messages in thread
From: David Miller @ 2015-07-29  6:32 UTC (permalink / raw)
  To: mkubecek; +Cc: Richard.Laing, tom, netdev, jmorris

From: Michal Kubecek <mkubecek@suse.cz>
Date: Wed, 29 Jul 2015 08:11:48 +0200

> Having both sysctl and per-route attribute would be perfect, of
> course.

+1

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 1/1] net/ipv4: Enable flow-based ECMP
  2015-07-28  2:27 [RFC PATCH 1/1] net/ipv4: Enable flow-based ECMP Richard Laing
                   ` (2 preceding siblings ...)
  2015-07-28 21:02 ` Tom Herbert
@ 2015-07-29  7:56 ` Julian Anastasov
  3 siblings, 0 replies; 14+ messages in thread
From: Julian Anastasov @ 2015-07-29  7:56 UTC (permalink / raw)
  To: Richard Laing; +Cc: netdev@vger.kernel.org, jmorris@namei.org


	Hello,

On Tue, 28 Jul 2015, Richard Laing wrote:

> From: Richard Laing <richard.laing@alliedtelesis.co.nz>
> 
> Enable flow-based ECMP.
> 
> Currently if equal-cost multipath is enabled the kernel chooses between
> equal cost paths for each matching packet, essentially packets are
> round-robined between the routes. This means that packets from a single
> flow can traverse different routes. If one of the routes experiences
> congestion this can result in delayed or out of order packets arriving
> at the destination.
> 
> This patch allows packets to be routed based on their
> flow - packets in the same flow will always use the same route. This
> prevents out of order packets. There are other issues with round-robin
> based ECMP routing related to variable path MTU handling and debugging.
> See RFC2991 for more details on the problems associated with packet
> based ECMP routing.
> 
> This patch relies on the skb hash value to select between routes. The
> selection uses a hash-threshold algorithm (see RFC2992).

	What about forwarding?

	Also, we can make it lockless and to consider
nexthop weights. The DNS SRV (RFC 2782:Weight) has such WRR algorithm,
I'll try to describe it with such example, may be it can
be properly implemented but this is just to show the idea:

- 2 NHs, alive:
- nexthop 1: weight 10
- nexthop 2: weight 20

- calculate the sum of weight of all nexthops: 10+20=30,
maintain it in fib_info, may be for all alive nexthops

- get a random number in the 0..29 range (in our case L3/L4 hash % 30):

rand_val = hash % sum;

	This means the lowest bits of hash must be random.
rand_val in range 0..9 should use NH1, 10..29 NH2.

- walk the list with nexthops by increasing the running sum:

	int run;

again:
	run = 0;
	for_nexthops(fi) {
		/* run: ->10->30 */
		run += nh->nh_weight;
		if (run > rand_val)
			goto found;
	}

	/* race on NH DEAD flag change, retry? */
	smp_rmb();
	goto again;

found:
	/* use nhsel */

	Some questions remain:

- events can mark nexthops as down/dead/whatever and they may be
ignored. As result, same hash can go to different nexthop for
next route lookups. One option is to walk even dead nexthops
but if we select such one we have to to skip to the next available.
As result, hashes that hit alive NH will always use their NH,
only hashes that hit dead NH will get new NH. Then
change of dead state will not affect the binding of hash
to alive nexthop.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 1/1] net/ipv4: Enable flow-based ECMP
  2015-07-29  6:11     ` Michal Kubecek
  2015-07-29  6:32       ` David Miller
@ 2015-07-29 16:12       ` Tom Herbert
  2015-07-30  1:23         ` Richard Laing
  1 sibling, 1 reply; 14+ messages in thread
From: Tom Herbert @ 2015-07-29 16:12 UTC (permalink / raw)
  To: Michal Kubecek; +Cc: Richard Laing, netdev@vger.kernel.org, jmorris@namei.org

On Tue, Jul 28, 2015 at 11:11 PM, Michal Kubecek <mkubecek@suse.cz> wrote:
> On Tue, Jul 28, 2015 at 09:20:02PM +0000, Richard Laing wrote:
>> >> diff --git a/include/net/flow.h b/include/net/flow.h
>> >> index 8109a15..d1d933d 100644
>> >> --- a/include/net/flow.h
>> >> +++ b/include/net/flow.h
>> >> @@ -79,6 +79,10 @@ struct flowi4 {
>> >>    #define fl4_ipsec_spi        uli.spi
>> >>    #define fl4_mh_type        uli.mht.type
>> >>    #define fl4_gre_key        uli.gre_key
>> >> +
>> >> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> > Why bother making this a CONFIG, round robin is a miserable algorithm
>> > anyway and nearly all the other packet steering mechanisms already use
>> > a hash.
>>
>> Fair enough, I will look at making it a sysctl option. I guess the
>> default can be the current behaviour.
>
> Hm... that's an interesting question. In general, it's better to use
> current behaviour as a default so that people are not surprised on
> upgrade. On the other hand, it used to be per-flow  - or rather per
> destination - earlier (until the routing cache removal, I believe) and
> per-flow distribution is IMHO preferrable in majority of use cases. In
> theory, there was a route attribute "equalize" to switch to per-packet
> distribution, but it was never actually implemented, AFAIK (it was
> recognized by ip and passed to kernel but ignored there).
>
> Anyway, config option is definitely inconvenient as most users install
> distribution kernels and do not configure their kernels themselves. Even
> boot parameter would be better - but sysctl sounds much better. Having
> both sysctl and per-route attribute would be perfect, of course.
>
IPv6 routing already uses a hash without any capability of setting
this to be round robin, probably every network device on the planet
performs ECMP using a hash since it is stateless algorithm, and as I
pointed most of our other packet steering mechanisms use a hash. So
the IPv4 path seems to be the odd man out here. Keeping round robin
seems around awfully conservative to me, and implies that we need to
implement it in IPv6 to maintain feature parity.

Tom

>                                                         Michal Kubecek
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 1/1] net/ipv4: Enable flow-based ECMP
  2015-07-29 16:12       ` Tom Herbert
@ 2015-07-30  1:23         ` Richard Laing
  2016-06-17 18:52           ` Jean He
  0 siblings, 1 reply; 14+ messages in thread
From: Richard Laing @ 2015-07-30  1:23 UTC (permalink / raw)
  To: Tom Herbert, Michal Kubecek; +Cc: netdev@vger.kernel.org, jmorris@namei.org



On 07/30/2015 04:12 AM, Tom Herbert wrote:
> On Tue, Jul 28, 2015 at 11:11 PM, Michal Kubecek <mkubecek@suse.cz> wrote:
>> On Tue, Jul 28, 2015 at 09:20:02PM +0000, Richard Laing wrote:
>>>>> diff --git a/include/net/flow.h b/include/net/flow.h
>>>>> index 8109a15..d1d933d 100644
>>>>> --- a/include/net/flow.h
>>>>> +++ b/include/net/flow.h
>>>>> @@ -79,6 +79,10 @@ struct flowi4 {
>>>>>     #define fl4_ipsec_spi        uli.spi
>>>>>     #define fl4_mh_type        uli.mht.type
>>>>>     #define fl4_gre_key        uli.gre_key
>>>>> +
>>>>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>>>> Why bother making this a CONFIG, round robin is a miserable algorithm
>>>> anyway and nearly all the other packet steering mechanisms already use
>>>> a hash.
>>> Fair enough, I will look at making it a sysctl option. I guess the
>>> default can be the current behaviour.
>> Hm... that's an interesting question. In general, it's better to use
>> current behaviour as a default so that people are not surprised on
>> upgrade. On the other hand, it used to be per-flow  - or rather per
>> destination - earlier (until the routing cache removal, I believe) and
>> per-flow distribution is IMHO preferrable in majority of use cases. In
>> theory, there was a route attribute "equalize" to switch to per-packet
>> distribution, but it was never actually implemented, AFAIK (it was
>> recognized by ip and passed to kernel but ignored there).
>>
>> Anyway, config option is definitely inconvenient as most users install
>> distribution kernels and do not configure their kernels themselves. Even
>> boot parameter would be better - but sysctl sounds much better. Having
>> both sysctl and per-route attribute would be perfect, of course.
>>
> IPv6 routing already uses a hash without any capability of setting
> this to be round robin, probably every network device on the planet
> performs ECMP using a hash since it is stateless algorithm, and as I
> pointed most of our other packet steering mechanisms use a hash. So
> the IPv4 path seems to be the odd man out here. Keeping round robin
> seems around awfully conservative to me, and implies that we need to
> implement it in IPv6 to maintain feature parity.
>
> Tom

Thanks Tom, I would certainly agree that flow based is preferable to the current behaviour and would be happy to make it the default. I will update my patch with the feedback and send an update, no ETA right now!

Cheers
Richard

--
Richard Laing
Software Team Leader
Allied Telesis Labs| 27 Nazareth Ave | Christchurch 8024 | New Zealand
Phone: +64 3 339 9248
Web: www.alliedtelesis.com

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 1/1] net/ipv4: Enable flow-based ECMP
  2015-07-30  1:23         ` Richard Laing
@ 2016-06-17 18:52           ` Jean He
  0 siblings, 0 replies; 14+ messages in thread
From: Jean He @ 2016-06-17 18:52 UTC (permalink / raw)
  To: netdev

Richard Laing <Richard.Laing <at> alliedtelesis.co.nz> writes:

> >>> Fair enough, I will look at making it a sysctl option. I guess the
> >>> default can be the current behaviour.

> Cheers
> Richard
> 

Hi Richard,

Could you please share if this new patch for per-flow IPv4 ECMP has landed?

If so, which version should I use? 

And is there a sysctl option that we can just use to enable per-flow hashing?

Many thanks!
Jean

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2016-06-17 18:57 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-07-28  2:27 [RFC PATCH 1/1] net/ipv4: Enable flow-based ECMP Richard Laing
2015-07-28  7:20 ` Michal Kubecek
2015-07-28 19:51 ` Stephen Hemminger
2015-07-28 21:16   ` Richard Laing
2015-07-28 21:02 ` Tom Herbert
2015-07-28 21:15   ` Tom Herbert
2015-07-28 21:22     ` Richard Laing
2015-07-28 21:20   ` Richard Laing
2015-07-29  6:11     ` Michal Kubecek
2015-07-29  6:32       ` David Miller
2015-07-29 16:12       ` Tom Herbert
2015-07-30  1:23         ` Richard Laing
2016-06-17 18:52           ` Jean He
2015-07-29  7:56 ` Julian Anastasov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).