From: pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org
To: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: "David S. Miller" <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>,
"Alexey Kuznetsov"
<kuznet-v/Mj1YrvjDBInbfyfbPRSQ@public.gmane.org>,
"James Morris" <jmorris-gx6/JNMH7DfYtjvyW6yDsg@public.gmane.org>,
"Hideaki YOSHIFUJI"
<yoshfuji-VfPWfsRibaP+Ru+s062T9g@public.gmane.org>,
"Patrick McHardy" <kaber-dcUjhNyLwpNeoWH0uzbU5w@public.gmane.org>,
linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
"Roopa Prabhu"
<roopa-qUQiAmfTcIp+XZJcv9eMoEEOCMrvLtNR@public.gmane.org>,
"Scott Feldman" <sfeldma-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>,
"Eric W. Biederman"
<ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>,
"Nicolas Dichtel"
<nicolas.dichtel-pdR9zngts4EAvxtiuMwx3w@public.gmane.org>,
"Thomas Graf" <tgraf-G/eBtMaohhA@public.gmane.org>,
"Jiri Benc" <jbenc-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>,
"Peter Nørlund" <pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
Subject: [PATCH v2 net-next 2/3] ipv4: L3 and L4 hash-based multipath routing
Date: Fri, 28 Aug 2015 22:00:49 +0200 [thread overview]
Message-ID: <1440792050-2109-3-git-send-email-pch@ordbogen.com> (raw)
In-Reply-To: <1440792050-2109-1-git-send-email-pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
From: Peter Nørlund <pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
This patch adds L3 and L4 hash-based multipath routing, selectable on a
per-route basis with the reintroduced RTA_MP_ALGO attribute. The default is
now RT_MP_ALG_L3_HASH.
Signed-off-by: Peter Nørlund <pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
---
include/net/ip_fib.h | 22 ++++++++++++++++-
include/uapi/linux/rtnetlink.h | 14 ++++++++++-
net/ipv4/fib_frontend.c | 4 +++
net/ipv4/fib_semantics.c | 43 +++++++++++++++++++++++++++-----
net/ipv4/route.c | 56 ++++++++++++++++++++++++++++++++++++++++--
5 files changed, 129 insertions(+), 10 deletions(-)
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 18a3c7f..21e74b5 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -37,6 +37,7 @@ struct fib_config {
u32 fc_flags;
u32 fc_priority;
__be32 fc_prefsrc;
+ int fc_mp_alg;
struct nlattr *fc_mx;
struct rtnexthop *fc_mp;
int fc_mx_len;
@@ -119,6 +120,7 @@ struct fib_info {
int fib_nhs;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
int fib_weight;
+ int fib_mp_alg;
#endif
struct rcu_head rcu;
struct fib_nh fib_nh[0];
@@ -312,7 +314,25 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev);
int fib_sync_down_dev(struct net_device *dev, unsigned long event);
int fib_sync_down_addr(struct net *net, __be32 local);
int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
-void fib_select_multipath(struct fib_result *res);
+
+struct multipath_flow4 {
+ __be32 saddr;
+ __be32 daddr;
+ union {
+ __be32 ports;
+ struct {
+ __be16 sport;
+ __be16 dport;
+ };
+ };
+};
+
+typedef void (*multipath_flow4_func_t)(struct multipath_flow4 *flow,
+ void *ctx);
+
+void fib_select_multipath(struct fib_result *res,
+ multipath_flow4_func_t flow_func,
+ void *ctx);
/* Exported by fib_trie.c */
void fib_trie_init(void);
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 0d3d3cc..2563a96 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -271,6 +271,18 @@ enum rt_scope_t {
#define RTM_F_EQUALIZE 0x400 /* Multipath equalizer: NI */
#define RTM_F_PREFIX 0x800 /* Prefix addresses */
+/* Multipath algorithms */
+
+enum rt_mp_alg_t {
+ RT_MP_ALG_L3_HASH, /* Was IP_MP_ALG_NONE */
+ RT_MP_ALG_PER_PACKET, /* Was IP_MP_ALG_RR */
+ RT_MP_ALG_DRR, /* not used */
+ RT_MP_ALG_RANDOM, /* not used */
+ RT_MP_ALG_WRANDOM, /* not used */
+ RT_MP_ALG_L4_HASH,
+ __RT_MP_ALG_MAX
+};
+
/* Reserved table identifiers */
enum rt_class_t {
@@ -301,7 +313,7 @@ enum rtattr_type_t {
RTA_FLOW,
RTA_CACHEINFO,
RTA_SESSION, /* no longer used */
- RTA_MP_ALGO, /* no longer used */
+ RTA_MP_ALGO,
RTA_TABLE,
RTA_MARK,
RTA_MFC_STATS,
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 7fa2771..5ba4442 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -622,6 +622,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
[RTA_PREFSRC] = { .type = NLA_U32 },
[RTA_METRICS] = { .type = NLA_NESTED },
[RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
+ [RTA_MP_ALGO] = { .type = NLA_U32 },
[RTA_FLOW] = { .type = NLA_U32 },
[RTA_ENCAP_TYPE] = { .type = NLA_U16 },
[RTA_ENCAP] = { .type = NLA_NESTED },
@@ -684,6 +685,9 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
cfg->fc_mp = nla_data(attr);
cfg->fc_mp_len = nla_len(attr);
break;
+ case RTA_MP_ALGO:
+ cfg->fc_mp_alg = nla_get_u32(attr);
+ break;
case RTA_FLOW:
cfg->fc_flow = nla_get_u32(attr);
break;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index becb63f..3a80b1a 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -259,6 +259,11 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
{
const struct fib_nh *onh = ofi->fib_nh;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (fi->fib_mp_alg != ofi->fib_mp_alg)
+ return -1;
+#endif
+
for_nexthops(fi) {
if (nh->nh_oif != onh->nh_oif ||
nh->nh_gw != onh->nh_gw ||
@@ -1028,6 +1033,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (cfg->fc_mp) {
#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ fi->fib_mp_alg = cfg->fc_mp_alg;
err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
if (err != 0)
goto failure;
@@ -1245,6 +1251,10 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
struct rtnexthop *rtnh;
struct nlattr *mp;
+ if (fi->fib_mp_alg &&
+ nla_put_u32(skb, RTA_MP_ALGO, fi->fib_mp_alg))
+ goto nla_put_failure;
+
mp = nla_nest_start(skb, RTA_MULTIPATH);
if (!mp)
goto nla_put_failure;
@@ -1520,16 +1530,37 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
#ifdef CONFIG_IP_ROUTE_MULTIPATH
-/*
- * The algorithm is suboptimal, but it provides really
- * fair weighted route distribution.
- */
-void fib_select_multipath(struct fib_result *res)
+/* Compute multipath hash based on 3- or 5-tuple */
+static int fib_multipath_hash(const struct fib_result *res,
+ multipath_flow4_func_t flow_func, void *ctx)
+{
+ struct multipath_flow4 flow;
+
+ flow_func(&flow, ctx);
+
+ if (res->fi->fib_mp_alg == RT_MP_ALG_L4_HASH)
+ return jhash_3words(flow.saddr, flow.daddr, flow.ports, 0) >> 1;
+ else
+ return jhash_2words(flow.saddr, flow.daddr, 0) >> 1;
+}
+
+static int fib_multipath_perpacket(void)
+{
+ return bitrev32(this_cpu_inc_return(fib_multipath_rr_counter)) >> 1;
+}
+
+void fib_select_multipath(struct fib_result *res,
+ multipath_flow4_func_t flow_func,
+ void *ctx)
{
struct fib_info *fi = res->fi;
int h;
- h = bitrev32(this_cpu_inc_return(fib_multipath_rr_counter)) >> 1;
+ if (res->fi->fib_mp_alg == RT_MP_ALG_PER_PACKET) {
+ h = fib_multipath_perpacket();
+ } else {
+ h = fib_multipath_hash(res, flow_func, ctx);
+ }
for_nexthops(fi) {
if (h > atomic_read(&nh->nh_upper_bound))
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f3087aa..f50f84f 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1643,6 +1643,58 @@ out:
return err;
}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+/* Fill multipath flow key data based on socket buffer */
+static void ip_multipath_flow_skb(struct multipath_flow4 *flow, void *ctx)
+{
+ const struct sk_buff *skb = (const struct sk_buff *)ctx;
+ const struct iphdr *iph;
+
+ iph = ip_hdr(skb);
+
+ flow->saddr = iph->saddr;
+ flow->daddr = iph->daddr;
+ flow->ports = 0;
+
+ if (unlikely(!(iph->frag_off & htons(IP_DF))))
+ return;
+
+ if (iph->protocol == IPPROTO_TCP ||
+ iph->protocol == IPPROTO_UDP ||
+ iph->protocol == IPPROTO_SCTP) {
+ __be16 _ports[2];
+ const __be16 *ports;
+
+ ports = skb_header_pointer(skb, iph->ihl * 4, sizeof(_ports),
+ &_ports);
+ if (ports) {
+ flow->sport = ports[0];
+ flow->dport = ports[1];
+ }
+ }
+}
+
+/* Fill multipath flow key data based on flowi4 */
+static void ip_multipath_flow_fl4(struct multipath_flow4 *flow, void *ctx)
+{
+ const struct flowi4 *fl4 = (const struct flowi4 *)ctx;
+
+ flow->saddr = fl4->saddr;
+ flow->daddr = fl4->daddr;
+
+ if (fl4->flowi4_proto == IPPROTO_TCP ||
+ fl4->flowi4_proto == IPPROTO_UDP ||
+ fl4->flowi4_proto == IPPROTO_SCTP) {
+ flow->sport = fl4->fl4_sport;
+ flow->dport = fl4->fl4_dport;
+ } else {
+ flow->ports = 0;
+ }
+}
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
static int ip_mkroute_input(struct sk_buff *skb,
struct fib_result *res,
const struct flowi4 *fl4,
@@ -1651,7 +1703,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (res->fi && res->fi->fib_nhs > 1)
- fib_select_multipath(res);
+ fib_select_multipath(res, ip_multipath_flow_skb, skb);
#endif
/* create a routing cache entry */
@@ -2197,7 +2249,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
- fib_select_multipath(&res);
+ fib_select_multipath(&res, ip_multipath_flow_fl4, fl4);
else
#endif
if (!res.prefixlen &&
--
2.1.4
next prev parent reply other threads:[~2015-08-28 20:00 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-08-28 20:00 [PATCH v2 net-next 0/3] ipv4: Hash-based multipath routing pch
[not found] ` <1440792050-2109-1-git-send-email-pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
2015-08-28 20:00 ` [PATCH v2 net-next 1/3] ipv4: Lock-less per-packet multipath pch-chEQUL3jiZBWk0Htik3J/w
2015-08-28 20:00 ` pch-chEQUL3jiZBWk0Htik3J/w [this message]
2015-08-30 22:48 ` [PATCH v2 net-next 2/3] ipv4: L3 and L4 hash-based multipath routing Tom Herbert
2015-08-29 20:14 ` [PATCH v2 net-next 0/3] ipv4: Hash-based " David Miller
[not found] ` <20150829.131429.360433621593751136.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
2015-08-29 20:31 ` Peter Nørlund
2015-08-29 20:46 ` David Miller
[not found] ` <20150829.134628.1013990034021542524.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
2015-08-29 20:55 ` Scott Feldman
2015-08-29 20:59 ` Tom Herbert
2015-08-30 21:28 ` Peter Nørlund
2015-08-30 22:29 ` Tom Herbert
2015-08-31 9:02 ` Thomas Graf
2015-08-28 20:00 ` [PATCH v2 net-next 3/3] ipv4: ICMP packet inspection for L3 multipath pch
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1440792050-2109-3-git-send-email-pch@ordbogen.com \
--to=pch-chequl3jizbwk0htik3j/w@public.gmane.org \
--cc=davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org \
--cc=ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org \
--cc=jbenc-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org \
--cc=jmorris-gx6/JNMH7DfYtjvyW6yDsg@public.gmane.org \
--cc=kaber-dcUjhNyLwpNeoWH0uzbU5w@public.gmane.org \
--cc=kuznet-v/Mj1YrvjDBInbfyfbPRSQ@public.gmane.org \
--cc=linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
--cc=netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
--cc=nicolas.dichtel-pdR9zngts4EAvxtiuMwx3w@public.gmane.org \
--cc=roopa-qUQiAmfTcIp+XZJcv9eMoEEOCMrvLtNR@public.gmane.org \
--cc=sfeldma-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org \
--cc=tgraf-G/eBtMaohhA@public.gmane.org \
--cc=yoshfuji-VfPWfsRibaP+Ru+s062T9g@public.gmane.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox