public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed
From: pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org
To: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: "David S. Miller" <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>,
	"Alexey Kuznetsov"
	<kuznet-v/Mj1YrvjDBInbfyfbPRSQ@public.gmane.org>,
	"James Morris" <jmorris-gx6/JNMH7DfYtjvyW6yDsg@public.gmane.org>,
	"Hideaki YOSHIFUJI"
	<yoshfuji-VfPWfsRibaP+Ru+s062T9g@public.gmane.org>,
	"Patrick McHardy" <kaber-dcUjhNyLwpNeoWH0uzbU5w@public.gmane.org>,
	linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	"Roopa Prabhu"
	<roopa-qUQiAmfTcIp+XZJcv9eMoEEOCMrvLtNR@public.gmane.org>,
	"Scott Feldman" <sfeldma-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>,
	"Eric W. Biederman"
	<ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>,
	"Nicolas Dichtel"
	<nicolas.dichtel-pdR9zngts4EAvxtiuMwx3w@public.gmane.org>,
	"Thomas Graf" <tgraf-G/eBtMaohhA@public.gmane.org>,
	"Jiri Benc" <jbenc-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>,
	"Peter Nørlund" <pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
Subject: [PATCH v2 net-next 2/3] ipv4: L3 and L4 hash-based multipath routing
Date: Fri, 28 Aug 2015 22:00:49 +0200	[thread overview]
Message-ID: <1440792050-2109-3-git-send-email-pch@ordbogen.com> (raw)
In-Reply-To: <1440792050-2109-1-git-send-email-pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>

From: Peter Nørlund <pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>

This patch adds L3 and L4 hash-based multipath routing, selectable on a
per-route basis with the reintroduced RTA_MP_ALGO attribute. The default is
now RT_MP_ALG_L3_HASH.

Signed-off-by: Peter Nørlund <pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
---
 include/net/ip_fib.h           | 22 ++++++++++++++++-
 include/uapi/linux/rtnetlink.h | 14 ++++++++++-
 net/ipv4/fib_frontend.c        |  4 +++
 net/ipv4/fib_semantics.c       | 43 +++++++++++++++++++++++++++-----
 net/ipv4/route.c               | 56 ++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 129 insertions(+), 10 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 18a3c7f..21e74b5 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -37,6 +37,7 @@ struct fib_config {
 	u32			fc_flags;
 	u32			fc_priority;
 	__be32			fc_prefsrc;
+	int			fc_mp_alg;
 	struct nlattr		*fc_mx;
 	struct rtnexthop	*fc_mp;
 	int			fc_mx_len;
@@ -119,6 +120,7 @@ struct fib_info {
 	int			fib_nhs;
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	int			fib_weight;
+	int			fib_mp_alg;
 #endif
 	struct rcu_head		rcu;
 	struct fib_nh		fib_nh[0];
@@ -312,7 +314,25 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev);
 int fib_sync_down_dev(struct net_device *dev, unsigned long event);
 int fib_sync_down_addr(struct net *net, __be32 local);
 int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
-void fib_select_multipath(struct fib_result *res);
+
+struct multipath_flow4 {
+	__be32 saddr;
+	__be32 daddr;
+	union {
+		__be32 ports;
+		struct {
+			__be16 sport;
+			__be16 dport;
+		};
+	};
+};
+
+typedef void (*multipath_flow4_func_t)(struct multipath_flow4 *flow,
+				       void *ctx);
+
+void fib_select_multipath(struct fib_result *res,
+			  multipath_flow4_func_t flow_func,
+			  void *ctx);
 
 /* Exported by fib_trie.c */
 void fib_trie_init(void);
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 0d3d3cc..2563a96 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -271,6 +271,18 @@ enum rt_scope_t {
 #define RTM_F_EQUALIZE		0x400	/* Multipath equalizer: NI	*/
 #define RTM_F_PREFIX		0x800	/* Prefix addresses		*/
 
+/* Multipath algorithms */
+
+enum rt_mp_alg_t {
+	RT_MP_ALG_L3_HASH,	/* Was IP_MP_ALG_NONE */
+	RT_MP_ALG_PER_PACKET,	/* Was IP_MP_ALG_RR */
+	RT_MP_ALG_DRR,		/* not used */
+	RT_MP_ALG_RANDOM,	/* not used */
+	RT_MP_ALG_WRANDOM,	/* not used */
+	RT_MP_ALG_L4_HASH,
+	__RT_MP_ALG_MAX
+};
+
 /* Reserved table identifiers */
 
 enum rt_class_t {
@@ -301,7 +313,7 @@ enum rtattr_type_t {
 	RTA_FLOW,
 	RTA_CACHEINFO,
 	RTA_SESSION, /* no longer used */
-	RTA_MP_ALGO, /* no longer used */
+	RTA_MP_ALGO,
 	RTA_TABLE,
 	RTA_MARK,
 	RTA_MFC_STATS,
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 7fa2771..5ba4442 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -622,6 +622,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 	[RTA_PREFSRC]		= { .type = NLA_U32 },
 	[RTA_METRICS]		= { .type = NLA_NESTED },
 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
+	[RTA_MP_ALGO]		= { .type = NLA_U32 },
 	[RTA_FLOW]		= { .type = NLA_U32 },
 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
 	[RTA_ENCAP]		= { .type = NLA_NESTED },
@@ -684,6 +685,9 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
 			cfg->fc_mp = nla_data(attr);
 			cfg->fc_mp_len = nla_len(attr);
 			break;
+		case RTA_MP_ALGO:
+			cfg->fc_mp_alg = nla_get_u32(attr);
+			break;
 		case RTA_FLOW:
 			cfg->fc_flow = nla_get_u32(attr);
 			break;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index becb63f..3a80b1a 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -259,6 +259,11 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
 {
 	const struct fib_nh *onh = ofi->fib_nh;
 
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	if (fi->fib_mp_alg != ofi->fib_mp_alg)
+		return -1;
+#endif
+
 	for_nexthops(fi) {
 		if (nh->nh_oif != onh->nh_oif ||
 		    nh->nh_gw  != onh->nh_gw ||
@@ -1028,6 +1033,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 
 	if (cfg->fc_mp) {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
+		fi->fib_mp_alg = cfg->fc_mp_alg;
 		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
 		if (err != 0)
 			goto failure;
@@ -1245,6 +1251,10 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 		struct rtnexthop *rtnh;
 		struct nlattr *mp;
 
+		if (fi->fib_mp_alg &&
+		    nla_put_u32(skb, RTA_MP_ALGO, fi->fib_mp_alg))
+			goto nla_put_failure;
+
 		mp = nla_nest_start(skb, RTA_MULTIPATH);
 		if (!mp)
 			goto nla_put_failure;
@@ -1520,16 +1530,37 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 
-/*
- * The algorithm is suboptimal, but it provides really
- * fair weighted route distribution.
- */
-void fib_select_multipath(struct fib_result *res)
+/* Compute multipath hash based on 3- or 5-tuple */
+static int fib_multipath_hash(const struct fib_result *res,
+			      multipath_flow4_func_t flow_func, void *ctx)
+{
+	struct multipath_flow4 flow;
+
+	flow_func(&flow, ctx);
+
+	if (res->fi->fib_mp_alg == RT_MP_ALG_L4_HASH)
+		return jhash_3words(flow.saddr, flow.daddr, flow.ports, 0) >> 1;
+	else
+		return jhash_2words(flow.saddr, flow.daddr, 0) >> 1;
+}
+
+static int fib_multipath_perpacket(void)
+{
+	return bitrev32(this_cpu_inc_return(fib_multipath_rr_counter)) >> 1;
+}
+
+void fib_select_multipath(struct fib_result *res,
+			  multipath_flow4_func_t flow_func,
+			  void *ctx)
 {
 	struct fib_info *fi = res->fi;
 	int h;
 
-	h = bitrev32(this_cpu_inc_return(fib_multipath_rr_counter)) >> 1;
+	if (res->fi->fib_mp_alg == RT_MP_ALG_PER_PACKET) {
+		h = fib_multipath_perpacket();
+	} else {
+		h = fib_multipath_hash(res, flow_func, ctx);
+	}
 
 	for_nexthops(fi) {
 		if (h > atomic_read(&nh->nh_upper_bound))
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f3087aa..f50f84f 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1643,6 +1643,58 @@ out:
 	return err;
 }
 
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+/* Fill multipath flow key data based on socket buffer */
+static void ip_multipath_flow_skb(struct multipath_flow4 *flow, void *ctx)
+{
+	const struct sk_buff *skb = (const struct sk_buff *)ctx;
+	const struct iphdr *iph;
+
+	iph = ip_hdr(skb);
+
+	flow->saddr = iph->saddr;
+	flow->daddr = iph->daddr;
+	flow->ports = 0;
+
+	if (unlikely(!(iph->frag_off & htons(IP_DF))))
+		return;
+
+	if (iph->protocol == IPPROTO_TCP ||
+	    iph->protocol == IPPROTO_UDP ||
+	    iph->protocol == IPPROTO_SCTP) {
+		__be16 _ports[2];
+		const __be16 *ports;
+
+		ports = skb_header_pointer(skb, iph->ihl * 4, sizeof(_ports),
+					   &_ports);
+		if (ports) {
+			flow->sport = ports[0];
+			flow->dport = ports[1];
+		}
+	}
+}
+
+/* Fill multipath flow key data based on flowi4  */
+static void ip_multipath_flow_fl4(struct multipath_flow4 *flow, void *ctx)
+{
+	const struct flowi4 *fl4 = (const struct flowi4 *)ctx;
+
+	flow->saddr = fl4->saddr;
+	flow->daddr = fl4->daddr;
+
+	if (fl4->flowi4_proto == IPPROTO_TCP ||
+	    fl4->flowi4_proto == IPPROTO_UDP ||
+	    fl4->flowi4_proto == IPPROTO_SCTP) {
+		flow->sport = fl4->fl4_sport;
+		flow->dport = fl4->fl4_dport;
+	} else {
+		flow->ports = 0;
+	}
+}
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
 static int ip_mkroute_input(struct sk_buff *skb,
 			    struct fib_result *res,
 			    const struct flowi4 *fl4,
@@ -1651,7 +1703,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
 {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (res->fi && res->fi->fib_nhs > 1)
-		fib_select_multipath(res);
+		fib_select_multipath(res, ip_multipath_flow_skb, skb);
 #endif
 
 	/* create a routing cache entry */
@@ -2197,7 +2249,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
-		fib_select_multipath(&res);
+		fib_select_multipath(&res, ip_multipath_flow_fl4, fl4);
 	else
 #endif
 	if (!res.prefixlen &&
-- 
2.1.4

  parent reply	other threads:[~2015-08-28 20:00 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-08-28 20:00 [PATCH v2 net-next 0/3] ipv4: Hash-based multipath routing pch
     [not found] ` <1440792050-2109-1-git-send-email-pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
2015-08-28 20:00   ` [PATCH v2 net-next 1/3] ipv4: Lock-less per-packet multipath pch-chEQUL3jiZBWk0Htik3J/w
2015-08-28 20:00   ` pch-chEQUL3jiZBWk0Htik3J/w [this message]
2015-08-30 22:48     ` [PATCH v2 net-next 2/3] ipv4: L3 and L4 hash-based multipath routing Tom Herbert
2015-08-29 20:14   ` [PATCH v2 net-next 0/3] ipv4: Hash-based " David Miller
     [not found]     ` <20150829.131429.360433621593751136.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
2015-08-29 20:31       ` Peter Nørlund
2015-08-29 20:46         ` David Miller
     [not found]           ` <20150829.134628.1013990034021542524.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
2015-08-29 20:55             ` Scott Feldman
2015-08-29 20:59             ` Tom Herbert
2015-08-30 21:28               ` Peter Nørlund
2015-08-30 22:29                 ` Tom Herbert
2015-08-31  9:02                   ` Thomas Graf
2015-08-28 20:00 ` [PATCH v2 net-next 3/3] ipv4: ICMP packet inspection for L3 multipath pch

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1440792050-2109-3-git-send-email-pch@ordbogen.com \
    --to=pch-chequl3jizbwk0htik3j/w@public.gmane.org \
    --cc=davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org \
    --cc=ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org \
    --cc=jbenc-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org \
    --cc=jmorris-gx6/JNMH7DfYtjvyW6yDsg@public.gmane.org \
    --cc=kaber-dcUjhNyLwpNeoWH0uzbU5w@public.gmane.org \
    --cc=kuznet-v/Mj1YrvjDBInbfyfbPRSQ@public.gmane.org \
    --cc=linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    --cc=netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    --cc=nicolas.dichtel-pdR9zngts4EAvxtiuMwx3w@public.gmane.org \
    --cc=roopa-qUQiAmfTcIp+XZJcv9eMoEEOCMrvLtNR@public.gmane.org \
    --cc=sfeldma-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org \
    --cc=tgraf-G/eBtMaohhA@public.gmane.org \
    --cc=yoshfuji-VfPWfsRibaP+Ru+s062T9g@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox