From: pch@ordbogen.com
To: netdev@vger.kernel.org
Cc: "David S. Miller" <davem@davemloft.net>,
"Alexey Kuznetsov" <kuznet@ms2.inr.ac.ru>,
"James Morris" <jmorris@namei.org>,
"Hideaki YOSHIFUJI" <yoshfuji@linux-ipv6.org>,
"Patrick McHardy" <kaber@trash.net>,
linux-api@vger.kernel.org,
"Roopa Prabhu" <roopa@cumulusnetworks.com>,
"Scott Feldman" <sfeldma@gmail.com>,
"Eric W. Biederman" <ebiederm@xmission.com>,
"Nicolas Dichtel" <nicolas.dichtel@6wind.com>,
"Thomas Graf" <tgraf@suug.ch>, "Jiri Benc" <jbenc@redhat.com>,
"Peter Nørlund" <pch@ordbogen.com>
Subject: [PATCH v2 net-next 3/3] ipv4: ICMP packet inspection for L3 multipath
Date: Fri, 28 Aug 2015 22:00:50 +0200 [thread overview]
Message-ID: <1440792050-2109-4-git-send-email-pch@ordbogen.com> (raw)
In-Reply-To: <1440792050-2109-1-git-send-email-pch@ordbogen.com>
From: Peter Nørlund <pch@ordbogen.com>
When doing L3 based multipath, ICMP packets are inspected to let them route
over the same path as the flow they relate to, allowing anycast
environments to work with ECMP.
Signed-off-by: Peter Nørlund <pch@ordbogen.com>
---
include/net/ip_fib.h | 2 +-
include/net/route.h | 12 ++++++-
net/ipv4/fib_semantics.c | 2 +-
net/ipv4/icmp.c | 34 +++++++++++++++++++-
net/ipv4/route.c | 82 ++++++++++++++++++++++++++++++++++++++----------
5 files changed, 112 insertions(+), 20 deletions(-)
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 21e74b5..3e5d4ed 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -328,7 +328,7 @@ struct multipath_flow4 {
};
typedef void (*multipath_flow4_func_t)(struct multipath_flow4 *flow,
- void *ctx);
+ enum rt_mp_alg_t algo, void *ctx);
void fib_select_multipath(struct fib_result *res,
multipath_flow4_func_t flow_func,
diff --git a/include/net/route.h b/include/net/route.h
index 395d79b..ccb85fc 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -28,6 +28,7 @@
#include <net/inetpeer.h>
#include <net/flow.h>
#include <net/inet_sock.h>
+#include <net/ip_fib.h>
#include <linux/in_route.h>
#include <linux/rtnetlink.h>
#include <linux/rcupdate.h>
@@ -110,7 +111,16 @@ struct in_device;
int ip_rt_init(void);
void rt_cache_flush(struct net *net);
void rt_flush_dev(struct net_device *dev);
-struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp);
+struct rtable *__ip_route_output_key_flow(struct net *, struct flowi4 *flp,
+ multipath_flow4_func_t flow_func,
+ void *ctx);
+
+static inline struct rtable *__ip_route_output_key(struct net *net,
+ struct flowi4 *flp)
+{
+ return __ip_route_output_key_flow(net, flp, NULL, NULL);
+}
+
struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
struct sock *sk);
struct dst_entry *ipv4_blackhole_route(struct net *net,
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 3a80b1a..000c535 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1536,7 +1536,7 @@ static int fib_multipath_hash(const struct fib_result *res,
{
struct multipath_flow4 flow;
- flow_func(&flow, ctx);
+ flow_func(&flow, res->fi->fib_mp_alg, ctx);
if (res->fi->fib_mp_alg == RT_MP_ALG_L4_HASH)
return jhash_3words(flow.saddr, flow.daddr, flow.ports, 0) >> 1;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index f16488e..0e25fe4 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -439,6 +439,38 @@ out_unlock:
icmp_xmit_unlock(sk);
}
+/* Source and destination is swapped. See ip_multipath_flow_skb */
+static void icmp_multipath_flow(struct multipath_flow4 *flow,
+ enum rt_mp_alg_t algo, void *ctx)
+{
+ const struct sk_buff *skb = (const struct sk_buff *)ctx;
+ const struct iphdr *iph = ip_hdr(skb);
+
+ flow->saddr = iph->daddr;
+ flow->daddr = iph->saddr;
+ flow->ports = 0;
+
+ if (algo == RT_MP_ALG_L4_HASH)
+ return;
+
+ if (unlikely(!(iph->frag_off & htons(IP_DF))))
+ return;
+
+ if (iph->protocol == IPPROTO_TCP ||
+ iph->protocol == IPPROTO_UDP ||
+ iph->protocol == IPPROTO_SCTP) {
+ __be16 _ports[2];
+ const __be16 *ports;
+
+ ports = skb_header_pointer(skb, iph->ihl * 4, sizeof(_ports),
+ &_ports);
+ if (ports) {
+ flow->sport = ports[1];
+ flow->dport = ports[0];
+ }
+ }
+}
+
static struct rtable *icmp_route_lookup(struct net *net,
struct flowi4 *fl4,
struct sk_buff *skb_in,
@@ -463,7 +495,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
fl4->flowi4_oif = vrf_master_ifindex(skb_in->dev) ? : skb_in->dev->ifindex;
security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
- rt = __ip_route_output_key(net, fl4);
+ rt = __ip_route_output_key_flow(net, fl4, icmp_multipath_flow, skb_in);
if (IS_ERR(rt))
return rt;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f50f84f..edbeb56 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1646,37 +1646,82 @@ out:
#ifdef CONFIG_IP_ROUTE_MULTIPATH
/* Fill multipath flow key data based on socket buffer */
-static void ip_multipath_flow_skb(struct multipath_flow4 *flow, void *ctx)
+static void ip_multipath_flow_skb(struct multipath_flow4 *flow,
+ enum rt_mp_alg_t algo, void *ctx)
{
const struct sk_buff *skb = (const struct sk_buff *)ctx;
- const struct iphdr *iph;
+ struct icmphdr _icmph;
+ struct iphdr _inner_iph;
+ const struct iphdr *outer_iph;
+ const struct icmphdr *icmph;
+ const struct iphdr *inner_iph;
+ unsigned int offset;
- iph = ip_hdr(skb);
+ outer_iph = ip_hdr(skb);
- flow->saddr = iph->saddr;
- flow->daddr = iph->daddr;
+ flow->saddr = outer_iph->saddr;
+ flow->daddr = outer_iph->daddr;
flow->ports = 0;
- if (unlikely(!(iph->frag_off & htons(IP_DF))))
- return;
+ offset = outer_iph->ihl * 4;
- if (iph->protocol == IPPROTO_TCP ||
- iph->protocol == IPPROTO_UDP ||
- iph->protocol == IPPROTO_SCTP) {
+ if (algo == RT_MP_ALG_L4_HASH) {
__be16 _ports[2];
const __be16 *ports;
- ports = skb_header_pointer(skb, iph->ihl * 4, sizeof(_ports),
+ if (unlikely(!(outer_iph->frag_off & htons(IP_DF))))
+ return;
+
+ if (outer_iph->protocol != IPPROTO_TCP &&
+ outer_iph->protocol != IPPROTO_UDP &&
+ outer_iph->protocol != IPPROTO_SCTP) {
+ return;
+ }
+
+ ports = skb_header_pointer(skb, offset, sizeof(_ports),
&_ports);
if (ports) {
flow->sport = ports[0];
flow->dport = ports[1];
}
+
+ return;
+ }
+
+ if (outer_iph->protocol != IPPROTO_ICMP)
+ return;
+
+ if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
+ return;
+
+ icmph = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+ if (!icmph)
+ return;
+
+ if (icmph->type != ICMP_DEST_UNREACH &&
+ icmph->type != ICMP_SOURCE_QUENCH &&
+ icmph->type != ICMP_REDIRECT &&
+ icmph->type != ICMP_TIME_EXCEEDED &&
+ icmph->type != ICMP_PARAMETERPROB) {
+ return;
}
+
+ offset += sizeof(_icmph);
+ inner_iph = skb_header_pointer(skb, offset, sizeof(_inner_iph),
+ &_inner_iph);
+ if (!inner_iph)
+ return;
+
+ /* Since the ICMP payload contains a packet sent from the current
+ * recipient, we swap source and destination addresses
+ */
+ flow->saddr = inner_iph->daddr;
+ flow->daddr = inner_iph->saddr;
}
/* Fill multipath flow key data based on flowi4 */
-static void ip_multipath_flow_fl4(struct multipath_flow4 *flow, void *ctx)
+static void ip_multipath_flow_fl4(struct multipath_flow4 *flow,
+ enum rt_mp_alg_t algo, void *ctx)
{
const struct flowi4 *fl4 = (const struct flowi4 *)ctx;
@@ -2086,7 +2131,9 @@ add:
* Major route resolver routine.
*/
-struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
+struct rtable *__ip_route_output_key_flow(struct net *net, struct flowi4 *fl4,
+ multipath_flow4_func_t flow_func,
+ void *ctx)
{
struct net_device *dev_out = NULL;
__u8 tos = RT_FL_TOS(fl4);
@@ -2248,9 +2295,12 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
- fib_select_multipath(&res, ip_multipath_flow_fl4, fl4);
- else
+ if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) {
+ if (flow_func)
+ fib_select_multipath(&res, flow_func, ctx);
+ else
+ fib_select_multipath(&res, ip_multipath_flow_fl4, fl4);
+ } else
#endif
if (!res.prefixlen &&
res.table->tb_num_default > 1 &&
--
2.1.4
next prev parent reply other threads:[~2015-08-28 20:06 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-08-28 20:00 [PATCH v2 net-next 0/3] ipv4: Hash-based multipath routing pch
2015-08-28 20:00 ` pch [this message]
[not found] ` <1440792050-2109-1-git-send-email-pch-chEQUL3jiZBWk0Htik3J/w@public.gmane.org>
2015-08-28 20:00 ` [PATCH v2 net-next 1/3] ipv4: Lock-less per-packet multipath pch-chEQUL3jiZBWk0Htik3J/w
2015-08-28 20:00 ` [PATCH v2 net-next 2/3] ipv4: L3 and L4 hash-based multipath routing pch-chEQUL3jiZBWk0Htik3J/w
2015-08-30 22:48 ` Tom Herbert
2015-08-29 20:14 ` [PATCH v2 net-next 0/3] ipv4: Hash-based " David Miller
[not found] ` <20150829.131429.360433621593751136.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
2015-08-29 20:31 ` Peter Nørlund
2015-08-29 20:46 ` David Miller
[not found] ` <20150829.134628.1013990034021542524.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
2015-08-29 20:55 ` Scott Feldman
2015-08-29 20:59 ` Tom Herbert
2015-08-30 21:28 ` Peter Nørlund
2015-08-30 22:29 ` Tom Herbert
2015-08-31 9:02 ` Thomas Graf
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1440792050-2109-4-git-send-email-pch@ordbogen.com \
--to=pch@ordbogen.com \
--cc=davem@davemloft.net \
--cc=ebiederm@xmission.com \
--cc=jbenc@redhat.com \
--cc=jmorris@namei.org \
--cc=kaber@trash.net \
--cc=kuznet@ms2.inr.ac.ru \
--cc=linux-api@vger.kernel.org \
--cc=netdev@vger.kernel.org \
--cc=nicolas.dichtel@6wind.com \
--cc=roopa@cumulusnetworks.com \
--cc=sfeldma@gmail.com \
--cc=tgraf@suug.ch \
--cc=yoshfuji@linux-ipv6.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox