From: Peter Oskolkov <posk@google.com>
To: Alexei Starovoitov <ast@kernel.org>,
Daniel Borkmann <daniel@iogearbox.net>,
netdev@vger.kernel.org
Cc: Peter Oskolkov <posk@posk.io>, David Ahern <dsahern@gmail.com>,
Willem de Bruijn <willemb@google.com>,
Peter Oskolkov <posk@google.com>
Subject: [PATCH bpf-next v11 5/7] bpf: add handling of BPF_LWT_REROUTE to lwt_bpf.c
Date: Wed, 13 Feb 2019 11:53:39 -0800 [thread overview]
Message-ID: <20190213195341.184969-6-posk@google.com> (raw)
In-Reply-To: <20190213195341.184969-1-posk@google.com>
This patch builds on top of the previous patch in the patchset,
which added BPF_LWT_ENCAP_IP mode to bpf_lwt_push_encap. As the
encapping can result in the skb needing to go via a different
interface/route/dst, bpf programs can indicate this by returning
BPF_LWT_REROUTE, which triggers a new route lookup for the skb.
v8 changes: fix kbuild errors when LWTUNNEL_BPF is builtin, but
IPV6 is a module: as LWTUNNEL_BPF can only be either Y or N,
call IPV6 routing functions only if they are built-in.
v9 changes:
- fixed a kbuild test robot compiler warning;
- call IPV6 routing functions via ipv6_stub.
v10 changes: removed unnecessary IS_ENABLED and pr_warn_once.
v11 changes: fixed a potential dst leak.
Signed-off-by: Peter Oskolkov <posk@google.com>
---
net/core/lwt_bpf.c | 126 ++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 124 insertions(+), 2 deletions(-)
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 079871fc020f..32251f3fcda0 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -17,6 +17,7 @@
#include <linux/bpf.h>
#include <net/lwtunnel.h>
#include <net/gre.h>
+#include <net/ip6_route.h>
struct bpf_lwt_prog {
struct bpf_prog *prog;
@@ -56,6 +57,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
switch (ret) {
case BPF_OK:
+ case BPF_LWT_REROUTE:
break;
case BPF_REDIRECT:
@@ -88,6 +90,30 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
return ret;
}
+static int bpf_lwt_input_reroute(struct sk_buff *skb)
+{
+ int err = -EINVAL;
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ struct iphdr *iph = ip_hdr(skb);
+
+ err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+ iph->tos, skb_dst(skb)->dev);
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ err = ipv6_stub->ipv6_route_input(skb);
+ } else {
+ err = -EAFNOSUPPORT;
+ }
+
+ if (err)
+ goto err;
+ return dst_input(skb);
+
+err:
+ kfree_skb(skb);
+ return err;
+}
+
static int bpf_input(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
@@ -99,11 +125,11 @@ static int bpf_input(struct sk_buff *skb)
ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
if (ret < 0)
return ret;
+ if (ret == BPF_LWT_REROUTE)
+ return bpf_lwt_input_reroute(skb);
}
if (unlikely(!dst->lwtstate->orig_input)) {
- pr_warn_once("orig_input not set on dst for prog %s\n",
- bpf->out.name);
kfree_skb(skb);
return -EINVAL;
}
@@ -148,6 +174,91 @@ static int xmit_check_hhlen(struct sk_buff *skb)
return 0;
}
+static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
+{
+ struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev);
+ int oif = l3mdev ? l3mdev->ifindex : 0;
+ struct dst_entry *dst = NULL;
+ struct sock *sk;
+ struct net *net;
+ bool ipv4;
+ int err;
+
+ if (skb->protocol == htons(ETH_P_IP))
+ ipv4 = true;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ ipv4 = false;
+ else
+ return -EAFNOSUPPORT;
+
+ sk = sk_to_full_sk(skb->sk);
+ if (sk) {
+ if (sk->sk_bound_dev_if)
+ oif = sk->sk_bound_dev_if;
+ net = sock_net(sk);
+ } else {
+ net = dev_net(skb_dst(skb)->dev);
+ }
+
+ if (ipv4) {
+ struct iphdr *iph = ip_hdr(skb);
+ struct flowi4 fl4 = {};
+ struct rtable *rt;
+
+ fl4.flowi4_oif = oif;
+ fl4.flowi4_mark = skb->mark;
+ fl4.flowi4_uid = sock_net_uid(net, sk);
+ fl4.flowi4_tos = RT_TOS(iph->tos);
+ fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
+ fl4.flowi4_proto = iph->protocol;
+ fl4.daddr = iph->daddr;
+ fl4.saddr = iph->saddr;
+
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
+ return -EINVAL;
+ dst = &rt->dst;
+ } else {
+ struct ipv6hdr *iph6 = ipv6_hdr(skb);
+ struct flowi6 fl6 = {};
+
+ fl6.flowi6_oif = oif;
+ fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_uid = sock_net_uid(net, sk);
+ fl6.flowlabel = ip6_flowinfo(iph6);
+ fl6.flowi6_proto = iph6->nexthdr;
+ fl6.daddr = iph6->daddr;
+ fl6.saddr = iph6->saddr;
+
+ err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6);
+ if (err || IS_ERR(dst))
+ return -EINVAL;
+ }
+ if (unlikely(dst->error)) {
+ dst_release(dst);
+ return -EINVAL;
+ }
+
+ /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it
+ * was done for the previous dst, so we are doing it here again, in
+ * case the new dst needs much more space. The call below is a noop
+ * if there is enough header space in skb.
+ */
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
+ if (unlikely(err))
+ return err;
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+
+ err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb);
+ if (unlikely(err))
+ return err;
+
+ /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
+ return LWTUNNEL_XMIT_DONE;
+}
+
static int bpf_xmit(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
@@ -155,11 +266,20 @@ static int bpf_xmit(struct sk_buff *skb)
bpf = bpf_lwt_lwtunnel(dst->lwtstate);
if (bpf->xmit.prog) {
+ __be16 proto = skb->protocol;
int ret;
ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
switch (ret) {
case BPF_OK:
+ /* If the header changed, e.g. via bpf_lwt_push_encap,
+ * BPF_LWT_REROUTE below should have been used if the
+ * protocol was also changed.
+ */
+ if (skb->protocol != proto) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
/* If the header was expanded, headroom might be too
* small for L2 header to come, expand as needed.
*/
@@ -170,6 +290,8 @@ static int bpf_xmit(struct sk_buff *skb)
return LWTUNNEL_XMIT_CONTINUE;
case BPF_REDIRECT:
return LWTUNNEL_XMIT_DONE;
+ case BPF_LWT_REROUTE:
+ return bpf_lwt_xmit_reroute(skb);
default:
return ret;
}
--
2.20.1.791.gb4d0f1c61a-goog
next prev parent reply other threads:[~2019-02-13 19:54 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-02-13 19:53 [PATCH bpf-next v11 0/7] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap Peter Oskolkov
2019-02-13 19:53 ` [PATCH bpf-next v11 1/7] bpf: add plumbing for BPF_LWT_ENCAP_IP in bpf_lwt_push_encap Peter Oskolkov
2019-02-13 19:53 ` [PATCH bpf-next v11 2/7] bpf: implement BPF_LWT_ENCAP_IP mode " Peter Oskolkov
2019-02-13 19:53 ` [PATCH bpf-next v11 3/7] bpf: handle GSO " Peter Oskolkov
2019-02-13 19:53 ` [PATCH bpf-next v11 4/7] ipv6_stub: add ipv6_route_input stub/proxy Peter Oskolkov
2019-02-13 19:53 ` Peter Oskolkov [this message]
2019-02-13 19:53 ` [PATCH bpf-next v11 6/7] bpf: sync <kdir>/include/.../bpf.h with tools/include/.../bpf.h Peter Oskolkov
2019-02-13 19:53 ` [PATCH bpf-next v11 7/7] selftests: bpf: add test_lwt_ip_encap selftest Peter Oskolkov
2019-02-14 6:03 ` Stanislav Fomichev
2019-02-14 0:46 ` [PATCH bpf-next v11 0/7] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap David Ahern
2019-02-14 2:39 ` Alexei Starovoitov
2019-02-14 3:44 ` David Ahern
2019-02-14 4:21 ` Alexei Starovoitov
2019-02-14 5:36 ` Peter Oskolkov
2019-02-14 6:11 ` Peter Oskolkov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190213195341.184969-6-posk@google.com \
--to=posk@google.com \
--cc=ast@kernel.org \
--cc=daniel@iogearbox.net \
--cc=dsahern@gmail.com \
--cc=netdev@vger.kernel.org \
--cc=posk@posk.io \
--cc=willemb@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.