Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 01/10] udp: expose inet cork to udp
From: Willem de Bruijn @ 2018-04-25 18:55 UTC (permalink / raw)
  To: netdev; +Cc: davem, alexander.duyck, dmichail, Willem de Bruijn
In-Reply-To: <20180425185601.55511-1-willemdebruijn.kernel@gmail.com>

From: Willem de Bruijn <willemb@google.com>

UDP segmentation offload needs access to inet_cork in the udp layer.
Pass the struct to ip(6)_make_skb instead of allocating it on the
stack in that function itself.

This patch is a noop otherwise.

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 include/net/ip.h      |  2 +-
 include/net/ipv6.h    |  1 +
 net/ipv4/ip_output.c  | 17 ++++++++---------
 net/ipv4/udp.c        |  4 +++-
 net/ipv6/ip6_output.c | 20 ++++++++++----------
 net/ipv6/udp.c        |  3 ++-
 6 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index dc4a2d6e58a5..7ec543a64bbc 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -171,7 +171,7 @@ struct sk_buff *ip_make_skb(struct sock *sk, struct flowi4 *fl4,
 					int len, int odd, struct sk_buff *skb),
 			    void *from, int length, int transhdrlen,
 			    struct ipcm_cookie *ipc, struct rtable **rtp,
-			    unsigned int flags);
+			    struct inet_cork *cork, unsigned int flags);
 
 static inline struct sk_buff *ip_finish_skb(struct sock *sk, struct flowi4 *fl4)
 {
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 68b167d98879..0dd722cab037 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -950,6 +950,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
 			     void *from, int length, int transhdrlen,
 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
 			     struct rt6_info *rt, unsigned int flags,
+			     struct inet_cork_full *cork,
 			     const struct sockcm_cookie *sockc);
 
 static inline struct sk_buff *ip6_finish_skb(struct sock *sk)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 83c73bab2c3d..2883ff1e909c 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1470,9 +1470,8 @@ struct sk_buff *ip_make_skb(struct sock *sk,
 					int len, int odd, struct sk_buff *skb),
 			    void *from, int length, int transhdrlen,
 			    struct ipcm_cookie *ipc, struct rtable **rtp,
-			    unsigned int flags)
+			    struct inet_cork *cork, unsigned int flags)
 {
-	struct inet_cork cork;
 	struct sk_buff_head queue;
 	int err;
 
@@ -1481,22 +1480,22 @@ struct sk_buff *ip_make_skb(struct sock *sk,
 
 	__skb_queue_head_init(&queue);
 
-	cork.flags = 0;
-	cork.addr = 0;
-	cork.opt = NULL;
-	err = ip_setup_cork(sk, &cork, ipc, rtp);
+	cork->flags = 0;
+	cork->addr = 0;
+	cork->opt = NULL;
+	err = ip_setup_cork(sk, cork, ipc, rtp);
 	if (err)
 		return ERR_PTR(err);
 
-	err = __ip_append_data(sk, fl4, &queue, &cork,
+	err = __ip_append_data(sk, fl4, &queue, cork,
 			       &current->task_frag, getfrag,
 			       from, length, transhdrlen, flags);
 	if (err) {
-		__ip_flush_pending_frames(sk, &queue, &cork);
+		__ip_flush_pending_frames(sk, &queue, cork);
 		return ERR_PTR(err);
 	}
 
-	return __ip_make_skb(sk, fl4, &queue, &cork);
+	return __ip_make_skb(sk, fl4, &queue, cork);
 }
 
 /*
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 24b5c59b1c53..6b9d8017b319 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1030,9 +1030,11 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 	/* Lockless fast path for the non-corking case. */
 	if (!corkreq) {
+		struct inet_cork cork;
+
 		skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
 				  sizeof(struct udphdr), &ipc, &rt,
-				  msg->msg_flags);
+				  &cork, msg->msg_flags);
 		err = PTR_ERR(skb);
 		if (!IS_ERR_OR_NULL(skb))
 			err = udp_send_skb(skb, fl4);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 6a477d54f8c7..7fa1db447405 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1755,9 +1755,9 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
 			     void *from, int length, int transhdrlen,
 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
 			     struct rt6_info *rt, unsigned int flags,
+			     struct inet_cork_full *cork,
 			     const struct sockcm_cookie *sockc)
 {
-	struct inet_cork_full cork;
 	struct inet6_cork v6_cork;
 	struct sk_buff_head queue;
 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
@@ -1768,27 +1768,27 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
 
 	__skb_queue_head_init(&queue);
 
-	cork.base.flags = 0;
-	cork.base.addr = 0;
-	cork.base.opt = NULL;
-	cork.base.dst = NULL;
+	cork->base.flags = 0;
+	cork->base.addr = 0;
+	cork->base.opt = NULL;
+	cork->base.dst = NULL;
 	v6_cork.opt = NULL;
-	err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
+	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
 	if (err) {
-		ip6_cork_release(&cork, &v6_cork);
+		ip6_cork_release(cork, &v6_cork);
 		return ERR_PTR(err);
 	}
 	if (ipc6->dontfrag < 0)
 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
 
-	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
+	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
 				&current->task_frag, getfrag, from,
 				length + exthdrlen, transhdrlen + exthdrlen,
 				flags, ipc6, sockc);
 	if (err) {
-		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
+		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
 		return ERR_PTR(err);
 	}
 
-	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
+	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
 }
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 4ec76a87aeb8..824797f8d1ab 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1324,12 +1324,13 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 	/* Lockless fast path for the non-corking case */
 	if (!corkreq) {
+		struct inet_cork_full cork;
 		struct sk_buff *skb;
 
 		skb = ip6_make_skb(sk, getfrag, msg, ulen,
 				   sizeof(struct udphdr), &ipc6,
 				   &fl6, (struct rt6_info *)dst,
-				   msg->msg_flags, &sockc);
+				   msg->msg_flags, &cork, &sockc);
 		err = PTR_ERR(skb);
 		if (!IS_ERR_OR_NULL(skb))
 			err = udp_v6_send_skb(skb, &fl6);
-- 
2.17.0.441.gb46fe60e1d-goog

^ permalink raw reply related

* [PATCH net-next 00/10] udp gso
From: Willem de Bruijn @ 2018-04-25 18:55 UTC (permalink / raw)
  To: netdev; +Cc: davem, alexander.duyck, dmichail, Willem de Bruijn

From: Willem de Bruijn <willemb@google.com>

Segmentation offload reduces cycles/byte for large packets by
amortizing the cost of protocol stack traversal.

This patchset implements GSO for UDP. A process can concatenate and
submit multiple datagrams to the same destination in one send call
by setting socket option SOL_UDP/UDP_SEGMENT with the segment size,
or passing an analogous cmsg at send time.

The stack will send the entire large (up to network layer max size)
datagram through the protocol layer. At the GSO layer, it is broken
up in individual segments. All receive the same network layer header
and UDP src and dst port. All but the last segment have the same UDP
header, but the last may differ in length and checksum.

Initial results show a significant reduction in UDP cycles/byte.
See the main patch for more details and benchmark results.

        udp
          876 MB/s 14873 msg/s 624666 calls/s
            11,205,777,429      cycles
    
        udp gso
         2139 MB/s 36282 msg/s 36282 calls/s
            11,204,374,561      cycles

The patch set is broken down as follows:
- patch 1 is a prerequisite: code rearrangement, noop otherwise
- patch 2 is the core feature
- patch 3,4,6 are refinements
- patch 5 adds the cmsg interface
- patch 7..10 are tests

This idea was presented previously at netconf 2017-2
http://vger.kernel.org/netconf2017_files/rx_hardening_and_udp_gso.pdf

Changes RFC -> v1
  - MSG_MORE:
      fixed, by allowing checksum offload with corking if gso
  - SKB_GSO_UDP_L4:
      made independent from SKB_GSO_UDP
      and removed skb_is_ufo() wrapper
  - NETIF_F_GSO_UDP_L4:
      add to netdev_features_string
      and to netdev-features.txt
      add BUILD_BUG_ON to match SKB_GSO_UDP_L4 value
  - UDP_MAX_SEGMENTS:
      introduce limit on number of segments per gso skb
      to avoid extreme cases like IP_MAX_MTU/IPV4_MIN_MTU
  - CHECKSUM_PARTIAL:
      test against missing feature after ndo_features_check
      if not supported return error, analogous to udp_send_check
  - MSG_ZEROCOPY: removed, deferred for now

Willem de Bruijn (11):
  udp: expose inet cork to udp
  udp: add gso
  udp: better wmem accounting on gso
  udp: paged allocation with gso
  udp: add gso segment cmsg
  udp: add gso support to virtual devices
  selftests: udp gso
  selftests: udp gso with connected sockets
  selftests: udp gso with corking
  selftests: udp gso benchmark

 Documentation/networking/netdev-features.txt  |   7 +
 include/linux/netdev_features.h               |   5 +-
 include/linux/netdevice.h                     |   1 +
 include/linux/skbuff.h                        |   2 +
 include/linux/udp.h                           |   3 +
 include/net/inet_sock.h                       |   1 +
 include/net/ip.h                              |   3 +-
 include/net/ipv6.h                            |   2 +
 include/net/udp.h                             |   5 +
 include/uapi/linux/udp.h                      |   1 +
 net/core/ethtool.c                            |   1 +
 net/core/skbuff.c                             |   2 +
 net/ipv4/ip_output.c                          |  41 +-
 net/ipv4/udp.c                                |  80 ++-
 net/ipv4/udp_offload.c                        |  68 +-
 net/ipv6/ip6_offload.c                        |   6 +-
 net/ipv6/ip6_output.c                         |  45 +-
 net/ipv6/udp.c                                |  31 +-
 net/ipv6/udp_offload.c                        |  19 +-
 tools/testing/selftests/net/.gitignore        |   3 +
 tools/testing/selftests/net/Makefile          |   4 +-
 tools/testing/selftests/net/udpgso.c          | 621 ++++++++++++++++++
 tools/testing/selftests/net/udpgso.sh         |  29 +
 tools/testing/selftests/net/udpgso_bench.sh   |  74 +++
 tools/testing/selftests/net/udpgso_bench_rx.c | 265 ++++++++
 tools/testing/selftests/net/udpgso_bench_tx.c | 420 ++++++++++++
 26 files changed, 1688 insertions(+), 51 deletions(-)
 create mode 100644 tools/testing/selftests/net/udpgso.c
 create mode 100755 tools/testing/selftests/net/udpgso.sh
 create mode 100755 tools/testing/selftests/net/udpgso_bench.sh
 create mode 100644 tools/testing/selftests/net/udpgso_bench_rx.c
 create mode 100644 tools/testing/selftests/net/udpgso_bench_tx.c

-- 
2.17.0.441.gb46fe60e1d-goog

^ permalink raw reply

* Re: [PATCH 0/3] Introduce LSM-hook for socketpair(2)
From: Paul Moore @ 2018-04-25 18:51 UTC (permalink / raw)
  To: James Morris
  Cc: David Herrmann, linux-kernel, teg, Stephen Smalley, selinux,
	linux-security-module, Eric Paris, Serge E. Hallyn,
	David S. Miller, netdev
In-Reply-To: <alpine.LRH.2.21.1804260443110.11568@namei.org>

On Wed, Apr 25, 2018 at 2:44 PM, James Morris <jmorris@namei.org> wrote:
> On Mon, 23 Apr 2018, David Herrmann wrote:
>> This patch series tries to close this gap and makes both behave the
>> same. A new LSM-hook is added which allows LSMs to cache the correct
>> peer information on newly created socket-pairs.
>
> Looks okay to me.
>
> Once it's respun with the Smack backend and maybe the hook name change,
> I'll merge this unless DaveM wants it to go in via his networking tree.

Note my objection to the hook placement in patch 2/3; I think we
should move the hook out of the AF_UNIX layer and up into the socket
layer.

-- 
paul moore
www.paul-moore.com

^ permalink raw reply

* Re: [PATCH 0/3] Introduce LSM-hook for socketpair(2)
From: David Miller @ 2018-04-25 18:48 UTC (permalink / raw)
  To: jmorris
  Cc: dh.herrmann, linux-kernel, paul, teg, sds, selinux,
	linux-security-module, eparis, serge, netdev
In-Reply-To: <alpine.LRH.2.21.1804260443110.11568@namei.org>

From: James Morris <jmorris@namei.org>
Date: Thu, 26 Apr 2018 04:44:38 +1000 (AEST)

> On Mon, 23 Apr 2018, David Herrmann wrote:
> 
>> This patch series tries to close this gap and makes both behave the
>> same. A new LSM-hook is added which allows LSMs to cache the correct
>> peer information on newly created socket-pairs.
> 
> Looks okay to me.
> 
> Once it's respun with the Smack backend and maybe the hook name change, 
> I'll merge this unless DaveM wants it to go in via his networking tree.

No, feel free to take it via your tree James.

^ permalink raw reply

* Re: [Cake] [PATCH net-next v3] Add Common Applications Kept Enhanced (cake) qdisc
From: David Miller @ 2018-04-25 18:48 UTC (permalink / raw)
  To: toke; +Cc: eric.dumazet, chromatix99, netdev, cake
In-Reply-To: <87lgdb5e4w.fsf@toke.dk>

From: Toke Høiland-Jørgensen <toke@toke.dk>
Date: Wed, 25 Apr 2018 20:34:23 +0200

> Is it possible to get the sizes of the individual segments
> of a GSO packet? That way we could do the calculation for the whole
> super-packet...

Yes it is.

Otherwise the software GSO fallback wouldn't even be possible.

^ permalink raw reply

* Re: [PATCH net-next v3] Add Common Applications Kept Enhanced (cake) qdisc
From: Toke Høiland-Jørgensen @ 2018-04-25 18:46 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, cake, dave.taht
In-Reply-To: <20180425.143948.235979083753006.davem@davemloft.net>

David Miller <davem@davemloft.net> writes:

> From: Toke Høiland-Jørgensen <toke@toke.dk>
> Date: Wed, 25 Apr 2018 15:42:48 +0200
>
>> +static void *cake_zalloc(size_t sz)
>> +{
>> +	void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN);
>> +
>> +	if (!ptr)
>> +		ptr = vzalloc(sz);
>> +	return ptr;
>> +}
>
> This is just kvzalloc(sz, GFP_KERNEL | __GFP_NOWARN)?

Ah yes; we actually fixed that before, but guess it got lost this time
around. Will fix :)

-Toke

^ permalink raw reply

* Re: [PATCH 0/3] Introduce LSM-hook for socketpair(2)
From: James Morris @ 2018-04-25 18:44 UTC (permalink / raw)
  To: David Herrmann
  Cc: linux-kernel, Paul Moore, teg, Stephen Smalley, selinux,
	linux-security-module, Eric Paris, Serge E. Hallyn,
	David S. Miller, netdev
In-Reply-To: <20180423133015.5455-1-dh.herrmann@gmail.com>

On Mon, 23 Apr 2018, David Herrmann wrote:

> This patch series tries to close this gap and makes both behave the
> same. A new LSM-hook is added which allows LSMs to cache the correct
> peer information on newly created socket-pairs.

Looks okay to me.

Once it's respun with the Smack backend and maybe the hook name change, 
I'll merge this unless DaveM wants it to go in via his networking tree.


-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply

* Re: [PATCH net-next v3] Add Common Applications Kept Enhanced (cake) qdisc
From: David Miller @ 2018-04-25 18:39 UTC (permalink / raw)
  To: toke; +Cc: netdev, cake, dave.taht
In-Reply-To: <20180425134249.21300-1-toke@toke.dk>

From: Toke Høiland-Jørgensen <toke@toke.dk>
Date: Wed, 25 Apr 2018 15:42:48 +0200

> +static void *cake_zalloc(size_t sz)
> +{
> +	void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN);
> +
> +	if (!ptr)
> +		ptr = vzalloc(sz);
> +	return ptr;
> +}

This is just kvzalloc(sz, GFP_KERNEL | __GFP_NOWARN)?

^ permalink raw reply

* Re: [PATCH net 1/1] net/smc: keep clcsock reference in smc_tcp_listen_work()
From: David Miller @ 2018-04-25 18:38 UTC (permalink / raw)
  To: ubraun; +Cc: netdev, linux-s390, schwidefsky, heiko.carstens, raspl, ubraun
In-Reply-To: <20180425104858.48953-1-ubraun@linux.ibm.com>

From: Ursula Braun <ubraun@linux.ibm.com>
Date: Wed, 25 Apr 2018 12:48:58 +0200

> The internal CLC socket should exist till the SMC-socket is released.
> Function tcp_listen_worker() releases the internal CLC socket of a
> listen socket, if an smc_close_active() is called. This function
> is called for the final release(), but it is called for shutdown
> SHUT_RDWR as well. This opens a door for protection faults, if
> socket calls using the internal CLC socket are called for a
> shutdown listen socket.
> 
> With the changes of
> commit 3d502067599f ("net/smc: simplify wait when closing listen socket")
> there is no need anymore to release the internal CLC socket in
> function tcp_listen_worker((). It is sufficient to release it in
> smc_release().
> 
> Fixes: 127f49705823 ("net/smc: release clcsock from tcp_listen_worker")
> Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
> Reported-by: syzbot+9045fc589fcd196ef522@syzkaller.appspotmail.com
> Reported-by: syzbot+28a2c86cf19c81d871fa@syzkaller.appspotmail.com
> Reported-by: syzbot+9605e6cace1b5efd4a0a@syzkaller.appspotmail.com
> Reported-by: syzbot+cf9012c597c8379d535c@syzkaller.appspotmail.com

Applied and queued up for -stable.

^ permalink raw reply

* Re: [PATCH 04/17] asm-generic: Remove unneeded __ARCH_WANT_SYS_LLSEEK macro
From: Geert Uytterhoeven @ 2018-04-25 18:36 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Linux-Arch, Eric W. Biederman, libc-alpha, y2038 Mailman List,
	Linux API, Linux Kernel Mailing List, Dominik Brodowski,
	Albert ARIBAUD (3ADEV), Peter Zijlstra, Deepa Dinamani, netdev,
	Darren Hart, Thomas Gleixner, Al Viro
In-Reply-To: <20180425160311.2718314-5-arnd@arndb.de>

On Wed, Apr 25, 2018 at 6:02 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> The sys_llseek sytem call is needed on all 32-bit architectures and
> none of the 64-bit ones, so we can remove the __ARCH_WANT_SYS_LLSEEK guard
> and simplify the include/asm-generic/unistd.h header further.
>
> Since 32-bit tasks can run either natively or in compat mode on 64-bit
> architectures, we have to check for both !CONFIG_64BIT and CONFIG_COMPAT.
>
> There are a few 64-bit architectures that also reference sys_llseek
> in their 64-bit ABI (e.g. sparc), but I verified that those all
> select CONFIG_COMPAT, so the #if check is still correct here. It's
> a bit odd to include it in the syscall table though, as it's the
> same as sys_lseek() on 64-bit, but with strange calling conventions.
>
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>

For m68k:
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds
_______________________________________________
Y2038 mailing list
Y2038@lists.linaro.org
https://lists.linaro.org/mailman/listinfo/y2038

^ permalink raw reply

* Re: [PATCH 02/17] y2038: Remove newstat family from default syscall set
From: Geert Uytterhoeven @ 2018-04-25 18:36 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Linux-Arch, Eric W. Biederman, libc-alpha, y2038 Mailman List,
	Linux API, Linux Kernel Mailing List, Dominik Brodowski,
	Albert ARIBAUD (3ADEV), Peter Zijlstra, Deepa Dinamani, netdev,
	Darren Hart, Thomas Gleixner, Al Viro
In-Reply-To: <20180425160311.2718314-3-arnd@arndb.de>

On Wed, Apr 25, 2018 at 6:02 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> We have four generations of stat() syscalls:
> - the oldstat syscalls that are only used on the older architectures
> - the newstat family that is used on all 64-bit architectures but
>   lacked support for large files on 32-bit architectures.
> - the stat64 family that is used mostly on 32-bit architectures to
>   replace newstat
> - statx() to replace all of the above, adding 64-bit timestamps among
>   other things.
>
> We already compile stat64 only on those architectures that need it,
> but newstat is always built, including on those that don't reference
> it. This adds a new __ARCH_WANT_NEW_STAT symbol along the lines of
> __ARCH_WANT_OLD_STAT and __ARCH_WANT_STAT64 to control compilation of
> newstat. All architectures that need it use an explict define, the
> others now get a little bit smaller, and future architecture (including
> 64-bit targets) won't ever see it.
>
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>

For m68k:
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds
_______________________________________________
Y2038 mailing list
Y2038@lists.linaro.org
https://lists.linaro.org/mailman/listinfo/y2038

^ permalink raw reply

* Re: [PATCH net-next v3] Add Common Applications Kept Enhanced (cake) qdisc
From: Toke Høiland-Jørgensen @ 2018-04-25 18:35 UTC (permalink / raw)
  To: Eric Dumazet, netdev; +Cc: cake, Dave Taht
In-Reply-To: <c3412b75-156d-3e2c-0f3f-781f2f8bb042@gmail.com>

Eric Dumazet <eric.dumazet@gmail.com> writes:

> On 04/25/2018 09:17 AM, Toke Høiland-Jørgensen wrote:
>
>> Or am I to interpret that as a hard NAK on having this feature in CAKE
>> (even if we fix the issues you pointed out)?
>
> No strong NACK, as long as the current code is fixed.
>
> Right now, a malicious packet will kill the box or something bad like
> that.

Yeah, that's not good, obviously. I may just pull it out for now and add
it back in a separate patch once we've had time to fix it :)

-Toke

^ permalink raw reply

* [RFC bpf-next 9/9] samples/bpf: Add examples of ipv4 and ipv6 forwarding in XDP
From: David Ahern @ 2018-04-25 18:34 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180425183449.25134-1-dsahern@gmail.com>

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 samples/bpf/Makefile                      |   4 +
 samples/bpf/xdp_fwd_kern.c                | 110 ++++++++++++++++++++++++
 samples/bpf/xdp_fwd_user.c                | 136 ++++++++++++++++++++++++++++++
 tools/testing/selftests/bpf/bpf_helpers.h |   3 +
 4 files changed, 253 insertions(+)
 create mode 100644 samples/bpf/xdp_fwd_kern.c
 create mode 100644 samples/bpf/xdp_fwd_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index aa8c392e2e52..c90d2ff74873 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -45,6 +45,7 @@ hostprogs-y += xdp_rxq_info
 hostprogs-y += syscall_tp
 hostprogs-y += cpustat
 hostprogs-y += xdp_adjust_tail
+hostprogs-y += xdp_fwd
 
 # Libbpf dependencies
 LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
@@ -97,6 +98,7 @@ xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
 syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
 cpustat-objs := bpf_load.o $(LIBBPF) cpustat_user.o
 xdp_adjust_tail-objs := bpf_load.o $(LIBBPF) xdp_adjust_tail_user.o
+xdp_fwd-objs := bpf_load.o $(LIBBPF) xdp_fwd_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -151,6 +153,7 @@ always += xdp2skb_meta_kern.o
 always += syscall_tp_kern.o
 always += cpustat_kern.o
 always += xdp_adjust_tail_kern.o
+always += xdp_fwd_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -197,6 +200,7 @@ HOSTLOADLIBES_xdp_rxq_info += -lelf
 HOSTLOADLIBES_syscall_tp += -lelf
 HOSTLOADLIBES_cpustat += -lelf
 HOSTLOADLIBES_xdp_adjust_tail += -lelf
+HOSTLOADLIBES_xdp_fwd += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
new file mode 100644
index 000000000000..4012c073fc45
--- /dev/null
+++ b/samples/bpf/xdp_fwd_kern.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+#include "bpf_helpers.h"
+
+#define IPV6_FLOWINFO_MASK              cpu_to_be32(0x0FFFFFFF)
+
+struct bpf_map_def SEC("maps") tx_port = {
+	.type = BPF_MAP_TYPE_DEVMAP,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 64,
+};
+
+static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data = (void *)(long)ctx->data;
+	struct bpf_fib_lookup fib_params;
+	struct ethhdr *eth = data;
+	int out_index;
+	u16 h_proto;
+	u64 nh_off;
+
+	nh_off = sizeof(*eth);
+	if (data + nh_off > data_end)
+		return XDP_DROP;
+
+	__builtin_memset(&fib_params, 0, sizeof(fib_params));
+
+	h_proto = eth->h_proto;
+	if (h_proto == htons(ETH_P_IP)) {
+		struct iphdr *iph = data + nh_off;
+
+		if (iph + 1 > data_end)
+			return XDP_DROP;
+
+		fib_params.family	= AF_INET;
+		fib_params.tos		= iph->tos;
+		fib_params.l4_protocol	= iph->protocol;
+		fib_params.sport	= 0;
+		fib_params.dport	= 0;
+		fib_params.tot_len	= ntohs(iph->tot_len);
+		fib_params.ipv4_src	= iph->saddr;
+		fib_params.ipv4_dst	= iph->daddr;
+	} else if (h_proto == htons(ETH_P_IPV6)) {
+		struct ipv6hdr *iph = data + nh_off;
+
+		if (iph + 1 > data_end)
+			return XDP_DROP;
+
+		fib_params.family	= AF_INET6;
+		fib_params.flowlabel	= *(__be32 *)iph & IPV6_FLOWINFO_MASK;
+		fib_params.l4_protocol	= iph->nexthdr;
+		fib_params.sport	= 0;
+		fib_params.dport	= 0;
+		fib_params.tot_len	= ntohs(iph->payload_len);
+		fib_params.ipv6_src	= iph->saddr;
+		fib_params.ipv6_dst	= iph->daddr;
+	} else {
+		return XDP_PASS;
+	}
+
+	fib_params.ifindex = ctx->ingress_ifindex;
+
+	out_index = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
+
+	/* verify egress index has xdp support */
+	// TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with
+	//       cannot pass map_type 14 into func bpf_map_lookup_elem#1:
+	if (out_index > 0) {
+		memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
+		memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
+		return bpf_redirect_map(&tx_port, out_index, 0);
+	}
+
+	return XDP_PASS;
+}
+
+SEC("xdp_fwd")
+int xdp_fwd_prog(struct xdp_md *ctx)
+{
+	return xdp_fwd_flags(ctx, 0);
+}
+
+SEC("xdp_fwd_direct")
+int xdp_fwd_direct_prog(struct xdp_md *ctx)
+{
+	return xdp_fwd_flags(ctx, BPF_FIB_LOOKUP_DIRECT);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c
new file mode 100644
index 000000000000..9c6606f57126
--- /dev/null
+++ b/samples/bpf/xdp_fwd_user.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <linux/limits.h>
+#include <net/if.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <libgen.h>
+
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include "libbpf.h"
+
+
+static int do_attach(int idx, int fd, const char *name)
+{
+	int err;
+
+	err = bpf_set_link_xdp_fd(idx, fd, 0);
+	if (err < 0)
+		printf("ERROR: failed to attach program to %s\n", name);
+
+	return err;
+}
+
+static int do_detach(int idx, const char *name)
+{
+	int err;
+
+	err = bpf_set_link_xdp_fd(idx, -1, 0);
+	if (err < 0)
+		printf("ERROR: failed to detach program from %s\n", name);
+
+	return err;
+}
+
+static void usage(const char *prog)
+{
+	fprintf(stderr,
+		"usage: %s [OPTS] interface-list\n"
+		"\nOPTS:\n"
+		"    -d    detach program\n"
+		"    -D    direct table lookups (skip fib rules)\n",
+		prog);
+}
+
+int main(int argc, char **argv)
+{
+	char filename[PATH_MAX];
+	int opt, i, idx, err;
+	int prog_id = 0;
+	int attach = 1;
+	int ret = 0;
+
+	while ((opt = getopt(argc, argv, ":dD")) != -1) {
+		switch (opt) {
+		case 'd':
+			attach = 0;
+			break;
+		case 'D':
+			prog_id = 1;
+			break;
+		default:
+			usage(basename(argv[0]));
+			return 1;
+		}
+	}
+
+	if (optind == argc) {
+		usage(basename(argv[0]));
+		return 1;
+	}
+
+	if (attach) {
+		snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+		if (access(filename, O_RDONLY) < 0) {
+			printf("error accessing file %s: %s\n",
+				filename, strerror(errno));
+			return 1;
+		}
+
+		if (load_bpf_file(filename)) {
+			printf("%s", bpf_log_buf);
+			return 1;
+		}
+
+		if (!prog_fd[prog_id]) {
+			printf("load_bpf_file: %s\n", strerror(errno));
+			return 1;
+		}
+	}
+	if (attach) {
+		for (i = 1; i < 64; ++i)
+			bpf_map_update_elem(map_fd[0], &i, &i, 0);
+	}
+
+	for (i = optind; i < argc; ++i) {
+		idx = if_nametoindex(argv[i]);
+		if (!idx)
+			idx = strtoul(argv[i], NULL, 0);
+
+		if (!idx) {
+			fprintf(stderr, "Invalid arg\n");
+			return 1;
+		}
+		if (!attach) {
+			err = do_detach(idx, argv[i]);
+			if (err)
+				ret = err;
+		} else {
+			err = do_attach(idx, prog_fd[prog_id], argv[i]);
+			if (err)
+				ret = err;
+		}
+	}
+
+	return ret;
+}
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 69d7b918e66a..4407766a5950 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -101,6 +101,9 @@ static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) =
 static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state,
 				     int size, int flags) =
 	(void *) BPF_FUNC_skb_get_xfrm_state;
+static int (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params,
+			     int plen, __u32 flags) =
+	(void *) BPF_FUNC_fib_lookup;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
2.11.0

^ permalink raw reply related

* [RFC bpf-next 8/9] bpf: Provide helper to do lookups in kernel FIB table
From: David Ahern @ 2018-04-25 18:34 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180425183449.25134-1-dsahern@gmail.com>

Provide a helper for doing a FIB and neighbor lookup in the kernel
tables from an XDP program. The helper provides a fastpath for forwarding
packets. If the packet is a local delivery or for any reason is not a
simple lookup and forward, the packet continues up the stack.

If it is to be forwarded, the forwarding can be done directly if the
neighbor is already known. If the neighbor does not exist, the first
few packets go up the stack for neighbor resolution. Once resolved, the
xdp program provides the fast path.

On successful lookup the nexthop dmac, current device smac and egress
device index are returned.

The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6
are implemented in this patch. The API includes layer 4 parameters if
the XDP program chooses to do deep packet inspection to allow compare
against ACLs implemented as FIB rules.

Header rewrite is left to the XDP program.

The lookup takes 2 flags:
- BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes
  straight to the table associated with the device (expert setting for
  those looking to maximize throughput)

- BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective.
  Default is an ingress lookup.

Initial performance numbers collected by Jesper, forwarded packets/sec:

       Full stack    XDP FIB lookup    XDP Direct lookup
IPv4   1,947,969       7,074,156          7,415,333
IPv6   1,728,000       6,165,504          7,262,720


Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/uapi/linux/bpf.h |  68 +++++++++++++-
 net/core/filter.c        | 233 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 300 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e6679393b687..82601c132b9f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -10,6 +10,8 @@
 
 #include <linux/types.h>
 #include <linux/bpf_common.h>
+#include <linux/if_ether.h>
+#include <linux/in6.h>
 
 /* Extended instruction set based on top of classic BPF */
 
@@ -783,6 +785,17 @@ union bpf_attr {
  *     @size: size of 'struct bpf_xfrm_state'
  *     @flags: room for future extensions
  *     Return: 0 on success or negative error
+ *
+ * int bpf_fib_lookup(ctx, params, plen, flags)
+ *     Do a FIB lookup based on given parameters
+ *     @ctx:     pointer to context of type xdp_md
+ *     @params:  pointer to bpf_fib_lookup
+ *     @plen:    size of params argument
+ *     @flags:   u32 bitmask of BPF_FIB_LOOKUP_* flags
+ *     Return: egress device index if packet is to be forwarded,
+ *             0 for local delivery (anything that needs to be handled
+ *             by the full stack), or negative on error.
+ *             If index is > 0, output data in bpf_fib_lookup is set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -851,7 +864,9 @@ union bpf_attr {
 	FN(msg_pull_data),		\
 	FN(bind),			\
 	FN(xdp_adjust_tail),		\
-	FN(skb_get_xfrm_state),
+	FN(skb_get_xfrm_state),		\
+	FN(fib_lookup),			\
+
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -1255,4 +1270,55 @@ struct bpf_raw_tracepoint_args {
 	__u64 args[0];
 };
 
+/* DIRECT:  Skip the FIB rules and go to FIB table associated with device
+ * OUTPUT:  Do lookup from egress perspective; default is ingress
+ */
+#define BPF_FIB_LOOKUP_DIRECT  BIT(0)
+#define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
+
+struct bpf_fib_lookup {
+	/* input */
+	__u8	family;   /* network family, AF_INET, AF_INET6, AF_MPLS */
+
+	/* set if lookup is to consider L4 data - e.g., FIB rules */
+	__u8	l4_protocol;
+	__be16	sport;
+	__be16	dport;
+
+	/* total length of packet from network header - used for MTU check */
+	__u16	tot_len;
+	__u32	ifindex;  /* L3 device index for lookup */
+
+	union {
+		/* inputs to lookup */
+		__u8	tos;		/* AF_INET  */
+		__be32	flowlabel;	/* AF_INET6 */
+
+		/* output: metric of fib result */
+		__u32 rt_metric;
+	};
+
+	union {
+		__be32		mpls_in;
+		__be32		ipv4_src;
+		struct in6_addr	ipv6_src;
+	};
+
+	/* input to bpf_fib_lookup, *dst is destination address.
+	 * output: bpf_fib_lookup sets to gateway address
+	 */
+	union {
+		/* return for MPLS lookups */
+		__be32		mpls_out[4];  /* support up to 4 labels */
+		__be32		ipv4_dst;
+		struct in6_addr	ipv6_dst;
+	};
+
+	/* output */
+	__be16	h_vlan_proto;
+	__be16	h_vlan_TCI;
+	__u8	smac[ETH_ALEN];
+	__u8	dmac[ETH_ALEN];
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index 8e45c6c7ab08..37602b2fb94a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -59,6 +59,10 @@
 #include <net/tcp.h>
 #include <net/xfrm.h>
 #include <linux/bpf_trace.h>
+#include <linux/inetdevice.h>
+#include <net/ip_fib.h>
+#include <net/flow.h>
+#include <net/arp.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -3787,6 +3791,231 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
 };
 #endif
 
+#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
+static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
+				  const struct neighbour *neigh,
+				  const struct net_device *dev)
+{
+	memcpy(params->dmac, neigh->ha, ETH_ALEN);
+	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
+	params->h_vlan_TCI = 0;
+	params->h_vlan_proto = 0;
+
+	return dev->ifindex;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_INET)
+static int bpf_ipv4_fib_lookup(struct xdp_buff *ctx,
+			       struct bpf_fib_lookup *params, u32 flags)
+{
+	struct net *net = dev_net(ctx->rxq->dev);
+	struct in_device *in_dev;
+	struct neighbour *neigh;
+	struct net_device *dev;
+	struct fib_result res;
+	struct fib_nh *nh;
+	struct flowi4 fl4;
+	int err;
+
+	dev = dev_get_by_index_rcu(net, params->ifindex);
+	if (unlikely(!dev))
+		return -ENODEV;
+
+	/* verify forwarding is enabled on this interface */
+	in_dev = __in_dev_get_rcu(dev);
+	if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
+		return 0;
+
+	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+		fl4.flowi4_iif = 1;
+		fl4.flowi4_oif = params->ifindex;
+	} else {
+		fl4.flowi4_iif = params->ifindex;
+		fl4.flowi4_oif = 0;
+	}
+	fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
+	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+	fl4.flowi4_flags = 0;
+
+	fl4.flowi4_proto = params->l4_protocol;
+	fl4.daddr = params->ipv4_dst;
+	fl4.saddr = params->ipv4_src;
+	fl4.fl4_sport = params->sport;
+	fl4.fl4_dport = params->dport;
+
+	if (flags & BPF_FIB_LOOKUP_DIRECT) {
+		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+		struct fib_table *tb;
+
+		tb = fib_get_table(net, tbid);
+		if (unlikely(!tb))
+			return 0;
+
+		err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
+	} else {
+		fl4.flowi4_mark = 0;
+		fl4.flowi4_secid = 0;
+		fl4.flowi4_tun_key.tun_id = 0;
+		fl4.flowi4_uid = sock_net_uid(net, NULL);
+
+		err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
+	}
+
+	if (err || res.type != RTN_UNICAST)
+		return 0;
+
+	if (res.fi->fib_nhs > 1)
+		fib_select_path(net, &res, &fl4, NULL);
+
+	nh = &res.fi->fib_nh[res.nh_sel];
+
+	/* do not handle lwt encaps right now */
+	if (nh->nh_lwtstate)
+		return 0;
+
+	dev = nh->nh_dev;
+	if (unlikely(!dev))
+		return 0;
+
+	if (nh->nh_gw)
+		params->ipv4_dst = nh->nh_gw;
+
+	params->rt_metric = res.fi->fib_priority;
+
+	/* xdp and cls_bpf programs are run in RCU-bh so
+	 * rcu_read_lock_bh is not needed here
+	 */
+	neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
+	if (neigh)
+		return bpf_fib_set_fwd_params(params, neigh, dev);
+
+	return 0;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int bpf_ipv6_fib_lookup(struct xdp_buff *ctx,
+			       struct bpf_fib_lookup *params, u32 flags)
+{
+	struct net *net = dev_net(ctx->rxq->dev);
+	struct neighbour *neigh;
+	struct net_device *dev;
+	struct fib6_info *f6i;
+	struct flowi6 fl6;
+	int strict = 0;
+	int oif;
+
+	/* link local addresses are never forwarded */
+	if (rt6_need_strict(&params->ipv6_dst) ||
+	    rt6_need_strict(&params->ipv6_src))
+		return 0;
+
+	dev = dev_get_by_index_rcu(net, params->ifindex);
+	if (unlikely(!dev))
+		return -ENODEV;
+
+	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+		fl6.flowi6_iif = 1;
+		oif = fl6.flowi6_oif = params->ifindex;
+	} else {
+		oif = fl6.flowi6_iif = params->ifindex;
+		fl6.flowi6_oif = 0;
+		strict = RT6_LOOKUP_F_HAS_SADDR;
+	}
+	fl6.flowlabel = params->flowlabel;
+	fl6.flowi6_scope = 0;
+	fl6.flowi6_flags = 0;
+	fl6.mp_hash = 0;
+
+	fl6.flowi6_proto = params->l4_protocol;
+	fl6.daddr = params->ipv6_dst;
+	fl6.saddr = params->ipv6_src;
+	fl6.fl6_sport = params->sport;
+	fl6.fl6_dport = params->dport;
+
+	if (flags & BPF_FIB_LOOKUP_DIRECT) {
+		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+		struct fib6_table *tb;
+
+		tb = ipv6_stub->fib6_get_table(net, tbid);
+		if (unlikely(!tb))
+			return 0;
+
+		f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
+	} else {
+		fl6.flowi6_mark = 0;
+		fl6.flowi6_secid = 0;
+		fl6.flowi6_tun_key.tun_id = 0;
+		fl6.flowi6_uid = sock_net_uid(net, NULL);
+
+		f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
+	}
+
+	if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
+		return 0;
+
+	if (unlikely(f6i->fib6_flags & RTF_REJECT ||
+	    f6i->fib6_type != RTN_UNICAST))
+		return 0;
+
+	if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
+		f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
+						       fl6.flowi6_oif, NULL,
+						       strict);
+
+	if (f6i->fib6_nh.nh_lwtstate)
+		return 0;
+
+	if (f6i->fib6_flags & RTF_GATEWAY)
+		params->ipv6_dst = f6i->fib6_nh.nh_gw;
+
+	dev = f6i->fib6_nh.nh_dev;
+	params->rt_metric = f6i->fib6_metric;
+
+	/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
+	 * not needed here. Can not use __ipv6_neigh_lookup_noref here
+	 * because we need to get nd_tbl via the stub
+	 */
+	neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
+				      ndisc_hashfn, &params->ipv6_dst, dev);
+	if (neigh)
+		return bpf_fib_set_fwd_params(params, neigh, dev);
+
+	return 0;
+}
+#endif
+
+BPF_CALL_4(bpf_fib_lookup, struct xdp_buff *, ctx,
+	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+	if (plen < sizeof(*params))
+		return -EINVAL;
+
+	switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+	case AF_INET:
+		return bpf_ipv4_fib_lookup(ctx, params, flags);
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		return bpf_ipv6_fib_lookup(ctx, params, flags);
+#endif
+	}
+	return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_fib_lookup_proto = {
+	.func		= bpf_fib_lookup,
+	.gpl_only	= true,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_PTR_TO_MEM,
+	.arg3_type      = ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -3861,6 +4090,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_socket_cookie_proto;
 	case BPF_FUNC_get_socket_uid:
 		return &bpf_get_socket_uid_proto;
+	case BPF_FUNC_fib_lookup:
+		return &bpf_fib_lookup_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -3957,6 +4188,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_xdp_redirect_map_proto;
 	case BPF_FUNC_xdp_adjust_tail:
 		return &bpf_xdp_adjust_tail_proto;
+	case BPF_FUNC_fib_lookup:
+		return &bpf_fib_lookup_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
2.11.0

^ permalink raw reply related

* [RFC bpf-next 7/9] net/ipv6: Add fib lookup stubs for use in bpf helper
From: David Ahern @ 2018-04-25 18:34 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180425183449.25134-1-dsahern@gmail.com>

Add stubs to retrieve a handle to an IPv6 FIB table, fib6_get_table,
a stub to do a lookup in a specific table, fib6_table_lookup, and
a stub for a full route lookup.

The stubs are needed for core bpf code to handle the case when the
IPv6 module is not builtin.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/addrconf.h   | 14 ++++++++++++++
 net/ipv6/addrconf_core.c | 33 ++++++++++++++++++++++++++++++++-
 net/ipv6/af_inet6.c      |  6 +++++-
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 8312cc25a3af..ff766ab207e0 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -223,6 +223,20 @@ struct ipv6_stub {
 				 const struct in6_addr *addr);
 	int (*ipv6_dst_lookup)(struct net *net, struct sock *sk,
 			       struct dst_entry **dst, struct flowi6 *fl6);
+
+	struct fib6_table *(*fib6_get_table)(struct net *net, u32 id);
+	struct fib6_info *(*fib6_lookup)(struct net *net, int oif,
+					 struct flowi6 *fl6, int flags);
+	struct fib6_info *(*fib6_table_lookup)(struct net *net,
+					      struct fib6_table *table,
+					      int oif, struct flowi6 *fl6,
+					      int flags);
+	struct fib6_info *(*fib6_multipath_select)(const struct net *net,
+						   struct fib6_info *f6i,
+						   struct flowi6 *fl6, int oif,
+						   const struct sk_buff *skb,
+						   int strict);
+
 	void (*udpv6_encap_enable)(void);
 	void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
 			      const struct in6_addr *solicited_addr,
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 32b564dfd02a..2fe754fd4f5e 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -134,8 +134,39 @@ static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1,
 	return -EAFNOSUPPORT;
 }
 
+static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id)
+{
+	return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_table_lookup(struct net *net, struct fib6_table *table,
+			       int oif, struct flowi6 *fl6, int flags)
+{
+	return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			 int flags)
+{
+	return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_multipath_select(const struct net *net, struct fib6_info *f6i,
+				   struct flowi6 *fl6, int oif,
+				   const struct sk_buff *skb, int strict)
+{
+	return f6i;
+}
+
 const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
-	.ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
+	.ipv6_dst_lookup   = eafnosupport_ipv6_dst_lookup,
+	.fib6_get_table    = eafnosupport_fib6_get_table,
+	.fib6_table_lookup = eafnosupport_fib6_table_lookup,
+	.fib6_lookup       = eafnosupport_fib6_lookup,
+	.fib6_multipath_select = eafnosupport_fib6_multipath_select,
 };
 EXPORT_SYMBOL_GPL(ipv6_stub);
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 36d622c477b1..c0e8255d50bb 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -887,7 +887,11 @@ static struct pernet_operations inet6_net_ops = {
 static const struct ipv6_stub ipv6_stub_impl = {
 	.ipv6_sock_mc_join = ipv6_sock_mc_join,
 	.ipv6_sock_mc_drop = ipv6_sock_mc_drop,
-	.ipv6_dst_lookup = ip6_dst_lookup,
+	.ipv6_dst_lookup   = ip6_dst_lookup,
+	.fib6_get_table	   = fib6_get_table,
+	.fib6_table_lookup = fib6_table_lookup,
+	.fib6_lookup       = fib6_lookup,
+	.fib6_multipath_select = fib6_multipath_select,
 	.udpv6_encap_enable = udpv6_encap_enable,
 	.ndisc_send_na = ndisc_send_na,
 	.nd_tbl	= &nd_tbl,
-- 
2.11.0

^ permalink raw reply related

* [RFC bpf-next 6/9] net/ipv6: Update fib6 tracepoint to take fib6_info
From: David Ahern @ 2018-04-25 18:34 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180425183449.25134-1-dsahern@gmail.com>

Similar to IPv4, IPv6 should use the FIB lookup result in the
tracepoint.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/trace/events/fib6.h | 14 +++++++-------
 net/ipv6/route.c            | 14 ++++++--------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h
index 7e8d48a81b91..1b8d951e3c12 100644
--- a/include/trace/events/fib6.h
+++ b/include/trace/events/fib6.h
@@ -12,10 +12,10 @@
 
 TRACE_EVENT(fib6_table_lookup,
 
-	TP_PROTO(const struct net *net, const struct rt6_info *rt,
+	TP_PROTO(const struct net *net, const struct fib6_info *f6i,
 		 struct fib6_table *table, const struct flowi6 *flp),
 
-	TP_ARGS(net, rt, table, flp),
+	TP_ARGS(net, f6i, table, flp),
 
 	TP_STRUCT__entry(
 		__field(	u32,	tb_id		)
@@ -48,20 +48,20 @@ TRACE_EVENT(fib6_table_lookup,
 		in6 = (struct in6_addr *)__entry->dst;
 		*in6 = flp->daddr;
 
-		if (rt->rt6i_idev) {
-			__assign_str(name, rt->rt6i_idev->dev->name);
+		if (f6i->fib6_nh.nh_dev) {
+			__assign_str(name, f6i->fib6_nh.nh_dev);
 		} else {
 			__assign_str(name, "");
 		}
-		if (rt == net->ipv6.ip6_null_entry) {
+		if (f6i == net->ipv6.fib6_null_entry) {
 			struct in6_addr in6_zero = {};
 
 			in6 = (struct in6_addr *)__entry->gw;
 			*in6 = in6_zero;
 
-		} else if (rt) {
+		} else if (f6i) {
 			in6 = (struct in6_addr *)__entry->gw;
-			*in6 = rt->rt6i_gateway;
+			*in6 = f6i->fib6_nh.nh_gw;
 		}
 	),
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 353645ed3d2c..e4163d19d905 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1078,6 +1078,8 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 			goto restart;
 	}
 
+	trace_fib6_table_lookup(net, f6i, table, fl6);
+
 	/* Search through exception table */
 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
 	if (rt) {
@@ -1096,8 +1098,6 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 
 	rcu_read_unlock();
 
-	trace_fib6_table_lookup(net, rt, table, fl6);
-
 	return rt;
 }
 
@@ -1826,6 +1826,8 @@ struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
 		}
 	}
 
+	trace_fib6_table_lookup(net, f6i, table, fl6);
+
 	return f6i;
 }
 
@@ -1852,7 +1854,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 		rt = net->ipv6.ip6_null_entry;
 		rcu_read_unlock();
 		dst_hold(&rt->dst);
-		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
 	}
 
@@ -1863,7 +1864,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 			dst_use_noref(&rt->dst, jiffies);
 
 		rcu_read_unlock();
-		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
 			    !(f6i->fib6_flags & RTF_GATEWAY))) {
@@ -1889,9 +1889,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 			dst_hold(&uncached_rt->dst);
 		}
 
-		trace_fib6_table_lookup(net, uncached_rt, table, fl6);
 		return uncached_rt;
-
 	} else {
 		/* Get a percpu copy */
 
@@ -1905,7 +1903,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 		local_bh_enable();
 		rcu_read_unlock();
-		trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
+
 		return pcpu_rt;
 	}
 }
@@ -2483,7 +2481,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 
 	rcu_read_unlock();
 
-	trace_fib6_table_lookup(net, ret, table, fl6);
+	trace_fib6_table_lookup(net, rt, table, fl6);
 	return ret;
 };
 
-- 
2.11.0

^ permalink raw reply related

* [RFC bpf-next 5/9] net/ipv6: Add fib6_lookup
From: David Ahern @ 2018-04-25 18:34 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180425183449.25134-1-dsahern@gmail.com>

Add IPv6 equivalent to fib_lookup. Does a fib lookup, including rules,
but returns a FIB entry, fib6_info, rather than a dst based rt6_info.
fib6_lookup is on the order of 40% faster than any of the dst based
lookup methods without custom rules and 25% faster with custom rules.

Since the lookup function has a completely different signature,
fib6_rule_action is split into 2 paths: the existing one is
renamed __fib6_rule_action and a new one for the fib6_info path
is added. fib6_rule_action decides which to call based on the
lookup_ptr. If it is fib6_table_lookup then the new path is taken.

Caller must hold rcu lock as no reference is taken on the returned
fib entry.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip6_fib.h |  6 ++++
 net/ipv6/fib6_rules.c | 83 +++++++++++++++++++++++++++++++++++++++++++++++++--
 net/ipv6/ip6_fib.c    |  7 +++++
 3 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 4f7b8f59ea6d..d920dd00139b 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,12 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
+/* called with rcu lock held; can return error pointer
+ * caller needs to select path
+ */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			      int flags);
+
 /* called with rcu lock held; caller needs to select path */
 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
 				    int oif, struct flowi6 *fl6, int strict);
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index d9c0bfd304f0..47da07aabda8 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -60,6 +60,39 @@ unsigned int fib6_rules_seq_read(struct net *net)
 	return fib_rules_seq_read(net, AF_INET6);
 }
 
+/* called with rcu lock held; no reference taken on fib6_info */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			      int flags)
+{
+	struct fib6_info *f6i;
+	int err;
+
+	if (net->ipv6.fib6_has_custom_rules) {
+		struct fib_lookup_arg arg = {
+			.lookup_ptr = fib6_table_lookup,
+			.lookup_data = &oif,
+			.flags = FIB_LOOKUP_NOREF,
+		};
+
+		l3mdev_update_flow(net, flowi6_to_flowi(fl6));
+
+		err = fib_rules_lookup(net->ipv6.fib6_rules_ops,
+				       flowi6_to_flowi(fl6), flags, &arg);
+		if (err)
+			return ERR_PTR(err);
+
+		f6i = arg.result ? : net->ipv6.fib6_null_entry;
+	} else {
+		f6i = fib6_table_lookup(net, net->ipv6.fib6_local_tbl,
+					oif, fl6, flags);
+		if (!f6i || f6i == net->ipv6.fib6_null_entry)
+			f6i = fib6_table_lookup(net, net->ipv6.fib6_main_tbl,
+						oif, fl6, flags);
+	}
+
+	return f6i;
+}
+
 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup)
@@ -121,8 +154,45 @@ static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
 	return 0;
 }
 
-static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
-			    int flags, struct fib_lookup_arg *arg)
+static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp,
+				int flags, struct fib_lookup_arg *arg)
+{
+	struct flowi6 *flp6 = &flp->u.ip6;
+	struct net *net = rule->fr_net;
+	struct fib6_table *table;
+	struct fib6_info *f6i;
+	int err = 0, *oif;
+	u32 tb_id;
+
+	switch (rule->action) {
+	case FR_ACT_TO_TBL:
+		break;
+	case FR_ACT_UNREACHABLE:
+		return -ENETUNREACH;
+	case FR_ACT_PROHIBIT:
+		return -EACCES;
+	case FR_ACT_BLACKHOLE:
+	default:
+		return -EINVAL;
+	}
+
+	tb_id = fib_rule_get_table(rule, arg);
+	table = fib6_get_table(net, tb_id);
+	if (!table)
+		return -EAGAIN;
+
+	oif = (int *)arg->lookup_data;
+	f6i = fib6_table_lookup(net, table, *oif, flp6, flags);
+	if (f6i != net->ipv6.fib6_null_entry)
+		err = fib6_rule_saddr(net, rule, flags, flp6,
+				      fib6_info_nh_dev(f6i));
+
+	arg->result = f6i;
+	return err;
+}
+
+static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+			      int flags, struct fib_lookup_arg *arg)
 {
 	struct flowi6 *flp6 = &flp->u.ip6;
 	struct rt6_info *rt = NULL;
@@ -182,6 +252,15 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 	return err;
 }
 
+static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+			    int flags, struct fib_lookup_arg *arg)
+{
+	if (arg->lookup_ptr == fib6_table_lookup)
+		return fib6_rule_action_alt(rule, flp, flags, arg);
+
+	return __fib6_rule_action(rule, flp, flags, arg);
+}
+
 static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
 {
 	struct rt6_info *rt = (struct rt6_info *) arg->result;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index f6442d6c5603..422ce5ca7b33 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -354,6 +354,13 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 	return &rt->dst;
 }
 
+/* called with rcu lock held; no reference taken on fib6_info */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			      int flags)
+{
+	return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, flags);
+}
+
 static void __net_init fib6_tables_init(struct net *net)
 {
 	fib6_link_table(net, net->ipv6.fib6_main_tbl);
-- 
2.11.0

^ permalink raw reply related

* [RFC bpf-next 4/9] net/ipv6: Refactor fib6_rule_action
From: David Ahern @ 2018-04-25 18:34 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180425183449.25134-1-dsahern@gmail.com>

Move source address lookup from fib6_rule_action to a helper. It will be
used in a later patch by a second variant for fib6_rule_action.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/ipv6/fib6_rules.c | 52 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index df113c7b5fc8..d9c0bfd304f0 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -96,6 +96,31 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 	return &net->ipv6.ip6_null_entry->dst;
 }
 
+static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
+			   struct flowi6 *flp6, const struct net_device *dev)
+{
+	struct fib6_rule *r = (struct fib6_rule *)rule;
+
+	/* If we need to find a source address for this traffic,
+	 * we check the result if it meets requirement of the rule.
+	 */
+	if ((rule->flags & FIB_RULE_FIND_SADDR) &&
+	    r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
+		struct in6_addr saddr;
+
+		if (ipv6_dev_get_saddr(net, dev, &flp6->daddr,
+				       rt6_flags2srcprefs(flags), &saddr))
+			return -EAGAIN;
+
+		if (!ipv6_prefix_equal(&saddr, &r->src.addr, r->src.plen))
+			return -EAGAIN;
+
+		flp6->saddr = saddr;
+	}
+
+	return 0;
+}
+
 static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 			    int flags, struct fib_lookup_arg *arg)
 {
@@ -134,27 +159,12 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 
 	rt = lookup(net, table, flp6, arg->lookup_data, flags);
 	if (rt != net->ipv6.ip6_null_entry) {
-		struct fib6_rule *r = (struct fib6_rule *)rule;
-
-		/*
-		 * If we need to find a source address for this traffic,
-		 * we check the result if it meets requirement of the rule.
-		 */
-		if ((rule->flags & FIB_RULE_FIND_SADDR) &&
-		    r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
-			struct in6_addr saddr;
-
-			if (ipv6_dev_get_saddr(net,
-					       ip6_dst_idev(&rt->dst)->dev,
-					       &flp6->daddr,
-					       rt6_flags2srcprefs(flags),
-					       &saddr))
-				goto again;
-			if (!ipv6_prefix_equal(&saddr, &r->src.addr,
-					       r->src.plen))
-				goto again;
-			flp6->saddr = saddr;
-		}
+		err = fib6_rule_saddr(net, rule, flags, flp6,
+				      ip6_dst_idev(&rt->dst)->dev);
+
+		if (err == -EAGAIN)
+			goto again;
+
 		err = rt->dst.error;
 		if (err != -EAGAIN)
 			goto out;
-- 
2.11.0

^ permalink raw reply related

* [RFC bpf-next 3/9] net/ipv6: Extract table lookup from ip6_pol_route
From: David Ahern @ 2018-04-25 18:34 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180425183449.25134-1-dsahern@gmail.com>

ip6_pol_route is used for ingress and egress FIB lookups. Refactor it
moving the table lookup into a separate fib6_table_lookup that can be
invoked separately and export the new function.

ip6_pol_route now calls fib6_table_lookup and uses the result to generate
a dst based rt6_info.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip6_fib.h |  4 ++++
 net/ipv6/route.c      | 39 +++++++++++++++++++++++++--------------
 2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 80d76d8dc683..4f7b8f59ea6d 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
+/* called with rcu lock held; caller needs to select path */
+struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
+				    int oif, struct flowi6 *fl6, int strict);
+
 struct fib6_info *fib6_multipath_select(const struct net *net,
 					struct fib6_info *match,
 					struct flowi6 *fl6, int oif,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 72c784add7a1..353645ed3d2c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1799,21 +1799,12 @@ void rt6_age_exceptions(struct fib6_info *rt,
 	rcu_read_unlock_bh();
 }
 
-struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
-			       int oif, struct flowi6 *fl6,
-			       const struct sk_buff *skb, int flags)
+/* must be called with rcu lock held */
+struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
+				    int oif, struct flowi6 *fl6, int strict)
 {
 	struct fib6_node *fn, *saved_fn;
 	struct fib6_info *f6i;
-	struct rt6_info *rt;
-	int strict = 0;
-
-	strict |= flags & RT6_LOOKUP_F_IFACE;
-	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
-	if (net->ipv6.devconf_all->forwarding == 0)
-		strict |= RT6_LOOKUP_F_REACHABLE;
-
-	rcu_read_lock();
 
 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 	saved_fn = fn;
@@ -1823,8 +1814,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 redo_rt6_select:
 	f6i = rt6_select(net, fn, oif, strict);
-	if (f6i->fib6_nsiblings)
-		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
 	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
@@ -1837,6 +1826,28 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 		}
 	}
 
+	return f6i;
+}
+
+struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
+			       int oif, struct flowi6 *fl6,
+			       const struct sk_buff *skb, int flags)
+{
+	struct fib6_info *f6i;
+	struct rt6_info *rt;
+	int strict = 0;
+
+	strict |= flags & RT6_LOOKUP_F_IFACE;
+	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
+	if (net->ipv6.devconf_all->forwarding == 0)
+		strict |= RT6_LOOKUP_F_REACHABLE;
+
+	rcu_read_lock();
+
+	f6i = fib6_table_lookup(net, table, oif, fl6, strict);
+	if (f6i->fib6_nsiblings)
+		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
+
 	if (f6i == net->ipv6.fib6_null_entry) {
 		rt = net->ipv6.ip6_null_entry;
 		rcu_read_unlock();
-- 
2.11.0

^ permalink raw reply related

* [RFC bpf-next 2/9] net/ipv6: Rename rt6_multipath_select
From: David Ahern @ 2018-04-25 18:34 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180425183449.25134-1-dsahern@gmail.com>

Rename rt6_multipath_select to fib6_multipath_select and export it.
A later patch wants access to it similar to IPv4's fib_select_path.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip6_fib.h |  5 +++++
 net/ipv6/route.c      | 17 +++++++++--------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 5a16630179cb..80d76d8dc683 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
+struct fib6_info *fib6_multipath_select(const struct net *net,
+					struct fib6_info *match,
+					struct flowi6 *fl6, int oif,
+					const struct sk_buff *skb, int strict);
+
 struct fib6_node *fib6_node_lookup(struct fib6_node *root,
 				   const struct in6_addr *daddr,
 				   const struct in6_addr *saddr);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e4082c612fcc..72c784add7a1 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -419,11 +419,11 @@ static bool rt6_check_expired(const struct rt6_info *rt)
 	return false;
 }
 
-static struct fib6_info *rt6_multipath_select(const struct net *net,
-					      struct fib6_info *match,
-					     struct flowi6 *fl6, int oif,
-					     const struct sk_buff *skb,
-					     int strict)
+struct fib6_info *fib6_multipath_select(const struct net *net,
+					struct fib6_info *match,
+					struct flowi6 *fl6, int oif,
+					const struct sk_buff *skb,
+					int strict)
 {
 	struct fib6_info *sibling, *next_sibling;
 
@@ -1068,8 +1068,9 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 		f6i = rt6_device_match(net, f6i, &fl6->saddr,
 				      fl6->flowi6_oif, flags);
 		if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
-			f6i = rt6_multipath_select(net, f6i, fl6,
-						   fl6->flowi6_oif, skb, flags);
+			f6i = fib6_multipath_select(net, f6i, fl6,
+						    fl6->flowi6_oif, skb,
+						    flags);
 	}
 	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
@@ -1823,7 +1824,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 redo_rt6_select:
 	f6i = rt6_select(net, fn, oif, strict);
 	if (f6i->fib6_nsiblings)
-		f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
+		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
 	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
-- 
2.11.0

^ permalink raw reply related

* [RFC bpf-next 1/9] net/ipv6: Rename fib6_lookup to fib6_node_lookup
From: David Ahern @ 2018-04-25 18:34 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180425183449.25134-1-dsahern@gmail.com>

Rename fib6_lookup to fib6_node_lookup to better reflect what it
returns. The fib6_lookup name will be used in a later patch for
an IPv6 equivalent to IPv4's fib_lookup.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip6_fib.h | 6 +++---
 net/ipv6/ip6_fib.c    | 5 +++--
 net/ipv6/route.c      | 8 ++++----
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 1af450d4e923..5a16630179cb 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,9 +376,9 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
-struct fib6_node *fib6_lookup(struct fib6_node *root,
-			      const struct in6_addr *daddr,
-			      const struct in6_addr *saddr);
+struct fib6_node *fib6_node_lookup(struct fib6_node *root,
+				   const struct in6_addr *daddr,
+				   const struct in6_addr *saddr);
 
 struct fib6_node *fib6_locate(struct fib6_node *root,
 			      const struct in6_addr *daddr, int dst_len,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 6421c893466e..f6442d6c5603 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1422,8 +1422,9 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
 
 /* called with rcu_read_lock() held
  */
-struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr,
-			      const struct in6_addr *saddr)
+struct fib6_node *fib6_node_lookup(struct fib6_node *root,
+				   const struct in6_addr *daddr,
+				   const struct in6_addr *saddr)
 {
 	struct fib6_node *fn;
 	struct lookup_args args[] = {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0407bbc5a028..e4082c612fcc 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1006,7 +1006,7 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
 		pn = rcu_dereference(fn->parent);
 		sn = FIB6_SUBTREE(pn);
 		if (sn && sn != fn)
-			fn = fib6_lookup(sn, NULL, saddr);
+			fn = fib6_node_lookup(sn, NULL, saddr);
 		else
 			fn = pn;
 		if (fn->fn_flags & RTN_RTINFO)
@@ -1059,7 +1059,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 		flags &= ~RT6_LOOKUP_F_IFACE;
 
 	rcu_read_lock();
-	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
 	f6i = rcu_dereference(fn->leaf);
 	if (!f6i) {
@@ -1814,7 +1814,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 	rcu_read_lock();
 
-	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 	saved_fn = fn;
 
 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
@@ -2417,7 +2417,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 	 */
 
 	rcu_read_lock();
-	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
 	for_each_fib6_node_rt_rcu(fn) {
 		if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
-- 
2.11.0

^ permalink raw reply related

* [RFC bpf-next 0/9] bpf: Add helper to do FIB lookups
From: David Ahern @ 2018-04-25 18:34 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: shm, roopa, brouer, toke, john.fastabend, David Ahern

Provide a helper for doing a FIB and neighbor lookups in the kernel
tables from an XDP program. The helper provides a fastpath for forwarding
packets. If the packet is a local delivery or for any reason is not a
simple lookup and forward, the packet is expected to continue up the stack
for full processing.

Patches 1-6 do some more refactoring to IPv6 with the end goal of
extracting a FIB lookup function that aligns with fib_lookup for IPv4,
basically returning a fib6_info without creating a dst based entry.

Patch 7 adds lookup functions to the ipv6 stub. These are needed since
bpf is built into the kernel and ipv6 may not be built or loaded.

Patch 8 adds the bpf helper and 9 is a sample program.

David Ahern (9):
  net/ipv6: Rename fib6_lookup to fib6_node_lookup
  net/ipv6: Rename rt6_multipath_select
  net/ipv6: Extract table lookup from ip6_pol_route
  net/ipv6: Refactor fib6_rule_action
  net/ipv6: Add fib6_lookup
  net/ipv6: Update fib6 tracepoint to take fib6_info
  net/ipv6: Add fib lookup stubs for use in bpf helper
  bpf: Provide helper to do lookups in kernel FIB table
  samples/bpf: Add examples of ipv4 and ipv6 forwarding in XDP

 include/net/addrconf.h                    |  14 ++
 include/net/ip6_fib.h                     |  21 ++-
 include/trace/events/fib6.h               |  14 +-
 include/uapi/linux/bpf.h                  |  68 ++++++++-
 net/core/filter.c                         | 233 ++++++++++++++++++++++++++++++
 net/ipv6/addrconf_core.c                  |  33 ++++-
 net/ipv6/af_inet6.c                       |   6 +-
 net/ipv6/fib6_rules.c                     | 135 ++++++++++++++---
 net/ipv6/ip6_fib.c                        |  12 +-
 net/ipv6/route.c                          |  76 +++++-----
 samples/bpf/Makefile                      |   4 +
 samples/bpf/xdp_fwd_kern.c                | 110 ++++++++++++++
 samples/bpf/xdp_fwd_user.c                | 136 +++++++++++++++++
 tools/testing/selftests/bpf/bpf_helpers.h |   3 +
 14 files changed, 794 insertions(+), 71 deletions(-)
 create mode 100644 samples/bpf/xdp_fwd_kern.c
 create mode 100644 samples/bpf/xdp_fwd_user.c

-- 
2.11.0

^ permalink raw reply

* Re: [PATCH] rds: ib: Fix missing call to rds_ib_dev_put in rds_ib_setup_qp
From: David Miller @ 2018-04-25 18:34 UTC (permalink / raw)
  To: dag.moxnes; +Cc: santosh.shilimkar, netdev, linux-rdma, rds-devel, linux-kernel
In-Reply-To: <1524655321-8379-1-git-send-email-dag.moxnes@oracle.com>

From: Dag Moxnes <dag.moxnes@oracle.com>
Date: Wed, 25 Apr 2018 13:22:01 +0200

> The function rds_ib_setup_qp is calling rds_ib_get_client_data and
> should correspondingly call rds_ib_dev_put. This call was lost in
> the non-error path with the introduction of error handling done in
> commit 3b12f73a5c29 ("rds: ib: add error handle")
> 
> Signed-off-by: Dag Moxnes <dag.moxnes@oracle.com>

Applied, thanks.

^ permalink raw reply

* Re: [Cake] [PATCH net-next v3] Add Common Applications Kept Enhanced (cake) qdisc
From: Toke Høiland-Jørgensen @ 2018-04-25 18:34 UTC (permalink / raw)
  To: Eric Dumazet, Jonathan Morton; +Cc: netdev, cake
In-Reply-To: <ca555f33-4d79-2515-e3ec-9c28d2418fba@gmail.com>

Eric Dumazet <eric.dumazet@gmail.com> writes:

> On 04/25/2018 09:52 AM, Jonathan Morton wrote:
>>> We can see here the high cost of forcing software GSO :/
>>>
>>> Really, this should be done only :
>>> 1) If requested by the admin ( tc .... gso ....)
>>>
>>> 2) If packet size is above a threshold.
>>>  The threshold could be set by the admin, and/or based on a fraction of the bandwidth parameter.
>>>
>>> I totally understand why you prefer to segment yourself for < 100 Mbit links.
>>>
>>> But this makes no sense on 10Gbit+
>> 
>> It is absolutely necessary, so far as I can see, to segment GSO
>> superpackets when overhead compensation is selected - as it very
>> often should be, even on pure Ethernet links. Without that, the
>> calculation of link occupancy time will be wrong. (The actual
>> transmission time of an Ethernet frame is rather more than just 14
>> bytes longer than the underlying IP packet.)
>
> Just fix the overhead compensation computation in the code.
>
> skb in a qdisc have everything you need.
>
> qdisc_pkt_len_init() has initialized qdisc_skb_cb(skb)->pkt_len with
> the exact bytes on the wire, and you have gso_segs to perform any
> adjustement you need to do.

The problem is that may not be the right values. For example, in many
CPEs there's a built-in switch that strips VLAN tags before the packet
actually hits the wire. So we do need to be able to get the actual
packet size. Is it possible to get the sizes of the individual segments
of a GSO packet? That way we could do the calculation for the whole
super-packet...

> Do not kill GSO only because you do not want to deal with it.

It's not (just) that we don't want to deal with it, it is also a very
aggressive optimisation for latency, which makes a lot of sense at
residential internet bandwidths.

>> Another reason to apply GSO segmentation is to achieve maximal
>> smoothness of flow isolation. This should still be achievable within
>> some tolerance at high link rates, but calculating this tolerance is
>> complicated by the triple-isolate algorithm.
>> 
>> If there's a way to obtain the individual packet sizes without
>> incurring the full segmentation overhead, it may be worth considering
>> (at high link rates only). I would want to leave it on by default,
>> because some of Cake's demonstrably superior latency performance
>> depends on seeing the real packets, not the aggregates, and the
>> overhead only becomes significant above 100Mbps on weak MIPS CPUs and
>> 1Gbps on vaguely modern x86 stuff.
>
> Again, these arguments are moot on 10Gbit+.
>
> Lets build the future, do not pretend GSO/TSO is not part of it.

I'm all for being future proof, but we also do want well-tested code.
Reworking this feature to function correctly with GSO is going to take
some work. Would you accept gating GSO splitting on bandwidth for now,
and adding correct overhead compensation for GSO packets later?

something like:

if (shaper_active() && (bandwidth <= 1Gbps || overhead_configured))
  split_gso()

-Toke

^ permalink raw reply

* [PATCH net] tcp: ignore Fast Open on repair mode
From: Yuchung Cheng @ 2018-04-25 18:33 UTC (permalink / raw)
  To: davem; +Cc: netdev, edumazet, ncardwell, Yuchung Cheng

The TCP repair sequence of operation is to first set the socket in
repair mode, then inject the TCP stats into the socket with repair
socket options, then call connect() to re-activate the socket. The
connect syscall simply returns and set state to ESTABLISHED
mode. As a result Fast Open is meaningless for TCP repair.

However allowing sendto() system call with MSG_FASTOPEN flag half-way
during the repair operation could unexpectedly cause data to be
sent, before the operation finishes changing the internal TCP stats
(e.g. MSS).  This in turn triggers TCP warnings on inconsistent
packet accounting.

The fix is to simply disallow Fast Open operation once the socket
is in the repair mode.

Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9ce1c726185e..4b18ad41d4df 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1204,7 +1204,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 			uarg->zerocopy = 0;
 	}
 
-	if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) {
+	if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
+	    !tp->repair) {
 		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
 		if (err == -EINPROGRESS && copied_syn > 0)
 			goto out;
-- 
2.17.0.441.gb46fe60e1d-goog

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox