* Re: [PATCH 2/3] selftests/bpf: test_xdp_noinline.c: fix 'noinline' macro expansion
From: Daniel Borkmann @ 2018-04-27 9:58 UTC (permalink / raw)
To: Sirio Balmelli, ast; +Cc: netdev
In-Reply-To: <20180426083125.GA13968@vm4>
On 04/26/2018 10:31 AM, Sirio Balmelli wrote:
> Compiling with clang 7.0.0 yields:
> test_xdp_noinline.c:470:24: warning: unknown attribute '__attribute__' ignored [-Wunknown-attributes]
> ../../../include/linux/compiler-gcc.h:24:19: note: expanded from macro 'noinline'
> ^
> test_xdp_noinline.c:494:24: error: use of undeclared identifier 'noinline'; did you mean 'inline'?
> static __attribute__ ((noinline))
>
> This appears to be the 'noinline' attribute being itself macro-expanded,
> so the compiler sees '__attribute__ ((__attribute__((noinline))))'.
>
> Fix using an #ifndef.
> Homogenize function declarations.
>
> Signed-off-by: Sirio Balmelli <sirio@b-ad.ch>
I think this error is a result of your previous patch that you pull in
kernel headers suddenly. Otherwise include/linux/compiler-gcc.h should
have never been included. That's why you see the wrong expansion of ...
__attribute__ ((noinline))
... into ...
__attribute__ ((__attribute__ ((noinline))))
... since noinline is additionally defined in include/linux/compiler-gcc.h.
> ---
> tools/testing/selftests/bpf/test_xdp_noinline.c | 79 +++++++++++++------------
> 1 file changed, 42 insertions(+), 37 deletions(-)
>
> diff --git a/tools/testing/selftests/bpf/test_xdp_noinline.c b/tools/testing/selftests/bpf/test_xdp_noinline.c
> index 5e4aac7..5b5f3f2 100644
> --- a/tools/testing/selftests/bpf/test_xdp_noinline.c
> +++ b/tools/testing/selftests/bpf/test_xdp_noinline.c
> @@ -15,6 +15,11 @@
> #include <linux/udp.h>
> #include "bpf_helpers.h"
>
> +/* some compiler-specific header might define this */
> +#ifndef noinline
> +#define noinline (__attribute__ ((noinline)))
> +#endif
> +
> #define bpf_printk(fmt, ...) \
> ({ \
> char ____fmt[] = fmt; \
> @@ -55,7 +60,7 @@ static __u32 rol32(__u32 word, unsigned int shift)
>
> typedef unsigned int u32;
>
> -static __attribute__ ((noinline))
> +static noinline
> u32 jhash(const void *key, u32 length, u32 initval)
> {
> u32 a, b, c;
> @@ -92,7 +97,7 @@ u32 jhash(const void *key, u32 length, u32 initval)
> return c;
> }
>
> -static __attribute__ ((noinline))
> +static noinline
> u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
> {
> a += initval;
> @@ -102,7 +107,7 @@ u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
> return c;
> }
>
> -static __attribute__ ((noinline))
> +static noinline
> u32 jhash_2words(u32 a, u32 b, u32 initval)
> {
> return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
> @@ -239,7 +244,7 @@ static inline __u64 calc_offset(bool is_ipv6, bool is_icmp)
> return off;
> }
>
> -static __attribute__ ((noinline))
> +static noinline
> bool parse_udp(void *data, void *data_end,
> bool is_ipv6, struct packet_description *pckt)
> {
> @@ -261,7 +266,7 @@ bool parse_udp(void *data, void *data_end,
> return 1;
> }
>
> -static __attribute__ ((noinline))
> +static noinline
> bool parse_tcp(void *data, void *data_end,
> bool is_ipv6, struct packet_description *pckt)
> {
> @@ -285,7 +290,7 @@ bool parse_tcp(void *data, void *data_end,
> return 1;
> }
>
> -static __attribute__ ((noinline))
> +static noinline
> bool encap_v6(struct xdp_md *xdp, struct ctl_value *cval,
> struct packet_description *pckt,
> struct real_definition *dst, __u32 pkt_bytes)
> @@ -328,7 +333,7 @@ bool encap_v6(struct xdp_md *xdp, struct ctl_value *cval,
> return 1;
> }
>
> -static __attribute__ ((noinline))
> +static noinline
> bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval,
> struct packet_description *pckt,
> struct real_definition *dst, __u32 pkt_bytes)
> @@ -382,7 +387,7 @@ bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval,
> return 1;
> }
>
> -static __attribute__ ((noinline))
> +static noinline
> bool decap_v6(struct xdp_md *xdp, void **data, void **data_end, bool inner_v4)
> {
> struct eth_hdr *new_eth;
> @@ -403,7 +408,7 @@ bool decap_v6(struct xdp_md *xdp, void **data, void **data_end, bool inner_v4)
> return 1;
> }
>
> -static __attribute__ ((noinline))
> +static noinline
> bool decap_v4(struct xdp_md *xdp, void **data, void **data_end)
> {
> struct eth_hdr *new_eth;
> @@ -421,7 +426,7 @@ bool decap_v4(struct xdp_md *xdp, void **data, void **data_end)
> return 1;
> }
>
> -static __attribute__ ((noinline))
> +static noinline
> int swap_mac_and_send(void *data, void *data_end)
> {
> unsigned char tmp_mac[6];
> @@ -434,7 +439,7 @@ int swap_mac_and_send(void *data, void *data_end)
> return XDP_TX;
> }
>
> -static __attribute__ ((noinline))
> +static noinline
> int send_icmp_reply(void *data, void *data_end)
> {
> struct icmphdr *icmp_hdr;
> @@ -467,7 +472,7 @@ int send_icmp_reply(void *data, void *data_end)
> return swap_mac_and_send(data, data_end);
> }
>
> -static __attribute__ ((noinline))
> +static noinline
> int send_icmp6_reply(void *data, void *data_end)
> {
> struct icmp6hdr *icmp_hdr;
> @@ -491,7 +496,7 @@ int send_icmp6_reply(void *data, void *data_end)
> return swap_mac_and_send(data, data_end);
> }
>
> -static __attribute__ ((noinline))
> +static noinline
> int parse_icmpv6(void *data, void *data_end, __u64 off,
> struct packet_description *pckt)
> {
> @@ -516,7 +521,7 @@ int parse_icmpv6(void *data, void *data_end, __u64 off,
> return -1;
> }
>
> -static __attribute__ ((noinline))
> +static noinline
> int parse_icmp(void *data, void *data_end, __u64 off,
> struct packet_description *pckt)
> {
> @@ -543,7 +548,7 @@ int parse_icmp(void *data, void *data_end, __u64 off,
> return -1;
> }
>
> -static __attribute__ ((noinline))
> +static noinline
> __u32 get_packet_hash(struct packet_description *pckt,
> bool hash_16bytes)
> {
> @@ -555,11 +560,11 @@ __u32 get_packet_hash(struct packet_description *pckt,
> 24);
> }
>
> -__attribute__ ((noinline))
> -static bool get_packet_dst(struct real_definition **real,
> - struct packet_description *pckt,
> - struct vip_meta *vip_info,
> - bool is_ipv6, void *lru_map)
> +static noinline
> +bool get_packet_dst(struct real_definition **real,
> + struct packet_description *pckt,
> + struct vip_meta *vip_info,
> + bool is_ipv6, void *lru_map)
> {
> struct real_pos_lru new_dst_lru = { };
> bool hash_16bytes = is_ipv6;
> @@ -608,10 +613,10 @@ static bool get_packet_dst(struct real_definition **real,
> return 1;
> }
>
> -__attribute__ ((noinline))
> -static void connection_table_lookup(struct real_definition **real,
> - struct packet_description *pckt,
> - void *lru_map)
> +static noinline
> +void connection_table_lookup(struct real_definition **real,
> + struct packet_description *pckt,
> + void *lru_map)
> {
>
> struct real_pos_lru *dst_lru;
> @@ -635,11 +640,11 @@ static void connection_table_lookup(struct real_definition **real,
> * below function has 6 arguments whereas bpf and llvm allow maximum of 5
> * but since it's _static_ llvm can optimize one argument away
> */
> -__attribute__ ((noinline))
> -static int process_l3_headers_v6(struct packet_description *pckt,
> - __u8 *protocol, __u64 off,
> - __u16 *pkt_bytes, void *data,
> - void *data_end)
> +static noinline
> +int process_l3_headers_v6(struct packet_description *pckt,
> + __u8 *protocol, __u64 off,
> + __u16 *pkt_bytes, void *data,
> + void *data_end)
> {
> struct ipv6hdr *ip6h;
> __u64 iph_len;
> @@ -666,11 +671,11 @@ static int process_l3_headers_v6(struct packet_description *pckt,
> return -1;
> }
>
> -__attribute__ ((noinline))
> -static int process_l3_headers_v4(struct packet_description *pckt,
> - __u8 *protocol, __u64 off,
> - __u16 *pkt_bytes, void *data,
> - void *data_end)
> +static noinline
> +int process_l3_headers_v4(struct packet_description *pckt,
> + __u8 *protocol, __u64 off,
> + __u16 *pkt_bytes, void *data,
> + void *data_end)
> {
> struct iphdr *iph;
> __u64 iph_len;
> @@ -698,9 +703,9 @@ static int process_l3_headers_v4(struct packet_description *pckt,
> return -1;
> }
>
> -__attribute__ ((noinline))
> -static int process_packet(void *data, __u64 off, void *data_end,
> - bool is_ipv6, struct xdp_md *xdp)
> +static inline
s/inline/noinline/
> +int process_packet(void *data, __u64 off, void *data_end,
> + bool is_ipv6, struct xdp_md *xdp)
> {
>
> struct real_definition *dst = NULL;
>
^ permalink raw reply
* Re: [PATCH bpf-next] bpf, doc: Update bpf_jit_enable limitation for CONFIG_BPF_JIT_ALWAYS_ON
From: Daniel Borkmann @ 2018-04-27 9:59 UTC (permalink / raw)
To: Leo Yan
Cc: Alexei Starovoitov, David S. Miller, Jonathan Corbet, netdev,
linux-kernel, linux-doc
In-Reply-To: <20180427094910.GA31015@leoy-ThinkPad-X240s>
On 04/27/2018 11:49 AM, Leo Yan wrote:
> On Fri, Apr 27, 2018 at 11:44:44AM +0200, Daniel Borkmann wrote:
>> On 04/26/2018 04:26 AM, Leo Yan wrote:
>>> When CONFIG_BPF_JIT_ALWAYS_ON is enabled, kernel has limitation for
>>> bpf_jit_enable, so it has fixed value 1 and we cannot set it to 2
>>> for JIT opcode dumping; this patch is to update the doc for it.
>>>
>>> Signed-off-by: Leo Yan <leo.yan@linaro.org>
>>> ---
>>> Documentation/networking/filter.txt | 6 ++++++
>>> 1 file changed, 6 insertions(+)
>>>
>>> diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
>>> index fd55c7d..feddab9 100644
>>> --- a/Documentation/networking/filter.txt
>>> +++ b/Documentation/networking/filter.txt
>>> @@ -483,6 +483,12 @@ Example output from dmesg:
>>> [ 3389.935851] JIT code: 00000030: 00 e8 28 94 ff e0 83 f8 01 75 07 b8 ff ff 00 00
>>> [ 3389.935852] JIT code: 00000040: eb 02 31 c0 c9 c3
>>>
>>> +When CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is set to 1 by default
>>> +and it returns failure if change to any other value from proc node; this is
>>> +for security consideration to avoid leaking info to unprivileged users. In this
>>> +case, we can't directly dump JIT opcode image from kernel log, alternatively we
>>> +need to use bpf tool for the dumping.
>>> +
>>
>> Could you change this doc text a bit, I think it's slightly misleading. From the first
>> sentence one could also interpret that value 0 would leaking info to unprivileged users
>> whereas here we're only talking about the case of value 2. Maybe something roughly like
>> this to make it more clear:
>>
>> When CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1 and
>> setting any other value than that will return in failure. This is even the case for
>> setting bpf_jit_enable to 2, since dumping the final JIT image into the kernel log
>> is discouraged and introspection through bpftool (under tools/bpf/bpftool/) is the
>> generally recommended approach instead.
>
> Yeah, your rephrasing is more clear and better. Will do this and send
> new patch soon. Thanks for your helping.
Awesome, thank you!
^ permalink raw reply
* [PATCH bpf-next v2] bpf, doc: Update bpf_jit_enable limitation for CONFIG_BPF_JIT_ALWAYS_ON
From: Leo Yan @ 2018-04-27 10:02 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, David S. Miller,
Jonathan Corbet, netdev, linux-kernel, linux-doc
Cc: Leo Yan
When CONFIG_BPF_JIT_ALWAYS_ON is enabled, kernel has limitation for
bpf_jit_enable, so it has fixed value 1 and we cannot set it to 2
for JIT opcode dumping; this patch is to update the doc for it.
Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Leo Yan <leo.yan@linaro.org>
---
Documentation/networking/filter.txt | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index fd55c7d..5032e12 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -483,6 +483,12 @@ Example output from dmesg:
[ 3389.935851] JIT code: 00000030: 00 e8 28 94 ff e0 83 f8 01 75 07 b8 ff ff 00 00
[ 3389.935852] JIT code: 00000040: eb 02 31 c0 c9 c3
+When CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1 and
+setting any other value than that will return in failure. This is even the case for
+setting bpf_jit_enable to 2, since dumping the final JIT image into the kernel log
+is discouraged and introspection through bpftool (under tools/bpf/bpftool/) is the
+generally recommended approach instead.
+
In the kernel source tree under tools/bpf/, there's bpf_jit_disasm for
generating disassembly out of the kernel log's hexdump:
--
1.9.1
^ permalink raw reply related
* Re: [PATCH net-next 00/13] sctp: refactor MTU handling
From: Xin Long @ 2018-04-27 10:04 UTC (permalink / raw)
To: Marcelo Ricardo Leitner
Cc: network dev, linux-sctp, Vlad Yasevich, Neil Horman
In-Reply-To: <cover.1524772453.git.marcelo.leitner@gmail.com>
On Fri, Apr 27, 2018 at 3:58 AM, Marcelo Ricardo Leitner
<marcelo.leitner@gmail.com> wrote:
> Currently MTU handling is spread over SCTP stack. There are multiple
> places doing same/similar calculations and updating them is error prone
> as one spot can easily be left out.
>
> This patchset converges it into a more concise and consistent code. In
> general, it moves MTU handling from functions with bigger objectives,
> such as sctp_assoc_add_peer(), to specific functions.
>
> It's also a preparation for the next patchset, which removes the
> duplication between sctp_make_op_error_space and
> sctp_make_op_error_fixed and relies on sctp_mtu_payload introduced here.
>
> More details on each patch.
>
> Marcelo Ricardo Leitner (13):
> sctp: remove old and unused SCTP_MIN_PMTU
> sctp: move transport pathmtu calc away of sctp_assoc_add_peer
> sctp: remove an if() that is always true
> sctp: introduce sctp_assoc_set_pmtu
> sctp: introduce sctp_mtu_payload
> sctp: introduce sctp_assoc_update_frag_point
> sctp: remove sctp_assoc_pending_pmtu
> sctp: introduce sctp_dst_mtu
> sctp: remove sctp_transport_pmtu_check
> sctp: re-use sctp_transport_pmtu in sctp_transport_route
> sctp: honor PMTU_DISABLED when handling icmp
> sctp: consider idata chunks when setting SCTP_MAXSEG
> sctp: allow unsetting sockopt MAXSEG
>
> include/net/sctp/constants.h | 5 ++--
> include/net/sctp/sctp.h | 52 ++++++++++++++------------------------
> include/net/sctp/structs.h | 2 ++
> net/sctp/associola.c | 60 +++++++++++++++++++++++---------------------
> net/sctp/chunk.c | 12 +--------
> net/sctp/output.c | 28 ++++++++-------------
> net/sctp/socket.c | 43 ++++++++++++++-----------------
> net/sctp/transport.c | 37 ++++++++++++++-------------
> 8 files changed, 105 insertions(+), 134 deletions(-)
>
> --
> 2.14.3
>
Series
Reviewed-by: Xin Long <lucien.xin@gmail.com>
^ permalink raw reply
* Re: [PATCH 1/3] selftests/bpf: Makefile: add includes to fix broken test build
From: Daniel Borkmann @ 2018-04-27 10:04 UTC (permalink / raw)
To: Sirio Balmelli, ast; +Cc: netdev
In-Reply-To: <20180426083107.GA13908@vm4>
On 04/26/2018 10:31 AM, Sirio Balmelli wrote:
> several bpf tests fail to build with clang 7.0.0:
> ...
> In file included from ../../../include/uapi/linux/bpf.h:11:
> In file included from ./include/uapi/linux/types.h:5:
> /usr/include/asm-generic/int-ll64.h:11:10: fatal error: 'asm/bitsperlong.h' file not found
>
> /usr/include/asm-generic/int-ll64.h is from outside the kernel repo,
> probably a good idea to repoint to -I$(ROOT)/include/uapi.
> asm/bitsperlong.h is architecture-specific, cater for this with an
> architecture-specific include -I$(ROOT)/$(ARCH)/include/uapi.
>
> Re-building now yields:
> ../../../../include/uapi/linux/stddef.h:2:10: fatal error: 'linux/compiler_types.h' file not found
>
> Fix this with -I$(ROOT)/include
>
> Signed-off-by: Sirio Balmelli <sirio@b-ad.ch>
> ---
> tools/testing/selftests/bpf/Makefile | 10 ++++++++--
> 1 file changed, 8 insertions(+), 2 deletions(-)
>
> diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
> index 0b72cc7..6a8cfaf 100644
> --- a/tools/testing/selftests/bpf/Makefile
> +++ b/tools/testing/selftests/bpf/Makefile
> @@ -80,8 +80,14 @@ else
> CPU ?= generic
> endif
>
> -CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi \
> - -Wno-compare-distinct-pointer-types
> +ARCH := arch/$(subst _64,,$(shell uname -p))
> +ROOT :=../../../..
> +TOOLS :=../../..
> +CLANG_FLAGS = -I. -I./include/uapi \
> + -I$(TOOLS)/include/uapi -I$(TOOLS)/include \
> + -I$(ROOT)/$(ARCH)/include/uapi \
> + -I$(ROOT)/include/uapi -I$(ROOT)/include \
> + -Wno-compare-distinct-pointer-types
Problem is that this will now pull in all sort of kernel headers whereas
before the includes are limited and contained to tools/include/ respectively
tools/arch/*/include/, meaning, the tools/ infrastructure has specifically
headers that are needed under these locations. And a bitsperlong.h is already
present there, thus please change and respin your fix to reuse that one.
Thanks Sirio!
^ permalink raw reply
* Re: [PATCH bpf-next v2] bpf, doc: Update bpf_jit_enable limitation for CONFIG_BPF_JIT_ALWAYS_ON
From: Daniel Borkmann @ 2018-04-27 10:10 UTC (permalink / raw)
To: Leo Yan, Alexei Starovoitov, David S. Miller, Jonathan Corbet,
netdev, linux-kernel, linux-doc
In-Reply-To: <1524823374-6174-1-git-send-email-leo.yan@linaro.org>
On 04/27/2018 12:02 PM, Leo Yan wrote:
> When CONFIG_BPF_JIT_ALWAYS_ON is enabled, kernel has limitation for
> bpf_jit_enable, so it has fixed value 1 and we cannot set it to 2
> for JIT opcode dumping; this patch is to update the doc for it.
>
> Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
> Signed-off-by: Leo Yan <leo.yan@linaro.org>
Applied to bpf-next, thanks Leo!
^ permalink raw reply
* Re: [dm-devel] [PATCH v5] fault-injection: introduce kvmalloc fallback options
From: Mikulas Patocka @ 2018-04-27 10:20 UTC (permalink / raw)
To: Michal Hocko
Cc: Michael S. Tsirkin, John Stoffel, James Bottomley, Michal,
eric.dumazet, netdev, jasowang, Randy Dunlap, linux-kernel,
Matthew Wilcox, linux-mm, dm-devel, Vlastimil Babka, Andrew,
David Rientjes, Morton, virtualization, David Miller, edumazet
In-Reply-To: <20180427082555.GC17484@dhcp22.suse.cz>
On Fri, 27 Apr 2018, Michal Hocko wrote:
> On Thu 26-04-18 18:52:05, Mikulas Patocka wrote:
> >
> >
> > On Fri, 27 Apr 2018, Michael S. Tsirkin wrote:
> [...]
> > > But assuming it's important to control this kind of
> > > fault injection to be controlled from
> > > a dedicated menuconfig option, why not the rest of
> > > faults?
> >
> > The injected faults cause damage to the user, so there's no point to
> > enable them by default. vmalloc fallback should not cause any damage
> > (assuming that the code is correctly written).
>
> But you want to find those bugs which would BUG_ON easier, so there is a
> risk of harm IIUC
Yes, I want to harm them, but I only want to harm the users using the
debugging kernel. Testers should be "harmed" by crashes - so that the
users of production kernels are harmed less.
If someone hits this, he should report it, use the kernel parameter to
turn it off and continue with the testing.
> and this is not much different than other fault injecting paths.
Fault injections causes misbehavior even on completely bug-free code (for
example, syscalls randomly returning -ENOMEM). This won't cause
misbehavior on bug-free code.
Mikulas
^ permalink raw reply
* [PATCH net-next 0/2] netns: uevent filtering
From: Christian Brauner @ 2018-04-27 10:23 UTC (permalink / raw)
To: ebiederm, davem, netdev, linux-kernel
Cc: avagin, ktkhai, serge, gregkh, Christian Brauner
Hey everyone,
This is the new approach to uevent filtering as discussed (see the
threads in [1], [2], and [3]).
This series deals with with fixing up uevent filtering logic:
- uevent filtering logic is simplified
- locking time on uevent_sock_list is minimized
- tagged and untagged kobjects are handled in separate codepaths
- permissions for userspace are fixed for network device uevents in
network namespaces owned by non-initial user namespaces
Udev is now able to see those events correctly which it wasn't before.
For example, moving a physical device into a network namespace not
owned by the initial user namespaces before gave:
root@xen1:~# udevadm --debug monitor -k
calling: monitor
monitor will print the received events for:
KERNEL - the kernel uevent
sender uid=65534, message ignored
sender uid=65534, message ignored
sender uid=65534, message ignored
sender uid=65534, message ignored
sender uid=65534, message ignored
and now after the discussion and solution in [3] correctly gives:
root@xen1:~# udevadm --debug monitor -k
calling: monitor
monitor will print the received events for:
KERNEL - the kernel uevent
KERNEL[625.301042] add /devices/pci0000:00/0000:00:02.0/0000:01:00.1/net/enp1s0f1 (net)
KERNEL[625.301109] move /devices/pci0000:00/0000:00:02.0/0000:01:00.1/net/enp1s0f1 (net)
KERNEL[625.301138] move /devices/pci0000:00/0000:00:02.0/0000:01:00.1/net/eth1 (net)
KERNEL[655.333272] remove /devices/pci0000:00/0000:00:02.0/0000:01:00.1/net/eth1 (net)
Thanks!
Christian
[1]: https://lkml.org/lkml/2018/4/4/739
[2]: https://lkml.org/lkml/2018/4/26/767
[3]: https://lkml.org/lkml/2018/4/26/738
Christian Brauner (2):
uevent: add alloc_uevent_skb() helper
netns: restrict uevents
lib/kobject_uevent.c | 175 ++++++++++++++++++++++++++++++-------------
1 file changed, 123 insertions(+), 52 deletions(-)
--
2.17.0
^ permalink raw reply
* [PATCH net-next 1/2 v3] uevent: add alloc_uevent_skb() helper
From: Christian Brauner @ 2018-04-27 10:23 UTC (permalink / raw)
To: ebiederm, davem, netdev, linux-kernel
Cc: avagin, ktkhai, serge, gregkh, Christian Brauner
In-Reply-To: <20180427102306.8617-1-christian.brauner@ubuntu.com>
This patch adds alloc_uevent_skb() in preparation for follow up patches.
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
lib/kobject_uevent.c | 39 ++++++++++++++++++++++++++-------------
1 file changed, 26 insertions(+), 13 deletions(-)
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 15ea216a67ce..c3cb110f663b 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -296,6 +296,31 @@ static void cleanup_uevent_env(struct subprocess_info *info)
}
#endif
+static struct sk_buff *alloc_uevent_skb(struct kobj_uevent_env *env,
+ const char *action_string,
+ const char *devpath)
+{
+ struct sk_buff *skb = NULL;
+ char *scratch;
+ size_t len;
+
+ /* allocate message with maximum possible size */
+ len = strlen(action_string) + strlen(devpath) + 2;
+ skb = alloc_skb(len + env->buflen, GFP_KERNEL);
+ if (!skb)
+ return NULL;
+
+ /* add header */
+ scratch = skb_put(skb, len);
+ sprintf(scratch, "%s@%s", action_string, devpath);
+
+ skb_put_data(skb, env->buf, env->buflen);
+
+ NETLINK_CB(skb).dst_group = 1;
+
+ return skb;
+}
+
static int kobject_uevent_net_broadcast(struct kobject *kobj,
struct kobj_uevent_env *env,
const char *action_string,
@@ -314,22 +339,10 @@ static int kobject_uevent_net_broadcast(struct kobject *kobj,
continue;
if (!skb) {
- /* allocate message with the maximum possible size */
- size_t len = strlen(action_string) + strlen(devpath) + 2;
- char *scratch;
-
retval = -ENOMEM;
- skb = alloc_skb(len + env->buflen, GFP_KERNEL);
+ skb = alloc_uevent_skb(env, action_string, devpath);
if (!skb)
continue;
-
- /* add header */
- scratch = skb_put(skb, len);
- sprintf(scratch, "%s@%s", action_string, devpath);
-
- skb_put_data(skb, env->buf, env->buflen);
-
- NETLINK_CB(skb).dst_group = 1;
}
retval = netlink_broadcast_filtered(uevent_sock, skb_get(skb),
--
2.17.0
^ permalink raw reply related
* [PATCH net-next 2/2 v3] netns: restrict uevents
From: Christian Brauner @ 2018-04-27 10:23 UTC (permalink / raw)
To: ebiederm, davem, netdev, linux-kernel
Cc: avagin, ktkhai, serge, gregkh, Christian Brauner
In-Reply-To: <20180427102306.8617-1-christian.brauner@ubuntu.com>
commit 07e98962fa77 ("kobject: Send hotplug events in all network namespaces")
enabled sending hotplug events into all network namespaces back in 2010.
Over time the set of uevents that get sent into all network namespaces has
shrunk. We have now reached the point where hotplug events for all devices
that carry a namespace tag are filtered according to that namespace.
Specifically, they are filtered whenever the namespace tag of the kobject
does not match the namespace tag of the netlink socket.
Currently, only network devices carry namespace tags (i.e. network
namespace tags). Hence, uevents for network devices only show up in the
network namespace such devices are created in or moved to.
However, any uevent for a kobject that does not have a namespace tag
associated with it will not be filtered and we will broadcast it into all
network namespaces. This behavior stopped making sense when user namespaces
were introduced.
This patch simplifies and fixes couple of things:
- Split codepath for sending uevents by kobject namespace tags:
1. Untagged kobjects - uevent_net_broadcast_untagged():
Untagged kobjects will be broadcast into all uevent sockets recorded
in uevent_sock_list, i.e. into all network namespacs owned by the
intial user namespace.
2. Tagged kobjects - uevent_net_broadcast_tagged():
Tagged kobjects will only be broadcast into the network namespace they
were tagged with.
Handling of tagged kobjects in 2. does not cause any semantic changes.
This is just splitting out the filtering logic that was handled by
kobj_bcast_filter() before.
Handling of untagged kobjects in 1. will cause a semantic change. The
reasons why this is needed and ok have been discussed in [1]. Here is a
short summary:
- Userspace ignores uevents from network namespaces that are not owned by
the intial user namespace:
Uevents are filtered by userspace in a user namespace because the
received uid != 0. Instead the uid associated with the event will be
65534 == "nobody" because the global root uid is not mapped.
This means we can safely and without introducing regressions modify the
kernel to not send uevents into all network namespaces whose owning
user namespace is not the initial user namespace because we know that
userspace will ignore the message because of the uid anyway.
I have a) verified that is is true for every udev implementation out
there b) that this behavior has been present in all udev
implementations from the very beginning.
- Thundering herd:
Broadcasting uevents into all network namespaces introduces significant
overhead.
All processes that listen to uevents running in non-initial user
namespaces will end up responding to uevents that will be meaningless
to them. Mainly, because non-initial user namespaces cannot easily
manage devices unless they have a privileged host-process helping them
out. This means that there will be a thundering herd of activity when
there shouldn't be any.
- Removing needless overhead/Increasing performance:
Currently, the uevent socket for each network namespace is added to the
global variable uevent_sock_list. The list itself needs to be protected
by a mutex. So everytime a uevent is generated the mutex is taken on
the list. The mutex is held *from the creation of the uevent (memory
allocation, string creation etc. until all uevent sockets have been
handled*. This is aggravated by the fact that for each uevent socket
that has listeners the mc_list must be walked as well which means we're
talking O(n^2) here. Given that a standard Linux workload usually has
quite a lot of network namespaces and - in the face of containers - a
lot of user namespaces this quickly becomes a performance problem (see
"Thundering herd" above). By just recording uevent sockets of network
namespaces that are owned by the initial user namespace we
significantly increase performance in this codepath.
- Injecting uevents:
There's a valid argument that containers might be interested in
receiving device events especially if they are delegated to them by a
privileged userspace process. One prime example are SR-IOV enabled
devices that are explicitly designed to be handed of to other users
such as VMs or containers.
This use-case can now be correctly handled since
commit 692ec06d7c92 ("netns: send uevent messages"). This commit
introduced the ability to send uevents from userspace. As such we can
let a sufficiently privileged (CAP_SYS_ADMIN in the owning user
namespace of the network namespace of the netlink socket) userspace
process make a decision what uevents should be sent. This removes the
need to blindly broadcast uevents into all user namespaces and provides
a performant and safe solution to this problem.
- Filtering logic:
This patch filters by *owning user namespace of the network namespace a
given task resides in* and not by user namespace of the task per se.
This means if the user namespace of a given task is unshared but the
network namespace is kept and is owned by the initial user namespace a
listener that is opening the uevent socket in that network namespace
can still listen to uevents.
- Fix permission for tagged kobjects:
Network devices that are created or moved into a network namespace that
is owned by a non-initial user namespace currently are send with
INVALID_{G,U}ID in their credentials. This means that all current udev
implementations in userspace will ignore the uevent they receive for
them. This has lead to weird bugs whereby new devices showing up in such
network namespaces were not recognized and did not get IPs assigned etc.
This patch adjusts the permission to the appropriate {g,u}id in the
respective user namespace. This way udevd is able to correctly handle
such devices.
- Simplify filtering logic:
do_one_broadcast() already ensures that only listeners in mc_list receive
uevents that have the same network namespace as the uevent socket itself.
So the filtering logic in kobj_bcast_filter is not needed (see [3]). This
patch therefore removes kobj_bcast_filter() and replaces
netlink_broadcast_filtered() with the simpler netlink_broadcast()
everywhere.
[1]: https://lkml.org/lkml/2018/4/4/739
[2]: https://lkml.org/lkml/2018/4/26/767
[3]: https://lkml.org/lkml/2018/4/26/738
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
lib/kobject_uevent.c | 140 ++++++++++++++++++++++++++++++-------------
1 file changed, 99 insertions(+), 41 deletions(-)
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index c3cb110f663b..d8ce5e6d83af 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -22,6 +22,7 @@
#include <linux/socket.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
+#include <linux/uidgid.h>
#include <linux/uuid.h>
#include <linux/ctype.h>
#include <net/sock.h>
@@ -231,30 +232,6 @@ int kobject_synth_uevent(struct kobject *kobj, const char *buf, size_t count)
return r;
}
-#ifdef CONFIG_NET
-static int kobj_bcast_filter(struct sock *dsk, struct sk_buff *skb, void *data)
-{
- struct kobject *kobj = data, *ksobj;
- const struct kobj_ns_type_operations *ops;
-
- ops = kobj_ns_ops(kobj);
- if (!ops && kobj->kset) {
- ksobj = &kobj->kset->kobj;
- if (ksobj->parent != NULL)
- ops = kobj_ns_ops(ksobj->parent);
- }
-
- if (ops && ops->netlink_ns && kobj->ktype->namespace) {
- const void *sock_ns, *ns;
- ns = kobj->ktype->namespace(kobj);
- sock_ns = ops->netlink_ns(dsk);
- return sock_ns != ns;
- }
-
- return 0;
-}
-#endif
-
#ifdef CONFIG_UEVENT_HELPER
static int kobj_usermode_filter(struct kobject *kobj)
{
@@ -296,6 +273,7 @@ static void cleanup_uevent_env(struct subprocess_info *info)
}
#endif
+#ifdef CONFIG_NET
static struct sk_buff *alloc_uevent_skb(struct kobj_uevent_env *env,
const char *action_string,
const char *devpath)
@@ -321,15 +299,13 @@ static struct sk_buff *alloc_uevent_skb(struct kobj_uevent_env *env,
return skb;
}
-static int kobject_uevent_net_broadcast(struct kobject *kobj,
- struct kobj_uevent_env *env,
- const char *action_string,
- const char *devpath)
+static int uevent_net_broadcast_untagged(struct kobj_uevent_env *env,
+ const char *action_string,
+ const char *devpath)
{
- int retval = 0;
-#if defined(CONFIG_NET)
struct sk_buff *skb = NULL;
struct uevent_sock *ue_sk;
+ int retval = 0;
/* send netlink message */
list_for_each_entry(ue_sk, &uevent_sock_list, list) {
@@ -345,19 +321,95 @@ static int kobject_uevent_net_broadcast(struct kobject *kobj,
continue;
}
- retval = netlink_broadcast_filtered(uevent_sock, skb_get(skb),
- 0, 1, GFP_KERNEL,
- kobj_bcast_filter,
- kobj);
+ retval = netlink_broadcast(uevent_sock, skb_get(skb), 0, 1,
+ GFP_KERNEL);
/* ENOBUFS should be handled in userspace */
if (retval == -ENOBUFS || retval == -ESRCH)
retval = 0;
}
consume_skb(skb);
-#endif
+
return retval;
}
+static int uevent_net_broadcast_tagged(struct sock *usk,
+ struct kobj_uevent_env *env,
+ const char *action_string,
+ const char *devpath)
+{
+ struct user_namespace *owning_user_ns = sock_net(usk)->user_ns;
+ struct sk_buff *skb = NULL;
+ int ret;
+
+ skb = alloc_uevent_skb(env, action_string, devpath);
+ if (!skb)
+ return -ENOMEM;
+
+ /* fix credentials */
+ if (owning_user_ns != &init_user_ns) {
+ struct netlink_skb_parms *parms = &NETLINK_CB(skb);
+ kuid_t root_uid;
+ kgid_t root_gid;
+
+ /* fix uid */
+ root_uid = make_kuid(owning_user_ns, 0);
+ if (!uid_valid(root_uid))
+ root_uid = GLOBAL_ROOT_UID;
+ parms->creds.uid = root_uid;
+
+ /* fix gid */
+ root_gid = make_kgid(owning_user_ns, 0);
+ if (!gid_valid(root_gid))
+ root_gid = GLOBAL_ROOT_GID;
+ parms->creds.gid = root_gid;
+ }
+
+ ret = netlink_broadcast(usk, skb, 0, 1, GFP_KERNEL);
+ /* ENOBUFS should be handled in userspace */
+ if (ret == -ENOBUFS || ret == -ESRCH)
+ ret = 0;
+
+ return ret;
+}
+#endif
+
+static int kobject_uevent_net_broadcast(struct kobject *kobj,
+ struct kobj_uevent_env *env,
+ const char *action_string,
+ const char *devpath)
+{
+ int ret = 0;
+
+#ifdef CONFIG_NET
+ const struct kobj_ns_type_operations *ops;
+ const struct net *net = NULL;
+
+ ops = kobj_ns_ops(kobj);
+ if (!ops && kobj->kset) {
+ struct kobject *ksobj = &kobj->kset->kobj;
+ if (ksobj->parent != NULL)
+ ops = kobj_ns_ops(ksobj->parent);
+ }
+
+ /* kobjects currently only carry network namespace tags and they
+ * are the only tag relevant here since we want to decide which
+ * network namespaces to broadcast the uevent into.
+ */
+ if (ops && ops->netlink_ns && kobj->ktype->namespace)
+ if (ops->type == KOBJ_NS_TYPE_NET)
+ net = kobj->ktype->namespace(kobj);
+
+ if (!net)
+ ret = uevent_net_broadcast_untagged(env, action_string,
+ devpath);
+ else
+ ret = uevent_net_broadcast_tagged(net->uevent_sock->sk, env,
+ action_string, devpath);
+#endif
+
+ return ret;
+}
+
static void zap_modalias_env(struct kobj_uevent_env *env)
{
static const char modalias_prefix[] = "MODALIAS=";
@@ -716,9 +768,13 @@ static int uevent_net_init(struct net *net)
net->uevent_sock = ue_sk;
- mutex_lock(&uevent_sock_mutex);
- list_add_tail(&ue_sk->list, &uevent_sock_list);
- mutex_unlock(&uevent_sock_mutex);
+ /* Restrict uevents to initial user namespace. */
+ if (sock_net(ue_sk->sk)->user_ns == &init_user_ns) {
+ mutex_lock(&uevent_sock_mutex);
+ list_add_tail(&ue_sk->list, &uevent_sock_list);
+ mutex_unlock(&uevent_sock_mutex);
+ }
+
return 0;
}
@@ -726,9 +782,11 @@ static void uevent_net_exit(struct net *net)
{
struct uevent_sock *ue_sk = net->uevent_sock;
- mutex_lock(&uevent_sock_mutex);
- list_del(&ue_sk->list);
- mutex_unlock(&uevent_sock_mutex);
+ if (sock_net(ue_sk->sk)->user_ns == &init_user_ns) {
+ mutex_lock(&uevent_sock_mutex);
+ list_del(&ue_sk->list);
+ mutex_unlock(&uevent_sock_mutex);
+ }
netlink_kernel_release(ue_sk->sk);
kfree(ue_sk);
--
2.17.0
^ permalink raw reply related
* [net-next] ipv6: sr: Add documentation for seg_flowlabel sysctl
From: Ahmed Abdelsalam @ 2018-04-27 10:35 UTC (permalink / raw)
To: davem, linux-doc, netdev; +Cc: Ahmed Abdelsalam
This patch adds a documentation for seg_flowlabel sysctl into
Documentation/networking/ip-sysctl.txt
Signed-off-by: Ahmed Abdelsalam <amsalam20@gmail.com>
---
Documentation/networking/ip-sysctl.txt | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 5dc1a04..7528f71 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1428,6 +1428,19 @@ ip6frag_low_thresh - INTEGER
ip6frag_time - INTEGER
Time in seconds to keep an IPv6 fragment in memory.
+IPv6 Segment Routing:
+
+seg6_flowlabel - INTEGER
+ Controls the behaviour of computing the flowlabel of outer
+ IPv6 header in case of SR T.encaps
+
+ -1 set flowlabel to zero.
+ 0 copy flowlabel from Inner paceket in case of Inner IPv6
+ (Set flowlabel to 0 in case IPv4/L2)
+ 1 Compute the flowlabel using seg6_make_flowlabel()
+
+ Default is 0.
+
conf/default/*:
Change the interface-specific default settings.
--
2.1.4
^ permalink raw reply related
* Re: [PATCH net-next 03/13] sctp: remove an if() that is always true
From: Neil Horman @ 2018-04-27 10:50 UTC (permalink / raw)
To: Marcelo Ricardo Leitner; +Cc: netdev, linux-sctp, Vlad Yasevich, Xin Long
In-Reply-To: <b083bd9240e25ad84cda4d9212b886da2373ec11.1524772453.git.marcelo.leitner@gmail.com>
On Thu, Apr 26, 2018 at 04:58:52PM -0300, Marcelo Ricardo Leitner wrote:
> As noticed by Xin Long, the if() here is always true as PMTU can never
> be 0.
>
> Reported-by: Xin Long <lucien.xin@gmail.com>
> Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> ---
> net/sctp/associola.c | 6 ++----
> 1 file changed, 2 insertions(+), 4 deletions(-)
>
> diff --git a/net/sctp/associola.c b/net/sctp/associola.c
> index b3aa95222bd52113295cb246c503c903bdd5c353..c5ed09cfa8423b17546e3d45f6d06db03af66384 100644
> --- a/net/sctp/associola.c
> +++ b/net/sctp/associola.c
> @@ -1397,10 +1397,8 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
> pmtu = t->pathmtu;
> }
>
> - if (pmtu) {
> - asoc->pathmtu = pmtu;
> - asoc->frag_point = sctp_frag_point(asoc, pmtu);
> - }
> + asoc->pathmtu = pmtu;
> + asoc->frag_point = sctp_frag_point(asoc, pmtu);
>
Can you double check this? Looking at it, it seems far fetched, but if someone
sends a crafted icmp dest unreach message to the host, pmtu_sending might be
able to get set for an association (which may have no transports established
yet), and if so, on the first packet send sctp_assoc_sync_pmtu can be called,
leading to a fall through in the loop over all transports, and pmtu being zero.
It seems like a far fetched set of circumstances, I know, but if it can happen,
I think you might see a crash in sctp_frag_point due to an underflow of the frag
value
Neil
> pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc,
> asoc->pathmtu, asoc->frag_point);
> --
> 2.14.3
>
>
^ permalink raw reply
* Re: [PATCH net-next v3] Add Common Applications Kept Enhanced (cake) qdisc
From: kbuild test robot @ 2018-04-27 10:54 UTC (permalink / raw)
To: Toke Høiland-Jørgensen
Cc: kbuild-all, netdev, cake, Toke Høiland-Jørgensen,
Dave Taht
In-Reply-To: <20180425134249.21300-1-toke@toke.dk>
[-- Attachment #1: Type: text/plain, Size: 4280 bytes --]
Hi Toke,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on net-next/master]
url: https://github.com/0day-ci/linux/commits/Toke-H-iland-J-rgensen/Add-Common-Applications-Kept-Enhanced-cake-qdisc/20180427-175308
config: i386-allmodconfig (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
# save the attached .config to linux build tree
make ARCH=i386
All errors (new ones prefixed by >>):
>> net/sched/sch_cake.c:68:10: fatal error: pkt_sched.h: No such file or directory
#include "pkt_sched.h"
^~~~~~~~~~~~~
compilation terminated.
vim +68 net/sched/sch_cake.c
2
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4 *
5 * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6 * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7 * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8 * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9 * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10 * Copyright (C) 2017 Ryan Mounce <ryan@mounce.com.au>
11 *
12 * The CAKE Principles:
13 * (or, how to have your cake and eat it too)
14 *
15 * This is a combination of several shaping, AQM and FQ techniques into one
16 * easy-to-use package:
17 *
18 * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19 * equipment and bloated MACs. This operates in deficit mode (as in sch_fq),
20 * eliminating the need for any sort of burst parameter (eg. token bucket
21 * depth). Burst support is limited to that necessary to overcome scheduling
22 * latency.
23 *
24 * - A Diffserv-aware priority queue, giving more priority to certain classes,
25 * up to a specified fraction of bandwidth. Above that bandwidth threshold,
26 * the priority is reduced to avoid starving other tins.
27 *
28 * - Each priority tin has a separate Flow Queue system, to isolate traffic
29 * flows from each other. This prevents a burst on one flow from increasing
30 * the delay to another. Flows are distributed to queues using a
31 * set-associative hash function.
32 *
33 * - Each queue is actively managed by Cobalt, which is a combination of the
34 * Codel and Blue AQM algorithms. This serves flows fairly, and signals
35 * congestion early via ECN (if available) and/or packet drops, to keep
36 * latency low. The codel parameters are auto-tuned based on the bandwidth
37 * setting, as is necessary at low bandwidths.
38 *
39 * The configuration parameters are kept deliberately simple for ease of use.
40 * Everything has sane defaults. Complete generality of configuration is *not*
41 * a goal.
42 *
43 * The priority queue operates according to a weighted DRR scheme, combined with
44 * a bandwidth tracker which reuses the shaper logic to detect which side of the
45 * bandwidth sharing threshold the tin is operating. This determines whether a
46 * priority-based weight (high) or a bandwidth-based weight (low) is used for
47 * that tin in the current pass.
48 *
49 * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50 * granted us permission to leverage.
51 */
52
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/version.h>
> 68 #include "pkt_sched.h"
69 #include <linux/if_vlan.h>
70 #include <net/pkt_sched.h>
71 #include <net/tcp.h>
72 #include <net/flow_dissector.h>
73
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 62952 bytes --]
^ permalink raw reply
* Re: ip6-in-ip{4,6} ipsec tunnel issues with 1280 MTU
From: Ashwanth Goli @ 2018-04-27 11:02 UTC (permalink / raw)
To: Paolo Abeni; +Cc: netdev, maloney, edumazet, David Ahern, netdev-owner
In-Reply-To: <1524743477.2658.38.camel@redhat.com>
On 2018-04-26 17:21, Paolo Abeni wrote:
> Hi,
>
> [fixed CC list]
>
> On Wed, 2018-04-25 at 21:43 +0530, Ashwanth Goli wrote:
>> Hi Pablo,
>
> Actually I'm Paolo, but yours is a recurring mistake ;)
>
>> I am noticing an issue similar to the one reported by Alexis Perez
>> [Regression for ip6-in-ip4 IPsec tunnel in 4.14.16]
>>
>> In my IPsec setup outer MTU is set to 1280, ip6_setup_cork sees an MTU
>> less than IPV6_MIN_MTU because of the tunnel headers. -EINVAL is being
>> returned as a result of the MTU check that got added with below patch.
>>
>> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/commit/net/ipv6/ip6_output.c?h=v4.14.34&id=8278804e05f6bcfe3fdfea4a404020752ead15a6
>>
>> Can we remove this MTU check since your recent patch [ipv6: the entire
>> IPv6 header chain must fit the first fragment] fixes a similar issue?
>
> AFAICS, RFC 2473 implies we can have MTU below 1280 for tunnel devices
> so we can probably relax the MTU check for such devices, but I think we
> would still need it in the general case.
>
> Cheers,
>
> Paolo
Should I send out the following change as a patch?
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 2e891d2..c4c3313 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1235,7 +1235,7 @@ static int ip6_setup_cork(struct sock *sk, struct
inet_cork_full *cork,
if (np->frag_size)
mtu = np->frag_size;
}
- if (mtu < IPV6_MIN_MTU)
+ if (!(rt->dst.flags & DST_XFRM_TUNNEL) && mtu < IPV6_MIN_MTU)
return -EINVAL;
cork->base.fragsize = mtu;
if (dst_allfrag(xfrm_dst_path(&rt->dst)))
^ permalink raw reply related
* RE: [PATCH] DT: net: can: rcar_canfd: document R8A77970 bindings
From: Ramesh Shanmugasundaram @ 2018-04-27 11:33 UTC (permalink / raw)
To: Sergei Shtylyov, Marc Kleine-Budde, Rob Herring,
linux-can@vger.kernel.org, netdev@vger.kernel.org,
devicetree@vger.kernel.org
Cc: Wolfgang Grandegger, Mark Rutland,
linux-renesas-soc@vger.kernel.org
In-Reply-To: <7a3d170f-1d3a-a807-9256-15fe1a78ca4e@cogentembedded.com>
Hello Sergei,
Thanks for your patch.
> Subject: [PATCH] DT: net: can: rcar_canfd: document R8A77970 bindings
>
> Document the R-Car V3M (R8A77970) SoC support in the R-Car CAN-FD
> bindings.
>
> Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Reviewed-by: Ramesh Shanmugasundaram <ramesh.shanmugasundaram@bp.renesas.com>
Thanks,
Ramesh
^ permalink raw reply
* Re: ath6kl: fix ath6kl_data_tx()'s return type
From: Kalle Valo @ 2018-04-27 11:35 UTC (permalink / raw)
To: Luc Van Oostenryck
Cc: linux-kernel, Luc Van Oostenryck, Kalle Valo, linux-wireless,
netdev
In-Reply-To: <20180424131900.5718-1-luc.vanoostenryck@gmail.com>
Luc Van Oostenryck <luc.vanoostenryck@gmail.com> wrote:
> The method ndo_start_xmit() is defined as returning an 'netdev_tx_t',
> which is a typedef for an enum type, but the implementation in this
> driver returns an 'int'.
>
> Fix this by returning 'netdev_tx_t' in this driver too.
>
> Signed-off-by: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
> Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Patch applied to ath-next branch of ath.git, thanks.
378b1d65070f ath6kl: fix ath6kl_data_tx()'s return type
--
https://patchwork.kernel.org/patch/10359823/
https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches
^ permalink raw reply
* Re: [next] ath10k: fix spelling mistake: "servive" -> "service"
From: Kalle Valo @ 2018-04-27 11:39 UTC (permalink / raw)
To: Colin Ian King
Cc: linux-wireless, netdev, kernel-janitors, linux-kernel, ath10k,
Kalle Valo
In-Reply-To: <20180426091244.9160-1-colin.king@canonical.com>
Colin Ian King <colin.king@canonical.com> wrote:
> Trivial fix to spelling mistake in ath10k_warn warning message text
>
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
> Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Patch applied to ath-next branch of ath.git, thanks.
785281342d0c ath10k: fix spelling mistake: "servive" -> "service"
--
https://patchwork.kernel.org/patch/10365081/
https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches
^ permalink raw reply
* Re: [next] ath10k: fix spelling mistake: "servive" -> "service"
From: Kalle Valo @ 2018-04-27 11:39 UTC (permalink / raw)
To: Colin Ian King
Cc: Kalle Valo, ath10k, linux-wireless, netdev, kernel-janitors,
linux-kernel
In-Reply-To: <20180426091244.9160-1-colin.king@canonical.com>
Colin Ian King <colin.king@canonical.com> wrote:
> Trivial fix to spelling mistake in ath10k_warn warning message text
>
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
> Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Patch applied to ath-next branch of ath.git, thanks.
785281342d0c ath10k: fix spelling mistake: "servive" -> "service"
--
https://patchwork.kernel.org/patch/10365081/
https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches
^ permalink raw reply
* Re: ath10k: sdio: jump to correct label in error handling path
From: Kalle Valo @ 2018-04-27 11:40 UTC (permalink / raw)
To: Niklas Cassel
Cc: Kalle Valo, Niklas Cassel, ath10k, linux-wireless, netdev,
linux-kernel
In-Reply-To: <20180426123502.23962-1-niklas.cassel@linaro.org>
Niklas Cassel <niklas.cassel@linaro.org> wrote:
> Jump to the correct label in error handling path.
> At this point of execution create_singlethread_workqueue() has succeeded,
> so it should be properly destroyed.
>
> Jump label was renamed in commit ec2c64e20257 ("ath10k: sdio: fix memory
> leak for probe allocations").
> However, the bug was originally introduced in commit d96db25d2025
> ("ath10k: add initial SDIO support").
>
> Fixes: d96db25d2025 ("ath10k: add initial SDIO support")
> Signed-off-by: Niklas Cassel <niklas.cassel@linaro.org>
> Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Patch applied to ath-next branch of ath.git, thanks.
e60a92590187 ath10k: sdio: jump to correct label in error handling path
--
https://patchwork.kernel.org/patch/10365901/
https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches
^ permalink raw reply
* Re: ath10k: sdio: jump to correct label in error handling path
From: Kalle Valo @ 2018-04-27 11:40 UTC (permalink / raw)
To: Niklas Cassel
Cc: netdev, linux-wireless, linux-kernel, ath10k, Kalle Valo,
Niklas Cassel
In-Reply-To: <20180426123502.23962-1-niklas.cassel@linaro.org>
Niklas Cassel <niklas.cassel@linaro.org> wrote:
> Jump to the correct label in error handling path.
> At this point of execution create_singlethread_workqueue() has succeeded,
> so it should be properly destroyed.
>
> Jump label was renamed in commit ec2c64e20257 ("ath10k: sdio: fix memory
> leak for probe allocations").
> However, the bug was originally introduced in commit d96db25d2025
> ("ath10k: add initial SDIO support").
>
> Fixes: d96db25d2025 ("ath10k: add initial SDIO support")
> Signed-off-by: Niklas Cassel <niklas.cassel@linaro.org>
> Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Patch applied to ath-next branch of ath.git, thanks.
e60a92590187 ath10k: sdio: jump to correct label in error handling path
--
https://patchwork.kernel.org/patch/10365901/
https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches
^ permalink raw reply
* ID of (former "National") TI's PHY DP83848
From: Juergen Borleis @ 2018-04-27 11:51 UTC (permalink / raw)
To: netdev; +Cc: Andrew Lunn, Florian Fainelli
Hi,
I have worked on a DP83848 variant without an interrupt line. While at it, I
read through all datasheets of existing variants of the DP83848 phy.
Here is what I've found so far:
+--------------+--------------+--------+-----+
| Variant | Phy ID | Note | IRQ |
+--------------+--------------+--------+-----+
| DP83848H | 0x20005c90 | MINI | NO |
+--------------+--------------+--------+-----+
| DP83848J | 0x20005c90 | MINI | NO |
+--------------+--------------+--------+-----+
| DP83848K | 0x20005c90 | MINI | NO |
+--------------+--------------+--------+-----+
| DP83848M | 0x20005c90 | MINI | NO |
+--------------+--------------+--------+-----+
| DP83848T | 0x20005c90 | MINI | NO |
+--------------+--------------+--------+-----+
| DP83848EP | 0x20005c90 | | YES |
+--------------+--------------+--------+-----+
| DP83848HT | 0x20005c90 | | YES |
+--------------+--------------+--------+-----+
| DP83848Q | 0x20005ca2 | MINI | NO |
+--------------+--------------+--------+-----+
| DP83848V | 0x20005ca2 | | YES |
+--------------+--------------+--------+-----+
| DP83848C | 0x20005ca2 | | YES |
+--------------+--------------+--------+-----+
"MINI" means a less pin count variant.
"IRQ"-"YES" means, the device has a interrupt line,
"IRQ"-"NO" means, the device has no interrupt line.
How to deal with interrupts, if the device itself has no interrupt line (and
uses the same phy ID)? How to deal with the same Phy ID used for different
devices and different features?
Cheers,
Juergen
PS: please keep me on CC as I'm not subscribed to the netdev list.
--
Pengutronix e.K. | Juergen Borleis |
Industrial Linux Solutions | http://www.pengutronix.de/ |
^ permalink raw reply
* Re: [tip:x86/cleanups] x86/bpf: Clean up non-standard comments, to make the code more readable
From: Daniel Borkmann @ 2018-04-27 12:13 UTC (permalink / raw)
To: peterz, edumazet, davem, linux-kernel, torvalds, ast, bp, hpa,
mingo, yoshfuji, tglx, linux-tip-commits, netdev
In-Reply-To: <tip-5f26c50143f58f256535bee8d93a105f36d4d2da@git.kernel.org>
Hi Ingo,
On 04/27/2018 01:00 PM, tip-bot for Ingo Molnar wrote:
> Commit-ID: 5f26c50143f58f256535bee8d93a105f36d4d2da
> Gitweb: https://git.kernel.org/tip/5f26c50143f58f256535bee8d93a105f36d4d2da
> Author: Ingo Molnar <mingo@kernel.org>
> AuthorDate: Fri, 27 Apr 2018 11:54:40 +0200
> Committer: Ingo Molnar <mingo@kernel.org>
> CommitDate: Fri, 27 Apr 2018 12:42:04 +0200
>
> x86/bpf: Clean up non-standard comments, to make the code more readable
>
> So by chance I looked into x86 assembly in arch/x86/net/bpf_jit_comp.c and
> noticed the weird and inconsistent comment style it mistakenly learned from
> the networking code:
>
> /* Multi-line comment ...
> * ... looks like this.
> */
>
> Fix this to use the standard comment style specified in Documentation/CodingStyle
> and used in arch/x86/ as well:
>
> /*
> * Multi-line comment ...
> * ... looks like this.
> */
>
> Also, to quote Linus's ... more explicit views about this:
>
> http://article.gmane.org/gmane.linux.kernel.cryptoapi/21066
>
> > But no, the networking code picked *none* of the above sane formats.
> > Instead, it picked these two models that are just half-arsed
> > shit-for-brains:
> >
> > (no)
> > /* This is disgusting drug-induced
> > * crap, and should die
> > */
> >
> > (no-no-no)
> > /* This is also very nasty
> > * and visually unbalanced */
> >
> > Please. The networking code actually has the *worst* possible comment
> > style. You can literally find that (no-no-no) style, which is just
> > really horribly disgusting and worse than the otherwise fairly similar
> > (d) in pretty much every way.
>
> Also improve the comments and some other details while at it:
>
> - Don't mix same-line and previous-line comment style on otherwise
> identical code patterns within the same function,
>
> - capitalize 'BPF' and x86 register names consistently,
>
> - capitalize sentences consistently,
>
> - instead of 'x64' use 'x86-64': x64 is a Microsoft specific term,
>
> - use more consistent punctuation,
>
> - use standard coding style in macros as well,
>
> - fix typos and a few other minor details.
>
> Consistent coding style is not optional, at least in arch/x86/.
>
> No change in functionality.
Thanks for the cleanup, looks fine to me!
> ( In case this commit causes conflicts with pending development code
> I'll be glad to help resolve any conflicts! )
Any objections if we would simply route this via bpf-next tree, otherwise
this will indeed cause really ugly merge conflicts throughout the JIT with
pending work.
> Acked-by: Thomas Gleixner <tglx@linutronix.de>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Borislav Petkov <bp@alien8.de>
> Cc: H. Peter Anvin <hpa@zytor.com>
> Cc: Linus Torvalds <torvalds@linux-foundation.org>
> Cc: David S. Miller <davem@davemloft.net>
> Cc: Eric Dumazet <edumazet@google.com>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Alexei Starovoitov <ast@fb.com>
> Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
> Cc: netdev@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Thanks,
Daniel
^ permalink raw reply
* Re: WARNING in tcp_enter_loss (2)
From: syzbot @ 2018-04-27 12:16 UTC (permalink / raw)
To: davem, kuznet, linux-kernel, netdev, syzkaller-bugs, yoshfuji
In-Reply-To: <001a113f39820d16d50567379661@google.com>
syzbot has found reproducer for the following crash on upstream commit
0644f186fc9d77bb5bd198369e59fb28927a3692 (Thu Apr 26 23:36:11 2018 +0000)
Merge tag 'for_linus' of
git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
syzbot dashboard link:
https://syzkaller.appspot.com/bug?extid=c5a3099b94cbdd9cd6da
So far this crash happened 2 times on net-next, upstream.
C reproducer: https://syzkaller.appspot.com/x/repro.c?id=5374384306913280
syzkaller reproducer:
https://syzkaller.appspot.com/x/repro.syz?id=4821663019433984
Raw console output:
https://syzkaller.appspot.com/x/log.txt?id=5119802469253120
Kernel config:
https://syzkaller.appspot.com/x/.config?id=7043958930931867332
compiler: gcc (GCC) 8.0.1 20180413 (experimental)
IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+c5a3099b94cbdd9cd6da@syzkaller.appspotmail.com
It will help syzbot understand when the bug is fixed.
WARNING: CPU: 0 PID: 4456 at net/ipv4/tcp_input.c:1955
tcp_enter_loss+0xe4f/0x1110 net/ipv4/tcp_input.c:1955
Kernel panic - not syncing: panic_on_warn set ...
CPU: 0 PID: 4456 Comm: syz-executor694 Not tainted 4.17.0-rc2+ #19
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x1b9/0x294 lib/dump_stack.c:113
panic+0x22f/0x4de kernel/panic.c:184
__warn.cold.8+0x163/0x1b3 kernel/panic.c:536
report_bug+0x252/0x2d0 lib/bug.c:186
fixup_bug arch/x86/kernel/traps.c:178 [inline]
do_error_trap+0x1de/0x490 arch/x86/kernel/traps.c:296
do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315
invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:992
RIP: 0010:tcp_enter_loss+0xe4f/0x1110 net/ipv4/tcp_input.c:1955
RSP: 0018:ffff8801b66c7560 EFLAGS: 00010293
RAX: ffff8801b66686c0 RBX: 0000000000000001 RCX: ffffffff864ac155
RDX: 0000000000000000 RSI: ffffffff864ac5bf RDI: 0000000000000004
RBP: ffff8801b66c75e0 R08: ffff8801b66686c0 R09: 0000000000000000
R10: ffffed0043fff001 R11: ffff88021fff8017 R12: 0000000000000003
R13: 0000000000000002 R14: ffff8801c8c6dd30 R15: ffff8801d02e5500
WARNING: CPU: 1 PID: 4450 at net/ipv4/tcp_input.c:1955
tcp_enter_loss+0xe4f/0x1110 net/ipv4/tcp_input.c:1955
tcp_retransmit_timer+0xc34/0x3060 net/ipv4/tcp_timer.c:486
Modules linked in:
CPU: 1 PID: 4450 Comm: syz-executor694 Not tainted 4.17.0-rc2+ #19
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
RIP: 0010:tcp_enter_loss+0xe4f/0x1110 net/ipv4/tcp_input.c:1955
RSP: 0018:ffff8801b60b7560 EFLAGS: 00010293
RAX: ffff8801b662e500 RBX: 0000000000000001 RCX: ffffffff864ac155
RDX: 0000000000000000 RSI: ffffffff864ac5bf RDI: 0000000000000004
RBP: ffff8801b60b75e0 R08: ffff8801b662e500 R09: 0000000000000000
R10: ffffed0043fff009 R11: ffff88021fff8057 R12: 0000000000000003
R13: 0000000000000002 R14: ffff8801cc3cf870 R15: ffff8801cd4f0a80
FS: 00000000015e1880(0000) GS:ffff8801daf00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000021000000 CR3: 00000001b631c000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
tcp_write_timer_handler+0x339/0x960 net/ipv4/tcp_timer.c:573
tcp_retransmit_timer+0xc34/0x3060 net/ipv4/tcp_timer.c:486
tcp_release_cb+0x25e/0x2d0 net/ipv4/tcp_output.c:871
release_sock+0x107/0x2b0 net/core/sock.c:2856
do_tcp_setsockopt.isra.38+0x48e/0x2600 net/ipv4/tcp.c:2880
tcp_write_timer_handler+0x339/0x960 net/ipv4/tcp_timer.c:573
tcp_setsockopt+0xc1/0xe0 net/ipv4/tcp.c:2892
sock_common_setsockopt+0x9a/0xe0 net/core/sock.c:3039
tcp_release_cb+0x25e/0x2d0 net/ipv4/tcp_output.c:871
__sys_setsockopt+0x1bd/0x390 net/socket.c:1903
release_sock+0x107/0x2b0 net/core/sock.c:2856
__do_sys_setsockopt net/socket.c:1914 [inline]
__se_sys_setsockopt net/socket.c:1911 [inline]
__x64_sys_setsockopt+0xbe/0x150 net/socket.c:1911
do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
do_tcp_setsockopt.isra.38+0x48e/0x2600 net/ipv4/tcp.c:2880
entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x441bc9
RSP: 002b:00007ffe202bc838 EFLAGS: 00000207
ORIG_RAX: 0000000000000036
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000441bc9
RDX: 0000000000000016 RSI: 0000000000000006 RDI: 0000000000000003
RBP: 00000000006cd018 R08: 000000002000023b R09: 0000000000000010
tcp_setsockopt+0xc1/0xe0 net/ipv4/tcp.c:2892
R10: 0000000020000040 R11: 0000000000000207 R12: 0000000000402810
sock_common_setsockopt+0x9a/0xe0 net/core/sock.c:3039
R13: 00000000004028a0 R14: 0000000000000000 R15: 0000000000000000
__sys_setsockopt+0x1bd/0x390 net/socket.c:1903
__do_sys_setsockopt net/socket.c:1914 [inline]
__se_sys_setsockopt net/socket.c:1911 [inline]
__x64_sys_setsockopt+0xbe/0x150 net/socket.c:1911
do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x441bc9
RSP: 002b:00007ffe202bc838 EFLAGS: 00000207 ORIG_RAX: 0000000000000036
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000441bc9
RDX: 0000000000000016 RSI: 0000000000000006 RDI: 0000000000000003
RBP: 00000000006cd018 R08: 000000002000023b R09: 0000000000000010
R10: 0000000020000040 R11: 0000000000000207 R12: 0000000000402810
R13: 00000000004028a0 R14: 0000000000000000 R15: 0000000000000000
Code: 89 a7 38 08 00 00 e9 07 fc ff ff 49 8d 87 78 09 00 00 48 89 45 88 49
8d 87 68 07 00 00 48 89 45 d0 e9 c5 f2 ff ff e8 91 6a 2e fb <0f> 0b e9 98
fb ff ff e8 55 cb 6a fb e9 de f6 ff ff 48 8b 7d d0
irq event stamp: 76541
hardirqs last enabled at (76539): [<ffffffff878009d5>]
restore_regs_and_return_to_kernel+0x0/0x2b
hardirqs last disabled at (76541): [<ffffffff87801166>]
error_entry+0x76/0xd0 arch/x86/entry/entry_64.S:1262
softirqs last enabled at (76528): [<ffffffff87a00778>]
__do_softirq+0x778/0xaf5 kernel/softirq.c:311
softirqs last disabled at (76540): [<ffffffff85d60074>] spin_lock_bh
include/linux/spinlock.h:315 [inline]
softirqs last disabled at (76540): [<ffffffff85d60074>]
release_sock+0x74/0x2b0 net/core/sock.c:2848
---[ end trace a7562162d42a707b ]---
Dumping ftrace buffer:
(ftrace buffer empty)
Kernel Offset: disabled
Rebooting in 86400 seconds..
^ permalink raw reply
* [PATCH net-next v4] Add Common Applications Kept Enhanced (cake) qdisc
From: Toke Høiland-Jørgensen @ 2018-04-27 12:17 UTC (permalink / raw)
To: netdev; +Cc: cake, Toke Høiland-Jørgensen, Dave Taht
sch_cake targets the home router use case and is intended to squeeze the
most bandwidth and latency out of even the slowest ISP links and routers,
while presenting an API simple enough that even an ISP can configure it.
Example of use on a cable ISP uplink:
tc qdisc add dev eth0 cake bandwidth 20Mbit nat docsis ack-filter
To shape a cable download link (ifb and tc-mirred setup elided)
tc qdisc add dev ifb0 cake bandwidth 200mbit nat docsis ingress wash
CAKE is filled with:
* A hybrid Codel/Blue AQM algorithm, "Cobalt", tied to an FQ_Codel
derived Flow Queuing system, which autoconfigures based on the bandwidth.
* A novel "triple-isolate" mode (the default) which balances per-host
and per-flow FQ even through NAT.
* An deficit based shaper, that can also be used in an unlimited mode.
* 8 way set associative hashing to reduce flow collisions to a minimum.
* A reasonable interpretation of various diffserv latency/loss tradeoffs.
* Support for zeroing diffserv markings for entering and exiting traffic.
* Support for interacting well with Docsis 3.0 shaper framing.
* Extensive support for DSL framing types.
* Support for ack filtering.
* Extensive statistics for measuring, loss, ecn markings, latency
variation.
A paper describing the design of CAKE is available at
https://arxiv.org/abs/1804.07617
Various versions baking have been available as an out of tree build for
kernel versions going back to 3.10, as the embedded router world has been
running a few years behind mainline Linux. A stable version has been
generally available on lede-17.01 and later.
sch_cake replaces a combination of iptables, tc filter, htb and fq_codel
in the sqm-scripts, with sane defaults and vastly simpler configuration.
CAKE's principal author is Jonathan Morton, with contributions from
Kevin Darbyshire-Bryant, Toke Høiland-Jørgensen, Sebastian Moeller,
Ryan Mounce, Guido Sarducci, Dean Scarff, Nils Andreas Svee, Dave Täht,
and Loganaden Velvindron.
Testing from Pete Heist, Georgios Amanakis, and the many other members of
the cake@lists.bufferbloat.net mailing list.
tc -s qdisc show dev eth2
qdisc cake 1: root refcnt 2 bandwidth 100Mbit diffserv3 triple-isolate rtt 100.0ms raw overhead 0
Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
backlog 0b 0p requeues 0
memory used: 0b of 5000000b
capacity estimate: 100Mbit
min/max network layer size: 65535 / 0
min/max overhead-adjusted size: 65535 / 0
average network hdr offset: 0
Bulk Best Effort Voice
thresh 6250Kbit 100Mbit 25Mbit
target 5.0ms 5.0ms 5.0ms
interval 100.0ms 100.0ms 100.0ms
pk_delay 0us 0us 0us
av_delay 0us 0us 0us
sp_delay 0us 0us 0us
pkts 0 0 0
bytes 0 0 0
way_inds 0 0 0
way_miss 0 0 0
way_cols 0 0 0
drops 0 0 0
marks 0 0 0
ack_drop 0 0 0
sp_flows 0 0 0
bk_flows 0 0 0
un_flows 0 0 0
max_len 0 0 0
quantum 300 1514 762
Tested-by: Pete Heist <peteheist@gmail.com>
Tested-by: Georgios Amanakis <gamanakis@gmail.com>
Signed-off-by: Dave Taht <dave.taht@gmail.com>
Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
Changelog:
v4:
- Only split GSO packets if shaping at speeds <= 1Gbps
- Fix overhead calculation code to also work for GSO packets
- Don't re-implement kvzalloc()
- Remove local header include from out-of-tree build (fixes kbuild-bot
complaint).
- Several fixes to the ACK filter:
- Check pskb_may_pull() before deref of transport headers.
- Don't run ACK filter logic on split GSO packets
- Fix TCP sequence number compare to deal with wraparounds
v3:
- Use IS_REACHABLE() macro to fix compilation when sch_cake is
built-in and conntrack is a module.
- Switch the stats output to use nested netlink attributes instead
of a versioned struct.
- Remove GPL boilerplate.
- Fix array initialisation style.
v2:
- Fix kbuild test bot complaint
- Clean up the netlink ABI
- Fix checkpatch complaints
- A few tweaks to the behaviour of cake based on testing carried out
while writing the paper.
include/uapi/linux/pkt_sched.h | 105 ++
net/sched/Kconfig | 11 +
net/sched/Makefile | 1 +
net/sched/sch_cake.c | 2641 ++++++++++++++++++++++++++++++++
4 files changed, 2758 insertions(+)
create mode 100644 net/sched/sch_cake.c
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 37b5096ae97b..bc581473c0b0 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -934,4 +934,109 @@ enum {
#define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
+/* CAKE */
+enum {
+ TCA_CAKE_UNSPEC,
+ TCA_CAKE_BASE_RATE,
+ TCA_CAKE_DIFFSERV_MODE,
+ TCA_CAKE_ATM,
+ TCA_CAKE_FLOW_MODE,
+ TCA_CAKE_OVERHEAD,
+ TCA_CAKE_RTT,
+ TCA_CAKE_TARGET,
+ TCA_CAKE_AUTORATE,
+ TCA_CAKE_MEMORY,
+ TCA_CAKE_NAT,
+ TCA_CAKE_RAW,
+ TCA_CAKE_WASH,
+ TCA_CAKE_MPU,
+ TCA_CAKE_INGRESS,
+ TCA_CAKE_ACK_FILTER,
+ TCA_CAKE_SPLIT_GSO,
+ __TCA_CAKE_MAX
+};
+#define TCA_CAKE_MAX (__TCA_CAKE_MAX - 1)
+
+enum {
+ __TCA_CAKE_STATS_INVALID,
+ TCA_CAKE_STATS_CAPACITY_ESTIMATE,
+ TCA_CAKE_STATS_MEMORY_LIMIT,
+ TCA_CAKE_STATS_MEMORY_USED,
+ TCA_CAKE_STATS_AVG_NETOFF,
+ TCA_CAKE_STATS_MIN_NETLEN,
+ TCA_CAKE_STATS_MAX_NETLEN,
+ TCA_CAKE_STATS_MIN_ADJLEN,
+ TCA_CAKE_STATS_MAX_ADJLEN,
+ TCA_CAKE_STATS_TIN_STATS,
+ __TCA_CAKE_STATS_MAX
+};
+#define TCA_CAKE_STATS_MAX (__TCA_CAKE_STATS_MAX - 1)
+
+enum {
+ __TCA_CAKE_TIN_STATS_INVALID,
+ TCA_CAKE_TIN_STATS_PAD,
+ TCA_CAKE_TIN_STATS_SENT_PACKETS,
+ TCA_CAKE_TIN_STATS_SENT_BYTES64,
+ TCA_CAKE_TIN_STATS_DROPPED_PACKETS,
+ TCA_CAKE_TIN_STATS_DROPPED_BYTES64,
+ TCA_CAKE_TIN_STATS_ACKS_DROPPED_PACKETS,
+ TCA_CAKE_TIN_STATS_ACKS_DROPPED_BYTES64,
+ TCA_CAKE_TIN_STATS_ECN_MARKED_PACKETS,
+ TCA_CAKE_TIN_STATS_ECN_MARKED_BYTES64,
+ TCA_CAKE_TIN_STATS_BACKLOG_PACKETS,
+ TCA_CAKE_TIN_STATS_BACKLOG_BYTES64,
+ TCA_CAKE_TIN_STATS_THRESHOLD_RATE,
+ TCA_CAKE_TIN_STATS_TARGET_US,
+ TCA_CAKE_TIN_STATS_INTERVAL_US,
+ TCA_CAKE_TIN_STATS_WAY_INDIRECT_HITS,
+ TCA_CAKE_TIN_STATS_WAY_MISSES,
+ TCA_CAKE_TIN_STATS_WAY_COLLISIONS,
+ TCA_CAKE_TIN_STATS_PEAK_DELAY_US,
+ TCA_CAKE_TIN_STATS_AVG_DELAY_US,
+ TCA_CAKE_TIN_STATS_BASE_DELAY_US,
+ TCA_CAKE_TIN_STATS_SPARSE_FLOWS,
+ TCA_CAKE_TIN_STATS_BULK_FLOWS,
+ TCA_CAKE_TIN_STATS_UNRESPONSIVE_FLOWS,
+ TCA_CAKE_TIN_STATS_MAX_SKBLEN,
+ TCA_CAKE_TIN_STATS_FLOW_QUANTUM,
+ __TCA_CAKE_TIN_STATS_MAX
+};
+#define TCA_CAKE_TIN_STATS_MAX (__TCA_CAKE_TIN_STATS_MAX - 1)
+#define TC_CAKE_MAX_TINS (8)
+
+enum {
+ CAKE_FLOW_NONE = 0,
+ CAKE_FLOW_SRC_IP,
+ CAKE_FLOW_DST_IP,
+ CAKE_FLOW_HOSTS, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_DST_IP */
+ CAKE_FLOW_FLOWS,
+ CAKE_FLOW_DUAL_SRC, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_FLOWS */
+ CAKE_FLOW_DUAL_DST, /* = CAKE_FLOW_DST_IP | CAKE_FLOW_FLOWS */
+ CAKE_FLOW_TRIPLE, /* = CAKE_FLOW_HOSTS | CAKE_FLOW_FLOWS */
+ CAKE_FLOW_MAX,
+};
+
+enum {
+ CAKE_DIFFSERV_DIFFSERV3 = 0,
+ CAKE_DIFFSERV_DIFFSERV4,
+ CAKE_DIFFSERV_DIFFSERV8,
+ CAKE_DIFFSERV_BESTEFFORT,
+ CAKE_DIFFSERV_PRECEDENCE,
+ CAKE_DIFFSERV_MAX
+};
+
+enum {
+ CAKE_ACK_NONE = 0,
+ CAKE_ACK_FILTER,
+ CAKE_ACK_AGGRESSIVE,
+ CAKE_ACK_MAX
+};
+
+enum {
+ CAKE_ATM_NONE = 0,
+ CAKE_ATM_ATM,
+ CAKE_ATM_PTM,
+ CAKE_ATM_MAX
+};
+
#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a01169fb5325..6e7d614b5757 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -284,6 +284,17 @@ config NET_SCH_FQ_CODEL
If unsure, say N.
+config NET_SCH_CAKE
+ tristate "Common Applications Kept Enhanced (CAKE)"
+ help
+ Say Y here if you want to use the Common Applications Kept Enhanced
+ (CAKE) queue management algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_cake.
+
+ If unsure, say N.
+
config NET_SCH_FQ
tristate "Fair Queue"
help
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8811d3804878..435054cee32c 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -50,6 +50,7 @@ obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o
obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o
obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o
+obj-$(CONFIG_NET_SCH_CAKE) += sch_cake.o
obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o
obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o
obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
new file mode 100644
index 000000000000..52af01a3f5bc
--- /dev/null
+++ b/net/sched/sch_cake.c
@@ -0,0 +1,2641 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+
+/* COMMON Applications Kept Enhanced (CAKE) discipline
+ *
+ * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
+ * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
+ * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
+ * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
+ * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
+ * Copyright (C) 2017 Ryan Mounce <ryan@mounce.com.au>
+ *
+ * The CAKE Principles:
+ * (or, how to have your cake and eat it too)
+ *
+ * This is a combination of several shaping, AQM and FQ techniques into one
+ * easy-to-use package:
+ *
+ * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
+ * equipment and bloated MACs. This operates in deficit mode (as in sch_fq),
+ * eliminating the need for any sort of burst parameter (eg. token bucket
+ * depth). Burst support is limited to that necessary to overcome scheduling
+ * latency.
+ *
+ * - A Diffserv-aware priority queue, giving more priority to certain classes,
+ * up to a specified fraction of bandwidth. Above that bandwidth threshold,
+ * the priority is reduced to avoid starving other tins.
+ *
+ * - Each priority tin has a separate Flow Queue system, to isolate traffic
+ * flows from each other. This prevents a burst on one flow from increasing
+ * the delay to another. Flows are distributed to queues using a
+ * set-associative hash function.
+ *
+ * - Each queue is actively managed by Cobalt, which is a combination of the
+ * Codel and Blue AQM algorithms. This serves flows fairly, and signals
+ * congestion early via ECN (if available) and/or packet drops, to keep
+ * latency low. The codel parameters are auto-tuned based on the bandwidth
+ * setting, as is necessary at low bandwidths.
+ *
+ * The configuration parameters are kept deliberately simple for ease of use.
+ * Everything has sane defaults. Complete generality of configuration is *not*
+ * a goal.
+ *
+ * The priority queue operates according to a weighted DRR scheme, combined with
+ * a bandwidth tracker which reuses the shaper logic to detect which side of the
+ * bandwidth sharing threshold the tin is operating. This determines whether a
+ * priority-based weight (high) or a bandwidth-based weight (low) is used for
+ * that tin in the current pass.
+ *
+ * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
+ * granted us permission to leverage.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/reciprocal_div.h>
+#include <net/netlink.h>
+#include <linux/version.h>
+#include <linux/if_vlan.h>
+#include <net/pkt_sched.h>
+#include <net/tcp.h>
+#include <net/flow_dissector.h>
+
+#if IS_REACHABLE(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+#define CAKE_SET_WAYS (8)
+#define CAKE_MAX_TINS (8)
+#define CAKE_QUEUES (1024)
+#define CAKE_SPLIT_GSO_THRESHOLD (125000000) /* 1Gbps */
+#define US2TIME(a) (a * (u64)NSEC_PER_USEC)
+
+typedef u64 cobalt_time_t;
+typedef s64 cobalt_tdiff_t;
+
+/**
+ * struct cobalt_params - contains codel and blue parameters
+ * @interval: codel initial drop rate
+ * @target: maximum persistent sojourn time & blue update rate
+ * @mtu_time: serialisation delay of maximum-size packet
+ * @p_inc: increment of blue drop probability (0.32 fxp)
+ * @p_dec: decrement of blue drop probability (0.32 fxp)
+ */
+struct cobalt_params {
+ cobalt_time_t interval;
+ cobalt_time_t target;
+ cobalt_time_t mtu_time;
+ u32 p_inc;
+ u32 p_dec;
+};
+
+/* struct cobalt_vars - contains codel and blue variables
+ * @count: codel dropping frequency
+ * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1
+ * @drop_next: time to drop next packet, or when we dropped last
+ * @blue_timer: Blue time to next drop
+ * @p_drop: BLUE drop probability (0.32 fxp)
+ * @dropping: set if in dropping state
+ * @ecn_marked: set if marked
+ */
+struct cobalt_vars {
+ u32 count;
+ u32 rec_inv_sqrt;
+ cobalt_time_t drop_next;
+ cobalt_time_t blue_timer;
+ u32 p_drop;
+ bool dropping;
+ bool ecn_marked;
+};
+
+enum {
+ CAKE_SET_NONE = 0,
+ CAKE_SET_SPARSE,
+ CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
+ CAKE_SET_BULK,
+ CAKE_SET_DECAYING
+};
+
+struct cake_flow {
+ /* this stuff is all needed per-flow at dequeue time */
+ struct sk_buff *head;
+ struct sk_buff *tail;
+ struct sk_buff *ackcheck;
+ struct list_head flowchain;
+ s32 deficit;
+ struct cobalt_vars cvars;
+ u16 srchost; /* index into cake_host table */
+ u16 dsthost;
+ u8 set;
+}; /* please try to keep this structure <= 64 bytes */
+
+struct cake_host {
+ u32 srchost_tag;
+ u32 dsthost_tag;
+ u16 srchost_refcnt;
+ u16 dsthost_refcnt;
+};
+
+struct cake_heap_entry {
+ u16 t:3, b:10;
+};
+
+struct cake_tin_data {
+ struct cake_flow flows[CAKE_QUEUES];
+ u32 backlogs[CAKE_QUEUES];
+ u32 tags[CAKE_QUEUES]; /* for set association */
+ u16 overflow_idx[CAKE_QUEUES];
+ struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
+ u32 perturb;
+ u16 flow_quantum;
+
+ struct cobalt_params cparams;
+ u32 drop_overlimit;
+ u16 bulk_flow_count;
+ u16 sparse_flow_count;
+ u16 decaying_flow_count;
+ u16 unresponsive_flow_count;
+
+ u32 max_skblen;
+
+ struct list_head new_flows;
+ struct list_head old_flows;
+ struct list_head decaying_flows;
+
+ /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
+ u64 tin_time_next_packet;
+ u32 tin_rate_ns;
+ u32 tin_rate_bps;
+ u16 tin_rate_shft;
+
+ u16 tin_quantum_prio;
+ u16 tin_quantum_band;
+ s32 tin_deficit;
+ u32 tin_backlog;
+ u32 tin_dropped;
+ u32 tin_ecn_mark;
+
+ u32 packets;
+ u64 bytes;
+
+ u32 ack_drops;
+
+ /* moving averages */
+ cobalt_time_t avge_delay;
+ cobalt_time_t peak_delay;
+ cobalt_time_t base_delay;
+
+ /* hash function stats */
+ u32 way_directs;
+ u32 way_hits;
+ u32 way_misses;
+ u32 way_collisions;
+}; /* number of tins is small, so size of this struct doesn't matter much */
+
+struct cake_sched_data {
+ struct cake_tin_data *tins;
+
+ struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
+ u16 overflow_timeout;
+
+ u16 tin_cnt;
+ u8 tin_mode;
+#define CAKE_FLOW_NAT_FLAG 64
+ u8 flow_mode;
+ u8 ack_filter;
+ u8 atm_mode;
+
+ /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
+ u16 rate_shft;
+ u64 time_next_packet;
+ u64 failsafe_next_packet;
+ u32 rate_ns;
+ u32 rate_bps;
+ u16 rate_flags;
+ s16 rate_overhead;
+ u16 rate_mpu;
+ u32 interval;
+ u32 target;
+
+ /* resource tracking */
+ u32 buffer_used;
+ u32 buffer_max_used;
+ u32 buffer_limit;
+ u32 buffer_config_limit;
+
+ /* indices for dequeue */
+ u16 cur_tin;
+ u16 cur_flow;
+
+ struct qdisc_watchdog watchdog;
+ const u8 *tin_index;
+ const u8 *tin_order;
+
+ /* bandwidth capacity estimate */
+ u64 last_packet_time;
+ u64 avg_packet_interval;
+ u64 avg_window_begin;
+ u32 avg_window_bytes;
+ u32 avg_peak_bandwidth;
+ u64 last_reconfig_time;
+
+ /* packet length stats */
+ u32 avg_netoff;
+ u16 max_netlen;
+ u16 max_adjlen;
+ u16 min_netlen;
+ u16 min_adjlen;
+};
+
+enum {
+ CAKE_FLAG_OVERHEAD = BIT(0),
+ CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
+ CAKE_FLAG_INGRESS = BIT(2),
+ CAKE_FLAG_WASH = BIT(3),
+ CAKE_FLAG_SPLIT_GSO = BIT(4)
+};
+
+/* COBALT operates the Codel and BLUE algorithms in parallel, in order to
+ * obtain the best features of each. Codel is excellent on flows which
+ * respond to congestion signals in a TCP-like way. BLUE is more effective on
+ * unresponsive flows.
+ */
+
+struct cobalt_skb_cb {
+ cobalt_time_t enqueue_time;
+ u32 adjusted_len;
+};
+
+static inline cobalt_time_t cobalt_get_time(void)
+{
+ return ktime_get_ns();
+}
+
+static inline u32 cobalt_time_to_us(cobalt_time_t val)
+{
+ do_div(val, NSEC_PER_USEC);
+ return (u32)val;
+}
+
+static inline struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
+{
+ qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
+ return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static inline cobalt_time_t cobalt_get_enqueue_time(const struct sk_buff *skb)
+{
+ return get_cobalt_cb(skb)->enqueue_time;
+}
+
+static inline void cobalt_set_enqueue_time(struct sk_buff *skb,
+ cobalt_time_t now)
+{
+ get_cobalt_cb(skb)->enqueue_time = now;
+}
+
+static u16 quantum_div[CAKE_QUEUES + 1] = {0};
+
+/* Diffserv lookup tables */
+
+static const u8 precedence[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 5, 5, 5, 5, 5, 5, 5, 5,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+};
+
+static const u8 diffserv8[] = {
+ 2, 5, 1, 2, 4, 2, 2, 2,
+ 0, 2, 1, 2, 1, 2, 1, 2,
+ 5, 2, 4, 2, 4, 2, 4, 2,
+ 3, 2, 3, 2, 3, 2, 3, 2,
+ 6, 2, 3, 2, 3, 2, 3, 2,
+ 6, 2, 2, 2, 6, 2, 6, 2,
+ 7, 2, 2, 2, 2, 2, 2, 2,
+ 7, 2, 2, 2, 2, 2, 2, 2,
+};
+
+static const u8 diffserv4[] = {
+ 0, 2, 0, 0, 2, 0, 0, 0,
+ 1, 0, 0, 0, 0, 0, 0, 0,
+ 2, 0, 2, 0, 2, 0, 2, 0,
+ 2, 0, 2, 0, 2, 0, 2, 0,
+ 3, 0, 2, 0, 2, 0, 2, 0,
+ 3, 0, 0, 0, 3, 0, 3, 0,
+ 3, 0, 0, 0, 0, 0, 0, 0,
+ 3, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static const u8 diffserv3[] = {
+ 0, 0, 0, 0, 2, 0, 0, 0,
+ 1, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 2, 0, 2, 0,
+ 2, 0, 0, 0, 0, 0, 0, 0,
+ 2, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static const u8 besteffort[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+/* tin priority order for stats dumping */
+
+static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
+static const u8 bulk_order[] = {1, 0, 2, 3};
+
+#define REC_INV_SQRT_CACHE (16)
+static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
+
+/* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
+ * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
+ *
+ * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
+ */
+
+static void cobalt_newton_step(struct cobalt_vars *vars)
+{
+ u32 invsqrt = vars->rec_inv_sqrt;
+ u32 invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
+ u64 val = (3LL << 32) - ((u64)vars->count * invsqrt2);
+
+ val >>= 2; /* avoid overflow in following multiply */
+ val = (val * invsqrt) >> (32 - 2 + 1);
+
+ vars->rec_inv_sqrt = val;
+}
+
+static void cobalt_invsqrt(struct cobalt_vars *vars)
+{
+ if (vars->count < REC_INV_SQRT_CACHE)
+ vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
+ else
+ cobalt_newton_step(vars);
+}
+
+/* There is a big difference in timing between the accurate values placed in
+ * the cache and the approximations given by a single Newton step for small
+ * count values, particularly when stepping from count 1 to 2 or vice versa.
+ * Above 16, a single Newton step gives sufficient accuracy in either
+ * direction, given the precision stored.
+ *
+ * The magnitude of the error when stepping up to count 2 is such as to give
+ * the value that *should* have been produced at count 4.
+ */
+
+static void cobalt_cache_init(void)
+{
+ struct cobalt_vars v;
+
+ memset(&v, 0, sizeof(v));
+ v.rec_inv_sqrt = ~0U;
+ cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
+
+ for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
+ cobalt_newton_step(&v);
+ cobalt_newton_step(&v);
+ cobalt_newton_step(&v);
+ cobalt_newton_step(&v);
+
+ cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
+ }
+}
+
+static void cobalt_vars_init(struct cobalt_vars *vars)
+{
+ memset(vars, 0, sizeof(*vars));
+
+ if (!cobalt_rec_inv_sqrt_cache[0]) {
+ cobalt_cache_init();
+ cobalt_rec_inv_sqrt_cache[0] = ~0;
+ }
+}
+
+/* CoDel control_law is t + interval/sqrt(count)
+ * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
+ * both sqrt() and divide operation.
+ */
+static cobalt_time_t cobalt_control(cobalt_time_t t,
+ cobalt_time_t interval,
+ u32 rec_inv_sqrt)
+{
+ return t + reciprocal_scale(interval, rec_inv_sqrt);
+}
+
+/* Call this when a packet had to be dropped due to queue overflow. Returns
+ * true if the BLUE state was quiescent before but active after this call.
+ */
+static bool cobalt_queue_full(struct cobalt_vars *vars,
+ struct cobalt_params *p,
+ cobalt_time_t now)
+{
+ bool up = false;
+
+ if ((now - vars->blue_timer) > p->target) {
+ up = !vars->p_drop;
+ vars->p_drop += p->p_inc;
+ if (vars->p_drop < p->p_inc)
+ vars->p_drop = ~0;
+ vars->blue_timer = now;
+ }
+ vars->dropping = true;
+ vars->drop_next = now;
+ if (!vars->count)
+ vars->count = 1;
+
+ return up;
+}
+
+/* Call this when the queue was serviced but turned out to be empty. Returns
+ * true if the BLUE state was active before but quiescent after this call.
+ */
+static bool cobalt_queue_empty(struct cobalt_vars *vars,
+ struct cobalt_params *p,
+ cobalt_time_t now)
+{
+ bool down = false;
+
+ if (vars->p_drop && (now - vars->blue_timer) > p->target) {
+ if (vars->p_drop < p->p_dec)
+ vars->p_drop = 0;
+ else
+ vars->p_drop -= p->p_dec;
+ vars->blue_timer = now;
+ down = !vars->p_drop;
+ }
+ vars->dropping = false;
+
+ if (vars->count && (now - vars->drop_next) >= 0) {
+ vars->count--;
+ cobalt_invsqrt(vars);
+ vars->drop_next = cobalt_control(vars->drop_next,
+ p->interval,
+ vars->rec_inv_sqrt);
+ }
+
+ return down;
+}
+
+/* Call this with a freshly dequeued packet for possible congestion marking.
+ * Returns true as an instruction to drop the packet, false for delivery.
+ */
+static bool cobalt_should_drop(struct cobalt_vars *vars,
+ struct cobalt_params *p,
+ cobalt_time_t now,
+ struct sk_buff *skb,
+ u32 bulk_flows)
+{
+ bool drop = false;
+
+ /* Simplified Codel implementation */
+ cobalt_tdiff_t sojourn = now - cobalt_get_enqueue_time(skb);
+
+/* The 'schedule' variable records, in its sign, whether 'now' is before or
+ * after 'drop_next'. This allows 'drop_next' to be updated before the next
+ * scheduling decision is actually branched, without destroying that
+ * information. Similarly, the first 'schedule' value calculated is preserved
+ * in the boolean 'next_due'.
+ *
+ * As for 'drop_next', we take advantage of the fact that 'interval' is both
+ * the delay between first exceeding 'target' and the first signalling event,
+ * *and* the scaling factor for the signalling frequency. It's therefore very
+ * natural to use a single mechanism for both purposes, and eliminates a
+ * significant amount of reference Codel's spaghetti code. To help with this,
+ * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
+ * as possible to 1.0 in fixed-point.
+ */
+
+ cobalt_tdiff_t schedule = now - vars->drop_next;
+
+ bool over_target = sojourn > p->target &&
+ sojourn > p->mtu_time * bulk_flows * 2 &&
+ sojourn > p->mtu_time * 4;
+ bool next_due = vars->count && schedule >= 0;
+
+ vars->ecn_marked = false;
+
+ if (over_target) {
+ if (!vars->dropping) {
+ vars->dropping = true;
+ vars->drop_next = cobalt_control(now,
+ p->interval,
+ vars->rec_inv_sqrt);
+ }
+ if (!vars->count)
+ vars->count = 1;
+ } else if (vars->dropping) {
+ vars->dropping = false;
+ }
+
+ if (next_due && vars->dropping) {
+ /* Use ECN mark if possible, otherwise drop */
+ drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
+
+ vars->count++;
+ if (!vars->count)
+ vars->count--;
+ cobalt_invsqrt(vars);
+ vars->drop_next = cobalt_control(vars->drop_next,
+ p->interval,
+ vars->rec_inv_sqrt);
+ schedule = now - vars->drop_next;
+ } else {
+ while (next_due) {
+ vars->count--;
+ cobalt_invsqrt(vars);
+ vars->drop_next = cobalt_control(vars->drop_next,
+ p->interval,
+ vars->rec_inv_sqrt);
+ schedule = now - vars->drop_next;
+ next_due = vars->count && schedule >= 0;
+ }
+ }
+
+ /* Simple BLUE implementation. Lack of ECN is deliberate. */
+ if (vars->p_drop)
+ drop |= (prandom_u32() < vars->p_drop);
+
+ /* Overload the drop_next field as an activity timeout */
+ if (!vars->count)
+ vars->drop_next = now + p->interval;
+ else if (schedule > 0 && !drop)
+ vars->drop_next = now;
+
+ return drop;
+}
+
+#if IS_REACHABLE(CONFIG_NF_CONNTRACK)
+
+static inline void cake_update_flowkeys(struct flow_keys *keys,
+ const struct sk_buff *skb)
+{
+ enum ip_conntrack_info ctinfo;
+ bool rev = false;
+
+ struct nf_conn *ct;
+ const struct nf_conntrack_tuple *tuple;
+
+ if (tc_skb_protocol(skb) != htons(ETH_P_IP))
+ return;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct) {
+ tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo));
+ } else {
+ const struct nf_conntrack_tuple_hash *hash;
+ struct nf_conntrack_tuple srctuple;
+
+ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+ NFPROTO_IPV4, dev_net(skb->dev),
+ &srctuple))
+ return;
+
+ hash = nf_conntrack_find_get(dev_net(skb->dev),
+ &nf_ct_zone_dflt,
+ &srctuple);
+ if (!hash)
+ return;
+
+ rev = true;
+ ct = nf_ct_tuplehash_to_ctrack(hash);
+ tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir);
+ }
+
+ keys->addrs.v4addrs.src = rev ? tuple->dst.u3.ip : tuple->src.u3.ip;
+ keys->addrs.v4addrs.dst = rev ? tuple->src.u3.ip : tuple->dst.u3.ip;
+
+ if (keys->ports.ports) {
+ keys->ports.src = rev ? tuple->dst.u.all : tuple->src.u.all;
+ keys->ports.dst = rev ? tuple->src.u.all : tuple->dst.u.all;
+ }
+ if (rev)
+ nf_ct_put(ct);
+}
+#else
+static inline void cake_update_flowkeys(struct flow_keys *keys,
+ const struct sk_buff *skb)
+{
+ /* There is nothing we can do here without CONNTRACK */
+}
+#endif
+
+/* Cake has several subtle multiple bit settings. In these cases you
+ * would be matching triple isolate mode as well.
+ */
+
+static inline bool cake_dsrc(int flow_mode)
+{
+ return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
+}
+
+static inline bool cake_ddst(int flow_mode)
+{
+ return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
+}
+
+static inline u32
+cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, int flow_mode)
+{
+ struct flow_keys keys, host_keys;
+ u32 flow_hash = 0, srchost_hash, dsthost_hash;
+ u16 reduced_hash, srchost_idx, dsthost_idx;
+
+ if (unlikely(flow_mode == CAKE_FLOW_NONE))
+ return 0;
+
+ skb_flow_dissect_flow_keys(skb, &keys,
+ FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+
+ if (flow_mode & CAKE_FLOW_NAT_FLAG)
+ cake_update_flowkeys(&keys, skb);
+
+ /* flow_hash_from_keys() sorts the addresses by value, so we have
+ * to preserve their order in a separate data structure to treat
+ * src and dst host addresses as independently selectable.
+ */
+ host_keys = keys;
+ host_keys.ports.ports = 0;
+ host_keys.basic.ip_proto = 0;
+ host_keys.keyid.keyid = 0;
+ host_keys.tags.flow_label = 0;
+
+ switch (host_keys.control.addr_type) {
+ case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+ host_keys.addrs.v4addrs.src = 0;
+ dsthost_hash = flow_hash_from_keys(&host_keys);
+ host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+ host_keys.addrs.v4addrs.dst = 0;
+ srchost_hash = flow_hash_from_keys(&host_keys);
+ break;
+
+ case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+ memset(&host_keys.addrs.v6addrs.src, 0,
+ sizeof(host_keys.addrs.v6addrs.src));
+ dsthost_hash = flow_hash_from_keys(&host_keys);
+ host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
+ memset(&host_keys.addrs.v6addrs.dst, 0,
+ sizeof(host_keys.addrs.v6addrs.dst));
+ srchost_hash = flow_hash_from_keys(&host_keys);
+ break;
+
+ default:
+ dsthost_hash = 0;
+ srchost_hash = 0;
+ };
+
+ /* This *must* be after the above switch, since as a
+ * side-effect it sorts the src and dst addresses.
+ */
+ if (flow_mode & CAKE_FLOW_FLOWS)
+ flow_hash = flow_hash_from_keys(&keys);
+
+ if (!(flow_mode & CAKE_FLOW_FLOWS)) {
+ if (flow_mode & CAKE_FLOW_SRC_IP)
+ flow_hash ^= srchost_hash;
+
+ if (flow_mode & CAKE_FLOW_DST_IP)
+ flow_hash ^= dsthost_hash;
+ }
+
+ reduced_hash = flow_hash % CAKE_QUEUES;
+
+ /* set-associative hashing */
+ /* fast path if no hash collision (direct lookup succeeds) */
+ if (likely(q->tags[reduced_hash] == flow_hash &&
+ q->flows[reduced_hash].set)) {
+ q->way_directs++;
+ } else {
+ u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
+ u32 outer_hash = reduced_hash - inner_hash;
+ u32 i, k;
+ bool allocate_src = false;
+ bool allocate_dst = false;
+
+ /* check if any active queue in the set is reserved for
+ * this flow.
+ */
+ for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (q->tags[outer_hash + k] == flow_hash) {
+ if (i)
+ q->way_hits++;
+
+ if (!q->flows[outer_hash + k].set) {
+ /* need to increment host refcnts */
+ allocate_src = cake_dsrc(flow_mode);
+ allocate_dst = cake_ddst(flow_mode);
+ }
+
+ goto found;
+ }
+ }
+
+ /* no queue is reserved for this flow, look for an
+ * empty one.
+ */
+ for (i = 0; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (!q->flows[outer_hash + k].set) {
+ q->way_misses++;
+ allocate_src = cake_dsrc(flow_mode);
+ allocate_dst = cake_ddst(flow_mode);
+ goto found;
+ }
+ }
+
+ /* With no empty queues, default to the original
+ * queue, accept the collision, update the host tags.
+ */
+ q->way_collisions++;
+ q->hosts[q->flows[reduced_hash].srchost].srchost_refcnt--;
+ q->hosts[q->flows[reduced_hash].dsthost].dsthost_refcnt--;
+ allocate_src = cake_dsrc(flow_mode);
+ allocate_dst = cake_ddst(flow_mode);
+found:
+ /* reserve queue for future packets in same flow */
+ reduced_hash = outer_hash + k;
+ q->tags[reduced_hash] = flow_hash;
+
+ if (allocate_src) {
+ srchost_idx = srchost_hash % CAKE_QUEUES;
+ inner_hash = srchost_idx % CAKE_SET_WAYS;
+ outer_hash = srchost_idx - inner_hash;
+ for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (q->hosts[outer_hash + k].srchost_tag ==
+ srchost_hash)
+ goto found_src;
+ }
+ for (i = 0; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (!q->hosts[outer_hash + k].srchost_refcnt)
+ break;
+ }
+ q->hosts[outer_hash + k].srchost_tag = srchost_hash;
+found_src:
+ srchost_idx = outer_hash + k;
+ q->hosts[srchost_idx].srchost_refcnt++;
+ q->flows[reduced_hash].srchost = srchost_idx;
+ }
+
+ if (allocate_dst) {
+ dsthost_idx = dsthost_hash % CAKE_QUEUES;
+ inner_hash = dsthost_idx % CAKE_SET_WAYS;
+ outer_hash = dsthost_idx - inner_hash;
+ for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (q->hosts[outer_hash + k].dsthost_tag ==
+ dsthost_hash)
+ goto found_dst;
+ }
+ for (i = 0; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (!q->hosts[outer_hash + k].dsthost_refcnt)
+ break;
+ }
+ q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
+found_dst:
+ dsthost_idx = outer_hash + k;
+ q->hosts[dsthost_idx].dsthost_refcnt++;
+ q->flows[reduced_hash].dsthost = dsthost_idx;
+ }
+ }
+
+ return reduced_hash;
+}
+
+/* helper functions : might be changed when/if skb use a standard list_head */
+/* remove one skb from head of slot queue */
+
+static inline struct sk_buff *dequeue_head(struct cake_flow *flow)
+{
+ struct sk_buff *skb = flow->head;
+
+ if (skb) {
+ flow->head = skb->next;
+ skb->next = NULL;
+
+ if (skb == flow->ackcheck)
+ flow->ackcheck = NULL;
+ }
+
+ return skb;
+}
+
+/* add skb to flow queue (tail add) */
+
+static inline void
+flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
+{
+ if (!flow->head)
+ flow->head = skb;
+ else
+ flow->tail->next = skb;
+ flow->tail = skb;
+ skb->next = NULL;
+}
+
+static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
+ struct cake_flow *flow)
+{
+ int seglen;
+ struct sk_buff *skb = flow->tail, *skb_check, *skb_check_prev;
+ struct iphdr *iph, *iph_check;
+ struct ipv6hdr *ipv6h, *ipv6h_check;
+ struct tcphdr *tcph, *tcph_check;
+ bool otherconn_ack_seen = false;
+ struct sk_buff *otherconn_checked_to = NULL;
+ bool thisconn_redundant_seen = false, thisconn_seen_last = false;
+ struct sk_buff *thisconn_checked_to = NULL, *thisconn_ack = NULL;
+ bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
+
+ /* no other possible ACKs to filter */
+ if (flow->head == skb)
+ return NULL;
+
+ iph = skb->encapsulation ? inner_ip_hdr(skb) : ip_hdr(skb);
+ ipv6h = skb->encapsulation ? inner_ipv6_hdr(skb) : ipv6_hdr(skb);
+
+ /* check that the innermost network header is v4/v6, and contains TCP */
+ if (pskb_may_pull(skb, ((unsigned char *)iph - skb->head) + sizeof(struct iphdr)) &&
+ iph->version == 4) {
+ if (iph->protocol != IPPROTO_TCP)
+ return NULL;
+ seglen = ntohs(iph->tot_len) - (4 * iph->ihl);
+ tcph = (struct tcphdr *)((void *)iph + (4 * iph->ihl));
+ if (!pskb_may_pull(skb, ((unsigned char *)tcph - skb->head) + sizeof(struct tcphdr)))
+ return NULL;
+ } else if (pskb_may_pull(skb, ((unsigned char *)ipv6h - skb->head) + sizeof(struct ipv6hdr) + sizeof(struct tcphdr)) &&
+ ipv6h->version == 6) {
+ if (ipv6h->nexthdr != IPPROTO_TCP)
+ return NULL;
+ seglen = ntohs(ipv6h->payload_len);
+ tcph = (struct tcphdr *)((void *)ipv6h +
+ sizeof(struct ipv6hdr));
+ } else {
+ return NULL;
+ }
+
+ /* the 'triggering' packet need only have the ACK flag set.
+ * also check that SYN is not set, as there won't be any previous ACKs.
+ */
+ if ((tcp_flag_word(tcph) &
+ (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
+ return NULL;
+
+ /* the 'triggering' ACK is at the end of the queue,
+ * we have already returned if it is the only packet in the flow.
+ * stop before last packet in queue, don't compare trigger ACK to itself
+ * start where we finished last time if recorded in ->ackcheck
+ * otherwise start from the the head of the flow queue.
+ */
+ skb_check_prev = flow->ackcheck;
+ skb_check = flow->ackcheck ?: flow->head;
+
+ while (skb_check->next) {
+ bool pure_ack, thisconn;
+
+ /* don't increment if at head of flow queue (_prev == NULL) */
+ if (skb_check_prev) {
+ skb_check_prev = skb_check;
+ skb_check = skb_check->next;
+ if (!skb_check->next)
+ break;
+ } else {
+ skb_check_prev = ERR_PTR(-1);
+ }
+
+ iph_check = skb_check->encapsulation ?
+ inner_ip_hdr(skb_check) : ip_hdr(skb_check);
+ ipv6h_check = skb_check->encapsulation ?
+ inner_ipv6_hdr(skb_check) : ipv6_hdr(skb_check);
+
+ if (pskb_may_pull(skb_check, ((unsigned char *)iph_check - skb_check->head) + sizeof(struct iphdr)) &&
+ iph_check->version == 4) {
+ if (iph_check->protocol != IPPROTO_TCP)
+ continue;
+ seglen = (ntohs(iph_check->tot_len) -
+ (4 * iph_check->ihl));
+ tcph_check = (struct tcphdr *)((void *)iph_check
+ + (4 * iph_check->ihl));
+ if (iph->version == 4 &&
+ iph_check->saddr == iph->saddr &&
+ iph_check->daddr == iph->daddr) {
+ thisconn = true;
+ } else {
+ thisconn = false;
+ }
+ } else if (pskb_may_pull(skb_check, ((unsigned char *)ipv6h_check - skb_check->head) + sizeof(struct ipv6hdr)) &&
+ ipv6h_check->version == 6) {
+ if (ipv6h_check->nexthdr != IPPROTO_TCP)
+ continue;
+ seglen = ntohs(ipv6h_check->payload_len);
+ tcph_check = (struct tcphdr *)((void *)ipv6h_check +
+ sizeof(struct ipv6hdr));
+ if (ipv6h->version == 6 &&
+ ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) &&
+ ipv6_addr_cmp(&ipv6h_check->daddr,
+ &ipv6h->daddr)) {
+ thisconn = true;
+ } else {
+ thisconn = false;
+ }
+ } else {
+ continue;
+ }
+
+ if (!pskb_may_pull(skb_check, ((unsigned char *)tcph_check - skb_check->head) + sizeof(struct tcphdr)))
+ continue;
+
+ /* stricter criteria apply to ACKs that we may filter
+ * 3 reserved flags must be unset to avoid future breakage
+ * ECE/CWR/NS can be safely ignored
+ * ACK must be set
+ * All other flags URG/PSH/RST/SYN/FIN must be unset
+ * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
+ * 0x01C00000 = NS/CWR/ECE (safe to ignore)
+ * 0x0E3F0000 = 0x0FFF0000 & ~0x01C00000
+ * must be 'pure' ACK, contain zero bytes of segment data
+ * options are ignored
+ */
+ if ((tcp_flag_word(tcph_check) &
+ (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK) {
+ continue;
+ } else if (((tcp_flag_word(tcph_check) &
+ cpu_to_be32(0x0E3F0000)) != TCP_FLAG_ACK) ||
+ ((seglen - 4 * tcph_check->doff) != 0)) {
+ pure_ack = false;
+ } else {
+ pure_ack = true;
+ }
+
+ /* if we find an ACK belonging to a different connection
+ * continue checking for other ACKs this round however
+ * restart checking from the other connection next time.
+ */
+ if (thisconn && (tcph_check->source != tcph->source ||
+ tcph_check->dest != tcph->dest)) {
+ thisconn = false;
+ }
+
+ /* new ack sequence must be greater
+ */
+ if (thisconn &&
+ ((int32_t)(ntohl(tcph_check->ack_seq) - ntohl(tcph->ack_seq)) > 0))
+ continue;
+
+ /* DupACKs with an equal sequence number shouldn't be filtered,
+ * but we can filter if the triggering packet is a SACK
+ */
+ if (thisconn &&
+ (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq)) &&
+ pskb_may_pull(skb, ((unsigned char *)tcph - skb->head) + (tcph->doff * 4))) {
+ /* inspired by tcp_parse_options in tcp_input.c */
+ bool sack = false;
+ int length = (tcph->doff * 4) - sizeof(struct tcphdr);
+ const u8 *ptr = (const u8 *)(tcph + 1);
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ if (opcode == TCPOPT_EOL)
+ break;
+ if (opcode == TCPOPT_NOP) {
+ length--;
+ continue;
+ }
+ opsize = *ptr++;
+ if (opsize < 2 || opsize > length)
+ break;
+ if (opcode == TCPOPT_SACK) {
+ sack = true;
+ break;
+ }
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ if (!sack)
+ continue;
+ }
+
+ /* somewhat complicated control flow for 'conservative'
+ * ACK filtering that aims to be more polite to slow-start and
+ * in the presence of packet loss.
+ * does not filter if there is one 'redundant' ACK in the queue.
+ * 'data' ACKs won't be filtered but do count as redundant ACKs.
+ */
+ if (thisconn) {
+ thisconn_seen_last = true;
+ /* if aggressive and this is a data ack we can skip
+ * checking it next time.
+ */
+ thisconn_checked_to = (aggressive && !pure_ack) ?
+ skb_check : skb_check_prev;
+ /* the first pure ack for this connection.
+ * record where it is, but only break if aggressive
+ * or already seen data ack from the same connection
+ */
+ if (pure_ack && !thisconn_ack) {
+ thisconn_ack = skb_check_prev;
+ if (aggressive || thisconn_redundant_seen)
+ break;
+ /* data ack or subsequent pure ack */
+ } else {
+ thisconn_redundant_seen = true;
+ /* this is the second ack for this connection
+ * break to filter the first pure ack
+ */
+ if (thisconn_ack)
+ break;
+ }
+ /* track packets from non-matching tcp connections that will
+ * need evaluation on the next run.
+ * if there are packets from both the matching connection and
+ * others that requre checking next run, track which was updated
+ * last and return the older of the two to ensure full coverage.
+ * if a non-matching pure ack has been seen, cannot skip any
+ * further on the next run so don't update.
+ */
+ } else if (!otherconn_ack_seen) {
+ thisconn_seen_last = false;
+ if (pure_ack) {
+ otherconn_ack_seen = true;
+ /* if aggressive we don't care about old data,
+ * start from the pure ack.
+ * otherwise if there is a previous data ack,
+ * start checking from it next time.
+ */
+ if (aggressive || !otherconn_checked_to)
+ otherconn_checked_to = skb_check_prev;
+ } else {
+ otherconn_checked_to = aggressive ?
+ skb_check : skb_check_prev;
+ }
+ }
+ }
+
+ /* skb_check is reused at this point
+ * it is the pure ACK to be filtered (if any)
+ */
+ skb_check = NULL;
+
+ /* next time start checking from the older/nearest to head of unfiltered
+ * but important tcp packets from this connection and other connections.
+ * if none seen, start after the last packet evaluated in the loop.
+ */
+ if (thisconn_checked_to && otherconn_checked_to)
+ flow->ackcheck = thisconn_seen_last ?
+ otherconn_checked_to : thisconn_checked_to;
+ else if (thisconn_checked_to)
+ flow->ackcheck = thisconn_checked_to;
+ else if (otherconn_checked_to)
+ flow->ackcheck = otherconn_checked_to;
+ else
+ flow->ackcheck = skb_check_prev;
+
+ /* if filtering, the pure ACK from the flow queue */
+ if (thisconn_ack && (aggressive || thisconn_redundant_seen)) {
+ if (PTR_ERR(thisconn_ack) == -1) {
+ skb_check = flow->head;
+ flow->head = flow->head->next;
+ } else {
+ skb_check = thisconn_ack->next;
+ thisconn_ack->next = thisconn_ack->next->next;
+ }
+ }
+
+ /* we just filtered that ack, fix up the list */
+ if (flow->ackcheck == skb_check)
+ flow->ackcheck = thisconn_ack;
+ /* check the entire flow queue next time */
+ if (PTR_ERR(flow->ackcheck) == -1)
+ flow->ackcheck = NULL;
+
+ return skb_check;
+}
+
+static inline cobalt_time_t cake_ewma(cobalt_time_t avg, cobalt_time_t sample,
+ u32 shift)
+{
+ avg -= avg >> shift;
+ avg += sample >> shift;
+ return avg;
+}
+
+static inline u32 cake_overhead(struct cake_sched_data *q, struct sk_buff *skb)
+{
+ const struct skb_shared_info *shinfo = skb_shinfo(skb);
+ u32 off = skb_network_offset(skb);
+ u32 len = qdisc_pkt_len(skb);
+ u16 segs = 1;
+
+ if (unlikely(shinfo->gso_size)) {
+ /* borrowed from qdisc_pkt_len_init() */
+ unsigned int hdr_len;
+
+ hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
+
+ /* + transport layer */
+ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
+ const struct tcphdr *th;
+ struct tcphdr _tcphdr;
+
+ th = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_tcphdr), &_tcphdr);
+ if (likely(th))
+ hdr_len += __tcp_hdrlen(th);
+ } else {
+ struct udphdr _udphdr;
+
+ if (skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_udphdr), &_udphdr))
+ hdr_len += sizeof(struct udphdr);
+ }
+
+ if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
+ segs = DIV_ROUND_UP(skb->len - hdr_len,
+ shinfo->gso_size);
+ else
+ segs = shinfo->gso_segs;
+
+ /* The last segment may be shorter; we ignore this, which means
+ * that we will over-estimate the size of the whole GSO segment
+ * by the difference in size. This is conservative, so we live
+ * with that to avoid the complexity of dealing with it.
+ */
+ len = shinfo->gso_size + hdr_len;
+ }
+
+ q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
+
+ if (q->rate_flags & CAKE_FLAG_OVERHEAD)
+ len -= off;
+
+ if (q->max_netlen < len)
+ q->max_netlen = len;
+ if (q->min_netlen > len)
+ q->min_netlen = len;
+
+ len += q->rate_overhead;
+
+ if (len < q->rate_mpu)
+ len = q->rate_mpu;
+
+ if (q->atm_mode == CAKE_ATM_ATM) {
+ len += 47;
+ len /= 48;
+ len *= 53;
+ } else if (q->atm_mode == CAKE_ATM_PTM) {
+ /* Add one byte per 64 bytes or part thereof.
+ * This is conservative and easier to calculate than the
+ * precise value.
+ */
+ len += (len + 63) / 64;
+ }
+
+ if (q->max_adjlen < len)
+ q->max_adjlen = len;
+ if (q->min_adjlen > len)
+ q->min_adjlen = len;
+
+ get_cobalt_cb(skb)->adjusted_len = len * segs;
+ return len;
+}
+
+static inline void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
+{
+ struct cake_heap_entry ii = q->overflow_heap[i];
+ struct cake_heap_entry jj = q->overflow_heap[j];
+
+ q->overflow_heap[i] = jj;
+ q->overflow_heap[j] = ii;
+
+ q->tins[ii.t].overflow_idx[ii.b] = j;
+ q->tins[jj.t].overflow_idx[jj.b] = i;
+}
+
+static inline u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
+{
+ struct cake_heap_entry ii = q->overflow_heap[i];
+
+ return q->tins[ii.t].backlogs[ii.b];
+}
+
+static void cake_heapify(struct cake_sched_data *q, u16 i)
+{
+ static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
+ u32 m = i;
+ u32 mb = cake_heap_get_backlog(q, m);
+
+ while (m < a) {
+ u32 l = m + m + 1;
+ u32 r = l + 1;
+
+ if (l < a) {
+ u32 lb = cake_heap_get_backlog(q, l);
+
+ if (lb > mb) {
+ m = l;
+ mb = lb;
+ }
+ }
+
+ if (r < a) {
+ u32 rb = cake_heap_get_backlog(q, r);
+
+ if (rb > mb) {
+ m = r;
+ mb = rb;
+ }
+ }
+
+ if (m != i) {
+ cake_heap_swap(q, i, m);
+ i = m;
+ } else {
+ break;
+ }
+ }
+}
+
+static void cake_heapify_up(struct cake_sched_data *q, u16 i)
+{
+ while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
+ u16 p = (i - 1) >> 1;
+ u32 ib = cake_heap_get_backlog(q, i);
+ u32 pb = cake_heap_get_backlog(q, p);
+
+ if (ib > pb) {
+ cake_heap_swap(q, i, p);
+ i = p;
+ } else {
+ break;
+ }
+ }
+}
+
+static int cake_advance_shaper(struct cake_sched_data *q,
+ struct cake_tin_data *b,
+ struct sk_buff *skb,
+ u64 now, bool drop)
+{
+ u32 len = get_cobalt_cb(skb)->adjusted_len;
+
+ /* charge packet bandwidth to this tin
+ * and to the global shaper.
+ */
+ if (q->rate_ns) {
+ s64 tdiff1 = b->tin_time_next_packet - now;
+ s64 tdiff2 = (len * (u64)b->tin_rate_ns) >> b->tin_rate_shft;
+ s64 tdiff3 = (len * (u64)q->rate_ns) >> q->rate_shft;
+ s64 tdiff4 = tdiff3 + (tdiff3 >> 1);
+
+ if (tdiff1 < 0)
+ b->tin_time_next_packet += tdiff2;
+ else if (tdiff1 < tdiff2)
+ b->tin_time_next_packet = now + tdiff2;
+
+ q->time_next_packet += tdiff3;
+ if (!drop)
+ q->failsafe_next_packet += tdiff4;
+ }
+ return len;
+}
+
+static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ u32 idx = 0, tin = 0, len;
+ struct cake_tin_data *b;
+ struct cake_flow *flow;
+ struct cake_heap_entry qq;
+ u64 now = cobalt_get_time();
+
+ if (!q->overflow_timeout) {
+ int i;
+ /* Build fresh max-heap */
+ for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
+ cake_heapify(q, i);
+ }
+ q->overflow_timeout = 65535;
+
+ /* select longest queue for pruning */
+ qq = q->overflow_heap[0];
+ tin = qq.t;
+ idx = qq.b;
+
+ b = &q->tins[tin];
+ flow = &b->flows[idx];
+ skb = dequeue_head(flow);
+ if (unlikely(!skb)) {
+ /* heap has gone wrong, rebuild it next time */
+ q->overflow_timeout = 0;
+ return idx + (tin << 16);
+ }
+
+ if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
+ b->unresponsive_flow_count++;
+
+ len = qdisc_pkt_len(skb);
+ q->buffer_used -= skb->truesize;
+ b->backlogs[idx] -= len;
+ b->tin_backlog -= len;
+ sch->qstats.backlog -= len;
+ qdisc_tree_reduce_backlog(sch, 1, len);
+
+ b->tin_dropped++;
+ sch->qstats.drops++;
+
+ if (q->rate_flags & CAKE_FLAG_INGRESS)
+ cake_advance_shaper(q, b, skb, now, true);
+
+ __qdisc_drop(skb, to_free);
+ sch->q.qlen--;
+
+ cake_heapify(q, 0);
+
+ return idx + (tin << 16);
+}
+
+static inline void cake_wash_diffserv(struct sk_buff *skb)
+{
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
+ break;
+ case htons(ETH_P_IPV6):
+ ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
+ break;
+ default:
+ break;
+ };
+}
+
+static inline u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash)
+{
+ u8 dscp;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
+ if (wash && dscp)
+ ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
+ return dscp;
+
+ case htons(ETH_P_IPV6):
+ dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
+ if (wash && dscp)
+ ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
+ return dscp;
+
+ case htons(ETH_P_ARP):
+ return 0x38; /* CS7 - Net Control */
+
+ default:
+ /* If there is no Diffserv field, treat as best-effort */
+ return 0;
+ };
+}
+
+static void cake_reconfigure(struct Qdisc *sch);
+
+static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ u32 idx, tin;
+ struct cake_tin_data *b;
+ struct cake_flow *flow;
+ /* signed len to handle corner case filtered ACK larger than trigger */
+ int len = qdisc_pkt_len(skb);
+ u64 now = cobalt_get_time();
+ struct sk_buff *ack = NULL;
+
+ /* extract the Diffserv Precedence field, if it exists */
+ /* and clear DSCP bits if washing */
+ if (q->tin_mode != CAKE_DIFFSERV_BESTEFFORT) {
+ tin = q->tin_index[cake_handle_diffserv(skb,
+ q->rate_flags & CAKE_FLAG_WASH)];
+ if (unlikely(tin >= q->tin_cnt))
+ tin = 0;
+ } else {
+ tin = 0;
+ if (q->rate_flags & CAKE_FLAG_WASH)
+ cake_wash_diffserv(skb);
+ }
+
+ b = &q->tins[tin];
+
+ /* choose flow to insert into */
+ idx = cake_hash(b, skb, q->flow_mode);
+ flow = &b->flows[idx];
+
+ /* ensure shaper state isn't stale */
+ if (!b->tin_backlog) {
+ if (b->tin_time_next_packet < now)
+ b->tin_time_next_packet = now;
+
+ if (!sch->q.qlen) {
+ if (q->time_next_packet < now) {
+ q->failsafe_next_packet = now;
+ q->time_next_packet = now;
+ } else if (q->time_next_packet > now &&
+ q->failsafe_next_packet > now) {
+ u64 next = min(q->time_next_packet,
+ q->failsafe_next_packet);
+ sch->qstats.overlimits++;
+ qdisc_watchdog_schedule_ns(&q->watchdog, next);
+ }
+ }
+ }
+
+ if (unlikely(len > b->max_skblen))
+ b->max_skblen = len;
+
+ /* Split GSO aggregates if they're likely to impair flow isolation */
+
+ if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
+ struct sk_buff *segs, *nskb;
+ netdev_features_t features = netif_skb_features(skb);
+ unsigned int slen = 0;
+
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+ if (IS_ERR_OR_NULL(segs))
+ return qdisc_drop(skb, sch, to_free);
+
+ while (segs) {
+ nskb = segs->next;
+ segs->next = NULL;
+ qdisc_skb_cb(segs)->pkt_len = segs->len;
+ cobalt_set_enqueue_time(segs, now);
+ get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
+ segs);
+ flow_queue_add(flow, segs);
+
+ sch->q.qlen++;
+ slen += segs->len;
+ q->buffer_used += segs->truesize;
+ b->packets++;
+ segs = nskb;
+ }
+ /* stats */
+ b->bytes += slen;
+ b->backlogs[idx] += slen;
+ b->tin_backlog += slen;
+ sch->qstats.backlog += slen;
+ q->avg_window_bytes += slen;
+
+ qdisc_tree_reduce_backlog(sch, 1, len);
+ consume_skb(skb);
+ } else {
+ /* not splitting */
+ cobalt_set_enqueue_time(skb, now);
+ get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
+ flow_queue_add(flow, skb);
+
+ if (q->ack_filter)
+ ack = cake_ack_filter(q, flow);
+
+ if (ack) {
+ b->ack_drops++;
+ sch->qstats.drops++;
+ b->bytes += qdisc_pkt_len(ack);
+ len -= qdisc_pkt_len(ack);
+ q->buffer_used += skb->truesize - ack->truesize;
+ if (q->rate_flags & CAKE_FLAG_INGRESS)
+ cake_advance_shaper(q, b, ack, now, true);
+
+ qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
+ consume_skb(ack);
+ } else {
+ sch->q.qlen++;
+ q->buffer_used += skb->truesize;
+ }
+ /* stats */
+ b->packets++;
+ b->bytes += len;
+ b->backlogs[idx] += len;
+ b->tin_backlog += len;
+ sch->qstats.backlog += len;
+ q->avg_window_bytes += len;
+ }
+
+ if (q->overflow_timeout)
+ cake_heapify_up(q, b->overflow_idx[idx]);
+
+ /* incoming bandwidth capacity estimate */
+ if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
+ u64 packet_interval = now - q->last_packet_time;
+
+ if (packet_interval > NSEC_PER_SEC)
+ packet_interval = NSEC_PER_SEC;
+
+ /* filter out short-term bursts, eg. wifi aggregation */
+ q->avg_packet_interval = cake_ewma(q->avg_packet_interval,
+ packet_interval,
+ packet_interval > q->avg_packet_interval ? 2 : 8);
+
+ q->last_packet_time = now;
+
+ if (packet_interval > q->avg_packet_interval) {
+ u64 window_interval = now - q->avg_window_begin;
+ u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
+
+ do_div(b, window_interval);
+ q->avg_peak_bandwidth =
+ cake_ewma(q->avg_peak_bandwidth, b,
+ b > q->avg_peak_bandwidth ? 2 : 8);
+ q->avg_window_bytes = 0;
+ q->avg_window_begin = now;
+
+ if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS &&
+ now - q->last_reconfig_time >
+ (NSEC_PER_SEC / 4)) {
+ q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
+ cake_reconfigure(sch);
+ }
+ }
+ } else {
+ q->avg_window_bytes = 0;
+ q->last_packet_time = now;
+ }
+
+ /* flowchain */
+ if (!flow->set || flow->set == CAKE_SET_DECAYING) {
+ struct cake_host *srchost = &b->hosts[flow->srchost];
+ struct cake_host *dsthost = &b->hosts[flow->dsthost];
+ u16 host_load = 1;
+
+ if (!flow->set) {
+ list_add_tail(&flow->flowchain, &b->new_flows);
+ } else {
+ b->decaying_flow_count--;
+ list_move_tail(&flow->flowchain, &b->new_flows);
+ }
+ flow->set = CAKE_SET_SPARSE;
+ b->sparse_flow_count++;
+
+ if (cake_dsrc(q->flow_mode))
+ host_load = max(host_load, srchost->srchost_refcnt);
+
+ if (cake_ddst(q->flow_mode))
+ host_load = max(host_load, dsthost->dsthost_refcnt);
+
+ flow->deficit = (b->flow_quantum *
+ quantum_div[host_load]) >> 16;
+ } else if (flow->set == CAKE_SET_SPARSE_WAIT) {
+ /* this flow was empty, accounted as a sparse flow, but actually
+ * in the bulk rotation.
+ */
+ flow->set = CAKE_SET_BULK;
+ b->sparse_flow_count--;
+ b->bulk_flow_count++;
+ }
+
+ if (q->buffer_used > q->buffer_max_used)
+ q->buffer_max_used = q->buffer_used;
+
+ if (q->buffer_used > q->buffer_limit) {
+ u32 dropped = 0;
+
+ while (q->buffer_used > q->buffer_limit) {
+ dropped++;
+ cake_drop(sch, to_free);
+ }
+ b->drop_overlimit += dropped;
+ }
+ return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct cake_tin_data *b = &q->tins[q->cur_tin];
+ struct cake_flow *flow = &b->flows[q->cur_flow];
+ struct sk_buff *skb = NULL;
+ u32 len;
+
+ if (flow->head) {
+ skb = dequeue_head(flow);
+ len = qdisc_pkt_len(skb);
+ b->backlogs[q->cur_flow] -= len;
+ b->tin_backlog -= len;
+ sch->qstats.backlog -= len;
+ q->buffer_used -= skb->truesize;
+ sch->q.qlen--;
+
+ if (q->overflow_timeout)
+ cake_heapify(q, b->overflow_idx[q->cur_flow]);
+ }
+ return skb;
+}
+
+/* Discard leftover packets from a tin no longer in use. */
+static void cake_clear_tin(struct Qdisc *sch, u16 tin)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+
+ q->cur_tin = tin;
+ for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
+ while (!!(skb = cake_dequeue_one(sch)))
+ kfree_skb(skb);
+}
+
+static struct sk_buff *cake_dequeue(struct Qdisc *sch)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ struct cake_tin_data *b = &q->tins[q->cur_tin];
+ struct cake_flow *flow;
+ struct cake_host *srchost, *dsthost;
+ struct list_head *head;
+ u32 len;
+ u16 host_load;
+ cobalt_time_t now = ktime_get_ns();
+ cobalt_time_t delay;
+ bool first_flow = true;
+
+begin:
+ if (!sch->q.qlen)
+ return NULL;
+
+ /* global hard shaper */
+ if (q->time_next_packet > now && q->failsafe_next_packet > now) {
+ u64 next = min(q->time_next_packet, q->failsafe_next_packet);
+
+ sch->qstats.overlimits++;
+ qdisc_watchdog_schedule_ns(&q->watchdog, next);
+ return NULL;
+ }
+
+ /* Choose a class to work on. */
+ if (!q->rate_ns) {
+ /* In unlimited mode, can't rely on shaper timings, just balance
+ * with DRR
+ */
+ while (b->tin_deficit < 0 ||
+ !(b->sparse_flow_count + b->bulk_flow_count)) {
+ if (b->tin_deficit <= 0)
+ b->tin_deficit += b->tin_quantum_band;
+
+ q->cur_tin++;
+ b++;
+ if (q->cur_tin >= q->tin_cnt) {
+ q->cur_tin = 0;
+ b = q->tins;
+ }
+ }
+ } else {
+ /* In shaped mode, choose:
+ * - Highest-priority tin with queue and meeting schedule, or
+ * - The earliest-scheduled tin with queue.
+ */
+ int tin, best_tin = 0;
+ s64 best_time = 0xFFFFFFFFFFFFUL;
+
+ for (tin = 0; tin < q->tin_cnt; tin++) {
+ b = q->tins + tin;
+ if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
+ s64 tdiff = b->tin_time_next_packet - now;
+
+ if (tdiff <= 0 || tdiff <= best_time) {
+ best_time = tdiff;
+ best_tin = tin;
+ }
+ }
+ }
+
+ q->cur_tin = best_tin;
+ b = q->tins + best_tin;
+ }
+
+retry:
+ /* service this class */
+ head = &b->decaying_flows;
+ if (!first_flow || list_empty(head)) {
+ head = &b->new_flows;
+ if (list_empty(head)) {
+ head = &b->old_flows;
+ if (unlikely(list_empty(head))) {
+ head = &b->decaying_flows;
+ if (unlikely(list_empty(head)))
+ goto begin;
+ }
+ }
+ }
+ flow = list_first_entry(head, struct cake_flow, flowchain);
+ q->cur_flow = flow - b->flows;
+ first_flow = false;
+
+ /* triple isolation (modified DRR++) */
+ srchost = &b->hosts[flow->srchost];
+ dsthost = &b->hosts[flow->dsthost];
+ host_load = 1;
+
+ if (cake_dsrc(q->flow_mode))
+ host_load = max(host_load, srchost->srchost_refcnt);
+
+ if (cake_ddst(q->flow_mode))
+ host_load = max(host_load, dsthost->dsthost_refcnt);
+
+ WARN_ON(host_load > CAKE_QUEUES);
+
+ /* flow isolation (DRR++) */
+ if (flow->deficit <= 0) {
+ /* The shifted prandom_u32() is a way to apply dithering to
+ * avoid accumulating roundoff errors
+ */
+ flow->deficit += (b->flow_quantum * quantum_div[host_load] +
+ (prandom_u32() >> 16)) >> 16;
+ list_move_tail(&flow->flowchain, &b->old_flows);
+
+ /* Keep all flows with deficits out of the sparse and decaying
+ * rotations. No non-empty flow can go into the decaying
+ * rotation, so they can't get deficits
+ */
+ if (flow->set == CAKE_SET_SPARSE) {
+ if (flow->head) {
+ b->sparse_flow_count--;
+ b->bulk_flow_count++;
+ flow->set = CAKE_SET_BULK;
+ } else {
+ /* we've moved it to the bulk rotation for
+ * correct deficit accounting but we still want
+ * to count it as a sparse flow, not a bulk one.
+ */
+ flow->set = CAKE_SET_SPARSE_WAIT;
+ }
+ }
+ goto retry;
+ }
+
+ /* Retrieve a packet via the AQM */
+ while (1) {
+ skb = cake_dequeue_one(sch);
+ if (!skb) {
+ /* this queue was actually empty */
+ if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
+ b->unresponsive_flow_count--;
+
+ if (flow->cvars.p_drop || flow->cvars.count ||
+ now < flow->cvars.drop_next) {
+ /* keep in the flowchain until the state has
+ * decayed to rest
+ */
+ list_move_tail(&flow->flowchain,
+ &b->decaying_flows);
+ if (flow->set == CAKE_SET_BULK) {
+ b->bulk_flow_count--;
+ b->decaying_flow_count++;
+ } else if (flow->set == CAKE_SET_SPARSE ||
+ flow->set == CAKE_SET_SPARSE_WAIT) {
+ b->sparse_flow_count--;
+ b->decaying_flow_count++;
+ }
+ flow->set = CAKE_SET_DECAYING;
+ } else {
+ /* remove empty queue from the flowchain */
+ list_del_init(&flow->flowchain);
+ if (flow->set == CAKE_SET_SPARSE ||
+ flow->set == CAKE_SET_SPARSE_WAIT)
+ b->sparse_flow_count--;
+ else if (flow->set == CAKE_SET_BULK)
+ b->bulk_flow_count--;
+ else
+ b->decaying_flow_count--;
+
+ flow->set = CAKE_SET_NONE;
+ srchost->srchost_refcnt--;
+ dsthost->dsthost_refcnt--;
+ }
+ goto begin;
+ }
+
+ /* Last packet in queue may be marked, shouldn't be dropped */
+ if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
+ (b->bulk_flow_count *
+ !!(q->rate_flags & CAKE_FLAG_INGRESS))) ||
+ !flow->head)
+ break;
+
+ /* drop this packet, get another one */
+ if (q->rate_flags & CAKE_FLAG_INGRESS) {
+ len = cake_advance_shaper(q, b, skb,
+ now, true);
+ flow->deficit -= len;
+ b->tin_deficit -= len;
+ }
+ b->tin_dropped++;
+ qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
+ qdisc_qstats_drop(sch);
+ kfree_skb(skb);
+ if (q->rate_flags & CAKE_FLAG_INGRESS)
+ goto retry;
+ }
+
+ b->tin_ecn_mark += !!flow->cvars.ecn_marked;
+ qdisc_bstats_update(sch, skb);
+
+ /* collect delay stats */
+ delay = now - cobalt_get_enqueue_time(skb);
+ b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
+ b->peak_delay = cake_ewma(b->peak_delay, delay,
+ delay > b->peak_delay ? 2 : 8);
+ b->base_delay = cake_ewma(b->base_delay, delay,
+ delay < b->base_delay ? 2 : 8);
+
+ len = cake_advance_shaper(q, b, skb, now, false);
+ flow->deficit -= len;
+ b->tin_deficit -= len;
+
+ if (q->time_next_packet > now && sch->q.qlen) {
+ u64 next = min(q->time_next_packet, q->failsafe_next_packet);
+
+ qdisc_watchdog_schedule_ns(&q->watchdog, next);
+ } else if (!sch->q.qlen) {
+ int i;
+
+ for (i = 0; i < q->tin_cnt; i++) {
+ if (q->tins[i].decaying_flow_count) {
+ u64 next = now + q->tins[i].cparams.target;
+
+ qdisc_watchdog_schedule_ns(&q->watchdog, next);
+ break;
+ }
+ }
+ }
+
+ if (q->overflow_timeout)
+ q->overflow_timeout--;
+
+ return skb;
+}
+
+static void cake_reset(struct Qdisc *sch)
+{
+ u32 c;
+
+ for (c = 0; c < CAKE_MAX_TINS; c++)
+ cake_clear_tin(sch, c);
+}
+
+static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
+ [TCA_CAKE_BASE_RATE] = { .type = NLA_U32 },
+ [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
+ [TCA_CAKE_ATM] = { .type = NLA_U32 },
+ [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 },
+ [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 },
+ [TCA_CAKE_RTT] = { .type = NLA_U32 },
+ [TCA_CAKE_TARGET] = { .type = NLA_U32 },
+ [TCA_CAKE_AUTORATE] = { .type = NLA_U32 },
+ [TCA_CAKE_MEMORY] = { .type = NLA_U32 },
+ [TCA_CAKE_NAT] = { .type = NLA_U32 },
+ [TCA_CAKE_RAW] = { .type = NLA_U32 },
+ [TCA_CAKE_WASH] = { .type = NLA_U32 },
+ [TCA_CAKE_MPU] = { .type = NLA_U32 },
+ [TCA_CAKE_INGRESS] = { .type = NLA_U32 },
+ [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 },
+};
+
+static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
+ cobalt_time_t ns_target, cobalt_time_t rtt_est_ns)
+{
+ /* convert byte-rate into time-per-byte
+ * so it will always unwedge in reasonable time.
+ */
+ static const u64 MIN_RATE = 64;
+ u64 rate_ns = 0;
+ u8 rate_shft = 0;
+ cobalt_time_t byte_target_ns;
+ u32 byte_target = mtu;
+
+ b->flow_quantum = 1514;
+ if (rate) {
+ b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
+ rate_shft = 32;
+ rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
+ do_div(rate_ns, max(MIN_RATE, rate));
+ while (!!(rate_ns >> 32)) {
+ rate_ns >>= 1;
+ rate_shft--;
+ }
+ } /* else unlimited, ie. zero delay */
+
+ b->tin_rate_bps = rate;
+ b->tin_rate_ns = rate_ns;
+ b->tin_rate_shft = rate_shft;
+
+ byte_target_ns = (byte_target * rate_ns) >> rate_shft;
+
+ b->cparams.target = max((byte_target_ns * 3) / 2, ns_target);
+ b->cparams.interval = max(rtt_est_ns +
+ b->cparams.target - ns_target,
+ b->cparams.target * 2);
+ b->cparams.mtu_time = byte_target_ns;
+ b->cparams.p_inc = 1 << 24; /* 1/256 */
+ b->cparams.p_dec = 1 << 20; /* 1/4096 */
+}
+
+static int cake_config_besteffort(struct Qdisc *sch)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct cake_tin_data *b = &q->tins[0];
+ u32 rate = q->rate_bps;
+ u32 mtu = psched_mtu(qdisc_dev(sch));
+
+ q->tin_cnt = 1;
+
+ q->tin_index = besteffort;
+ q->tin_order = normal_order;
+
+ cake_set_rate(b, rate, mtu, US2TIME(q->target), US2TIME(q->interval));
+ b->tin_quantum_band = 65535;
+ b->tin_quantum_prio = 65535;
+
+ return 0;
+}
+
+static int cake_config_precedence(struct Qdisc *sch)
+{
+ /* convert high-level (user visible) parameters into internal format */
+ struct cake_sched_data *q = qdisc_priv(sch);
+ u32 rate = q->rate_bps;
+ u32 mtu = psched_mtu(qdisc_dev(sch));
+ u32 quantum1 = 256;
+ u32 quantum2 = 256;
+ u32 i;
+
+ q->tin_cnt = 8;
+ q->tin_index = precedence;
+ q->tin_order = normal_order;
+
+ for (i = 0; i < q->tin_cnt; i++) {
+ struct cake_tin_data *b = &q->tins[i];
+
+ cake_set_rate(b, rate, mtu, US2TIME(q->target),
+ US2TIME(q->interval));
+
+ b->tin_quantum_prio = max_t(u16, 1U, quantum1);
+ b->tin_quantum_band = max_t(u16, 1U, quantum2);
+
+ /* calculate next class's parameters */
+ rate *= 7;
+ rate >>= 3;
+
+ quantum1 *= 3;
+ quantum1 >>= 1;
+
+ quantum2 *= 7;
+ quantum2 >>= 3;
+ }
+
+ return 0;
+}
+
+/* List of known Diffserv codepoints:
+ *
+ * Least Effort (CS1)
+ * Best Effort (CS0)
+ * Max Reliability & LLT "Lo" (TOS1)
+ * Max Throughput (TOS2)
+ * Min Delay (TOS4)
+ * LLT "La" (TOS5)
+ * Assured Forwarding 1 (AF1x) - x3
+ * Assured Forwarding 2 (AF2x) - x3
+ * Assured Forwarding 3 (AF3x) - x3
+ * Assured Forwarding 4 (AF4x) - x3
+ * Precedence Class 2 (CS2)
+ * Precedence Class 3 (CS3)
+ * Precedence Class 4 (CS4)
+ * Precedence Class 5 (CS5)
+ * Precedence Class 6 (CS6)
+ * Precedence Class 7 (CS7)
+ * Voice Admit (VA)
+ * Expedited Forwarding (EF)
+
+ * Total 25 codepoints.
+ */
+
+/* List of traffic classes in RFC 4594:
+ * (roughly descending order of contended priority)
+ * (roughly ascending order of uncontended throughput)
+ *
+ * Network Control (CS6,CS7) - routing traffic
+ * Telephony (EF,VA) - aka. VoIP streams
+ * Signalling (CS5) - VoIP setup
+ * Multimedia Conferencing (AF4x) - aka. video calls
+ * Realtime Interactive (CS4) - eg. games
+ * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch
+ * Broadcast Video (CS3)
+ * Low Latency Data (AF2x,TOS4) - eg. database
+ * Ops, Admin, Management (CS2,TOS1) - eg. ssh
+ * Standard Service (CS0 & unrecognised codepoints)
+ * High Throughput Data (AF1x,TOS2) - eg. web traffic
+ * Low Priority Data (CS1) - eg. BitTorrent
+
+ * Total 12 traffic classes.
+ */
+
+static int cake_config_diffserv8(struct Qdisc *sch)
+{
+/* Pruned list of traffic classes for typical applications:
+ *
+ * Network Control (CS6, CS7)
+ * Minimum Latency (EF, VA, CS5, CS4)
+ * Interactive Shell (CS2, TOS1)
+ * Low Latency Transactions (AF2x, TOS4)
+ * Video Streaming (AF4x, AF3x, CS3)
+ * Bog Standard (CS0 etc.)
+ * High Throughput (AF1x, TOS2)
+ * Background Traffic (CS1)
+ *
+ * Total 8 traffic classes.
+ */
+
+ struct cake_sched_data *q = qdisc_priv(sch);
+ u32 rate = q->rate_bps;
+ u32 mtu = psched_mtu(qdisc_dev(sch));
+ u32 quantum1 = 256;
+ u32 quantum2 = 256;
+ u32 i;
+
+ q->tin_cnt = 8;
+
+ /* codepoint to class mapping */
+ q->tin_index = diffserv8;
+ q->tin_order = normal_order;
+
+ /* class characteristics */
+ for (i = 0; i < q->tin_cnt; i++) {
+ struct cake_tin_data *b = &q->tins[i];
+
+ cake_set_rate(b, rate, mtu, US2TIME(q->target),
+ US2TIME(q->interval));
+
+ b->tin_quantum_prio = max_t(u16, 1U, quantum1);
+ b->tin_quantum_band = max_t(u16, 1U, quantum2);
+
+ /* calculate next class's parameters */
+ rate *= 7;
+ rate >>= 3;
+
+ quantum1 *= 3;
+ quantum1 >>= 1;
+
+ quantum2 *= 7;
+ quantum2 >>= 3;
+ }
+
+ return 0;
+}
+
+static int cake_config_diffserv4(struct Qdisc *sch)
+{
+/* Further pruned list of traffic classes for four-class system:
+ *
+ * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4)
+ * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
+ * Best Effort (CS0, AF1x, TOS2, and those not specified)
+ * Background Traffic (CS1)
+ *
+ * Total 4 traffic classes.
+ */
+
+ struct cake_sched_data *q = qdisc_priv(sch);
+ u32 rate = q->rate_bps;
+ u32 mtu = psched_mtu(qdisc_dev(sch));
+ u32 quantum = 1024;
+
+ q->tin_cnt = 4;
+
+ /* codepoint to class mapping */
+ q->tin_index = diffserv4;
+ q->tin_order = bulk_order;
+
+ /* class characteristics */
+ cake_set_rate(&q->tins[0], rate, mtu,
+ US2TIME(q->target), US2TIME(q->interval));
+ cake_set_rate(&q->tins[1], rate >> 4, mtu,
+ US2TIME(q->target), US2TIME(q->interval));
+ cake_set_rate(&q->tins[2], rate >> 1, mtu,
+ US2TIME(q->target), US2TIME(q->interval));
+ cake_set_rate(&q->tins[3], rate >> 2, mtu,
+ US2TIME(q->target), US2TIME(q->interval));
+
+ /* priority weights */
+ q->tins[0].tin_quantum_prio = quantum;
+ q->tins[1].tin_quantum_prio = quantum >> 4;
+ q->tins[2].tin_quantum_prio = quantum << 2;
+ q->tins[3].tin_quantum_prio = quantum << 4;
+
+ /* bandwidth-sharing weights */
+ q->tins[0].tin_quantum_band = quantum;
+ q->tins[1].tin_quantum_band = quantum >> 4;
+ q->tins[2].tin_quantum_band = quantum >> 1;
+ q->tins[3].tin_quantum_band = quantum >> 2;
+
+ return 0;
+}
+
+static int cake_config_diffserv3(struct Qdisc *sch)
+{
+/* Simplified Diffserv structure with 3 tins.
+ * Low Priority (CS1)
+ * Best Effort
+ * Latency Sensitive (TOS4, VA, EF, CS6, CS7)
+ */
+ struct cake_sched_data *q = qdisc_priv(sch);
+ u32 rate = q->rate_bps;
+ u32 mtu = psched_mtu(qdisc_dev(sch));
+ u32 quantum = 1024;
+
+ q->tin_cnt = 3;
+
+ /* codepoint to class mapping */
+ q->tin_index = diffserv3;
+ q->tin_order = bulk_order;
+
+ /* class characteristics */
+ cake_set_rate(&q->tins[0], rate, mtu,
+ US2TIME(q->target), US2TIME(q->interval));
+ cake_set_rate(&q->tins[1], rate >> 4, mtu,
+ US2TIME(q->target), US2TIME(q->interval));
+ cake_set_rate(&q->tins[2], rate >> 2, mtu,
+ US2TIME(q->target), US2TIME(q->interval));
+
+ /* priority weights */
+ q->tins[0].tin_quantum_prio = quantum;
+ q->tins[1].tin_quantum_prio = quantum >> 4;
+ q->tins[2].tin_quantum_prio = quantum << 4;
+
+ /* bandwidth-sharing weights */
+ q->tins[0].tin_quantum_band = quantum;
+ q->tins[1].tin_quantum_band = quantum >> 4;
+ q->tins[2].tin_quantum_band = quantum >> 2;
+
+ return 0;
+}
+
+static void cake_reconfigure(struct Qdisc *sch)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ int c, ft;
+
+ switch (q->tin_mode) {
+ case CAKE_DIFFSERV_BESTEFFORT:
+ ft = cake_config_besteffort(sch);
+ break;
+
+ case CAKE_DIFFSERV_PRECEDENCE:
+ ft = cake_config_precedence(sch);
+ break;
+
+ case CAKE_DIFFSERV_DIFFSERV8:
+ ft = cake_config_diffserv8(sch);
+ break;
+
+ case CAKE_DIFFSERV_DIFFSERV4:
+ ft = cake_config_diffserv4(sch);
+ break;
+
+ case CAKE_DIFFSERV_DIFFSERV3:
+ default:
+ ft = cake_config_diffserv3(sch);
+ break;
+ };
+
+ for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
+ cake_clear_tin(sch, c);
+ q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
+ }
+
+ q->rate_ns = q->tins[ft].tin_rate_ns;
+ q->rate_shft = q->tins[ft].tin_rate_shft;
+
+ if (q->buffer_config_limit) {
+ q->buffer_limit = q->buffer_config_limit;
+ } else if (q->rate_bps) {
+ u64 t = (u64)q->rate_bps * q->interval;
+
+ do_div(t, USEC_PER_SEC / 4);
+ q->buffer_limit = max_t(u32, t, 4U << 20);
+ } else {
+ q->buffer_limit = ~0;
+ }
+
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
+
+ q->buffer_limit = min(q->buffer_limit,
+ max(sch->limit * psched_mtu(qdisc_dev(sch)),
+ q->buffer_config_limit));
+}
+
+static int cake_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_CAKE_MAX + 1];
+ int err;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_CAKE_BASE_RATE])
+ q->rate_bps = nla_get_u32(tb[TCA_CAKE_BASE_RATE]);
+
+ if (tb[TCA_CAKE_DIFFSERV_MODE])
+ q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
+
+ if (tb[TCA_CAKE_ATM])
+ q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
+
+ if (tb[TCA_CAKE_WASH]) {
+ if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
+ q->rate_flags |= CAKE_FLAG_WASH;
+ else
+ q->rate_flags &= ~CAKE_FLAG_WASH;
+ }
+
+ if (tb[TCA_CAKE_FLOW_MODE])
+ q->flow_mode = nla_get_u32(tb[TCA_CAKE_FLOW_MODE]);
+
+ if (tb[TCA_CAKE_NAT]) {
+ q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
+ q->flow_mode |= CAKE_FLOW_NAT_FLAG *
+ !!nla_get_u32(tb[TCA_CAKE_NAT]);
+ }
+
+ if (tb[TCA_CAKE_OVERHEAD]) {
+ q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
+ q->rate_flags |= CAKE_FLAG_OVERHEAD;
+
+ q->max_netlen = q->max_adjlen = 0;
+ q->min_netlen = q->min_adjlen = ~0;
+ }
+
+ if (tb[TCA_CAKE_RAW]) {
+ q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
+
+ q->max_netlen = q->max_adjlen = 0;
+ q->min_netlen = q->min_adjlen = ~0;
+ }
+
+ if (tb[TCA_CAKE_MPU])
+ q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
+
+ if (tb[TCA_CAKE_RTT]) {
+ q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
+
+ if (!q->interval)
+ q->interval = 1;
+ }
+
+ if (tb[TCA_CAKE_TARGET]) {
+ q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
+
+ if (!q->target)
+ q->target = 1;
+ }
+
+ if (tb[TCA_CAKE_AUTORATE]) {
+ if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
+ q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
+ else
+ q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
+ }
+
+ if (tb[TCA_CAKE_INGRESS]) {
+ if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
+ q->rate_flags |= CAKE_FLAG_INGRESS;
+ else
+ q->rate_flags &= ~CAKE_FLAG_INGRESS;
+ }
+
+ if (tb[TCA_CAKE_ACK_FILTER])
+ q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
+
+ if (tb[TCA_CAKE_MEMORY])
+ q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
+
+ if (q->rate_bps && q->rate_bps <= CAKE_SPLIT_GSO_THRESHOLD)
+ q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
+ else
+ q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
+
+ if (q->tins) {
+ sch_tree_lock(sch);
+ cake_reconfigure(sch);
+ sch_tree_unlock(sch);
+ }
+
+ return 0;
+}
+
+
+static void cake_free(void *addr)
+{
+ if (addr)
+ kvfree(addr);
+}
+
+static void cake_destroy(struct Qdisc *sch)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+
+ qdisc_watchdog_cancel(&q->watchdog);
+
+ if (q->tins)
+ cake_free(q->tins);
+}
+
+static int cake_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ int i, j;
+
+ sch->limit = 10240;
+ q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
+ q->flow_mode = CAKE_FLOW_TRIPLE;
+
+ q->rate_bps = 0; /* unlimited by default */
+
+ q->interval = 100000; /* 100ms default */
+ q->target = 5000; /* 5ms: codel RFC argues
+ * for 5 to 10% of interval
+ */
+
+ q->cur_tin = 0;
+ q->cur_flow = 0;
+
+ if (opt) {
+ int err = cake_change(sch, opt, extack);
+
+ if (err)
+ return err;
+ }
+
+ qdisc_watchdog_init(&q->watchdog, sch);
+
+ quantum_div[0] = ~0;
+ for (i = 1; i <= CAKE_QUEUES; i++)
+ quantum_div[i] = 65535 / i;
+
+ q->tins = kvzalloc(CAKE_MAX_TINS * sizeof(struct cake_tin_data),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!q->tins)
+ goto nomem;
+
+ for (i = 0; i < CAKE_MAX_TINS; i++) {
+ struct cake_tin_data *b = q->tins + i;
+
+ b->perturb = prandom_u32();
+ INIT_LIST_HEAD(&b->new_flows);
+ INIT_LIST_HEAD(&b->old_flows);
+ INIT_LIST_HEAD(&b->decaying_flows);
+ b->sparse_flow_count = 0;
+ b->bulk_flow_count = 0;
+ b->decaying_flow_count = 0;
+
+ for (j = 0; j < CAKE_QUEUES; j++) {
+ struct cake_flow *flow = b->flows + j;
+ u32 k = j * CAKE_MAX_TINS + i;
+
+ INIT_LIST_HEAD(&flow->flowchain);
+ cobalt_vars_init(&flow->cvars);
+
+ q->overflow_heap[k].t = i;
+ q->overflow_heap[k].b = j;
+ b->overflow_idx[j] = k;
+ }
+ }
+
+ cake_reconfigure(sch);
+ q->avg_peak_bandwidth = q->rate_bps;
+ q->min_netlen = q->min_adjlen = ~0;
+ return 0;
+
+nomem:
+ cake_destroy(sch);
+ return -ENOMEM;
+}
+
+static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (!opts)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_BASE_RATE, q->rate_bps))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
+ q->flow_mode & ~CAKE_FLOW_NAT_FLAG))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_NAT,
+ !!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO, !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_WASH,
+ !!(q->rate_flags & CAKE_FLAG_WASH)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
+ goto nla_put_failure;
+
+ if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
+ if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
+ !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_INGRESS,
+ !!(q->rate_flags & CAKE_FLAG_INGRESS)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ return -1;
+}
+
+static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP);
+ struct nlattr *tstats, *ts;
+ int i;
+
+ if (!stats)
+ return -1;
+
+#define PUT_STAT_U32(attr, data) do { \
+ if(nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
+ goto nla_put_failure; \
+ } while (0);
+
+ PUT_STAT_U32(CAPACITY_ESTIMATE, q->avg_peak_bandwidth);
+ PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
+ PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
+ PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
+ PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
+ PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
+ PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
+ PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
+
+#undef PUT_STAT_U32
+
+ tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS);
+ if (!tstats)
+ goto nla_put_failure;
+
+#define PUT_TSTAT_U32(attr, data) do { \
+ if(nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
+ goto nla_put_failure; \
+ } while (0);
+#define PUT_TSTAT_U64(attr, data) do { \
+ if(nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
+ data, TCA_CAKE_TIN_STATS_PAD)) \
+ goto nla_put_failure; \
+ } while (0);
+
+ for (i = 0; i < q->tin_cnt; i++) {
+ struct cake_tin_data *b = &q->tins[q->tin_order[i]];
+
+ ts = nla_nest_start(d->skb, i + 1);
+ if (!ts)
+ goto nla_put_failure;
+
+ PUT_TSTAT_U32(THRESHOLD_RATE, b->tin_rate_bps);
+ PUT_TSTAT_U32(TARGET_US, cobalt_time_to_us(b->cparams.target));
+ PUT_TSTAT_U32(INTERVAL_US, cobalt_time_to_us(b->cparams.interval));
+
+ PUT_TSTAT_U32(SENT_PACKETS, b->packets);
+ PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
+ PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
+ PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
+ PUT_TSTAT_U64(BACKLOG_BYTES64, b->tin_backlog);
+ PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
+
+ PUT_TSTAT_U32(PEAK_DELAY_US, cobalt_time_to_us(b->peak_delay));
+ PUT_TSTAT_U32(AVG_DELAY_US, cobalt_time_to_us(b->avge_delay));
+ PUT_TSTAT_U32(BASE_DELAY_US, cobalt_time_to_us(b->base_delay));
+
+ PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
+ PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
+ PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
+
+ PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
+ b->decaying_flow_count);
+ PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
+ PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
+ PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
+
+ PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
+ nla_nest_end(d->skb, ts);
+ }
+
+#undef PUT_TSTAT_U32
+#undef PUT_TSTAT_U64
+
+ nla_nest_end(d->skb, tstats);
+ return nla_nest_end(d->skb, stats);
+
+nla_put_failure:
+ nla_nest_cancel(d->skb, stats);
+ return -1;
+}
+
+static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
+ .id = "cake",
+ .priv_size = sizeof(struct cake_sched_data),
+ .enqueue = cake_enqueue,
+ .dequeue = cake_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = cake_init,
+ .reset = cake_reset,
+ .destroy = cake_destroy,
+ .change = cake_change,
+ .dump = cake_dump,
+ .dump_stats = cake_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init cake_module_init(void)
+{
+ return register_qdisc(&cake_qdisc_ops);
+}
+
+static void __exit cake_module_exit(void)
+{
+ unregister_qdisc(&cake_qdisc_ops);
+}
+
+module_init(cake_module_init)
+module_exit(cake_module_exit)
+MODULE_AUTHOR("Jonathan Morton");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("The CAKE shaper.");
--
2.17.0
^ permalink raw reply related
* [PATCH iproute2-next v5] Add support for cake qdisc
From: Toke Høiland-Jørgensen @ 2018-04-27 12:17 UTC (permalink / raw)
To: netdev; +Cc: cake, Toke Høiland-Jørgensen, Dave Taht
In-Reply-To: <20180427121706.23273-1-toke@toke.dk>
sch_cake is intended to squeeze the most bandwidth and latency out of even
the slowest ISP links and routers, while presenting an API simple enough
that even an ISP can configure it.
Example of use on a cable ISP uplink:
tc qdisc add dev eth0 cake bandwidth 20Mbit nat docsis ack-filter
To shape a cable download link (ifb and tc-mirred setup elided)
tc qdisc add dev ifb0 cake bandwidth 200mbit nat docsis ingress wash besteffort
Cake is filled with:
* A hybrid Codel/Blue AQM algorithm, "Cobalt", tied to an FQ_Codel
derived Flow Queuing system, which autoconfigures based on the bandwidth.
* A novel "triple-isolate" mode (the default) which balances per-host
and per-flow FQ even through NAT.
* An deficit based shaper, that can also be used in an unlimited mode.
* 8 way set associative hashing to reduce flow collisions to a minimum.
* A reasonable interpretation of various diffserv latency/loss tradeoffs.
* Support for zeroing diffserv markings for entering and exiting traffic.
* Support for interacting well with Docsis 3.0 shaper framing.
* Support for DSL framing types and shapers.
* Support for ack filtering.
* Extensive statistics for measuring, loss, ecn markings, latency variation.
Various versions baking have been available as an out of tree build for
kernel versions going back to 3.10, as the embedded router world has been
running a few years behind mainline Linux. A stable version has been
generally available on lede-17.01 and later.
sch_cake replaces a combination of iptables, tc filter, htb and fq_codel
in the sqm-scripts, with sane defaults and vastly simpler configuration.
Cake's principal author is Jonathan Morton, with contributions from
Kevin Darbyshire-Bryant, Toke Høiland-Jørgensen, Sebastian Moeller,
Ryan Mounce, Guido Sarducci, Dean Scarff, Nils Andreas Svee, Dave Täht,
and Loganaden Velvindron.
Testing from Pete Heist, Georgios Amanakis, and the many other members of
the cake@lists.bufferbloat.net mailing list.
Signed-off-by: Dave Taht <dave.taht@gmail.com>
Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
Changelog:
v5:
- Print the SPLIT_GSO flag
- Switch to print_u64() for JSON output
- Fix a format string for mpu option output
v4:
- Switch stats parsing to use nested netlink attributes
- Tweaks to JSON stats output keys
v3:
- Remove accidentally included test flag
v2:
- Updated netlink config ABI
- Remove diffserv-llt mode
- Various tweaks and clean-ups of stats output
man/man8/tc-cake.8 | 632 ++++++++++++++++++++++++++++++++++++++
man/man8/tc.8 | 1 +
tc/Makefile | 1 +
tc/q_cake.c | 739 +++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 1373 insertions(+)
create mode 100644 man/man8/tc-cake.8
create mode 100644 tc/q_cake.c
diff --git a/man/man8/tc-cake.8 b/man/man8/tc-cake.8
new file mode 100644
index 00000000..dff2e360
--- /dev/null
+++ b/man/man8/tc-cake.8
@@ -0,0 +1,632 @@
+.TH CAKE 8 "27 April 2018" "iproute2" "Linux"
+.SH NAME
+CAKE \- Common Applications Kept Enhanced (CAKE)
+.SH SYNOPSIS
+.B tc qdisc ... cake
+.br
+[
+.BR bandwidth
+RATE |
+.BR unlimited*
+|
+.BR autorate_ingress
+]
+.br
+[
+.BR rtt
+TIME |
+.BR datacentre
+|
+.BR lan
+|
+.BR metro
+|
+.BR regional
+|
+.BR internet*
+|
+.BR oceanic
+|
+.BR satellite
+|
+.BR interplanetary
+]
+.br
+[
+.BR besteffort
+|
+.BR diffserv8
+|
+.BR diffserv4
+|
+.BR diffserv3*
+]
+.br
+[
+.BR flowblind
+|
+.BR srchost
+|
+.BR dsthost
+|
+.BR hosts
+|
+.BR flows
+|
+.BR dual-srchost
+|
+.BR dual-dsthost
+|
+.BR triple-isolate*
+]
+.br
+[
+.BR nat
+|
+.BR nonat*
+]
+.br
+[
+.BR wash
+|
+.BR nowash*
+]
+.br
+[
+.BR ack-filter
+|
+.BR ack-filter-aggressive
+|
+.BR no-ack-filter*
+]
+.br
+[
+.BR memlimit
+LIMIT ]
+.br
+[
+.BR ptm
+|
+.BR atm
+|
+.BR noatm*
+]
+.br
+[
+.BR overhead
+N |
+.BR conservative
+|
+.BR raw*
+]
+.br
+[
+.BR mpu
+N ]
+.br
+[
+.BR ingress
+|
+.BR egress*
+]
+.br
+(* marks defaults)
+
+
+.SH DESCRIPTION
+CAKE (Common Applications Kept Enhanced) is a shaping-capable queue discipline
+which uses both AQM and FQ. It combines COBALT, which is an AQM algorithm
+combining Codel and BLUE, a shaper which operates in deficit mode, and a variant
+of DRR++ for flow isolation. 8-way set-associative hashing is used to virtually
+eliminate hash collisions. Priority queuing is available through a simplified
+diffserv implementation. Overhead compensation for various encapsulation
+schemes is tightly integrated.
+
+All settings are optional; the default settings are chosen to be sensible in
+most common deployments. Most people will only need to set the
+.B bandwidth
+parameter to get useful results, but reading the
+.B Overhead Compensation
+and
+.B Round Trip Time
+sections is strongly encouraged.
+
+.SH SHAPER PARAMETERS
+CAKE uses a deficit-mode shaper, which does not exhibit the initial burst
+typical of token-bucket shapers. It will automatically burst precisely as much
+as required to maintain the configured throughput. As such, it is very
+straightforward to configure.
+.PP
+.B unlimited
+(default)
+.br
+ No limit on the bandwidth.
+.PP
+.B bandwidth
+RATE
+.br
+ Set the shaper bandwidth. See
+.BR tc(8)
+or examples below for details of the RATE value.
+.PP
+.B autorate_ingress
+.br
+ Automatic capacity estimation based on traffic arriving at this qdisc.
+This is most likely to be useful with cellular links, which tend to change
+quality randomly. A
+.B bandwidth
+parameter can be used in conjunction to specify an initial estimate. The shaper
+will periodically be set to a bandwidth slightly below the estimated rate. This
+estimator cannot estimate the bandwidth of links downstream of itself.
+
+.SH OVERHEAD COMPENSATION PARAMETERS
+The size of each packet on the wire may differ from that seen by Linux. The
+following parameters allow CAKE to compensate for this difference by internally
+considering each packet to be bigger than Linux informs it. To assist users who
+are not expert network engineers, keywords have been provided to represent a
+number of common link technologies.
+
+.SS Manual Overhead Specification
+.B overhead
+BYTES
+.br
+ Adds BYTES to the size of each packet. BYTES may be negative; values
+between -64 and 256 (inclusive) are accepted.
+.PP
+.B mpu
+BYTES
+.br
+ Rounds each packet (including overhead) up to a minimum length
+BYTES. BYTES may not be negative; values between 0 and 256 (inclusive)
+are accepted.
+.PP
+.B atm
+.br
+ Compensates for ATM cell framing, which is normally found on ADSL links.
+This is performed after the
+.B overhead
+parameter above. ATM uses fixed 53-byte cells, each of which can carry 48 bytes
+payload.
+.PP
+.B ptm
+.br
+ Compensates for PTM encoding, which is normally found on VDSL2 links and
+uses a 64b/65b encoding scheme. It is even more efficient to simply
+derate the specified shaper bandwidth by a factor of 64/65 or 0.984. See
+ITU G.992.3 Annex N and IEEE 802.3 Section 61.3 for details.
+.PP
+.B noatm
+.br
+ Disables ATM and PTM compensation.
+
+.SS Failsafe Overhead Keywords
+These two keywords are provided for quick-and-dirty setup. Use them if you
+can't be bothered to read the rest of this section.
+.PP
+.B raw
+(default)
+.br
+ Turns off all overhead compensation in CAKE. The packet size reported
+by Linux will be used directly.
+.PP
+ Other overhead keywords may be added after "raw". The effect of this is
+to make the overhead compensation operate relative to the reported packet size,
+not the underlying IP packet size.
+.PP
+.B conservative
+.br
+ Compensates for more overhead than is likely to occur on any
+widely-deployed link technology.
+.br
+ Equivalent to
+.B overhead 48 atm.
+
+.SS ADSL Overhead Keywords
+Most ADSL modems have a way to check which framing scheme is in use. Often this
+is also specified in the settings document provided by the ISP. The keywords in
+this section are intended to correspond with these sources of information. All
+of them implicitly set the
+.B atm
+flag.
+.PP
+.B pppoa-vcmux
+.br
+ Equivalent to
+.B overhead 10 atm
+.PP
+.B pppoa-llc
+.br
+ Equivalent to
+.B overhead 14 atm
+.PP
+.B pppoe-vcmux
+.br
+ Equivalent to
+.B overhead 32 atm
+.PP
+.B pppoe-llcsnap
+.br
+ Equivalent to
+.B overhead 40 atm
+.PP
+.B bridged-vcmux
+.br
+ Equivalent to
+.B overhead 24 atm
+.PP
+.B bridged-llcsnap
+.br
+ Equivalent to
+.B overhead 32 atm
+.PP
+.B ipoa-vcmux
+.br
+ Equivalent to
+.B overhead 8 atm
+.PP
+.B ipoa-llcsnap
+.br
+ Equivalent to
+.B overhead 16 atm
+.PP
+See also the Ethernet Correction Factors section below.
+
+.SS VDSL2 Overhead Keywords
+ATM was dropped from VDSL2 in favour of PTM, which is a much more
+straightforward framing scheme. Some ISPs retained PPPoE for compatibility with
+their existing back-end systems.
+.PP
+.B pppoe-ptm
+.br
+ Equivalent to
+.B overhead 30 ptm
+
+.br
+ PPPoE: 2B PPP + 6B PPPoE +
+.br
+ ETHERNET: 6B dest MAC + 6B src MAC + 2B ethertype + 4B Frame Check Sequence +
+.br
+ PTM: 1B Start of Frame (S) + 1B End of Frame (Ck) + 2B TC-CRC (PTM-FCS)
+.br
+.PP
+.B bridged-ptm
+.br
+ Equivalent to
+.B overhead 22 ptm
+.br
+ ETHERNET: 6B dest MAC + 6B src MAC + 2B ethertype + 4B Frame Check Sequence +
+.br
+ PTM: 1B Start of Frame (S) + 1B End of Frame (Ck) + 2B TC-CRC (PTM-FCS)
+.br
+.PP
+See also the Ethernet Correction Factors section below.
+
+.SS DOCSIS Cable Overhead Keyword
+DOCSIS is the universal standard for providing Internet service over cable-TV
+infrastructure.
+
+In this case, the actual on-wire overhead is less important than the packet size
+the head-end equipment uses for shaping and metering. This is specified to be
+an Ethernet frame including the CRC (aka FCS).
+.PP
+.B docsis
+.br
+ Equivalent to
+.B overhead 18 mpu 64 noatm
+
+.SS Ethernet Overhead Keywords
+.PP
+.B ethernet
+.br
+ Accounts for Ethernet's preamble, inter-frame gap, and Frame Check
+Sequence. Use this keyword when the bottleneck being shaped for is an
+actual Ethernet cable.
+.br
+ Equivalent to
+.B overhead 38 mpu 84 noatm
+.PP
+.B ether-vlan
+.br
+ Adds 4 bytes to the overhead compensation, accounting for an IEEE 802.1Q
+VLAN header appended to the Ethernet frame header. NB: Some ISPs use one or
+even two of these within PPPoE; this keyword may be repeated as necessary to
+express this.
+
+.SH ROUND TRIP TIME PARAMETERS
+Active Queue Management (AQM) consists of embedding congestion signals in the
+packet flow, which receivers use to instruct senders to slow down when the queue
+is persistently occupied. CAKE uses ECN signalling when available, and packet
+drops otherwise, according to a combination of the Codel and BLUE AQM algorithms
+called COBALT.
+
+Very short latencies require a very rapid AQM response to adequately control
+latency. However, such a rapid response tends to impair throughput when the
+actual RTT is relatively long. CAKE allows specifying the RTT it assumes for
+tuning various parameters. Actual RTTs within an order of magnitude of this
+will generally work well for both throughput and latency management.
+
+At the 'lan' setting and below, the time constants are similar in magnitude to
+the jitter in the Linux kernel itself, so congestion might be signalled
+prematurely. The flows will then become sparse and total throughput reduced,
+leaving little or no back-pressure for the fairness logic to work against. Use
+the "metro" setting for local lans unless you have a custom kernel.
+.PP
+.B rtt
+TIME
+.br
+ Manually specify an RTT.
+.PP
+.B datacentre
+.br
+ For extremely high-performance 10GigE+ networks only. Equivalent to
+.B rtt 100us.
+.PP
+.B lan
+.br
+ For pure Ethernet (not Wi-Fi) networks, at home or in the office. Don't
+use this when shaping for an Internet access link. Equivalent to
+.B rtt 1ms.
+.PP
+.B metro
+.br
+ For traffic mostly within a single city. Equivalent to
+.B rtt 10ms.
+.PP
+.B regional
+.br
+ For traffic mostly within a European-sized country. Equivalent to
+.B rtt 30ms.
+.PP
+.B internet
+(default)
+.br
+ This is suitable for most Internet traffic. Equivalent to
+.B rtt 100ms.
+.PP
+.B oceanic
+.br
+ For Internet traffic with generally above-average latency, such as that
+suffered by Australasian residents. Equivalent to
+.B rtt 300ms.
+.PP
+.B satellite
+.br
+ For traffic via geostationary satellites. Equivalent to
+.B rtt 1000ms.
+.PP
+.B interplanetary
+.br
+ So named because Jupiter is about 1 light-hour from Earth. Use this to
+(almost) completely disable AQM actions. Equivalent to
+.B rtt 3600s.
+
+.SH FLOW ISOLATION PARAMETERS
+With flow isolation enabled, CAKE places packets from different flows into
+different queues, each of which carries its own AQM state. Packets from each
+queue are then delivered fairly, according to a DRR++ algorithm which minimises
+latency for "sparse" flows. CAKE uses a set-associative hashing algorithm to
+minimise flow collisions.
+
+These keywords specify whether fairness based on source address, destination
+address, individual flows, or any combination of those is desired.
+.PP
+.B flowblind
+.br
+ Disables flow isolation; all traffic passes through a single queue for
+each tin.
+.PP
+.B srchost
+.br
+ Flows are defined only by source address. Could be useful on the egress
+path of an ISP backhaul.
+.PP
+.B dsthost
+.br
+ Flows are defined only by destination address. Could be useful on the
+ingress path of an ISP backhaul.
+.PP
+.B hosts
+.br
+ Flows are defined by source-destination host pairs. This is host
+isolation, rather than flow isolation.
+.PP
+.B flows
+.br
+ Flows are defined by the entire 5-tuple of source address, destination
+address, transport protocol, source port and destination port. This is the type
+of flow isolation performed by SFQ and fq_codel.
+.PP
+.B dual-srchost
+.br
+ Flows are defined by the 5-tuple, and fairness is applied first over
+source addresses, then over individual flows. Good for use on egress traffic
+from a LAN to the internet, where it'll prevent any one LAN host from
+monopolising the uplink, regardless of the number of flows they use.
+.PP
+.B dual-dsthost
+.br
+ Flows are defined by the 5-tuple, and fairness is applied first over
+destination addresses, then over individual flows. Good for use on ingress
+traffic to a LAN from the internet, where it'll prevent any one LAN host from
+monopolising the downlink, regardless of the number of flows they use.
+.PP
+.B triple-isolate
+(default)
+.br
+ Flows are defined by the 5-tuple, and fairness is applied over source
+*and* destination addresses intelligently (ie. not merely by host-pairs), and
+also over individual flows. Use this if you're not certain whether to use
+dual-srchost or dual-dsthost; it'll do both jobs at once, preventing any one
+host on *either* side of the link from monopolising it with a large number of
+flows.
+.PP
+.B nat
+.br
+ Instructs Cake to perform a NAT lookup before applying flow-isolation
+rules, to determine the true addresses and port numbers of the packet, to
+improve fairness between hosts "inside" the NAT. This has no practical effect
+in "flowblind" or "flows" modes, or if NAT is performed on a different host.
+.PP
+.B nonat
+(default)
+.br
+ Cake will not perform a NAT lookup. Flow isolation will be performed
+using the addresses and port numbers directly visible to the interface Cake is
+attached to.
+
+.SH PRIORITY QUEUE PARAMETERS
+CAKE can divide traffic into "tins" based on the Diffserv field. Each tin has
+its own independent set of flow-isolation queues, and is serviced based on a WRR
+algorithm. To avoid perverse Diffserv marking incentives, tin weights have a
+"priority sharing" value when bandwidth used by that tin is below a threshold,
+and a lower "bandwidth sharing" value when above. Bandwidth is compared against
+the threshold using the same algorithm as the deficit-mode shaper.
+
+Detailed customisation of tin parameters is not provided. The following presets
+perform all necessary tuning, relative to the current shaper bandwidth and RTT
+settings.
+.PP
+.B besteffort
+.br
+ Disables priority queuing by placing all traffic in one tin.
+.PP
+.B precedence
+.br
+ Enables legacy interpretation of TOS "Precedence" field. Use of this
+preset on the modern Internet is firmly discouraged.
+.PP
+.B diffserv4
+.br
+ Provides a general-purpose Diffserv implementation with four tins:
+.br
+ Bulk (CS1), 6.25% threshold, generally low priority.
+.br
+ Best Effort (general), 100% threshold.
+.br
+ Video (AF4x, AF3x, CS3, AF2x, CS2, TOS4, TOS1), 50% threshold.
+.br
+ Voice (CS7, CS6, EF, VA, CS5, CS4), 25% threshold.
+.PP
+.B diffserv3
+(default)
+.br
+ Provides a simple, general-purpose Diffserv implementation with three tins:
+.br
+ Bulk (CS1), 6.25% threshold, generally low priority.
+.br
+ Best Effort (general), 100% threshold.
+.br
+ Voice (CS7, CS6, EF, VA, TOS4), 25% threshold, reduced Codel interval.
+
+.SH OTHER PARAMETERS
+.B memlimit
+LIMIT
+.br
+ Limit the memory consumed by Cake to LIMIT bytes. Note that this does
+not translate directly to queue size (so do not size this based on bandwidth
+delay product considerations, but rather on worst case acceptable memory
+consumption), as there is some overhead in the data structures containing the
+packets, especially for small packets.
+
+ By default, the limit is calculated based on the bandwidth and RTT
+settings.
+
+.PP
+.B wash
+
+.br
+ Traffic entering your diffserv domain is frequently mis-marked in
+transit from the perspective of your network, and traffic exiting yours may be
+mis-marked from the perspective of the transiting provider.
+
+Apply the wash option to clear all extra diffserv (but not ECN bits), after
+priority queuing has taken place.
+
+If you are shaping inbound, and cannot trust the diffserv markings (as is the
+case for Comcast Cable, among others), it is best to use a single queue
+"besteffort" mode with wash.
+
+.SH EXAMPLES
+# tc qdisc delete root dev eth0
+.br
+# tc qdisc add root dev eth0 cake bandwidth 100Mbit ethernet
+.br
+# tc -s qdisc show dev eth0
+.br
+qdisc cake 1: root refcnt 2 bandwidth 100Mbit diffserv3 triple-isolate rtt 100.0ms noatm overhead 38 mpu 84
+ Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
+ backlog 0b 0p requeues 0
+ memory used: 0b of 5000000b
+ capacity estimate: 100Mbit
+ min/max network layer size: 65535 / 0
+ min/max overhead-adjusted size: 65535 / 0
+ average network hdr offset: 0
+
+ Bulk Best Effort Voice
+ thresh 6250Kbit 100Mbit 25Mbit
+ target 5.0ms 5.0ms 5.0ms
+ interval 100.0ms 100.0ms 100.0ms
+ pk_delay 0us 0us 0us
+ av_delay 0us 0us 0us
+ sp_delay 0us 0us 0us
+ pkts 0 0 0
+ bytes 0 0 0
+ way_inds 0 0 0
+ way_miss 0 0 0
+ way_cols 0 0 0
+ drops 0 0 0
+ marks 0 0 0
+ ack_drop 0 0 0
+ sp_flows 0 0 0
+ bk_flows 0 0 0
+ un_flows 0 0 0
+ max_len 0 0 0
+ quantum 300 1514 762
+
+After some use:
+.br
+# tc -s qdisc show dev eth0
+
+qdisc cake 1: root refcnt 2 bandwidth 100Mbit diffserv3 triple-isolate rtt 100.0ms noatm overhead 38 mpu 84
+ Sent 44709231 bytes 31931 pkt (dropped 45, overlimits 93782 requeues 0)
+ backlog 33308b 22p requeues 0
+ memory used: 292352b of 5000000b
+ capacity estimate: 100Mbit
+ min/max network layer size: 28 / 1500
+ min/max overhead-adjusted size: 84 / 1538
+ average network hdr offset: 14
+
+ Bulk Best Effort Voice
+ thresh 6250Kbit 100Mbit 25Mbit
+ target 5.0ms 5.0ms 5.0ms
+ interval 100.0ms 100.0ms 100.0ms
+ pk_delay 8.7ms 6.9ms 5.0ms
+ av_delay 4.9ms 5.3ms 3.8ms
+ sp_delay 727us 1.4ms 511us
+ pkts 2590 21271 8137
+ bytes 3081804 30302659 11426206
+ way_inds 0 46 0
+ way_miss 3 17 4
+ way_cols 0 0 0
+ drops 20 15 10
+ marks 0 0 0
+ ack_drop 0 0 0
+ sp_flows 2 4 1
+ bk_flows 1 2 1
+ un_flows 0 0 0
+ max_len 1514 1514 1514
+ quantum 300 1514 762
+
+.SH SEE ALSO
+.BR tc (8),
+.BR tc-codel (8),
+.BR tc-fq_codel (8),
+.BR tc-red (8)
+
+.SH AUTHORS
+Cake's principal author is Jonathan Morton, with contributions from
+Tony Ambardar, Kevin Darbyshire-Bryant, Toke Høiland-Jørgensen,
+Sebastian Moeller, Ryan Mounce, Dean Scarff, Nils Andreas Svee, and Dave Täht.
+
+This manual page was written by Loganaden Velvindron. Please report corrections
+to the Linux Networking mailing list <netdev@vger.kernel.org>.
diff --git a/man/man8/tc.8 b/man/man8/tc.8
index 840880fb..716dfec5 100644
--- a/man/man8/tc.8
+++ b/man/man8/tc.8
@@ -795,6 +795,7 @@ was written by Alexey N. Kuznetsov and added in Linux 2.2.
.BR tc-basic (8),
.BR tc-bfifo (8),
.BR tc-bpf (8),
+.BR tc-cake (8),
.BR tc-cbq (8),
.BR tc-cgroup (8),
.BR tc-choke (8),
diff --git a/tc/Makefile b/tc/Makefile
index dfd00267..d9a43568 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -66,6 +66,7 @@ TCMODULES += q_codel.o
TCMODULES += q_fq_codel.o
TCMODULES += q_fq.o
TCMODULES += q_pie.o
+TCMODULES += q_cake.o
TCMODULES += q_hhf.o
TCMODULES += q_clsact.o
TCMODULES += e_bpf.o
diff --git a/tc/q_cake.c b/tc/q_cake.c
new file mode 100644
index 00000000..a1e66f4e
--- /dev/null
+++ b/tc/q_cake.c
@@ -0,0 +1,739 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Common Applications Kept Enhanced -- CAKE
+ *
+ * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
+ * Copyright (C) 2017-2018 Toke Høiland-Jørgensen <toke@toke.dk>
+ */
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+#include <inttypes.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain(void)
+{
+ fprintf(stderr,
+"Usage: ... cake [ bandwidth RATE | unlimited* | autorate_ingress ]\n"
+" [ rtt TIME | datacentre | lan | metro | regional |\n"
+" internet* | oceanic | satellite | interplanetary ]\n"
+" [ besteffort | diffserv8 | diffserv4 | diffserv3* ]\n"
+" [ flowblind | srchost | dsthost | hosts | flows |\n"
+" dual-srchost | dual-dsthost | triple-isolate* ]\n"
+" [ nat | nonat* ]\n"
+" [ wash | nowash* ]\n"
+" [ ack-filter | ack-filter-aggressive | no-ack-filter* ]\n"
+" [ memlimit LIMIT ]\n"
+" [ ptm | atm | noatm* ] [ overhead N | conservative | raw* ]\n"
+" [ mpu N ] [ ingress | egress* ]\n"
+" (* marks defaults)\n");
+}
+
+static int cake_parse_opt(struct qdisc_util *qu, int argc, char **argv,
+ struct nlmsghdr *n, const char *dev)
+{
+ int unlimited = 0;
+ unsigned bandwidth = 0;
+ unsigned interval = 0;
+ unsigned target = 0;
+ unsigned diffserv = 0;
+ unsigned memlimit = 0;
+ int overhead = 0;
+ bool overhead_set = false;
+ bool overhead_override = false;
+ int mpu = 0;
+ int flowmode = -1;
+ int nat = -1;
+ int atm = -1;
+ int autorate = -1;
+ int wash = -1;
+ int ingress = -1;
+ int ack_filter = -1;
+ struct rtattr *tail;
+
+ while (argc > 0) {
+ if (strcmp(*argv, "bandwidth") == 0) {
+ NEXT_ARG();
+ if (get_rate(&bandwidth, *argv)) {
+ fprintf(stderr, "Illegal \"bandwidth\"\n");
+ return -1;
+ }
+ unlimited = 0;
+ autorate = 0;
+ } else if (strcmp(*argv, "unlimited") == 0) {
+ bandwidth = 0;
+ unlimited = 1;
+ autorate = 0;
+ } else if (strcmp(*argv, "autorate_ingress") == 0) {
+ autorate = 1;
+
+ } else if (strcmp(*argv, "rtt") == 0) {
+ NEXT_ARG();
+ if (get_time(&interval, *argv)) {
+ fprintf(stderr, "Illegal \"rtt\"\n");
+ return -1;
+ }
+ target = interval / 20;
+ if(!target)
+ target = 1;
+ } else if (strcmp(*argv, "datacentre") == 0) {
+ interval = 100;
+ target = 5;
+ } else if (strcmp(*argv, "lan") == 0) {
+ interval = 1000;
+ target = 50;
+ } else if (strcmp(*argv, "metro") == 0) {
+ interval = 10000;
+ target = 500;
+ } else if (strcmp(*argv, "regional") == 0) {
+ interval = 30000;
+ target = 1500;
+ } else if (strcmp(*argv, "internet") == 0) {
+ interval = 100000;
+ target = 5000;
+ } else if (strcmp(*argv, "oceanic") == 0) {
+ interval = 300000;
+ target = 15000;
+ } else if (strcmp(*argv, "satellite") == 0) {
+ interval = 1000000;
+ target = 50000;
+ } else if (strcmp(*argv, "interplanetary") == 0) {
+ interval = 1000000000;
+ target = 50000000;
+
+ } else if (strcmp(*argv, "besteffort") == 0) {
+ diffserv = CAKE_DIFFSERV_BESTEFFORT;
+ } else if (strcmp(*argv, "precedence") == 0) {
+ diffserv = CAKE_DIFFSERV_PRECEDENCE;
+ } else if (strcmp(*argv, "diffserv8") == 0) {
+ diffserv = CAKE_DIFFSERV_DIFFSERV8;
+ } else if (strcmp(*argv, "diffserv4") == 0) {
+ diffserv = CAKE_DIFFSERV_DIFFSERV4;
+ } else if (strcmp(*argv, "diffserv") == 0) {
+ diffserv = CAKE_DIFFSERV_DIFFSERV4;
+ } else if (strcmp(*argv, "diffserv3") == 0) {
+ diffserv = CAKE_DIFFSERV_DIFFSERV3;
+
+ } else if (strcmp(*argv, "nowash") == 0) {
+ wash = 0;
+ } else if (strcmp(*argv, "wash") == 0) {
+ wash = 1;
+
+ } else if (strcmp(*argv, "flowblind") == 0) {
+ flowmode = CAKE_FLOW_NONE;
+ } else if (strcmp(*argv, "srchost") == 0) {
+ flowmode = CAKE_FLOW_SRC_IP;
+ } else if (strcmp(*argv, "dsthost") == 0) {
+ flowmode = CAKE_FLOW_DST_IP;
+ } else if (strcmp(*argv, "hosts") == 0) {
+ flowmode = CAKE_FLOW_HOSTS;
+ } else if (strcmp(*argv, "flows") == 0) {
+ flowmode = CAKE_FLOW_FLOWS;
+ } else if (strcmp(*argv, "dual-srchost") == 0) {
+ flowmode = CAKE_FLOW_DUAL_SRC;
+ } else if (strcmp(*argv, "dual-dsthost") == 0) {
+ flowmode = CAKE_FLOW_DUAL_DST;
+ } else if (strcmp(*argv, "triple-isolate") == 0) {
+ flowmode = CAKE_FLOW_TRIPLE;
+
+ } else if (strcmp(*argv, "nat") == 0) {
+ nat = 1;
+ } else if (strcmp(*argv, "nonat") == 0) {
+ nat = 0;
+
+ } else if (strcmp(*argv, "ptm") == 0) {
+ atm = CAKE_ATM_PTM;
+ } else if (strcmp(*argv, "atm") == 0) {
+ atm = CAKE_ATM_ATM;
+ } else if (strcmp(*argv, "noatm") == 0) {
+ atm = CAKE_ATM_NONE;
+
+ } else if (strcmp(*argv, "raw") == 0) {
+ atm = CAKE_ATM_NONE;
+ overhead = 0;
+ overhead_set = true;
+ overhead_override = true;
+ } else if (strcmp(*argv, "conservative") == 0) {
+ /*
+ * Deliberately over-estimate overhead:
+ * one whole ATM cell plus ATM framing.
+ * A safe choice if the actual overhead is unknown.
+ */
+ atm = CAKE_ATM_ATM;
+ overhead = 48;
+ overhead_set = true;
+
+ /* Various ADSL framing schemes, all over ATM cells */
+ } else if (strcmp(*argv, "ipoa-vcmux") == 0) {
+ atm = CAKE_ATM_ATM;
+ overhead += 8;
+ overhead_set = true;
+ } else if (strcmp(*argv, "ipoa-llcsnap") == 0) {
+ atm = CAKE_ATM_ATM;
+ overhead += 16;
+ overhead_set = true;
+ } else if (strcmp(*argv, "bridged-vcmux") == 0) {
+ atm = CAKE_ATM_ATM;
+ overhead += 24;
+ overhead_set = true;
+ } else if (strcmp(*argv, "bridged-llcsnap") == 0) {
+ atm = CAKE_ATM_ATM;
+ overhead += 32;
+ overhead_set = true;
+ } else if (strcmp(*argv, "pppoa-vcmux") == 0) {
+ atm = CAKE_ATM_ATM;
+ overhead += 10;
+ overhead_set = true;
+ } else if (strcmp(*argv, "pppoa-llc") == 0) {
+ atm = CAKE_ATM_ATM;
+ overhead += 14;
+ overhead_set = true;
+ } else if (strcmp(*argv, "pppoe-vcmux") == 0) {
+ atm = CAKE_ATM_ATM;
+ overhead += 32;
+ overhead_set = true;
+ } else if (strcmp(*argv, "pppoe-llcsnap") == 0) {
+ atm = CAKE_ATM_ATM;
+ overhead += 40;
+ overhead_set = true;
+
+ /* Typical VDSL2 framing schemes, both over PTM */
+ /* PTM has 64b/65b coding which absorbs some bandwidth */
+ } else if (strcmp(*argv, "pppoe-ptm") == 0) {
+ /* 2B PPP + 6B PPPoE + 6B dest MAC + 6B src MAC
+ * + 2B ethertype + 4B Frame Check Sequence
+ * + 1B Start of Frame (S) + 1B End of Frame (Ck)
+ * + 2B TC-CRC (PTM-FCS) = 30B
+ */
+ atm = CAKE_ATM_PTM;
+ overhead += 30;
+ overhead_set = true;
+ } else if (strcmp(*argv, "bridged-ptm") == 0) {
+ /* 6B dest MAC + 6B src MAC + 2B ethertype
+ * + 4B Frame Check Sequence
+ * + 1B Start of Frame (S) + 1B End of Frame (Ck)
+ * + 2B TC-CRC (PTM-FCS) = 22B
+ */
+ atm = CAKE_ATM_PTM;
+ overhead += 22;
+ overhead_set = true;
+
+ } else if (strcmp(*argv, "via-ethernet") == 0) {
+ /*
+ * We used to use this flag to manually compensate for
+ * Linux including the Ethernet header on Ethernet-type
+ * interfaces, but not on IP-type interfaces.
+ *
+ * It is no longer needed, because Cake now adjusts for
+ * that automatically, and is thus ignored.
+ *
+ * It would be deleted entirely, but it appears in the
+ * stats output when the automatic compensation is
+ * active.
+ */
+
+ } else if (strcmp(*argv, "ethernet") == 0) {
+ /* ethernet pre-amble & interframe gap & FCS
+ * you may need to add vlan tag */
+ overhead += 38;
+ overhead_set = true;
+ mpu = 84;
+
+ /* Additional Ethernet-related overhead used by some ISPs */
+ } else if (strcmp(*argv, "ether-vlan") == 0) {
+ /* 802.1q VLAN tag - may be repeated */
+ overhead += 4;
+ overhead_set = true;
+
+ /*
+ * DOCSIS cable shapers account for Ethernet frame with FCS,
+ * but not interframe gap or preamble.
+ */
+ } else if (strcmp(*argv, "docsis") == 0) {
+ atm = CAKE_ATM_NONE;
+ overhead += 18;
+ overhead_set = true;
+ mpu = 64;
+
+ } else if (strcmp(*argv, "overhead") == 0) {
+ char* p = NULL;
+ NEXT_ARG();
+ overhead = strtol(*argv, &p, 10);
+ if(!p || *p || !*argv || overhead < -64 || overhead > 256) {
+ fprintf(stderr, "Illegal \"overhead\", valid range is -64 to 256\\n");
+ return -1;
+ }
+ overhead_set = true;
+
+ } else if (strcmp(*argv, "mpu") == 0) {
+ char* p = NULL;
+ NEXT_ARG();
+ mpu = strtol(*argv, &p, 10);
+ if(!p || *p || !*argv || mpu < 0 || mpu > 256) {
+ fprintf(stderr, "Illegal \"mpu\", valid range is 0 to 256\\n");
+ return -1;
+ }
+
+ } else if (strcmp(*argv, "ingress") == 0) {
+ ingress = 1;
+ } else if (strcmp(*argv, "egress") == 0) {
+ ingress = 0;
+
+ } else if (strcmp(*argv, "no-ack-filter") == 0) {
+ ack_filter = CAKE_ACK_NONE;
+ } else if (strcmp(*argv, "ack-filter") == 0) {
+ ack_filter = CAKE_ACK_FILTER;
+ } else if (strcmp(*argv, "ack-filter-aggressive") == 0) {
+ ack_filter = CAKE_ACK_AGGRESSIVE;
+
+ } else if (strcmp(*argv, "memlimit") == 0) {
+ NEXT_ARG();
+ if(get_size(&memlimit, *argv)) {
+ fprintf(stderr, "Illegal value for \"memlimit\": \"%s\"\n", *argv);
+ return -1;
+ }
+
+ } else if (strcmp(*argv, "help") == 0) {
+ explain();
+ return -1;
+ } else {
+ fprintf(stderr, "What is \"%s\"?\n", *argv);
+ explain();
+ return -1;
+ }
+ argc--; argv++;
+ }
+
+ tail = NLMSG_TAIL(n);
+ addattr_l(n, 1024, TCA_OPTIONS, NULL, 0);
+ if (bandwidth || unlimited)
+ addattr_l(n, 1024, TCA_CAKE_BASE_RATE, &bandwidth, sizeof(bandwidth));
+ if (diffserv)
+ addattr_l(n, 1024, TCA_CAKE_DIFFSERV_MODE, &diffserv, sizeof(diffserv));
+ if (atm != -1)
+ addattr_l(n, 1024, TCA_CAKE_ATM, &atm, sizeof(atm));
+ if (flowmode != -1)
+ addattr_l(n, 1024, TCA_CAKE_FLOW_MODE, &flowmode, sizeof(flowmode));
+ if (overhead_set)
+ addattr_l(n, 1024, TCA_CAKE_OVERHEAD, &overhead, sizeof(overhead));
+ if (overhead_override) {
+ unsigned zero = 0;
+ addattr_l(n, 1024, TCA_CAKE_RAW, &zero, sizeof(zero));
+ }
+ if (mpu > 0)
+ addattr_l(n, 1024, TCA_CAKE_MPU, &mpu, sizeof(mpu));
+ if (interval)
+ addattr_l(n, 1024, TCA_CAKE_RTT, &interval, sizeof(interval));
+ if (target)
+ addattr_l(n, 1024, TCA_CAKE_TARGET, &target, sizeof(target));
+ if (autorate != -1)
+ addattr_l(n, 1024, TCA_CAKE_AUTORATE, &autorate, sizeof(autorate));
+ if (memlimit)
+ addattr_l(n, 1024, TCA_CAKE_MEMORY, &memlimit, sizeof(memlimit));
+ if (nat != -1)
+ addattr_l(n, 1024, TCA_CAKE_NAT, &nat, sizeof(nat));
+ if (wash != -1)
+ addattr_l(n, 1024, TCA_CAKE_WASH, &wash, sizeof(wash));
+ if (ingress != -1)
+ addattr_l(n, 1024, TCA_CAKE_INGRESS, &ingress, sizeof(ingress));
+ if (ack_filter != -1)
+ addattr_l(n, 1024, TCA_CAKE_ACK_FILTER, &ack_filter, sizeof(ack_filter));
+
+ tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail;
+ return 0;
+}
+
+
+static int cake_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+ struct rtattr *tb[TCA_CAKE_MAX + 1];
+ unsigned bandwidth = 0;
+ unsigned diffserv = 0;
+ unsigned flowmode = 0;
+ unsigned interval = 0;
+ unsigned memlimit = 0;
+ int overhead = 0;
+ int raw = 0;
+ int mpu = 0;
+ int atm = 0;
+ int nat = 0;
+ int autorate = 0;
+ int wash = 0;
+ int ingress = 0;
+ int ack_filter = 0;
+ int split_gso = 0;
+ SPRINT_BUF(b1);
+ SPRINT_BUF(b2);
+
+ if (opt == NULL)
+ return 0;
+
+ parse_rtattr_nested(tb, TCA_CAKE_MAX, opt);
+
+ if (tb[TCA_CAKE_BASE_RATE] &&
+ RTA_PAYLOAD(tb[TCA_CAKE_BASE_RATE]) >= sizeof(__u32)) {
+ bandwidth = rta_getattr_u32(tb[TCA_CAKE_BASE_RATE]);
+ if(bandwidth) {
+ print_uint(PRINT_JSON, "bandwidth", NULL, bandwidth);
+ print_string(PRINT_FP, NULL, "bandwidth %s ", sprint_rate(bandwidth, b1));
+ } else
+ print_string(PRINT_ANY, "bandwidth", "bandwidth %s ", "unlimited");
+ }
+ if (tb[TCA_CAKE_AUTORATE] &&
+ RTA_PAYLOAD(tb[TCA_CAKE_AUTORATE]) >= sizeof(__u32)) {
+ autorate = rta_getattr_u32(tb[TCA_CAKE_AUTORATE]);
+ if(autorate == 1)
+ print_string(PRINT_ANY, "autorate", "autorate_%s ", "ingress");
+ else if(autorate)
+ print_string(PRINT_ANY, "autorate", "(?autorate?) ", "unknown");
+ }
+ if (tb[TCA_CAKE_DIFFSERV_MODE] &&
+ RTA_PAYLOAD(tb[TCA_CAKE_DIFFSERV_MODE]) >= sizeof(__u32)) {
+ diffserv = rta_getattr_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
+ switch(diffserv) {
+ case CAKE_DIFFSERV_DIFFSERV3:
+ print_string(PRINT_ANY, "diffserv", "%s ", "diffserv3");
+ break;
+ case CAKE_DIFFSERV_DIFFSERV4:
+ print_string(PRINT_ANY, "diffserv", "%s ", "diffserv4");
+ break;
+ case CAKE_DIFFSERV_DIFFSERV8:
+ print_string(PRINT_ANY, "diffserv", "%s ", "diffserv8");
+ break;
+ case CAKE_DIFFSERV_BESTEFFORT:
+ print_string(PRINT_ANY, "diffserv", "%s ", "besteffort");
+ break;
+ case CAKE_DIFFSERV_PRECEDENCE:
+ print_string(PRINT_ANY, "diffserv", "%s ", "precedence");
+ break;
+ default:
+ print_string(PRINT_ANY, "diffserv", "(?diffserv?) ", "unknown");
+ break;
+ };
+ }
+ if (tb[TCA_CAKE_FLOW_MODE] &&
+ RTA_PAYLOAD(tb[TCA_CAKE_FLOW_MODE]) >= sizeof(__u32)) {
+ flowmode = rta_getattr_u32(tb[TCA_CAKE_FLOW_MODE]);
+ switch(flowmode) {
+ case CAKE_FLOW_NONE:
+ print_string(PRINT_ANY, "flowmode", "%s ", "flowblind");
+ break;
+ case CAKE_FLOW_SRC_IP:
+ print_string(PRINT_ANY, "flowmode", "%s ", "srchost");
+ break;
+ case CAKE_FLOW_DST_IP:
+ print_string(PRINT_ANY, "flowmode", "%s ", "dsthost");
+ break;
+ case CAKE_FLOW_HOSTS:
+ print_string(PRINT_ANY, "flowmode", "%s ", "hosts");
+ break;
+ case CAKE_FLOW_FLOWS:
+ print_string(PRINT_ANY, "flowmode", "%s ", "flows");
+ break;
+ case CAKE_FLOW_DUAL_SRC:
+ print_string(PRINT_ANY, "flowmode", "%s ", "dual-srchost");
+ break;
+ case CAKE_FLOW_DUAL_DST:
+ print_string(PRINT_ANY, "flowmode", "%s ", "dual-dsthost");
+ break;
+ case CAKE_FLOW_TRIPLE:
+ print_string(PRINT_ANY, "flowmode", "%s ", "triple-isolate");
+ break;
+ default:
+ print_string(PRINT_ANY, "flowmode", "(?flowmode?) ", "unknown");
+ break;
+ };
+
+ }
+
+ if (tb[TCA_CAKE_NAT] &&
+ RTA_PAYLOAD(tb[TCA_CAKE_NAT]) >= sizeof(__u32)) {
+ nat = rta_getattr_u32(tb[TCA_CAKE_NAT]);
+ }
+
+ if(nat)
+ print_string(PRINT_FP, NULL, "nat ", NULL);
+ print_bool(PRINT_JSON, "nat", NULL, nat);
+
+ if (tb[TCA_CAKE_WASH] &&
+ RTA_PAYLOAD(tb[TCA_CAKE_WASH]) >= sizeof(__u32)) {
+ wash = rta_getattr_u32(tb[TCA_CAKE_WASH]);
+ }
+ if (tb[TCA_CAKE_ATM] &&
+ RTA_PAYLOAD(tb[TCA_CAKE_ATM]) >= sizeof(__u32)) {
+ atm = rta_getattr_u32(tb[TCA_CAKE_ATM]);
+ }
+ if (tb[TCA_CAKE_OVERHEAD] &&
+ RTA_PAYLOAD(tb[TCA_CAKE_OVERHEAD]) >= sizeof(__s32)) {
+ overhead = *(__s32 *) RTA_DATA(tb[TCA_CAKE_OVERHEAD]);
+ }
+ if (tb[TCA_CAKE_MPU] &&
+ RTA_PAYLOAD(tb[TCA_CAKE_MPU]) >= sizeof(__u32)) {
+ mpu = rta_getattr_u32(tb[TCA_CAKE_MPU]);
+ }
+ if (tb[TCA_CAKE_INGRESS] &&
+ RTA_PAYLOAD(tb[TCA_CAKE_INGRESS]) >= sizeof(__u32)) {
+ ingress = rta_getattr_u32(tb[TCA_CAKE_INGRESS]);
+ }
+ if (tb[TCA_CAKE_ACK_FILTER] &&
+ RTA_PAYLOAD(tb[TCA_CAKE_ACK_FILTER]) >= sizeof(__u32)) {
+ ack_filter = rta_getattr_u32(tb[TCA_CAKE_ACK_FILTER]);
+ }
+ if (tb[TCA_CAKE_SPLIT_GSO] &&
+ RTA_PAYLOAD(tb[TCA_CAKE_SPLIT_GSO]) >= sizeof(__u32)) {
+ split_gso = rta_getattr_u32(tb[TCA_CAKE_SPLIT_GSO]);
+ }
+ if (tb[TCA_CAKE_RAW]) {
+ raw = 1;
+ }
+ if (tb[TCA_CAKE_RTT] &&
+ RTA_PAYLOAD(tb[TCA_CAKE_RTT]) >= sizeof(__u32)) {
+ interval = rta_getattr_u32(tb[TCA_CAKE_RTT]);
+ }
+
+ if (wash)
+ print_string(PRINT_FP, NULL, "wash ", NULL);
+ print_bool(PRINT_JSON, "wash", NULL, wash);
+
+ if (ingress)
+ print_string(PRINT_FP, NULL, "ingress ", NULL);
+ print_bool(PRINT_JSON, "ingress", NULL, ingress);
+
+ if (ack_filter == CAKE_ACK_AGGRESSIVE)
+ print_string(PRINT_ANY, "ack-filter", "ack-filter-%s ", "aggressive");
+ else if (ack_filter == CAKE_ACK_FILTER)
+ print_string(PRINT_ANY, "ack-filter", "ack-filter ", "enabled");
+ else
+ print_string(PRINT_JSON, "ack-filter", NULL, "disabled");
+
+ if (split_gso)
+ print_string(PRINT_FP, NULL, "split-gso ", NULL);
+ print_bool(PRINT_JSON, "split_gso", NULL, split_gso);
+
+ if (interval)
+ print_string(PRINT_FP, NULL, "rtt %s ", sprint_time(interval, b2));
+ print_uint(PRINT_JSON, "rtt", NULL, interval);
+
+ if (raw)
+ print_string(PRINT_FP, NULL, "raw ", NULL);
+ print_bool(PRINT_JSON, "raw", NULL, raw);
+
+ if (atm == CAKE_ATM_ATM)
+ print_string(PRINT_ANY, "atm", "%s ", "atm");
+ else if (atm == CAKE_ATM_PTM)
+ print_string(PRINT_ANY, "atm", "%s ", "ptm");
+ else if (!raw)
+ print_string(PRINT_ANY, "atm", "%s ", "noatm");
+
+ print_int(PRINT_ANY, "overhead", "overhead %d ", overhead);
+
+ if (mpu)
+ print_uint(PRINT_ANY, "mpu", "mpu %u ", mpu);
+
+ if (memlimit) {
+ print_uint(PRINT_JSON, "memlimit", NULL, memlimit);
+ print_string(PRINT_FP, NULL, "memlimit %s", sprint_size(memlimit, b1));
+ }
+
+ return 0;
+}
+
+static void cake_print_json_tin(struct rtattr **tstat)
+{
+#define PRINT_TSTAT_JSON(type, name, attr) if (tstat[TCA_CAKE_TIN_STATS_ ## attr]) \
+ print_u64(PRINT_JSON, name, NULL, \
+ rta_getattr_ ## type((struct rtattr *)tstat[TCA_CAKE_TIN_STATS_ ## attr]))
+
+ open_json_object(NULL);
+ PRINT_TSTAT_JSON(u32, "threshold_rate", THRESHOLD_RATE);
+ PRINT_TSTAT_JSON(u32, "target_us", TARGET_US);
+ PRINT_TSTAT_JSON(u32, "interval_us", INTERVAL_US);
+ PRINT_TSTAT_JSON(u32, "peak_delay_us", PEAK_DELAY_US);
+ PRINT_TSTAT_JSON(u32, "avg_delay_us", AVG_DELAY_US);
+ PRINT_TSTAT_JSON(u32, "base_delay_us", BASE_DELAY_US);
+ PRINT_TSTAT_JSON(u32, "sent_packets", SENT_PACKETS);
+ PRINT_TSTAT_JSON(u64, "sent_bytes", SENT_BYTES64);
+ PRINT_TSTAT_JSON(u32, "way_indirect_hits", WAY_INDIRECT_HITS);
+ PRINT_TSTAT_JSON(u32, "way_misses", WAY_MISSES);
+ PRINT_TSTAT_JSON(u32, "way_collisions", WAY_COLLISIONS);
+ PRINT_TSTAT_JSON(u32, "drops", DROPPED_PACKETS);
+ PRINT_TSTAT_JSON(u32, "ecn_mark", ECN_MARKED_PACKETS);
+ PRINT_TSTAT_JSON(u32, "ack_drops", ACKS_DROPPED_PACKETS);
+ PRINT_TSTAT_JSON(u32, "sparse_flows", SPARSE_FLOWS);
+ PRINT_TSTAT_JSON(u32, "bulk_flows", BULK_FLOWS);
+ PRINT_TSTAT_JSON(u32, "unresponsive_flows", UNRESPONSIVE_FLOWS);
+ PRINT_TSTAT_JSON(u32, "max_pkt_len", MAX_SKBLEN);
+ PRINT_TSTAT_JSON(u32, "flow_quantum", FLOW_QUANTUM);
+ close_json_object();
+
+#undef PRINT_TSTAT_JSON
+}
+
+static int cake_print_xstats(struct qdisc_util *qu, FILE *f,
+ struct rtattr *xstats)
+{
+ SPRINT_BUF(b1);
+ struct rtattr *st[TCA_CAKE_STATS_MAX + 1];
+ int i;
+
+ if (xstats == NULL)
+ return 0;
+
+#define GET_STAT_U32(attr) rta_getattr_u32(st[TCA_CAKE_STATS_ ## attr])
+
+ parse_rtattr_nested(st, TCA_CAKE_STATS_MAX, xstats);
+
+ if (st[TCA_CAKE_STATS_MEMORY_USED] &&
+ st[TCA_CAKE_STATS_MEMORY_LIMIT]) {
+ print_string(PRINT_FP, NULL, " memory used: %s",
+ sprint_size(GET_STAT_U32(MEMORY_USED), b1));
+
+ print_string(PRINT_FP, NULL, " of %s\n",
+ sprint_size(GET_STAT_U32(MEMORY_LIMIT), b1));
+
+ print_uint(PRINT_JSON, "memory_used", NULL,
+ GET_STAT_U32(MEMORY_USED));
+ print_uint(PRINT_JSON, "memory_limit", NULL,
+ GET_STAT_U32(MEMORY_LIMIT));
+ }
+
+ if (st[TCA_CAKE_STATS_CAPACITY_ESTIMATE]) {
+ print_string(PRINT_FP, NULL, " capacity estimate: %s\n",
+ sprint_rate(GET_STAT_U32(CAPACITY_ESTIMATE), b1));
+ print_uint(PRINT_JSON, "capacity_estimate", NULL,
+ GET_STAT_U32(CAPACITY_ESTIMATE));
+ }
+
+ if (st[TCA_CAKE_STATS_MIN_NETLEN] &&
+ st[TCA_CAKE_STATS_MAX_NETLEN]) {
+ print_uint(PRINT_ANY, "min_network_size",
+ " min/max network layer size: %12u",
+ GET_STAT_U32(MIN_NETLEN));
+ print_uint(PRINT_ANY, "max_network_size",
+ " /%8u\n", GET_STAT_U32(MAX_NETLEN));
+ }
+
+ if (st[TCA_CAKE_STATS_MIN_ADJLEN] &&
+ st[TCA_CAKE_STATS_MAX_ADJLEN]) {
+ print_uint(PRINT_ANY, "min_adj_size",
+ " min/max overhead-adjusted size: %8u",
+ GET_STAT_U32(MIN_ADJLEN));
+ print_uint(PRINT_ANY, "max_adj_size",
+ " /%8u\n", GET_STAT_U32(MAX_ADJLEN));
+ }
+
+ if (st[TCA_CAKE_STATS_AVG_NETOFF])
+ print_uint(PRINT_ANY, "avg_hdr_offset",
+ " average network hdr offset: %12u\n\n",
+ GET_STAT_U32(AVG_NETOFF));
+
+#undef GET_STAT_U32
+
+ if (st[TCA_CAKE_STATS_TIN_STATS]) {
+ struct rtattr *tins[TC_CAKE_MAX_TINS + 1];
+ struct rtattr *tstat[TC_CAKE_MAX_TINS][TCA_CAKE_TIN_STATS_MAX + 1];
+ int num_tins = 0;
+
+ parse_rtattr_nested(tins, TC_CAKE_MAX_TINS, st[TCA_CAKE_STATS_TIN_STATS]);
+
+ for (i = 1; i <= TC_CAKE_MAX_TINS && tins[i]; i++) {
+ parse_rtattr_nested(tstat[i-1], TCA_CAKE_TIN_STATS_MAX, tins[i]);
+ num_tins++;
+ }
+
+ if (!num_tins)
+ return 0;
+
+ if (is_json_context()) {
+ open_json_array(PRINT_JSON, "tins");
+ for (i = 0; i < num_tins; i++)
+ cake_print_json_tin(tstat[i]);
+ close_json_array(PRINT_JSON, NULL);
+
+ return 0;
+ }
+
+
+ switch(num_tins) {
+ case 3:
+ fprintf(f, " Bulk Best Effort Voice\n");
+ break;
+
+ case 4:
+ fprintf(f, " Bulk Best Effort Video Voice\n");
+ break;
+
+ default:
+ fprintf(f, " ");
+ for(i=0; i < num_tins; i++)
+ fprintf(f, " Tin %u", i);
+ fprintf(f, "\n");
+ };
+
+#define GET_TSTAT(i, attr) (tstat[i][TCA_CAKE_TIN_STATS_ ## attr])
+#define PRINT_TSTAT(name, attr, fmts, val) do { \
+ if (GET_TSTAT(0, attr)) { \
+ fprintf(f, name); \
+ for (i = 0; i < num_tins; i++) \
+ fprintf(f, " %12" fmts, val); \
+ fprintf(f, "\n"); \
+ } \
+ } while (0)
+
+#define SPRINT_TSTAT(pfunc, name, attr) PRINT_TSTAT( \
+ name, attr, "s", sprint_ ## pfunc( \
+ rta_getattr_u32(GET_TSTAT(i, attr)), b1))
+
+#define PRINT_TSTAT_U32(name, attr) PRINT_TSTAT( \
+ name, attr, "u", rta_getattr_u32(GET_TSTAT(i, attr)))
+
+#define PRINT_TSTAT_U64(name, attr) PRINT_TSTAT( \
+ name, attr, "llu", rta_getattr_u64(GET_TSTAT(i, attr)))
+
+ SPRINT_TSTAT(rate, " thresh ", THRESHOLD_RATE);
+ SPRINT_TSTAT(time, " target ", TARGET_US);
+ SPRINT_TSTAT(time, " interval", INTERVAL_US);
+ SPRINT_TSTAT(time, " pk_delay", PEAK_DELAY_US);
+ SPRINT_TSTAT(time, " av_delay", AVG_DELAY_US);
+ SPRINT_TSTAT(time, " sp_delay", BASE_DELAY_US);
+
+ PRINT_TSTAT_U32(" pkts ", SENT_PACKETS);
+ PRINT_TSTAT_U64(" bytes ", SENT_BYTES64);
+
+ PRINT_TSTAT_U32(" way_inds", WAY_INDIRECT_HITS);
+ PRINT_TSTAT_U32(" way_miss", WAY_MISSES);
+ PRINT_TSTAT_U32(" way_cols", WAY_COLLISIONS);
+ PRINT_TSTAT_U32(" drops ", DROPPED_PACKETS);
+ PRINT_TSTAT_U32(" marks ", ECN_MARKED_PACKETS);
+ PRINT_TSTAT_U32(" ack_drop", ACKS_DROPPED_PACKETS);
+ PRINT_TSTAT_U32(" sp_flows", SPARSE_FLOWS);
+ PRINT_TSTAT_U32(" bk_flows", BULK_FLOWS);
+ PRINT_TSTAT_U32(" un_flows", UNRESPONSIVE_FLOWS);
+ PRINT_TSTAT_U32(" max_len ", MAX_SKBLEN);
+ PRINT_TSTAT_U32(" quantum ", FLOW_QUANTUM);
+
+#undef GET_STAT
+#undef PRINT_TSTAT
+#undef SPRINT_TSTAT
+#undef PRINT_TSTAT_U32
+#undef PRINT_TSTAT_U64
+ }
+ return 0;
+}
+
+struct qdisc_util cake_qdisc_util = {
+ .id = "cake",
+ .parse_qopt = cake_parse_opt,
+ .print_qopt = cake_print_opt,
+ .print_xstats = cake_print_xstats,
+};
--
2.17.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox