* Help with the BPF verifier
From: Arnaldo Carvalho de Melo @ 2018-11-01 18:52 UTC (permalink / raw)
To: Yonghong Song
Cc: Daniel Borkmann, Jiri Olsa, Martin Lau, Alexei Starovoitov,
Linux Networking Development Mailing List
tl;dr: I seem to be trying to get past clang optimizations that get the
verifier to accept my proggie.
Hi,
So I'm moving to use raw_syscalls:sys_exit to collect pointer
contents, using maps to tell the bpf program what to copy, how many
bytes, filters, etc.
I'm at the start of it at this point I need to use an index to
get to the right syscall arg that is a filename, starting just with
"open" and "openat", that have the filename in different args, so to get
this first part working I'm doing it directly in the bpf restricted C
program, later this will be to maps, etc, so if I set the index as a
constant, just for testing, it works, look at the "open" and "openat"
calls below, later we'll see why openat is failing to augment its
"filename" arg while "open" works:
[root@seventh perf]# trace -e tools/perf/examples/bpf/augmented_raw_syscalls.c sleep 1
? ( ): sleep/10152 ... [continued]: execve()) = 0
0.045 ( 0.004 ms): sleep/10152 brk() = 0x55ccff356000
0.074 ( 0.007 ms): sleep/10152 access(filename: , mode: R) = -1 ENOENT No such file or directory
0.089 ( 0.006 ms): sleep/10152 openat(dfd: CWD, filename: , flags: CLOEXEC) = 3
0.097 ( 0.003 ms): sleep/10152 fstat(fd: 3, statbuf: 0x7ffecdd283f0) = 0
0.103 ( 0.006 ms): sleep/10152 mmap(len: 103334, prot: READ, flags: PRIVATE, fd: 3) = 0x7f8ffee9c000
0.111 ( 0.002 ms): sleep/10152 close(fd: 3) = 0
0.135 ( 0.007 ms): sleep/10152 openat(dfd: CWD, filename: , flags: CLOEXEC) = 3
0.144 ( 0.003 ms): sleep/10152 read(fd: 3, buf: 0x7ffecdd285b8, count: 832) = 832
0.150 ( 0.002 ms): sleep/10152 fstat(fd: 3, statbuf: 0x7ffecdd28450) = 0
0.155 ( 0.005 ms): sleep/10152 mmap(len: 8192, prot: READ|WRITE, flags: PRIVATE|ANONYMOUS) = 0x7f8ffee9a000
0.166 ( 0.007 ms): sleep/10152 mmap(len: 3889792, prot: EXEC|READ, flags: PRIVATE|DENYWRITE, fd: 3) = 0x7f8ffe8dc000
0.175 ( 0.010 ms): sleep/10152 mprotect(start: 0x7f8ffea89000, len: 2093056) = 0
0.188 ( 0.010 ms): sleep/10152 mmap(addr: 0x7f8ffec88000, len: 24576, prot: READ|WRITE, flags: PRIVATE|FIXED|DENYWRITE, fd: 3, off: 1753088) = 0x7f8ffec88000
0.204 ( 0.005 ms): sleep/10152 mmap(addr: 0x7f8ffec8e000, len: 14976, prot: READ|WRITE, flags: PRIVATE|FIXED|ANONYMOUS) = 0x7f8ffec8e000
0.218 ( 0.002 ms): sleep/10152 close(fd: 3) = 0
0.239 ( 0.002 ms): sleep/10152 arch_prctl(option: 4098, arg2: 140256433779968) = 0
0.312 ( 0.009 ms): sleep/10152 mprotect(start: 0x7f8ffec88000, len: 16384, prot: READ) = 0
0.343 ( 0.005 ms): sleep/10152 mprotect(start: 0x55ccff1c6000, len: 4096, prot: READ) = 0
0.354 ( 0.006 ms): sleep/10152 mprotect(start: 0x7f8ffeeb6000, len: 4096, prot: READ) = 0
0.362 ( 0.019 ms): sleep/10152 munmap(addr: 0x7f8ffee9c000, len: 103334) = 0
0.476 ( 0.002 ms): sleep/10152 brk() = 0x55ccff356000
0.480 ( 0.004 ms): sleep/10152 brk(brk: 0x55ccff377000) = 0x55ccff377000
0.487 ( 0.002 ms): sleep/10152 brk() = 0x55ccff377000
0.497 ( 0.008 ms): sleep/10152 open(filename: /usr/lib/locale/locale-archive, flags: CLOEXEC) = 3
0.507 ( 0.002 ms): sleep/10152 fstat(fd: 3, statbuf: 0x7f8ffec8daa0) = 0
0.511 ( 0.006 ms): sleep/10152 mmap(len: 113045344, prot: READ, flags: PRIVATE, fd: 3) = 0x7f8ff7d0d000
0.524 ( 0.002 ms): sleep/10152 close(fd: 3) = 0
0.574 (1000.140 ms): sleep/10152 nanosleep(rqtp: 0x7ffecdd29130) = 0
1000.753 ( 0.007 ms): sleep/10152 close(fd: 1) = 0
1000.767 ( 0.004 ms): sleep/10152 close(fd: 2) = 0
1000.781 ( ): sleep/10152 exit_group()
[root@seventh perf]#
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Augment the raw_syscalls tracepoints with the contents of the pointer arguments.
4 *
5 * Test it with:
6 *
7 * perf trace -e tools/perf/examples/bpf/augmented_raw_syscalls.c cat /etc/passwd > /dev/null
8 *
9 * This exactly matches what is marshalled into the raw_syscall:sys_enter
10 * payload expected by the 'perf trace' beautifiers.
11 *
12 * For now it just uses the existing tracepoint augmentation code in 'perf
13 * trace', in the next csets we'll hook up these with the sys_enter/sys_exit
14 * code that will combine entry/exit in a strace like way.
15 */
16 #include <stdio.h>
17 #include <linux/socket.h>
18 /* bpf-output associated map */
19 struct bpf_map SEC("maps") __augmented_syscalls__ = {
20 .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
21 .key_size = sizeof(int),
22 .value_size = sizeof(u32),
23 .max_entries = __NR_CPUS__,
24 };
25 struct syscall_enter_args {
26 unsigned long long common_tp_fields;
27 long syscall_nr;
28 unsigned long args[6];
29 };
30 struct syscall_exit_args {
31 unsigned long long common_tp_fields;
32 long syscall_nr;
33 long ret;
34 };
35 struct augmented_filename {
36 unsigned int size;
37 int reserved;
38 char value[256];
39 };
40 #define SYS_OPEN 2
41 #define SYS_OPENAT 257
42 SEC("raw_syscalls:sys_enter")
43 int sys_enter(struct syscall_enter_args *args)
44 {
45 struct {
46 struct syscall_enter_args args;
47 struct augmented_filename filename;
48 } augmented_args;
49 unsigned int len = sizeof(augmented_args);
50 unsigned int filename_arg = 6;
51 probe_read(&augmented_args.args, sizeof(augmented_args.args), args);
52 switch (augmented_args.args.syscall_nr) {
53 case SYS_OPEN: filename_arg = 0; break;
54 case SYS_OPENAT: filename_arg = 1; break;
55 }
56 if (filename_arg <= 5) {
57 augmented_args.filename.reserved = 0;
58 augmented_args.filename.size = probe_read_str(&augmented_args.filename.value,
59 sizeof(augmented_args.filename.value),
60 (const void *)args->args[0]);
61 if (augmented_args.filename.size < sizeof(augmented_args.filename.value)) {
62 len -= sizeof(augmented_args.filename.value) - augmented_args.filename.size;
63 len &= sizeof(augmented_args.filename.value) - 1;
64 }
65 } else {
66 len = sizeof(augmented_args.args);
67 }
68 perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, &augmented_args, len);
69 return 0;
70 }
71 SEC("raw_syscalls:sys_exit")
72 int sys_exit(struct syscall_exit_args *args)
73 {
74 return 1; /* 0 as soon as we start copying data returned by the kernel, e.g. 'read' */
75 }
76 license(GPL);
In line #60 if I change that to 1, then "openat" works and "open"
doesn't, so what I wanted was to use filename_arg there as the index,
now it comes from that switch, but really it'll come from userspace,
that knows the syscall tables for each arch, etc.
But if I do that, i.e. apply this patch to that program:
--- /wb/augmented_raw_syscalls.c.old 2018-11-01 15:43:55.000394234 -0300
+++ /wb/augmented_raw_syscalls.c 2018-11-01 15:44:15.102367838 -0300
@@ -67,7 +67,7 @@
augmented_args.filename.reserved = 0;
augmented_args.filename.size = probe_read_str(&augmented_args.filename.value,
sizeof(augmented_args.filename.value),
- (const void *)args->args[0]);
+ (const void *)args->args[filename_arg]);
if (augmented_args.filename.size < sizeof(augmented_args.filename.value)) {
len -= sizeof(augmented_args.filename.value) - augmented_args.filename.size;
len &= sizeof(augmented_args.filename.value) - 1;
Then I end up with the verifier complying, I tried various ways to get
around the compiler about filename_arg being safe to use as an index,
but I couldn't find the right trick, ideas?
This is what I end up with when I apply that patch:
[root@seventh perf]# trace -e tools/perf/examples/bpf/augmented_raw_syscalls.c sleep 1
event syntax error: 'tools/perf/examples/bpf/augmented_raw_syscalls.c'
\___ Kernel verifier blocks program loading
(add -v to see detail)
Run 'perf list' for a list of valid events
Usage: perf trace [<options>] [<command>]
or: perf trace [<options>] -- <command> [<options>]
or: perf trace record [<options>] [<command>]
or: perf trace record [<options>] -- <command> [<options>]
-e, --event <event> event/syscall selector. use 'perf list' to list available events
[root@seventh perf]#
Using -v, as suggested, I get:
[root@seventh perf]# trace -v -e tools/perf/examples/bpf/augmented_raw_syscalls.c sleep 1
bpf: builtin compilation failed: -95, try external compiler
Kernel build dir is set to /lib/modules/4.19.0-rc8-00014-gc0cff31be705/build
set env: KBUILD_DIR=/lib/modules/4.19.0-rc8-00014-gc0cff31be705/build
unset env: KBUILD_OPTS
include option is set to -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/7/include -I/home/acme/git/linux/arch/x86/include -I./arch/x86/include/generated -I/home/acme/git/linux/include -I./include -I/home/acme/git/linux/arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I/home/acme/git/linux/include/uapi -I./include/generated/uapi -include /home/acme/git/linux/include/linux/kconfig.h
set env: NR_CPUS=4
set env: LINUX_VERSION_CODE=0x41300
set env: CLANG_EXEC=/usr/local/bin/clang
unset env: CLANG_OPTIONS
set env: KERNEL_INC_OPTIONS= -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/7/include -I/home/acme/git/linux/arch/x86/include -I./arch/x86/include/generated -I/home/acme/git/linux/include -I./include -I/home/acme/git/linux/arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I/home/acme/git/linux/include/uapi -I./include/generated/uapi -include /home/acme/git/linux/include/linux/kconfig.h
set env: PERF_BPF_INC_OPTIONS=-I/home/acme/lib/perf/include/bpf
set env: WORKING_DIR=/lib/modules/4.19.0-rc8-00014-gc0cff31be705/build
set env: CLANG_SOURCE=/home/acme/git/perf/tools/perf/examples/bpf/augmented_raw_syscalls.c
llvm compiling command template: $CLANG_EXEC -D__KERNEL__ -D__NR_CPUS__=$NR_CPUS -DLINUX_VERSION_CODE=$LINUX_VERSION_CODE $CLANG_OPTIONS $PERF_BPF_INC_OPTIONS $KERNEL_INC_OPTIONS -Wno-unused-value -Wno-pointer-sign -working-directory $WORKING_DIR -c "$CLANG_SOURCE" -target bpf $CLANG_EMIT_LLVM -O2 -o - $LLVM_OPTIONS_PIPE
llvm compiling command : /usr/local/bin/clang -D__KERNEL__ -D__NR_CPUS__=4 -DLINUX_VERSION_CODE=0x41300 -I/home/acme/lib/perf/include/bpf -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/7/include -I/home/acme/git/linux/arch/x86/include -I./arch/x86/include/generated -I/home/acme/git/linux/include -I./include -I/home/acme/git/linux/arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I/home/acme/git/linux/include/uapi -I./include/generated/uapi -include /home/acme/git/linux/include/linux/kconfig.h -Wno-unused-value -Wno-pointer-sign -working-directory /lib/modules/4.19.0-rc8-00014-gc0cff31be705/build -c /home/acme/git/perf/tools/perf/examples/bpf/augmented_raw_syscalls.c -target bpf -O2 -o -
libbpf: loading object 'tools/perf/examples/bpf/augmented_raw_syscalls.c' from buffer
libbpf: section(1) .strtab, size 168, link 0, flags 0, type=3
libbpf: skip section(1) .strtab
libbpf: section(2) .text, size 0, link 0, flags 6, type=1
libbpf: skip section(2) .text
libbpf: section(3) raw_syscalls:sys_enter, size 376, link 0, flags 6, type=1
libbpf: found program raw_syscalls:sys_enter
libbpf: section(4) .relraw_syscalls:sys_enter, size 16, link 10, flags 0, type=9
libbpf: section(5) raw_syscalls:sys_exit, size 16, link 0, flags 6, type=1
libbpf: found program raw_syscalls:sys_exit
libbpf: section(6) maps, size 56, link 0, flags 3, type=1
libbpf: section(7) license, size 4, link 0, flags 3, type=1
libbpf: license of tools/perf/examples/bpf/augmented_raw_syscalls.c is GPL
libbpf: section(8) version, size 4, link 0, flags 3, type=1
libbpf: kernel version of tools/perf/examples/bpf/augmented_raw_syscalls.c is 41300
libbpf: section(9) .llvm_addrsig, size 6, link 10, flags 80000000, type=1879002115
libbpf: skip section(9) .llvm_addrsig
libbpf: section(10) .symtab, size 240, link 1, flags 0, type=2
libbpf: maps in tools/perf/examples/bpf/augmented_raw_syscalls.c: 2 maps in 56 bytes
libbpf: map 0 is "__augmented_syscalls__"
libbpf: map 1 is "__bpf_stdout__"
libbpf: collecting relocating info for: 'raw_syscalls:sys_enter'
libbpf: relo for 4 value 28 name 124
libbpf: relocation: insn_idx=39
libbpf: relocation: find map 1 (__augmented_syscalls__) for insn 39
bpf: config program 'raw_syscalls:sys_enter'
bpf: config program 'raw_syscalls:sys_exit'
libbpf: create map __bpf_stdout__: fd=3
libbpf: create map __augmented_syscalls__: fd=4
libbpf: load bpf program failed: Permission denied
libbpf: -- BEGIN DUMP LOG ---
libbpf:
0: (bf) r6 = r1
1: (bf) r1 = r10
2: (07) r1 += -328
3: (b7) r7 = 64
4: (b7) r2 = 64
5: (bf) r3 = r6
6: (85) call bpf_probe_read#4
7: (b7) r2 = 1
8: (79) r3 = *(u64 *)(r10 -320)
9: (15) if r3 == 0x101 goto pc+1
R0=inv(id=0) R2=inv1 R3=inv(id=0) R6=ctx(id=0,off=0,imm=0) R7=inv64 R10=fp0,call_-1
10: (b7) r2 = 6
11: (b7) r1 = 0
12: (15) if r3 == 0x2 goto pc+1
R0=inv(id=0) R1=inv0 R2=inv6 R3=inv(id=0) R6=ctx(id=0,off=0,imm=0) R7=inv64 R10=fp0,call_-1
13: (bf) r1 = r2
14: (25) if r1 > 0x5 goto pc+21
R0=inv(id=0) R1=inv6 R2=inv6 R3=inv(id=0) R6=ctx(id=0,off=0,imm=0) R7=inv64 R10=fp0,call_-1
15: (b7) r2 = 0
16: (63) *(u32 *)(r10 -260) = r2
17: (67) r1 <<= 32
18: (77) r1 >>= 32
19: (67) r1 <<= 3
20: (bf) r2 = r6
21: (0f) r2 += r1
22: (79) r3 = *(u64 *)(r2 +16)
R2 invalid mem access 'inv'
libbpf: -- END LOG --
libbpf: failed to load program 'raw_syscalls:sys_enter'
libbpf: failed to load object 'tools/perf/examples/bpf/augmented_raw_syscalls.c'
bpf: load objects failed: err=-4007: (Kernel verifier blocks program loading)
event syntax error: 'tools/perf/examples/bpf/augmented_raw_syscalls.c'
\___ Kernel verifier blocks program loading
(add -v to see detail)
Run 'perf list' for a list of valid events
Usage: perf trace [<options>] [<command>]
or: perf trace [<options>] -- <command> [<options>]
or: perf trace record [<options>] [<command>]
or: perf trace record [<options>] -- <command> [<options>]
-e, --event <event> event/syscall selector. use 'perf list' to list available events
[root@seventh perf]#
^ permalink raw reply
* Re: [RFC 0/2] Delayed binding of UDP sockets for Quic per-connection sockets
From: Eric Dumazet @ 2018-11-01 18:21 UTC (permalink / raw)
To: Leif Hedstrom; +Cc: Christoph Paasch, netdev, Ian Swett, Jana Iyengar
In-Reply-To: <60615C27-057A-4215-9D1E-3A164B72757B@apple.com>
On 11/01/2018 10:58 AM, Leif Hedstrom wrote:
>
>
>> On Oct 31, 2018, at 6:53 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>
>>
>>
>> On 10/31/2018 04:26 PM, Christoph Paasch wrote:
>>> Implementations of Quic might want to create a separate socket for each
>>> Quic-connection by creating a connected UDP-socket.
>>>
>>
>> Nice proposal, but I doubt a QUIC server can afford having one UDP socket per connection ?
>
> First thing: This is an idea we’ve been floating, and it’s not completed yet, so we don’t have any performance numbers etc. to share. The ideas for the implementation came up after a discussion with Ian and Jana re: their implementation of a QUIC server.
>
> That much said, the general rationale for this is that having a socket for each QUIC connection could simplify integrating QUIC into existing software that already does epoll() over TCP sockets. This is how e.g. Apache Traffic Server works, which is our target implementation for QUIC.
>
>
>
>>
>> It would add a huge overhead in term of memory usage in the kernel,
>> and lots of epoll events to manage (say a QUIC server with one million flows, receiving
>> very few packets per second per flow)
>
> Our use case is not millions of sockets, rather, 10’s of thousands. There would be one socket for each QUIC Connection, not per stream (obviously). At ~80Gbps on a box, we definitely see much less than 100k TCP connections.
>
> Question: is there additional memory overhead here for the UDP sockets vs a normal TCP socket for e.g. HTTP or HTTP/2 ?
TCP sockets have a lot of state. We can understand spending 2 or 3 KB per socket.
UDP sockets really have no state. The receive queue anchor is only 24 bytes.
Still, memory cost for one UDP socket are :
1344 bytes for UDP socket,
320 bytes for the "struct file"
192 bytes for the struct dentry
704 bytes for inode
512 bytes for the two dst (connected socket)
200 bytes for eventpoll structures
104 bytes for the fq flow
That is about 3.1KB per socket (but you probably can round this to 4KB due to kmalloc roundings)
One million sockets -> 4GB of memory.
This really does not scale.
^ permalink raw reply
* Re: [PATCH v3 1/2] kretprobe: produce sane stack traces
From: Masami Hiramatsu @ 2018-11-02 3:04 UTC (permalink / raw)
To: Aleksa Sarai
Cc: Naveen N. Rao, Anil S Keshavamurthy, David S. Miller,
Jonathan Corbet, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Alexander Shishkin, Jiri Olsa,
Namhyung Kim, Steven Rostedt, Shuah Khan, Alexei Starovoitov,
Daniel Borkmann, Brendan Gregg, Christian Brauner, Aleksa Sarai,
netdev, linux-doc, linux-kernel
In-Reply-To: <20181101211343.yooxwqfnoloprb5h@yavin>
On Fri, 2 Nov 2018 08:13:43 +1100
Aleksa Sarai <cyphar@cyphar.com> wrote:
> On 2018-11-02, Masami Hiramatsu <mhiramat@kernel.org> wrote:
> > Please split the test case as an independent patch.
>
> Will do. Should the Documentation/ change also be a separate patch?
I think the Documentation change can be coupled with code change
if the change is small. But selftests is different, that can be
backported soon for testing the stable kernels.
> > > new file mode 100644
> > > index 000000000000..03146c6a1a3c
> > > --- /dev/null
> > > +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc
> > > @@ -0,0 +1,25 @@
> > > +#!/bin/sh
> > > +# SPDX-License-Identifier: GPL-2.0+
> > > +# description: Kretprobe dynamic event with a stacktrace
> > > +
> > > +[ -f kprobe_events ] || exit_unsupported # this is configurable
> > > +
> > > +echo 0 > events/enable
> > > +echo 1 > options/stacktrace
> > > +
> > > +echo 'r:teststackprobe sched_fork $retval' > kprobe_events
> > > +grep teststackprobe kprobe_events
> > > +test -d events/kprobes/teststackprobe
> >
> > Hmm, what happen if we have 2 or more kretprobes on same stack?
> > It seems you just save stack in pre_handler, but that stack can already
> > includes another kretprobe's trampline address...
>
> Yeah, you're quite right...
>
> My first instinct was to do something awful like iterate over the set of
> "kretprobe_instance"s with ->task == current, and then correct
> kretprobe_trampoline entries using ->ret_addr. (I think this would be
> correct because each task can only be in one context at once, and in
> order to get to a particular kretprobe all of your caller kretprobes
> were inserted before you and any sibling or later kretprobe_instances
> will have been removed. But I might be insanely wrong on this front.)
yeah, you are right.
>
> However (as I noted in the other thread), there is a problem where
> kretprobe_trampoline actually stops the unwinder in its tracks and thus
> you only get the first kretprobe_trampoline. This is something I'm going
> to look into some more (despite not having made progress on it last
> time) since now it's something that actually needs to be fixed (and
> as I mentioned in the other thread, show_stack() actually works on x86
> in this context unlike the other stack_trace users).
I should read the unwinder code, but anyway, fixing it up in kretprobe
handler context is not hard. Since each instance is on an hlist, so when
we hit the kretprobe_trampoline, we can search it. However, the problem
is the case where the out of kretprobe handler context. In that context
we need to try to lock the hlist and search the list, which will be a
costful operation.
On the other hand, func-graph tracer introduces a shadow return address
stack for each task (thread), and when we hit its trampoline on the stack,
we can easily search the entry from "current" task without locking the
shadow stack (and we already did it). This may need to pay a cost (1 page)
for each task, but smarter than kretprobe, which makes a system-wide
hash-table and need to search from hlist which has return addresses
of other context coexist.
Thank you,
--
Masami Hiramatsu <mhiramat@kernel.org>
^ permalink raw reply
* Re: [RFC PATCH] lib: Introduce generic __cmpxchg_u64() and use it where needed
From: Paul E. McKenney @ 2018-11-01 17:43 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Eric Dumazet, Trond Myklebust, mark.rutland@arm.com,
linux-kernel@vger.kernel.org, ralf@linux-mips.org,
jlayton@kernel.org, linuxppc-dev@lists.ozlabs.org,
bfields@fieldses.org, linux-mips@linux-mips.org,
linux@roeck-us.net, linux-nfs@vger.kernel.org,
akpm@linux-foundation.org, will.deacon@arm.com,
boqun.feng@gmail.com, paul.burton@mips.com,
"anna.schumaker@netapp.com" <an
In-Reply-To: <20181101171432.GH3178@hirez.programming.kicks-ass.net>
On Thu, Nov 01, 2018 at 06:14:32PM +0100, Peter Zijlstra wrote:
> On Thu, Nov 01, 2018 at 09:59:38AM -0700, Eric Dumazet wrote:
> > On 11/01/2018 09:32 AM, Peter Zijlstra wrote:
> >
> > >> Anyhow, if the atomic maintainers are willing to stand up and state for
> > >> the record that the atomic counters are guaranteed to wrap modulo 2^n
> > >> just like unsigned integers, then I'm happy to take Paul's patch.
> > >
> > > I myself am certainly relying on it.
> >
> > Could we get uatomic_t support maybe ?
>
> Whatever for; it'd be the exact identical same functions as for
> atomic_t, except for a giant amount of code duplication to deal with the
> new type.
>
> That is; today we merged a bunch of scripts that generates most of
> atomic*_t, so we could probably script uatomic*_t wrappers with minimal
> effort, but it would add several thousand lines of code to each compile
> for absolutely no reason what so ever.
>
> > This reminds me of this sooooo silly patch :/
> >
> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=adb03115f4590baa280ddc440a8eff08a6be0cb7
>
> Yes, that's stupid. UBSAN is just wrong there.
It would be good for UBSAN to treat atomic operations as guaranteed
2s complement with no UB for signed integer overflow. After all, if
even the C standard is willing to do this...
Ah, but don't we disable interrupts and fall back to normal arithmetic
for UP systems? Hmmm... We do so for atomic_add_return() even on
x86, it turns out:
static __always_inline int arch_atomic_add_return(int i, atomic_t *v)
{
return i + xadd(&v->counter, i);
}
So UBSAN actually did have a point. :-(
Thanx, Paul
^ permalink raw reply
* Re: [PATCH net] qmi_wwan: Support dynamic config on Quectel EP06
From: Kristian Evensen @ 2018-11-01 17:40 UTC (permalink / raw)
To: Network Development, Bjørn Mork
In-Reply-To: <20180908115048.12667-1-kristian.evensen@gmail.com>
Hi,
On Sat, Sep 8, 2018 at 1:50 PM Kristian Evensen
<kristian.evensen@gmail.com> wrote:
> Quectel EP06 (and EM06/EG06) supports dynamic configuration of USB
> interfaces, without the device changing VID/PID or configuration number.
> When the configuration is updated and interfaces are added/removed, the
> interface numbers change. This means that the current code for matching
> EP06 does not work.
Would it be possible to have this patch added to stable? I checked
both the 4.14-tree and the stable queue, but could not find the
patch/upstream commit.
Thanks,
Kristian
^ permalink raw reply
* Re: [RFC PATCH] lib: Introduce generic __cmpxchg_u64() and use it where needed
From: Paul E. McKenney @ 2018-11-01 17:34 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Trond Myklebust, mark.rutland@arm.com,
linux-kernel@vger.kernel.org, ralf@linux-mips.org,
jlayton@kernel.org, linuxppc-dev@lists.ozlabs.org,
bfields@fieldses.org, linux-mips@linux-mips.org,
linux@roeck-us.net, linux-nfs@vger.kernel.org,
akpm@linux-foundation.org, will.deacon@arm.com,
boqun.feng@gmail.com, paul.burton@mips.com,
anna.schumaker@netapp.com, "jhogan@kerne
In-Reply-To: <20181101171846.GI3178@hirez.programming.kicks-ass.net>
On Thu, Nov 01, 2018 at 06:18:46PM +0100, Peter Zijlstra wrote:
> On Thu, Nov 01, 2018 at 10:01:46AM -0700, Paul E. McKenney wrote:
> > On Thu, Nov 01, 2018 at 05:32:12PM +0100, Peter Zijlstra wrote:
> > > On Thu, Nov 01, 2018 at 03:22:15PM +0000, Trond Myklebust wrote:
> > > > On Thu, 2018-11-01 at 15:59 +0100, Peter Zijlstra wrote:
> > > > > On Thu, Nov 01, 2018 at 01:18:46PM +0000, Mark Rutland wrote:
> > >
> > > > > > > My one question (and the reason why I went with cmpxchg() in the
> > > > > > > first place) would be about the overflow behaviour for
> > > > > > > atomic_fetch_inc() and friends. I believe those functions should
> > > > > > > be OK on x86, so that when we overflow the counter, it behaves
> > > > > > > like an unsigned value and wraps back around. Is that the case
> > > > > > > for all architectures?
> > > > > > >
> > > > > > > i.e. are atomic_t/atomic64_t always guaranteed to behave like
> > > > > > > u32/u64 on increment?
> > > > > > >
> > > > > > > I could not find any documentation that explicitly stated that
> > > > > > > they should.
> > > > > >
> > > > > > Peter, Will, I understand that the atomic_t/atomic64_t ops are
> > > > > > required to wrap per 2's-complement. IIUC the refcount code relies
> > > > > > on this.
> > > > > >
> > > > > > Can you confirm?
> > > > >
> > > > > There is quite a bit of core code that hard assumes 2s-complement.
> > > > > Not only for atomics but for any signed integer type. Also see the
> > > > > kernel using -fno-strict-overflow which implies -fwrapv, which
> > > > > defines signed overflow to behave like 2s-complement (and rids us of
> > > > > that particular UB).
> > > >
> > > > Fair enough, but there have also been bugfixes to explicitly fix unsafe
> > > > C standards assumptions for signed integers. See, for instance commit
> > > > 5a581b367b5d "jiffies: Avoid undefined behavior from signed overflow"
> > > > from Paul McKenney.
> > >
> > > Yes, I feel Paul has been to too many C/C++ committee meetings and got
> > > properly paranoid. Which isn't always a bad thing :-)
> >
> > Even the C standard defines 2s complement for atomics.
>
> Ooh good to know.
Must be some mistake, right? ;-)
> > Just not for
> > normal arithmetic, where yes, signed overflow is UB. And yes, I do
> > know about -fwrapv, but I would like to avoid at least some copy-pasta
> > UB from my kernel code to who knows what user-mode environment. :-/
> >
> > At least where it is reasonably easy to do so.
>
> Fair enough I suppose; I just always make sure to include the same
> -fknobs for the userspace thing when I lift code.
Agreed! But when it is other people lifting the code...
> > And there is a push to define C++ signed arithmetic as 2s complement,
> > but there are still 1s complement systems with C compilers. Just not
> > C++ compilers. Legacy...
>
> *groan*; how about those ancient hardwares keep using ancient compilers
> and we all move on to the 70s :-)
Hey!!! Some of that 70s (and 60s!) 1s-complement hardware helped pay
my way through university the first time around!!! ;-)
Though where it once filled a room it is now on a single small chip.
Go figure...
> > > But for us using -fno-strict-overflow which actually defines signed
> > > overflow, I myself am really not worried. I'm also not sure if KASAN has
> > > been taught about this, or if it will still (incorrectly) warn about UB
> > > for signed types.
> >
> > UBSAN gave me a signed-overflow warning a few days ago. Which I have
> > fixed, even though 2s complement did the right thing. I am also taking
> > advantage of the change to use better naming.
>
> Oh too many *SANs I suppose; and yes, if you can make the code better,
> why not.
Yeah, when INT_MIN was confined to a single function, no problem.
But thanks to the RCU flavor consolidation, it has to be spread out a
bit more... Plus there is now INT_MAX, INT_MAX/2, ...
> > > > Anyhow, if the atomic maintainers are willing to stand up and state for
> > > > the record that the atomic counters are guaranteed to wrap modulo 2^n
> > > > just like unsigned integers, then I'm happy to take Paul's patch.
> > >
> > > I myself am certainly relying on it.
> >
> > Color me confused. My 5a581b367b5d is from 2013. Or is "Paul" instead
> > intended to mean Paul Mackerras, who happens to be on CC?
>
> Paul Burton I think, on a part of the thread before we joined :-)
Couldn't be bothered to look up the earlier part of the thread. Getting
lazy in my old age. ;-)
Thanx, Paul
^ permalink raw reply
* Re: Kernel 4.19 network performance - forwarding/routing normal users traffic
From: Paweł Staszewski @ 2018-11-01 17:30 UTC (permalink / raw)
To: David Ahern, Jesper Dangaard Brouer; +Cc: netdev, Yoel Caspersen
In-Reply-To: <7141e1e0-93e4-ab20-bce6-17f1e14682f1@gmail.com>
W dniu 01.11.2018 o 18:23, David Ahern pisze:
> On 11/1/18 7:52 AM, Paweł Staszewski wrote:
>>
>> W dniu 01.11.2018 o 11:55, Jesper Dangaard Brouer pisze:
>>> On Wed, 31 Oct 2018 21:37:16 -0600 David Ahern <dsahern@gmail.com> wrote:
>>>
>>>> This is mainly a forwarding use case? Seems so based on the perf report.
>>>> I suspect forwarding with XDP would show pretty good improvement.
>>> Yes, significant performance improvements.
>>>
>>> Notice Davids talk: "Leveraging Kernel Tables with XDP"
>>> http://vger.kernel.org/lpc-networking2018.html#session-1
>> It will be rly interesting
> It's pushing the exact use case you have: FRR manages the FIB, XDP
> programs get access to updates as they happen for fast path forwarding.
Cant wait then :)
>>> It looks like that you are doing "pure" IP-routing, without any
>>> iptables conntrack stuff (from your perf report data). That will
>>> actually be a really good use-case for accelerating this with XDP.
>> Yes pure IP routing
>> iptables used only for some local input filtering.
>>
>>
>>> I want you to understand the philosophy behind how David and I want
>>> people to leverage XDP. Think of XDP as a software offload layer for
>>> the kernel network stack. Setup and use Linux kernel network stack, but
>>> accelerate parts of it with XDP, e.g. the route FIB lookup.
>>>
>>> Sample code avail here:
>>>
>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/samples/bpf/xdp_fwd_kern.c
>>>
>> I can try some tests on same hw but testlab configuration - will give it
>> a try :)
>>
> That version does not work with VLANs. I have patches for it but it
> needs a bit more work before sending out. Perhaps I can get back to it
> next week.
>
Will be nice - next week i will be able to replace network controller
and install separate two 100Gbit nics into two pciex x16 slots - so can
test without hitting pcie bandwidth limits.
^ permalink raw reply
* Re: Kernel 4.19 network performance - forwarding/routing normal users traffic
From: David Ahern @ 2018-11-01 17:23 UTC (permalink / raw)
To: Paweł Staszewski, Jesper Dangaard Brouer, David Ahern
Cc: netdev, Yoel Caspersen
In-Reply-To: <63198d68-6752-3695-f406-d86fb395c12b@itcare.pl>
On 11/1/18 7:52 AM, Paweł Staszewski wrote:
>
>
> W dniu 01.11.2018 o 11:55, Jesper Dangaard Brouer pisze:
>> On Wed, 31 Oct 2018 21:37:16 -0600 David Ahern <dsahern@gmail.com> wrote:
>>
>>> This is mainly a forwarding use case? Seems so based on the perf report.
>>> I suspect forwarding with XDP would show pretty good improvement.
>> Yes, significant performance improvements.
>>
>> Notice Davids talk: "Leveraging Kernel Tables with XDP"
>> http://vger.kernel.org/lpc-networking2018.html#session-1
> It will be rly interesting
It's pushing the exact use case you have: FRR manages the FIB, XDP
programs get access to updates as they happen for fast path forwarding.
>
>> It looks like that you are doing "pure" IP-routing, without any
>> iptables conntrack stuff (from your perf report data). That will
>> actually be a really good use-case for accelerating this with XDP.
> Yes pure IP routing
> iptables used only for some local input filtering.
>
>
>>
>> I want you to understand the philosophy behind how David and I want
>> people to leverage XDP. Think of XDP as a software offload layer for
>> the kernel network stack. Setup and use Linux kernel network stack, but
>> accelerate parts of it with XDP, e.g. the route FIB lookup.
>>
>> Sample code avail here:
>>
>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/samples/bpf/xdp_fwd_kern.c
>>
> I can try some tests on same hw but testlab configuration - will give it
> a try :)
>
That version does not work with VLANs. I have patches for it but it
needs a bit more work before sending out. Perhaps I can get back to it
next week.
^ permalink raw reply
* Re: [RFC bpf-next] libbpf: increase rlimit before trying to create BPF maps
From: Quentin Monnet @ 2018-11-01 17:18 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann; +Cc: netdev, oss-drivers
In-Reply-To: <1540913023-8903-1-git-send-email-quentin.monnet@netronome.com>
2018-10-30 15:23 UTC+0000 ~ Quentin Monnet <quentin.monnet@netronome.com>
> The limit for memory locked in the kernel by a process is usually set to
> 64 bytes by default. This can be an issue when creating large BPF maps.
> A workaround is to raise this limit for the current process before
> trying to create a new BPF map. Changing the hard limit requires the
> CAP_SYS_RESOURCE and can usually only be done by root user (but then
> only root can create BPF maps).
Sorry, the parenthesis is not correct: non-root users can in fact create
BPF maps as well. If a non-root user calls the function to create a map,
setrlimit() will fail silently (but set errno), and the program will
simply go on with its rlimit unchanged.
>
> As far as I know there is not API to get the current amount of memory
> locked for a user, therefore we cannot raise the limit only when
> required. One solution, used by bcc, is to try to create the map, and on
> getting a EPERM error, raising the limit to infinity before giving
> another try. Another approach, used in iproute, is to raise the limit in
> all cases, before trying to create the map.
>
> Here we do the same as in iproute2: the rlimit is raised to infinity
> before trying to load the map.
>
> I send this patch as a RFC to see if people would prefer the bcc
> approach instead, or the rlimit change to be in bpftool rather than in
> libbpf.
>
> Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
> ---
> tools/lib/bpf/bpf.c | 5 +++++
> 1 file changed, 5 insertions(+)
>
> diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
> index 03f9bcc4ef50..456a5a7b112c 100644
> --- a/tools/lib/bpf/bpf.c
> +++ b/tools/lib/bpf/bpf.c
> @@ -26,6 +26,8 @@
> #include <unistd.h>
> #include <asm/unistd.h>
> #include <linux/bpf.h>
> +#include <sys/resource.h>
> +#include <sys/types.h>
> #include "bpf.h"
> #include "libbpf.h"
> #include <errno.h>
> @@ -68,8 +70,11 @@ static inline int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr,
> int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr)
> {
> __u32 name_len = create_attr->name ? strlen(create_attr->name) : 0;
> + struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY };
> union bpf_attr attr;
>
> + setrlimit(RLIMIT_MEMLOCK, &rinf);
> +
> memset(&attr, '\0', sizeof(attr));
>
> attr.map_type = create_attr->map_type;
>
^ permalink raw reply
* KASAN: use-after-free Read in ip6_tnl_parse_tlv_enc_lim
From: syzbot @ 2018-11-01 16:51 UTC (permalink / raw)
To: davem, kuznet, linux-kernel, netdev, syzkaller-bugs, yoshfuji
Hello,
syzbot found the following crash on:
HEAD commit: 44adbac8f721 Merge branch 'work.tty-ioctl' of git://git.ke..
git tree: bpf-next
console output: https://syzkaller.appspot.com/x/log.txt?x=16043183400000
kernel config: https://syzkaller.appspot.com/x/.config?x=9cb981ee01463dea
dashboard link: https://syzkaller.appspot.com/bug?extid=2c8d8fccd6cece4ede2a
compiler: gcc (GCC) 8.0.1 20180413 (experimental)
Unfortunately, I don't have any reproducer for this crash yet.
IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+2c8d8fccd6cece4ede2a@syzkaller.appspotmail.com
��6��a�~IyM�\x12�: renamed from nr0\x01
==================================================================
BUG: KASAN: use-after-free in ip6_tnl_parse_tlv_enc_lim+0x5df/0x660
net/ipv6/ip6_tunnel.c:417
Read of size 1 at addr ffff8801ca21227f by task syz-executor0/11239
CPU: 1 PID: 11239 Comm: syz-executor0 Not tainted 4.19.0+ #135
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x244/0x39d lib/dump_stack.c:113
print_address_description.cold.7+0x9/0x1ff mm/kasan/report.c:256
kasan_report_error mm/kasan/report.c:354 [inline]
kasan_report.cold.8+0x242/0x309 mm/kasan/report.c:412
__asan_report_load1_noabort+0x14/0x20 mm/kasan/report.c:430
ip6_tnl_parse_tlv_enc_lim+0x5df/0x660 net/ipv6/ip6_tunnel.c:417
ip6ip6_tnl_xmit net/ipv6/ip6_tunnel.c:1348 [inline]
ip6_tnl_start_xmit+0x49f/0x25a0 net/ipv6/ip6_tunnel.c:1412
__netdev_start_xmit include/linux/netdevice.h:4336 [inline]
netdev_start_xmit include/linux/netdevice.h:4345 [inline]
xmit_one net/core/dev.c:3252 [inline]
dev_hard_start_xmit+0x295/0xc90 net/core/dev.c:3268
__dev_queue_xmit+0x2f71/0x3ad0 net/core/dev.c:3838
dev_queue_xmit+0x17/0x20 net/core/dev.c:3871
__bpf_tx_skb net/core/filter.c:2017 [inline]
__bpf_redirect_common net/core/filter.c:2055 [inline]
__bpf_redirect+0x5cf/0xb20 net/core/filter.c:2062
____bpf_clone_redirect net/core/filter.c:2095 [inline]
bpf_clone_redirect+0x2f6/0x490 net/core/filter.c:2067
bpf_prog_759a992c578a3894+0x7d7/0x1000
Allocated by task 10771:
save_stack+0x43/0xd0 mm/kasan/kasan.c:448
set_track mm/kasan/kasan.c:460 [inline]
kasan_kmalloc+0xc7/0xe0 mm/kasan/kasan.c:553
kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:490
kmem_cache_alloc+0x12e/0x730 mm/slab.c:3554
getname_flags+0xd0/0x5a0 fs/namei.c:140
getname fs/namei.c:211 [inline]
user_path_create fs/namei.c:3693 [inline]
do_symlinkat+0xe9/0x2d0 fs/namei.c:4147
__do_sys_symlink fs/namei.c:4173 [inline]
__se_sys_symlink fs/namei.c:4171 [inline]
__x64_sys_symlink+0x59/0x80 fs/namei.c:4171
do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
entry_SYSCALL_64_after_hwframe+0x49/0xbe
Freed by task 10771:
save_stack+0x43/0xd0 mm/kasan/kasan.c:448
set_track mm/kasan/kasan.c:460 [inline]
__kasan_slab_free+0x102/0x150 mm/kasan/kasan.c:521
kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
__cache_free mm/slab.c:3498 [inline]
kmem_cache_free+0x83/0x290 mm/slab.c:3756
putname+0xf2/0x130 fs/namei.c:261
filename_create+0x2b2/0x5b0 fs/namei.c:3658
user_path_create fs/namei.c:3693 [inline]
do_symlinkat+0xfe/0x2d0 fs/namei.c:4147
__do_sys_symlink fs/namei.c:4173 [inline]
__se_sys_symlink fs/namei.c:4171 [inline]
__x64_sys_symlink+0x59/0x80 fs/namei.c:4171
do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
entry_SYSCALL_64_after_hwframe+0x49/0xbe
The buggy address belongs to the object at ffff8801ca212140
which belongs to the cache names_cache of size 4096
The buggy address is located 319 bytes inside of
4096-byte region [ffff8801ca212140, ffff8801ca213140)
The buggy address belongs to the page:
page:ffffea0007288480 count:1 mapcount:0 mapping:ffff8801da973d80 index:0x0
compound_mapcount: 0
flags: 0x2fffc0000008100(slab|head)
raw: 02fffc0000008100 ffffea0006dd6288 ffffea0007223988 ffff8801da973d80
raw: 0000000000000000 ffff8801ca212140 0000000100000001 0000000000000000
page dumped because: kasan: bad access detected
Memory state around the buggy address:
ffff8801ca212100: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
ffff8801ca212180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> ffff8801ca212200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^
ffff8801ca212280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff8801ca212300: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
==================================================================
---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.
syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with
syzbot.
^ permalink raw reply
* Re: Kernel 4.19 network performance - forwarding/routing normal users traffic
From: Paweł Staszewski @ 2018-11-01 16:49 UTC (permalink / raw)
To: Saeed Mahameed, netdev@vger.kernel.org
In-Reply-To: <6486d01d-7a50-33c4-e27f-4ace8aa8e150@itcare.pl>
W dniu 01.11.2018 o 12:09, Paweł Staszewski pisze:
>>> rx_cqe_compress_pkts: 0
>> If this is a pcie bottleneck it might be useful to enable CQE
>> compression (to reduce PCIe completion descriptors transactions)
>> you should see the above rx_cqe_compress_pkts increasing when enabled.
>>
>> $ ethtool --set-priv-flags enp175s0f1 rx_cqe_compress on
>> $ ethtool --show-priv-flags enp175s0f1
>> Private flags for p6p1:
>> rx_cqe_moder : on
>> cqe_moder : off
>> rx_cqe_compress : on
>> ...
>>
>> try this on both interfaces.
> Done
> ethtool --show-priv-flags enp175s0f1
> Private flags for enp175s0f1:
> rx_cqe_moder : on
> tx_cqe_moder : off
> rx_cqe_compress : on
> rx_striding_rq : off
> rx_no_csum_complete: off
>
> ethtool --show-priv-flags enp175s0f0
> Private flags for enp175s0f0:
> rx_cqe_moder : on
> tx_cqe_moder : off
> rx_cqe_compress : on
> rx_striding_rq : off
> rx_no_csum_complete: off
Enabling cqe compress changes nothing after reaching 64Gbit RX /
64Gbit/s TX on interfaces cpu's are saturated at 100%
ethtool -S enp175s0f1 | grep rx_cqe_compress
rx_cqe_compress_blks: 5657836379
rx_cqe_compress_pkts: 13153761080
ethtool -S enp175s0f0 | grep rx_cqe_compress
rx_cqe_compress_blks: 5994612500
rx_cqe_compress_pkts: 13579014869
bwm-ng v0.6.1 (probing every 1.000s), press 'h' for help
input: /proc/net/dev type: rate
- iface Rx Tx Total
==============================================================================
enp175s0f1: 27.03 Gb/s 37.09 Gb/s
64.12 Gb/s
enp175s0f0: 36.84 Gb/s 26.82 Gb/s
63.66 Gb/s
------------------------------------------------------------------------------
total: 63.85 Gb/s 63.87 Gb/s 127.72 Gb/s
bwm-ng v0.6.1 (probing every 1.000s), press 'h' for help
input: /proc/net/dev type: rate
/ iface Rx Tx Total
==============================================================================
enp175s0f1: 3.22 GB/s 4.26 GB/s
7.48 GB/s
enp175s0f0: 4.24 GB/s 3.21 GB/s
7.45 GB/s
------------------------------------------------------------------------------
total: 7.46 GB/s 7.47 GB/s
14.93 GB/s
mpstat
Average: CPU %usr %nice %sys %iowait %irq %soft %steal
%guest %gnice %idle
Average: all 0.05 0.00 0.19 0.02 0.00 42.74 0.00
0.00 0.00 56.99
Average: 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00
0.00 0.00 100.00
Average: 1 0.00 0.00 0.30 0.00 0.00 0.00 0.00
0.00 0.00 99.70
Average: 2 0.00 0.00 0.20 0.00 0.00 0.00 0.00
0.00 0.00 99.80
Average: 3 0.00 0.00 0.20 1.20 0.00 0.00 0.00
0.00 0.00 98.60
Average: 4 0.10 0.00 0.00 0.00 0.00 0.00 0.00
0.00 0.00 99.90
Average: 5 0.00 0.00 0.10 0.00 0.00 0.00 0.00
0.00 0.00 99.90
Average: 6 0.10 0.00 0.20 0.00 0.00 0.00 0.00
0.00 0.00 99.70
Average: 7 0.00 0.00 0.00 0.00 0.00 0.00 0.00
0.00 0.00 100.00
Average: 8 0.00 0.00 0.00 0.00 0.00 0.00 0.00
0.00 0.00 100.00
Average: 9 0.00 0.00 0.00 0.00 0.00 0.00 0.00
0.00 0.00 100.00
Average: 10 0.00 0.00 0.00 0.00 0.00 0.00 0.00
0.00 0.00 100.00
Average: 11 0.00 0.00 0.00 0.00 0.00 0.00 0.00
0.00 0.00 100.00
Average: 12 1.40 0.00 4.50 0.00 0.00 0.00 0.00
0.00 0.00 94.10
Average: 13 0.00 0.00 1.60 0.00 0.00 0.00 0.00
0.00 0.00 98.40
Average: 14 0.00 0.00 0.00 0.00 0.00 84.10 0.00
0.00 0.00 15.90
Average: 15 0.00 0.00 0.10 0.00 0.00 93.70 0.00
0.00 0.00 6.20
Average: 16 0.00 0.00 0.10 0.00 0.00 94.31 0.00
0.00 0.00 5.59
Average: 17 0.00 0.00 0.00 0.00 0.00 95.30 0.00
0.00 0.00 4.70
Average: 18 0.00 0.00 0.00 0.00 0.00 62.80 0.00
0.00 0.00 37.20
Average: 19 0.00 0.00 0.10 0.00 0.00 98.90 0.00
0.00 0.00 1.00
Average: 20 0.00 0.00 0.00 0.00 0.00 99.30 0.00
0.00 0.00 0.70
Average: 21 0.00 0.00 0.00 0.00 0.00 100.00 0.00
0.00 0.00 0.00
Average: 22 0.00 0.00 0.00 0.00 0.00 99.90 0.00
0.00 0.00 0.10
Average: 23 0.00 0.00 0.10 0.00 0.00 99.90 0.00
0.00 0.00 0.00
Average: 24 0.00 0.00 0.10 0.00 0.00 97.10 0.00
0.00 0.00 2.80
Average: 25 0.00 0.00 0.00 0.00 0.00 64.06 0.00
0.00 0.00 35.94
Average: 26 0.00 0.00 0.10 0.00 0.00 88.50 0.00
0.00 0.00 11.40
Average: 27 0.00 0.00 0.00 0.00 0.00 94.10 0.00
0.00 0.00 5.90
Average: 28 0.80 0.00 0.00 0.00 0.00 0.00 0.00
0.00 0.00 99.20
Average: 29 0.00 0.00 0.10 0.00 0.00 0.00 0.00
0.00 0.00 99.90
Average: 30 0.00 0.00 0.00 0.00 0.00 0.00 0.00
0.00 0.00 100.00
Average: 31 0.20 0.00 0.80 0.00 0.00 0.00 0.00
0.00 0.00 99.00
Average: 32 0.00 0.00 0.00 0.00 0.00 0.00 0.00
0.00 0.00 100.00
Average: 33 0.00 0.00 0.00 0.00 0.00 0.00 0.00
0.00 0.00 100.00
Average: 34 0.00 0.00 0.00 0.00 0.00 0.00 0.00
0.00 0.00 100.00
Average: 35 0.00 0.00 0.00 0.00 0.00 0.00 0.00
0.00 0.00 100.00
Average: 36 0.20 0.00 0.40 0.00 0.00 0.00 0.00
0.00 0.00 99.40
Average: 37 0.00 0.00 0.10 0.00 0.00 0.00 0.00
0.00 0.00 99.90
Average: 38 0.00 0.00 0.00 0.00 0.00 0.00 0.00
0.00 0.00 100.00
Average: 39 0.00 0.00 0.00 0.00 0.00 0.00 0.00
0.00 0.00 100.00
Average: 40 0.10 0.00 0.00 0.00 0.00 0.00 0.00
0.00 0.00 99.90
Average: 41 0.10 0.00 1.20 0.00 0.00 0.00 0.00
0.00 0.00 98.70
Average: 42 0.00 0.00 0.10 0.00 0.00 78.92 0.00
0.00 0.00 20.98
Average: 43 0.00 0.00 0.00 0.00 0.00 81.00 0.00
0.00 0.00 19.00
Average: 44 0.00 0.00 0.00 0.00 0.00 82.58 0.00
0.00 0.00 17.42
Average: 45 0.00 0.00 0.00 0.00 0.00 68.97 0.00
0.00 0.00 31.03
Average: 46 0.00 0.00 0.10 0.00 0.00 79.20 0.00
0.00 0.00 20.70
Average: 47 0.00 0.00 0.00 0.00 0.00 71.33 0.00
0.00 0.00 28.67
Average: 48 0.00 0.00 0.10 0.00 0.00 72.40 0.00
0.00 0.00 27.50
Average: 49 0.00 0.00 0.00 0.00 0.00 90.79 0.00
0.00 0.00 9.21
Average: 50 0.00 0.00 0.10 0.00 0.00 93.20 0.00
0.00 0.00 6.70
Average: 51 0.00 0.00 0.00 0.00 0.00 91.70 0.00
0.00 0.00 8.30
Average: 52 0.00 0.00 0.10 0.00 0.00 79.90 0.00
0.00 0.00 20.00
Average: 53 0.00 0.00 0.00 0.00 0.00 76.20 0.00
0.00 0.00 23.80
Average: 54 0.00 0.00 0.00 0.00 0.00 89.59 0.00
0.00 0.00 10.41
Average: 55 0.00 0.00 0.10 0.00 0.00 65.97 0.00
0.00 0.00 33.93
So yes it looks like pcie x16 limit and pcie is 8/8GB/s both directions
- 16GB/s one direction
So 100Gbit/s network controller need to have pcie x32 :)
I understand in normal server host scenario (no forwarding) most traffic
is outbound or inbound - not many situations where we have 100G input
and 100G output.
I will replace then this 2 port 100G nic with two connect-x 5 100G nic's
installed in two different pcie x16
^ permalink raw reply
* Re: [RFC PATCH] lib: Introduce generic __cmpxchg_u64() and use it where needed
From: Peter Zijlstra @ 2018-11-01 16:32 UTC (permalink / raw)
To: Trond Myklebust
Cc: mark.rutland@arm.com, linux-kernel@vger.kernel.org,
ralf@linux-mips.org, jlayton@kernel.org,
linuxppc-dev@lists.ozlabs.org, bfields@fieldses.org,
linux-mips@linux-mips.org, linux@roeck-us.net,
linux-nfs@vger.kernel.org, akpm@linux-foundation.org,
will.deacon@arm.com, boqun.feng@gmail.com, paul.burton@mips.com,
anna.schumaker@netapp.com, jhogan@kernel.org, "netdev@vger.ke
In-Reply-To: <f38e272f7a96e983549e4281aa9fd02833a4277a.camel@hammerspace.com>
On Thu, Nov 01, 2018 at 03:22:15PM +0000, Trond Myklebust wrote:
> On Thu, 2018-11-01 at 15:59 +0100, Peter Zijlstra wrote:
> > On Thu, Nov 01, 2018 at 01:18:46PM +0000, Mark Rutland wrote:
> > > > My one question (and the reason why I went with cmpxchg() in the
> > > > first place) would be about the overflow behaviour for
> > > > atomic_fetch_inc() and friends. I believe those functions should
> > > > be OK on x86, so that when we overflow the counter, it behaves
> > > > like an unsigned value and wraps back around. Is that the case
> > > > for all architectures?
> > > >
> > > > i.e. are atomic_t/atomic64_t always guaranteed to behave like
> > > > u32/u64 on increment?
> > > >
> > > > I could not find any documentation that explicitly stated that
> > > > they should.
> > >
> > > Peter, Will, I understand that the atomic_t/atomic64_t ops are
> > > required to wrap per 2's-complement. IIUC the refcount code relies
> > > on this.
> > >
> > > Can you confirm?
> >
> > There is quite a bit of core code that hard assumes 2s-complement.
> > Not only for atomics but for any signed integer type. Also see the
> > kernel using -fno-strict-overflow which implies -fwrapv, which
> > defines signed overflow to behave like 2s-complement (and rids us of
> > that particular UB).
>
> Fair enough, but there have also been bugfixes to explicitly fix unsafe
> C standards assumptions for signed integers. See, for instance commit
> 5a581b367b5d "jiffies: Avoid undefined behavior from signed overflow"
> from Paul McKenney.
Yes, I feel Paul has been to too many C/C++ committee meetings and got
properly paranoid. Which isn't always a bad thing :-)
But for us using -fno-strict-overflow which actually defines signed
overflow, I myself am really not worried. I'm also not sure if KASAN has
been taught about this, or if it will still (incorrectly) warn about UB
for signed types.
> Anyhow, if the atomic maintainers are willing to stand up and state for
> the record that the atomic counters are guaranteed to wrap modulo 2^n
> just like unsigned integers, then I'm happy to take Paul's patch.
I myself am certainly relying on it.
^ permalink raw reply
* Re: [PATCH net] net: hns3: Fix for out-of-bounds access when setting pfc back pressure
From: Yunsheng Lin @ 2018-11-02 0:48 UTC (permalink / raw)
To: John Garry, davem
Cc: huangdaode, xuwei5, liguozhu, Yisen.Zhuang, linuxarm, salil.mehta,
lipeng321, netdev, linux-kernel
In-Reply-To: <4b5d0e04-df53-6123-0562-7f0146d33ae9@huawei.com>
On 2018/11/1 20:49, John Garry wrote:
> On 01/11/2018 12:12, Yunsheng Lin wrote:
>> The vport should be initialized to hdev->vport for each bp group,
>> otherwise it will cause out-of-bounds access and bp setting not
>> correct problem.
>>
>> [ 35.254124] BUG: KASAN: slab-out-of-bounds in hclge_pause_setup_hw+0x2a0/0x3f8 [hclge]
>> [ 35.254126] Read of size 2 at addr ffff803b6651581a by task kworker/0:1/14
>>
>> [ 35.254132] CPU: 0 PID: 14 Comm: kworker/0:1 Not tainted 4.19.0-rc7-hulk+ #85
>> [ 35.254133] Hardware name: Huawei D06/D06, BIOS Hisilicon D06 UEFI RC0 - B052 (V0.52) 09/14/2018
>> [ 35.254141] Workqueue: events work_for_cpu_fn
>> [ 35.254144] Call trace:
>> [ 35.254147] dump_backtrace+0x0/0x2f0
>> [ 35.254149] show_stack+0x24/0x30
>> [ 35.254154] dump_stack+0x110/0x184
>> [ 35.254157] print_address_description+0x168/0x2b0
>> [ 35.254160] kasan_report+0x184/0x310
>> [ 35.254162] __asan_load2+0x7c/0xa0
>> [ 35.254170] hclge_pause_setup_hw+0x2a0/0x3f8 [hclge]
>> [ 35.254177] hclge_tm_init_hw+0x794/0x9f0 [hclge]
>> [ 35.254184] hclge_tm_schd_init+0x48/0x58 [hclge]
>> [ 35.254191] hclge_init_ae_dev+0x778/0x1168 [hclge]
>> [ 35.254196] hnae3_register_ae_dev+0x14c/0x298 [hnae3]
>> [ 35.254206] hns3_probe+0x88/0xa8 [hns3]
>> [ 35.254210] local_pci_probe+0x7c/0xf0
>> [ 35.254212] work_for_cpu_fn+0x34/0x50
>> [ 35.254214] process_one_work+0x4d4/0xa38
>> [ 35.254216] worker_thread+0x55c/0x8d8
>> [ 35.254219] kthread+0x1b0/0x1b8
>> [ 35.254222] ret_from_fork+0x10/0x1c
>>
>> [ 35.254224] The buggy address belongs to the page:
>> [ 35.254228] page:ffff7e00ed994400 count:1 mapcount:0 mapping:0000000000000000 index:0x0 compound_mapcount: 0
>> [ 35.273835] flags: 0xfffff8000008000(head)
>> [ 35.282007] raw: 0fffff8000008000 dead000000000100 dead000000000200 0000000000000000
>> [ 35.282010] raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000
>> [ 35.282012] page dumped because: kasan: bad access detected
>>
>> [ 35.282014] Memory state around the buggy address:
>> [ 35.282017] ffff803b66515700: fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe
>> [ 35.282019] ffff803b66515780: fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe
>> [ 35.282021] >ffff803b66515800: fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe
>> [ 35.282022] ^
>> [ 35.282024] ffff803b66515880: fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe
>> [ 35.282026] ffff803b66515900: fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe
>> [ 35.282028] ==================================================================
>> [ 35.282029] Disabling lock debugging due to kernel taint
>> [ 35.282747] hclge driver initialization finished.
>>
>> Fixes: 67bf2541f4b9 ("net: hns3: Fixes the back pressure setting when sriov is enabled")
>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>> ---
>> drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 3 ++-
>> 1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
>> index aa5cb98..0c9c6ae 100644
>> --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
>> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
>> @@ -1168,11 +1168,12 @@ static int hclge_pfc_setup_hw(struct hclge_dev *hdev)
>> */
>> static int hclge_bp_setup_hw(struct hclge_dev *hdev, u8 tc)
>> {
>> - struct hclge_vport *vport = hdev->vport;
>> u32 i, k, qs_bitmap;
>> int ret;
>>
>> for (i = 0; i < HCLGE_BP_GRP_NUM; i++) {
>> + struct hclge_vport *vport = hdev->vport;
>> +
>
> It seems to me that the bug is fixed but the code is not much better.
>
> Since variable vport is only referenced in the inner loop, I would define it there.
I am agree that it is always better to have better and safer code.
I will provide another verion to limit local variable as much as possible.
Thanks for the reviewing and suggestion.
>
>> qs_bitmap = 0;
>>
>> for (k = 0; k < hdev->num_alloc_vport; k++) {
>>
>
> static int hclge_bp_setup_hw(struct hclge_dev *hdev, u8 tc)
> {
> - struct hclge_vport *vport = hdev->vport;
> - u32 i, k, qs_bitmap;
> - int ret;
> + int ret, i;
>
> for (i = 0; i < HCLGE_BP_GRP_NUM; i++) {
> - qs_bitmap = 0;
> + u32 qs_bitmap = 0;
> + int k;
>
> for (k = 0; k < hdev->num_alloc_vport; k++) {
> + struct hclge_vport *vport = &hdev->vport[k];
> u16 qs_id = vport->qs_offset + tc;
> u8 grp, sub_grp;
>
> ...
>
> if (i == grp)
> qs_bitmap |= (1 << sub_grp);
> -
> - vport++;
>
>
> ... so this seems better and safer, as you're explicitly limiting vport within bounds of hdev->num_alloc_vport.
>
>
>
> .
>
^ permalink raw reply
* Re: [PATCH v3 1/2] kretprobe: produce sane stack traces
From: Steven Rostedt @ 2018-11-02 0:47 UTC (permalink / raw)
To: Aleksa Sarai
Cc: Naveen N. Rao, Anil S Keshavamurthy, David S. Miller,
Masami Hiramatsu, Jonathan Corbet, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Alexander Shishkin, Jiri Olsa,
Namhyung Kim, Shuah Khan, Alexei Starovoitov, Daniel Borkmann,
Brendan Gregg, Christian Brauner, Aleksa Sarai, netdev, linux-doc,
linux-kernel
In-Reply-To: <20181101083551.3805-2-cyphar@cyphar.com>
On Thu, 1 Nov 2018 19:35:50 +1100
Aleksa Sarai <cyphar@cyphar.com> wrote:
> Historically, kretprobe has always produced unusable stack traces
> (kretprobe_trampoline is the only entry in most cases, because of the
> funky stack pointer overwriting). This has caused quite a few annoyances
> when using tracing to debug problems[1] -- since return values are only
> available with kretprobes but stack traces were only usable for kprobes,
> users had to probe both and then manually associate them.
>
> With the advent of bpf_trace, users would have been able to do this
> association in bpf, but this was less than ideal (because
> bpf_get_stackid would still produce rubbish and programs that didn't
> know better would get silly results). The main usecase for stack traces
> (at least with bpf_trace) is for DTrace-style aggregation on stack
> traces (both entry and exit). Therefore we cannot simply correct the
> stack trace on exit -- we must stash away the stack trace and return the
> entry stack trace when it is requested.
>
> [1]: https://github.com/iovisor/bpftrace/issues/101
>
> Cc: Brendan Gregg <bgregg@netflix.com>
> Cc: Christian Brauner <christian@brauner.io>
> Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
> ---
> Documentation/kprobes.txt | 6 +-
> include/linux/kprobes.h | 27 +++++
> kernel/events/callchain.c | 8 +-
> kernel/kprobes.c | 101 +++++++++++++++++-
> kernel/trace/trace.c | 11 +-
> .../test.d/kprobe/kretprobe_stacktrace.tc | 25 +++++
> 6 files changed, 173 insertions(+), 5 deletions(-)
> create mode 100644 tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc
>
> diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
> index 10f4499e677c..1965585848f4 100644
> --- a/Documentation/kprobes.txt
> +++ b/Documentation/kprobes.txt
> @@ -597,7 +597,11 @@ address with the trampoline's address, stack backtraces and calls
> to __builtin_return_address() will typically yield the trampoline's
> address instead of the real return address for kretprobed functions.
> (As far as we can tell, __builtin_return_address() is used only
> -for instrumentation and error reporting.)
> +for instrumentation and error reporting.) However, since return probes
> +are used extensively in tracing (where stack backtraces are useful),
> +return probes will stash away the stack backtrace during function entry
> +so that return probe handlers can use the entry backtrace instead of
> +having a trace with just kretprobe_trampoline.
>
> If the number of times a function is called does not match the number
> of times it returns, registering a return probe on that function may
> diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
> index e909413e4e38..1a1629544e56 100644
> --- a/include/linux/kprobes.h
> +++ b/include/linux/kprobes.h
> @@ -40,6 +40,8 @@
> #include <linux/rcupdate.h>
> #include <linux/mutex.h>
> #include <linux/ftrace.h>
> +#include <linux/stacktrace.h>
> +#include <linux/perf_event.h>
> #include <asm/kprobes.h>
>
> #ifdef CONFIG_KPROBES
> @@ -168,11 +170,18 @@ struct kretprobe {
> raw_spinlock_t lock;
> };
>
> +#define KRETPROBE_TRACE_SIZE 127
> +struct kretprobe_trace {
> + int nr_entries;
> + unsigned long entries[KRETPROBE_TRACE_SIZE];
> +};
> +
> struct kretprobe_instance {
> struct hlist_node hlist;
> struct kretprobe *rp;
> kprobe_opcode_t *ret_addr;
> struct task_struct *task;
> + struct kretprobe_trace entry;
> char data[0];
> };
>
> @@ -371,6 +380,12 @@ void unregister_kretprobe(struct kretprobe *rp);
> int register_kretprobes(struct kretprobe **rps, int num);
> void unregister_kretprobes(struct kretprobe **rps, int num);
>
> +struct kretprobe_instance *current_kretprobe_instance(void);
> +void kretprobe_save_stack_trace(struct kretprobe_instance *ri,
> + struct stack_trace *trace);
> +void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri,
> + struct perf_callchain_entry_ctx *ctx);
> +
> void kprobe_flush_task(struct task_struct *tk);
> void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
>
> @@ -397,6 +412,18 @@ static inline struct kprobe *kprobe_running(void)
> {
> return NULL;
> }
> +static inline struct kretprobe_instance *current_kretprobe_instance(void)
> +{
> + return NULL;
> +}
> +static inline void kretprobe_save_stack_trace(struct kretprobe_instance *ri,
> + struct stack_trace *trace)
> +{
> +}
> +static inline void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri,
> + struct perf_callchain_entry_ctx *ctx)
> +{
> +}
> static inline int register_kprobe(struct kprobe *p)
> {
> return -ENOSYS;
> diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
> index 24a77c34e9ad..98edcd8a6987 100644
> --- a/kernel/events/callchain.c
> +++ b/kernel/events/callchain.c
> @@ -12,6 +12,7 @@
> #include <linux/perf_event.h>
> #include <linux/slab.h>
> #include <linux/sched/task_stack.h>
> +#include <linux/kprobes.h>
>
> #include "internal.h"
>
> @@ -197,9 +198,14 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
> ctx.contexts_maxed = false;
>
> if (kernel && !user_mode(regs)) {
> + struct kretprobe_instance *ri = current_kretprobe_instance();
> +
> if (add_mark)
> perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL);
> - perf_callchain_kernel(&ctx, regs);
> + if (ri)
> + kretprobe_perf_callchain_kernel(ri, &ctx);
> + else
> + perf_callchain_kernel(&ctx, regs);
> }
>
> if (user) {
> diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> index 90e98e233647..fca3964d18cd 100644
> --- a/kernel/kprobes.c
> +++ b/kernel/kprobes.c
> @@ -1206,6 +1206,16 @@ __releases(hlist_lock)
> }
> NOKPROBE_SYMBOL(kretprobe_table_unlock);
>
> +static bool kretprobe_hash_is_locked(struct task_struct *tsk)
> +{
> + unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
> + raw_spinlock_t *hlist_lock;
> +
> + hlist_lock = kretprobe_table_lock_ptr(hash);
> + return raw_spin_is_locked(hlist_lock);
> +}
> +NOKPROBE_SYMBOL(kretprobe_hash_is_locked);
> +
> /*
> * This function is called from finish_task_switch when task tk becomes dead,
> * so that we can recycle any function-return probe instances associated
> @@ -1800,6 +1810,13 @@ unsigned long __weak arch_deref_entry_point(void *entry)
> return (unsigned long)entry;
> }
>
> +static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs);
> +
> +static inline bool kprobe_is_retprobe(struct kprobe *kp)
> +{
> + return kp->pre_handler == pre_handler_kretprobe;
> +}
> +
> #ifdef CONFIG_KRETPROBES
> /*
> * This kprobe pre_handler is registered with every kretprobe. When probe
> @@ -1826,6 +1843,8 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
> hash = hash_ptr(current, KPROBE_HASH_BITS);
> raw_spin_lock_irqsave(&rp->lock, flags);
> if (!hlist_empty(&rp->free_instances)) {
> + struct stack_trace trace = {};
> +
> ri = hlist_entry(rp->free_instances.first,
> struct kretprobe_instance, hlist);
> hlist_del(&ri->hlist);
> @@ -1834,6 +1853,11 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
> ri->rp = rp;
> ri->task = current;
>
> + trace.entries = &ri->entry.entries[0];
> + trace.max_entries = KRETPROBE_TRACE_SIZE;
> + save_stack_trace_regs(regs, &trace);
> + ri->entry.nr_entries = trace.nr_entries;
> +
So basically your saving a copy of the stack trace for each probe, for
something that may not ever be used?
This adds quite a lot of overhead for something not used often.
I think the real answer is to fix kretprobes (and I just checked, the
return call of function graph tracer stack trace doesn't go past the
return_to_handler function either. It's just that a stack trace was
never done on the return side).
The more I look at this patch, the less I like it.
-- Steve
> if (rp->entry_handler && rp->entry_handler(ri, regs)) {
> raw_spin_lock_irqsave(&rp->lock, flags);
> hlist_add_head(&ri->hlist, &rp->free_instances);
> @@ -1856,6 +1880,65 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
> }
> NOKPROBE_SYMBOL(pre_handler_kretprobe);
>
> +/*
> + * Return the kretprobe_instance associated with the current_kprobe. Calling
> + * this is only reasonable from within a kretprobe handler context (otherwise
> + * return NULL).
> + *
> + * Must be called within a kretprobe_hash_lock(current, ...) context.
> + */
> +struct kretprobe_instance *current_kretprobe_instance(void)
> +{
> + struct kprobe *kp;
> + struct kretprobe *rp;
> + struct kretprobe_instance *ri;
> + struct hlist_head *head;
> + unsigned long hash = hash_ptr(current, KPROBE_HASH_BITS);
> +
> + kp = kprobe_running();
> + if (!kp || !kprobe_is_retprobe(kp))
> + return NULL;
> + if (WARN_ON(!kretprobe_hash_is_locked(current)))
> + return NULL;
> +
> + rp = container_of(kp, struct kretprobe, kp);
> + head = &kretprobe_inst_table[hash];
> +
> + hlist_for_each_entry(ri, head, hlist) {
> + if (ri->task == current && ri->rp == rp)
> + return ri;
> + }
> + return NULL;
> +}
> +EXPORT_SYMBOL_GPL(current_kretprobe_instance);
> +NOKPROBE_SYMBOL(current_kretprobe_instance);
> +
> +void kretprobe_save_stack_trace(struct kretprobe_instance *ri,
> + struct stack_trace *trace)
> +{
> + int i;
> + struct kretprobe_trace *krt = &ri->entry;
> +
> + for (i = trace->skip; i < krt->nr_entries; i++) {
> + if (trace->nr_entries >= trace->max_entries)
> + break;
> + trace->entries[trace->nr_entries++] = krt->entries[i];
> + }
> +}
> +
> +void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri,
> + struct perf_callchain_entry_ctx *ctx)
> +{
> + int i;
> + struct kretprobe_trace *krt = &ri->entry;
> +
> + for (i = 0; i < krt->nr_entries; i++) {
> + if (krt->entries[i] == ULONG_MAX)
> + break;
> + perf_callchain_store(ctx, (u64) krt->entries[i]);
> + }
> +}
> +
> bool __weak arch_kprobe_on_func_entry(unsigned long offset)
> {
> return !offset;
> @@ -2005,6 +2088,22 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
> }
> NOKPROBE_SYMBOL(pre_handler_kretprobe);
>
> +struct kretprobe_instance *current_kretprobe_instance(void)
> +{
> + return NULL;
> +}
> +EXPORT_SYMBOL_GPL(current_kretprobe_instance);
> +NOKPROBE_SYMBOL(current_kretprobe_instance);
> +
> +void kretprobe_save_stack_trace(struct kretprobe_instance *ri,
> + struct stack_trace *trace)
> +{
> +}
> +
> +void kretprobe_perf_callchain_kernel(struct kretprobe_instance *ri,
> + struct perf_callchain_entry_ctx *ctx)
> +{
> +}
> #endif /* CONFIG_KRETPROBES */
>
> /* Set the kprobe gone and remove its instruction buffer. */
> @@ -2241,7 +2340,7 @@ static void report_probe(struct seq_file *pi, struct kprobe *p,
> char *kprobe_type;
> void *addr = p->addr;
>
> - if (p->pre_handler == pre_handler_kretprobe)
> + if (kprobe_is_retprobe(p))
> kprobe_type = "r";
> else
> kprobe_type = "k";
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index bf6f1d70484d..2210d38a4dbf 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -42,6 +42,7 @@
> #include <linux/nmi.h>
> #include <linux/fs.h>
> #include <linux/trace.h>
> +#include <linux/kprobes.h>
> #include <linux/sched/clock.h>
> #include <linux/sched/rt.h>
>
> @@ -2590,6 +2591,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
> struct ring_buffer_event *event;
> struct stack_entry *entry;
> struct stack_trace trace;
> + struct kretprobe_instance *ri = current_kretprobe_instance();
> int use_stack;
> int size = FTRACE_STACK_ENTRIES;
>
> @@ -2626,7 +2628,9 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
> trace.entries = this_cpu_ptr(ftrace_stack.calls);
> trace.max_entries = FTRACE_STACK_MAX_ENTRIES;
>
> - if (regs)
> + if (ri)
> + kretprobe_save_stack_trace(ri, &trace);
> + else if (regs)
> save_stack_trace_regs(regs, &trace);
> else
> save_stack_trace(&trace);
> @@ -2653,7 +2657,10 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
> else {
> trace.max_entries = FTRACE_STACK_ENTRIES;
> trace.entries = entry->caller;
> - if (regs)
> +
> + if (ri)
> + kretprobe_save_stack_trace(ri, &trace);
> + else if (regs)
> save_stack_trace_regs(regs, &trace);
> else
> save_stack_trace(&trace);
> diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc
> new file mode 100644
> index 000000000000..03146c6a1a3c
> --- /dev/null
> +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_stacktrace.tc
> @@ -0,0 +1,25 @@
> +#!/bin/sh
> +# SPDX-License-Identifier: GPL-2.0+
> +# description: Kretprobe dynamic event with a stacktrace
> +
> +[ -f kprobe_events ] || exit_unsupported # this is configurable
> +
> +echo 0 > events/enable
> +echo 1 > options/stacktrace
> +
> +echo 'r:teststackprobe sched_fork $retval' > kprobe_events
> +grep teststackprobe kprobe_events
> +test -d events/kprobes/teststackprobe
> +
> +clear_trace
> +echo 1 > events/kprobes/teststackprobe/enable
> +( echo "forked")
> +echo 0 > events/kprobes/teststackprobe/enable
> +
> +# Make sure we don't see kretprobe_trampoline and we see _do_fork.
> +! grep 'kretprobe' trace
> +grep '_do_fork' trace
> +
> +echo '-:teststackprobe' >> kprobe_events
> +clear_trace
> +test -d events/kprobes/teststackprobe && exit_fail || exit_pass
^ permalink raw reply
* [PATCH] iwlegacy: fix small typo
From: Yangtao Li @ 2018-11-01 15:40 UTC (permalink / raw)
To: sgruszka, kvalo, davem; +Cc: linux-wireless, netdev, linux-kernel, Yangtao Li
Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
---
drivers/net/wireless/intel/iwlegacy/3945-rs.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/wireless/intel/iwlegacy/3945-rs.c b/drivers/net/wireless/intel/iwlegacy/3945-rs.c
index e8983c6a2b7b..a697edd46e7f 100644
--- a/drivers/net/wireless/intel/iwlegacy/3945-rs.c
+++ b/drivers/net/wireless/intel/iwlegacy/3945-rs.c
@@ -781,7 +781,7 @@ il3945_rs_get_rate(void *il_r, struct ieee80211_sta *sta, void *il_sta,
switch (scale_action) {
case -1:
- /* Decrese rate */
+ /* Decrease rate */
if (low != RATE_INVALID)
idx = low;
break;
--
2.17.0
^ permalink raw reply related
* Re: Kernel 4.19 network performance - forwarding/routing normal users traffic
From: Aaron Lu @ 2018-11-01 15:27 UTC (permalink / raw)
To: Jesper Dangaard Brouer
Cc: Paweł Staszewski, Eric Dumazet, netdev, Tariq Toukan,
Ilias Apalodimas, Yoel Caspersen, Mel Gorman
In-Reply-To: <20181101102213.2fa2643d@redhat.com>
On Thu, Nov 01, 2018 at 10:22:13AM +0100, Jesper Dangaard Brouer wrote:
... ...
> Section copied out:
>
> mlx5e_poll_tx_cq
> |
> --16.34%--napi_consume_skb
> |
> |--12.65%--__free_pages_ok
> | |
> | --11.86%--free_one_page
> | |
> | |--10.10%--queued_spin_lock_slowpath
> | |
> | --0.65%--_raw_spin_lock
This callchain looks like it is freeing higher order pages than order 0:
__free_pages_ok is only called for pages whose order are bigger than 0.
> |
> |--1.55%--page_frag_free
> |
> --1.44%--skb_release_data
>
>
> Let me explain what (I think) happens. The mlx5 driver RX-page recycle
> mechanism is not effective in this workload, and pages have to go
> through the page allocator. The lock contention happens during mlx5
> DMA TX completion cycle. And the page allocator cannot keep up at
> these speeds.
>
> One solution is extend page allocator with a bulk free API. (This have
> been on my TODO list for a long time, but I don't have a
> micro-benchmark that trick the driver page-recycle to fail). It should
> fit nicely, as I can see that kmem_cache_free_bulk() does get
> activated (bulk freeing SKBs), which means that DMA TX completion do
> have a bulk of packets.
>
> We can (and should) also improve the page recycle scheme in the driver.
> After LPC, I have a project with Tariq and Ilias (Cc'ed) to improve the
> page_pool, and we will (attempt) to generalize this, for both high-end
> mlx5 and more low-end ARM64-boards (macchiatobin and espressobin).
>
> The MM-people is in parallel working to improve the performance of
> order-0 page returns. Thus, the explicit page bulk free API might
> actually become less important. I actually think (Cc.) Aaron have a
> patchset he would like you to test, which removes the (zone->)lock
> you hit in free_one_page().
Thanks Jesper.
Yes, the said patchset is in this branch:
https://github.com/aaronlu/linux no_merge_cluster_alloc_4.19-rc5
But as I said above, I think the lock contention here is for
order > 0 pages so my current patchset will not work here, unfortunately.
BTW, Mel Gorman has suggested an alternative way to improve page
allocator's scalability and I'm working on it right now, it will
improve page allocator's scalability for all order pages. I might be
able to post it some time next week, will CC all of you when it's ready.
^ permalink raw reply
* Re: [PATCH 12/31] netfilter: cttimeout: remove superfluous check on layer 4 netlink functions
From: Eric Dumazet @ 2018-11-01 14:57 UTC (permalink / raw)
To: Pablo Neira Ayuso, netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20181008230125.2330-13-pablo@netfilter.org>
On 10/08/2018 04:01 PM, Pablo Neira Ayuso wrote:
> We assume they are always set accordingly since a874752a10da
> ("netfilter: conntrack: timeout interface depend on
> CONFIG_NF_CONNTRACK_TIMEOUT"), so we can get rid of this checks.
>
> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> ---
> net/netfilter/nfnetlink_cttimeout.c | 48 ++++++++++++++-----------------------
> net/netfilter/nft_ct.c | 3 ---
> 2 files changed, 18 insertions(+), 33 deletions(-)
>
> diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
> index a30f8ba4b89a..6ca0df7f416f 100644
> --- a/net/netfilter/nfnetlink_cttimeout.c
> +++ b/net/netfilter/nfnetlink_cttimeout.c
> @@ -53,9 +53,6 @@ ctnl_timeout_parse_policy(void *timeout,
> struct nlattr **tb;
> int ret = 0;
>
> - if (!l4proto->ctnl_timeout.nlattr_to_obj)
> - return 0;
> -
> tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb),
> GFP_KERNEL);
>
> @@ -167,6 +164,8 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
> struct nfgenmsg *nfmsg;
> unsigned int flags = portid ? NLM_F_MULTI : 0;
> const struct nf_conntrack_l4proto *l4proto = timeout->timeout.l4proto;
> + struct nlattr *nest_parms;
> + int ret;
>
> event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event);
> nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
> @@ -186,22 +185,15 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
> htonl(refcount_read(&timeout->refcnt))))
> goto nla_put_failure;
>
> - if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) {
> - struct nlattr *nest_parms;
> - int ret;
> -
> - nest_parms = nla_nest_start(skb,
> - CTA_TIMEOUT_DATA | NLA_F_NESTED);
> - if (!nest_parms)
> - goto nla_put_failure;
> + nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA | NLA_F_NESTED);
> + if (!nest_parms)
> + goto nla_put_failure;
>
> - ret = l4proto->ctnl_timeout.obj_to_nlattr(skb,
> - &timeout->timeout.data);
> - if (ret < 0)
> - goto nla_put_failure;
> + ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->timeout.data);
> + if (ret < 0)
> + goto nla_put_failure;
>
> - nla_nest_end(skb, nest_parms);
> - }
> + nla_nest_end(skb, nest_parms);
>
> nlmsg_end(skb, nlh);
> return skb->len;
> @@ -397,6 +389,8 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
> struct nlmsghdr *nlh;
> struct nfgenmsg *nfmsg;
> unsigned int flags = portid ? NLM_F_MULTI : 0;
> + struct nlattr *nest_parms;
> + int ret;
>
> event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event);
> nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
> @@ -412,21 +406,15 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
> nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto))
> goto nla_put_failure;
>
> - if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) {
> - struct nlattr *nest_parms;
> - int ret;
> -
> - nest_parms = nla_nest_start(skb,
> - CTA_TIMEOUT_DATA | NLA_F_NESTED);
> - if (!nest_parms)
> - goto nla_put_failure;
> + nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA | NLA_F_NESTED);
> + if (!nest_parms)
> + goto nla_put_failure;
>
> - ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, NULL);
> - if (ret < 0)
> - goto nla_put_failure;
> + ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, NULL);
Hi Pablo
None of the obj_to_nlattr handlers can handle a NULL pointer.
What is the intent here ?
netlink: 24 bytes leftover after parsing attributes in process `syz-executor2'.
kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] PREEMPT SMP KASAN
CPU: 0 PID: 9575 Comm: syz-executor1 Not tainted 4.19.0+ #312
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:icmp_timeout_obj_to_nlattr+0x77/0x170 net/netfilter/nf_conntrack_proto_icmp.c:297
kobject: 'loop5' (00000000d8ff612b): kobject_uevent_env
Code: b5 41 c7 00 f1 f1 f1 f1 c7 40 04 04 f2 f2 f2 65 48 8b 04 25 28 00 00 00 48 89 45 d0 31 c0 e8 c0 06 26 fb 4c 89 e8 48 c1 e8 03 <42> 0f b6 14 38 4c 89 e8 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85
kobject: 'loop5' (00000000d8ff612b): fill_kobj_path: path = '/devices/virtual/block/loop5'
RSP: 0018:ffff88017def7220 EFLAGS: 00010246
RAX: 0000000000000000 RBX: 1ffff1002fbdee45 RCX: ffffc90004561000
RDX: 000000000000064f RSI: ffffffff865970c0 RDI: ffff8801ba602bc0
RBP: ffff88017def72b0 R08: ffff8801879e2400 R09: ffff880188d264a8
R10: ffffed00311a4c94 R11: ffff880188d264a0 R12: ffff8801ba602bc0
R13: 0000000000000000 R14: ffff88017def7288 R15: dffffc0000000000
FS: 00007fb495a6f700(0000) GS:ffff8801dae00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
kobject: 'loop3' (00000000bc48af2d): kobject_uevent_env
CR2: 00007f4879a55000 CR3: 00000001b7b96000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
kobject: 'loop3' (00000000bc48af2d): fill_kobj_path: path = '/devices/virtual/block/loop3'
cttimeout_default_fill_info net/netfilter/nfnetlink_cttimeout.c:411 [inline]
cttimeout_default_get+0x644/0x9e0 net/netfilter/nfnetlink_cttimeout.c:457
nfnetlink_rcv_msg+0xdd3/0x10c0 net/netfilter/nfnetlink.c:228
kobject: 'loop3' (00000000bc48af2d): kobject_uevent_env
kobject: 'loop3' (00000000bc48af2d): fill_kobj_path: path = '/devices/virtual/block/loop3'
netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2477
nfnetlink_rcv+0x1c0/0x4d0 net/netfilter/nfnetlink.c:560
netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
netlink_unicast+0x5a5/0x760 net/netlink/af_netlink.c:1336
netlink_sendmsg+0xa18/0xfc0 net/netlink/af_netlink.c:1917
sock_sendmsg_nosec net/socket.c:621 [inline]
sock_sendmsg+0xd5/0x120 net/socket.c:631
___sys_sendmsg+0x7fd/0x930 net/socket.c:2116
__sys_sendmsg+0x11d/0x280 net/socket.c:2154
__do_sys_sendmsg net/socket.c:2163 [inline]
__se_sys_sendmsg net/socket.c:2161 [inline]
__x64_sys_sendmsg+0x78/0xb0 net/socket.c:2161
do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
> + if (ret < 0)
> + goto nla_put_failure;
>
> - nla_nest_end(skb, nest_parms);
> - }
> + nla_nest_end(skb, nest_parms);
>
> nlmsg_end(skb, nlh);
> return skb->len;
> diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
> index 5dd87748afa8..17ae5059c312 100644
> --- a/net/netfilter/nft_ct.c
> +++ b/net/netfilter/nft_ct.c
> @@ -776,9 +776,6 @@ nft_ct_timeout_parse_policy(void *timeouts,
> struct nlattr **tb;
> int ret = 0;
>
> - if (!l4proto->ctnl_timeout.nlattr_to_obj)
> - return 0;
> -
> tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb),
> GFP_KERNEL);
>
>
^ permalink raw reply
* Re: [PULL] vhost: cleanups and fixes
From: Michael S. Tsirkin @ 2018-11-01 23:55 UTC (permalink / raw)
To: Linus Torvalds
Cc: Kees Cook, kvm, virtualization, netdev, Linux Kernel Mailing List,
Andrew Morton, bijan.mottahedeh, gedwards, joe, lenaic,
liang.z.li, mhocko, mhocko, stefanha, wei.w.wang
In-Reply-To: <CAHk-=wicvws38JqzVF6oNEZ0jGzP6RecR6yAGtyzX3AkoJ321g@mail.gmail.com>
On Thu, Nov 01, 2018 at 04:06:19PM -0700, Linus Torvalds wrote:
> On Thu, Nov 1, 2018 at 4:00 PM Kees Cook <keescook@chromium.org> wrote:
> >
> > + memset(&rsp, 0, sizeof(rsp));
> > + rsp.response = VIRTIO_SCSI_S_FUNCTION_REJECTED;
> > + resp = vq->iov[out].iov_base;
> > + ret = __copy_to_user(resp, &rsp, sizeof(rsp));
> >
> > Is it actually safe to trust that iov_base has passed an earlier
> > access_ok() check here? Why not just use copy_to_user() instead?
>
> Good point.
>
> We really should have removed those double-underscore things ages ago.
Well in case of vhost there are a bunch of reasons to keep them.
One is that all access_ok checks take place
way earlier in context of the owner task. Result is saved
and then used for access repeatedly. Skipping reding access_ok twice
did seem to give a small speedup in the past.
In fact I am looking
into switching some of the uses to unsafe_put_user/unsafe_get_user
after doing something like barrier_nospec after the
access_ok checks. Seems to give a measureable speedup.
Another is that the double underscore things actually can be done
in softirq context if you do preempt_disable/preempt_enable.
IIUC Jason's looking into using that to cut down the latency
for when the access is very small.
> Also, apart from the address, what about the size? Wouldn't it be
> better to use copy_to_iter() rather than implement it badly by hand?
>
> Linus
Generally size is checked when we retrieve the iov but I will recheck
this case and reply here.
^ permalink raw reply
* Re: [PULL] vhost: cleanups and fixes
From: Michael S. Tsirkin @ 2018-11-01 23:38 UTC (permalink / raw)
To: Kees Cook
Cc: lenaic, Michal Hocko, bijan.mottahedeh, KVM, Network Development,
liang.z.li, LKML, virtualization, Stefan Hajnoczi, Joe Perches,
Andrew Morton, Michal Hocko, Linus Torvalds
In-Reply-To: <CAGXu5jJ0HgV2qN=wohEgro6ixqXHOHBTsvS5a9Dcpz8gxVo3bA@mail.gmail.com>
On Thu, Nov 01, 2018 at 04:00:23PM -0700, Kees Cook wrote:
> On Thu, Nov 1, 2018 at 2:19 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > The following changes since commit 84df9525b0c27f3ebc2ebb1864fa62a97fdedb7d:
> >
> > Linux 4.19 (2018-10-22 07:37:37 +0100)
> >
> > are available in the Git repository at:
> >
> > git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git tags/for_linus
> >
> > for you to fetch changes up to 79f800b2e76923cd8ce0aa659cb5c019d9643bc9:
> >
> > MAINTAINERS: remove reference to bogus vsock file (2018-10-24 21:16:14 -0400)
> >
> > ----------------------------------------------------------------
> > virtio, vhost: fixes, tweaks
> >
> > virtio balloon page hinting support
> > vhost scsi control queue
> >
> > misc fixes.
> >
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> >
> > ----------------------------------------------------------------
> > Bijan Mottahedeh (3):
> > vhost/scsi: Respond to control queue operations
>
> +static void
> +vhost_scsi_send_tmf_resp(struct vhost_scsi *vs,
> + struct vhost_virtqueue *vq,
> + int head, unsigned int out)
> +{
> + struct virtio_scsi_ctrl_tmf_resp __user *resp;
> + struct virtio_scsi_ctrl_tmf_resp rsp;
> + int ret;
> +
> + pr_debug("%s\n", __func__);
> + memset(&rsp, 0, sizeof(rsp));
> + rsp.response = VIRTIO_SCSI_S_FUNCTION_REJECTED;
> + resp = vq->iov[out].iov_base;
> + ret = __copy_to_user(resp, &rsp, sizeof(rsp));
>
> Is it actually safe to trust that iov_base has passed an earlier
> access_ok() check here? Why not just use copy_to_user() instead?
>
> -Kees
I am not sure copy_to_user will do the right thing here, because all
this runs in context of a kernel thread. We do need access_ok which
takes place way earlier in context of the task.
Another reason it is safe is because the address is not
coming from userspace at all.
> > vhost/scsi: Extract common handling code from control queue handler
> > vhost/scsi: Use common handling code in request queue handler
> >
> > Greg Edwards (1):
> > vhost/scsi: truncate T10 PI iov_iter to prot_bytes
> >
> > Lénaïc Huard (1):
> > kvm_config: add CONFIG_VIRTIO_MENU
> >
> > Stefan Hajnoczi (1):
> > MAINTAINERS: remove reference to bogus vsock file
> >
> > Wei Wang (3):
> > virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT
> > mm/page_poison: expose page_poisoning_enabled to kernel modules
> > virtio-balloon: VIRTIO_BALLOON_F_PAGE_POISON
> >
> > MAINTAINERS | 1 -
> > drivers/vhost/scsi.c | 426 ++++++++++++++++++++++++++++--------
> > drivers/virtio/virtio_balloon.c | 380 +++++++++++++++++++++++++++++---
> > include/uapi/linux/virtio_balloon.h | 8 +
> > kernel/configs/kvm_guest.config | 1 +
> > mm/page_poison.c | 6 +
> > 6 files changed, 688 insertions(+), 134 deletions(-)
>
>
>
> --
> Kees Cook
^ permalink raw reply
* Re: [net 1/8] igb: shorten maximum PHC timecounter update interval
From: Miroslav Lichvar @ 2018-11-01 14:12 UTC (permalink / raw)
To: Jeff Kirsher
Cc: davem, netdev, nhorman, sassmann, Jacob Keller, Richard Cochran
In-Reply-To: <20181031194254.16417-2-jeffrey.t.kirsher@intel.com>
On Wed, Oct 31, 2018 at 12:42:47PM -0700, Jeff Kirsher wrote:
> From: Miroslav Lichvar <mlichvar@redhat.com>
>
> The timecounter needs to be updated at least once per ~550 seconds in
> order to avoid a 40-bit SYSTIM timestamp to be misinterpreted as an old
> timestamp.
>
> Since commit 500462a9d ("timers: Switch to a non-cascading wheel"),
> scheduling of delayed work seems to be less accurate and a requested
> delay of 540 seconds may actually be longer than 550 seconds. Shorten
> the delay to 480 seconds to be sure the timecounter is updated in time.
It looks like this is the v1 of the patch. There was a v2 I sent on
Oct 26, which made the interval even shorter. I can send a separate
patch for that change.
--
Miroslav Lichvar
^ permalink raw reply
* Re: [PULL] vhost: cleanups and fixes
From: Linus Torvalds @ 2018-11-01 23:06 UTC (permalink / raw)
To: Kees Cook
Cc: mst, kvm, virtualization, netdev, Linux Kernel Mailing List,
Andrew Morton, bijan.mottahedeh, gedwards, joe, lenaic,
liang.z.li, mhocko, mhocko, stefanha, wei.w.wang
In-Reply-To: <CAGXu5jJ0HgV2qN=wohEgro6ixqXHOHBTsvS5a9Dcpz8gxVo3bA@mail.gmail.com>
On Thu, Nov 1, 2018 at 4:00 PM Kees Cook <keescook@chromium.org> wrote:
>
> + memset(&rsp, 0, sizeof(rsp));
> + rsp.response = VIRTIO_SCSI_S_FUNCTION_REJECTED;
> + resp = vq->iov[out].iov_base;
> + ret = __copy_to_user(resp, &rsp, sizeof(rsp));
>
> Is it actually safe to trust that iov_base has passed an earlier
> access_ok() check here? Why not just use copy_to_user() instead?
Good point.
We really should have removed those double-underscore things ages ago.
Also, apart from the address, what about the size? Wouldn't it be
better to use copy_to_iter() rather than implement it badly by hand?
Linus
^ permalink raw reply
* [PATCH net 2/2] net: systemport: Protect stop from timeout
From: Florian Fainelli @ 2018-11-01 22:55 UTC (permalink / raw)
To: netdev; +Cc: jaedon.shin, opendmb, Florian Fainelli, David S. Miller,
open list
In-Reply-To: <20181101225538.18632-1-f.fainelli@gmail.com>
A timing hazard exists when the network interface is stopped that
allows a watchdog timeout to be processed by a separate core in
parallel. This creates the potential for the timeout handler to
wake the queues while the driver is shutting down, or access
registers after their clocks have been removed.
The more common case is that the watchdog timeout will produce a
warning message which doesn't lead to a crash. The chances of this
are greatly increased by the fact that bcm_sysport_netif_stop stops
the transmit queues which can easily precipitate a watchdog time-
out because of stale trans_start data in the queues.
This commit corrects the behavior by ensuring that the watchdog
timeout is disabled before enterring bcm_sysport_netif_stop. There
are currently only two users of the bcm_sysport_netif_stop function:
close and suspend.
The close case already handles the issue by exiting the RUNNING
state before invoking the driver close service.
The suspend case now performs the netif_device_detach to exit the
PRESENT state before the call to bcm_sysport_netif_stop rather than
after it.
These behaviors prevent any future scheduling of the driver timeout
service during the window. The netif_tx_stop_all_queues function
in bcm_sysport_netif_stop is replaced with netif_tx_disable to ensure
synchronization with any transmit or timeout threads that may
already be executing on other cores.
For symmetry, the netif_device_attach call upon resume is moved to
after the call to bcm_sysport_netif_start. Since it wakes the transmit
queues it is not necessary to invoke netif_tx_start_all_queues from
bcm_sysport_netif_start so it is moved into the driver open service.
Fixes: 40755a0fce17 ("net: systemport: add suspend and resume support")
Fixes: 80105befdb4b ("net: systemport: add Broadcom SYSTEMPORT Ethernet MAC driver")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
drivers/net/ethernet/broadcom/bcmsysport.c | 15 +++++++--------
1 file changed, 7 insertions(+), 8 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index 4122553e224b..0e2d99c737e3 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -1902,9 +1902,6 @@ static void bcm_sysport_netif_start(struct net_device *dev)
intrl2_1_mask_clear(priv, 0xffffffff);
else
intrl2_0_mask_clear(priv, INTRL2_0_TDMA_MBDONE_MASK);
-
- /* Last call before we start the real business */
- netif_tx_start_all_queues(dev);
}
static void rbuf_init(struct bcm_sysport_priv *priv)
@@ -2048,6 +2045,8 @@ static int bcm_sysport_open(struct net_device *dev)
bcm_sysport_netif_start(dev);
+ netif_tx_start_all_queues(dev);
+
return 0;
out_clear_rx_int:
@@ -2071,7 +2070,7 @@ static void bcm_sysport_netif_stop(struct net_device *dev)
struct bcm_sysport_priv *priv = netdev_priv(dev);
/* stop all software from updating hardware */
- netif_tx_stop_all_queues(dev);
+ netif_tx_disable(dev);
napi_disable(&priv->napi);
cancel_work_sync(&priv->dim.dim.work);
phy_stop(dev->phydev);
@@ -2658,12 +2657,12 @@ static int __maybe_unused bcm_sysport_suspend(struct device *d)
if (!netif_running(dev))
return 0;
+ netif_device_detach(dev);
+
bcm_sysport_netif_stop(dev);
phy_suspend(dev->phydev);
- netif_device_detach(dev);
-
/* Disable UniMAC RX */
umac_enable_set(priv, CMD_RX_EN, 0);
@@ -2746,8 +2745,6 @@ static int __maybe_unused bcm_sysport_resume(struct device *d)
goto out_free_rx_ring;
}
- netif_device_attach(dev);
-
/* RX pipe enable */
topctrl_writel(priv, 0, RX_FLUSH_CNTL);
@@ -2788,6 +2785,8 @@ static int __maybe_unused bcm_sysport_resume(struct device *d)
bcm_sysport_netif_start(dev);
+ netif_device_attach(dev);
+
return 0;
out_free_rx_ring:
--
2.17.1
^ permalink raw reply related
* [PATCH net 1/2] net: bcmgenet: protect stop from timeout
From: Florian Fainelli @ 2018-11-01 22:55 UTC (permalink / raw)
To: netdev; +Cc: jaedon.shin, opendmb, Florian Fainelli, David S. Miller,
open list
In-Reply-To: <20181101225538.18632-1-f.fainelli@gmail.com>
From: Doug Berger <opendmb@gmail.com>
A timing hazard exists when the network interface is stopped that
allows a watchdog timeout to be processed by a separate core in
parallel. This creates the potential for the timeout handler to
wake the queues while the driver is shutting down, or access
registers after their clocks have been removed.
The more common case is that the watchdog timeout will produce a
warning message which doesn't lead to a crash. The chances of this
are greatly increased by the fact that bcmgenet_netif_stop stops
the transmit queues which can easily precipitate a watchdog time-
out because of stale trans_start data in the queues.
This commit corrects the behavior by ensuring that the watchdog
timeout is disabled before enterring bcmgenet_netif_stop. There
are currently only two users of the bcmgenet_netif_stop function:
close and suspend.
The close case already handles the issue by exiting the RUNNING
state before invoking the driver close service.
The suspend case now performs the netif_device_detach to exit the
PRESENT state before the call to bcmgenet_netif_stop rather than
after it.
These behaviors prevent any future scheduling of the driver timeout
service during the window. The netif_tx_stop_all_queues function
in bcmgenet_netif_stop is replaced with netif_tx_disable to ensure
synchronization with any transmit or timeout threads that may
already be executing on other cores.
For symmetry, the netif_device_attach call upon resume is moved to
after the call to bcmgenet_netif_start. Since it wakes the transmit
queues it is not necessary to invoke netif_tx_start_all_queues from
bcmgenet_netif_start so it is moved into the driver open service.
Fixes: 1c1008c793fa ("net: bcmgenet: add main driver file")
Signed-off-by: Doug Berger <opendmb@gmail.com>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
drivers/net/ethernet/broadcom/genet/bcmgenet.c | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 20c1681bb1af..2d6f090bf644 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -2855,7 +2855,6 @@ static void bcmgenet_netif_start(struct net_device *dev)
umac_enable_set(priv, CMD_TX_EN | CMD_RX_EN, true);
- netif_tx_start_all_queues(dev);
bcmgenet_enable_tx_napi(priv);
/* Monitor link interrupts now */
@@ -2937,6 +2936,8 @@ static int bcmgenet_open(struct net_device *dev)
bcmgenet_netif_start(dev);
+ netif_tx_start_all_queues(dev);
+
return 0;
err_irq1:
@@ -2958,7 +2959,7 @@ static void bcmgenet_netif_stop(struct net_device *dev)
struct bcmgenet_priv *priv = netdev_priv(dev);
bcmgenet_disable_tx_napi(priv);
- netif_tx_stop_all_queues(dev);
+ netif_tx_disable(dev);
/* Disable MAC receive */
umac_enable_set(priv, CMD_RX_EN, false);
@@ -3620,13 +3621,13 @@ static int bcmgenet_suspend(struct device *d)
if (!netif_running(dev))
return 0;
+ netif_device_detach(dev);
+
bcmgenet_netif_stop(dev);
if (!device_may_wakeup(d))
phy_suspend(dev->phydev);
- netif_device_detach(dev);
-
/* Prepare the device for Wake-on-LAN and switch to the slow clock */
if (device_may_wakeup(d) && priv->wolopts) {
ret = bcmgenet_power_down(priv, GENET_POWER_WOL_MAGIC);
@@ -3700,8 +3701,6 @@ static int bcmgenet_resume(struct device *d)
/* Always enable ring 16 - descriptor ring */
bcmgenet_enable_dma(priv, dma_ctrl);
- netif_device_attach(dev);
-
if (!device_may_wakeup(d))
phy_resume(dev->phydev);
@@ -3710,6 +3709,8 @@ static int bcmgenet_resume(struct device *d)
bcmgenet_netif_start(dev);
+ netif_device_attach(dev);
+
return 0;
out_clk_disable:
--
2.17.1
^ permalink raw reply related
* Re: Kernel 4.19 network performance - forwarding/routing normal users traffic
From: Paweł Staszewski @ 2018-11-01 13:52 UTC (permalink / raw)
To: Jesper Dangaard Brouer, David Ahern; +Cc: netdev, Yoel Caspersen
In-Reply-To: <20181101115522.10b0dd0a@redhat.com>
W dniu 01.11.2018 o 11:55, Jesper Dangaard Brouer pisze:
> On Wed, 31 Oct 2018 21:37:16 -0600 David Ahern <dsahern@gmail.com> wrote:
>
>> This is mainly a forwarding use case? Seems so based on the perf report.
>> I suspect forwarding with XDP would show pretty good improvement.
> Yes, significant performance improvements.
>
> Notice Davids talk: "Leveraging Kernel Tables with XDP"
> http://vger.kernel.org/lpc-networking2018.html#session-1
It will be rly interesting
> It looks like that you are doing "pure" IP-routing, without any
> iptables conntrack stuff (from your perf report data). That will
> actually be a really good use-case for accelerating this with XDP.
Yes pure IP routing
iptables used only for some local input filtering.
>
> I want you to understand the philosophy behind how David and I want
> people to leverage XDP. Think of XDP as a software offload layer for
> the kernel network stack. Setup and use Linux kernel network stack, but
> accelerate parts of it with XDP, e.g. the route FIB lookup.
>
> Sample code avail here:
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/samples/bpf/xdp_fwd_kern.c
I can try some tests on same hw but testlab configuration - will give it
a try :)
> (I do warn, what we just found a bug/crash in setup+tairdown for the
> mlx5 driver you are using, that we/mlx _will_ fix soon)
Ok
>
>
>> You need the vlan changes I have queued up though.
> I know Yoel will be very interested in those changes too! I've
> convinced Yoel to write an XDP program for his Border Network Gateway
> (BNG) production system[1], and his is a heavy VLAN user. And the plan
> is to Open Source this when he have-something-working.
>
> [1] https://www.version2.dk/blog/software-router-del-5-linux-bng-1086060
Ok - for now i need to split traffic into two separate 100G ports placed
in two different x16 pciexpress slots to check if the problem is mainly
caused by no more pciex x16 bandwidth available.
^ permalink raw reply
* Re: [PATCH net-next 5/6] net/ncsi: Reset channel state in ncsi_start_dev()
From: Samuel Mendoza-Jonas @ 2018-11-01 22:53 UTC (permalink / raw)
To: Justin.Lee1, netdev; +Cc: davem, linux-kernel, openbmc
In-Reply-To: <96158616f0774e3d9fae0e99bb16e605@AUSX13MPS306.AMER.DELL.COM>
On Tue, 2018-10-30 at 21:26 +0000, Justin.Lee1@Dell.com wrote:
> > +int ncsi_reset_dev(struct ncsi_dev *nd)
> > +{
> > + struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd);
> > + struct ncsi_channel *nc, *active;
> > + struct ncsi_package *np;
> > + unsigned long flags;
> > + bool enabled;
> > + int state;
> > +
> > + active = NULL;
> > + NCSI_FOR_EACH_PACKAGE(ndp, np) {
> > + NCSI_FOR_EACH_CHANNEL(np, nc) {
> > + spin_lock_irqsave(&nc->lock, flags);
> > + enabled = nc->monitor.enabled;
> > + state = nc->state;
> > + spin_unlock_irqrestore(&nc->lock, flags);
> > +
> > + if (enabled)
> > + ncsi_stop_channel_monitor(nc);
> > + if (state == NCSI_CHANNEL_ACTIVE) {
> > + active = nc;
> > + break;
>
> Is the original intention to process the channel one by one?
> If it is the case, there are two loops and we might need to use
> "goto found" instead.
Yes we'll need to break out of the package loop here as well.
>
> > + }
> > + }
> > + }
> > +
>
> found: ?
>
> > + if (!active) {
> > + /* Done */
> > + spin_lock_irqsave(&ndp->lock, flags);
> > + ndp->flags &= ~NCSI_DEV_RESET;
> > + spin_unlock_irqrestore(&ndp->lock, flags);
> > + return ncsi_choose_active_channel(ndp);
> > + }
> > +
> > + spin_lock_irqsave(&ndp->lock, flags);
> > + ndp->flags |= NCSI_DEV_RESET;
> > + ndp->active_channel = active;
> > + ndp->active_package = active->package;
> > + spin_unlock_irqrestore(&ndp->lock, flags);
> > +
> > + nd->state = ncsi_dev_state_suspend;
> > + schedule_work(&ndp->work);
> > + return 0;
> > +}
>
> Also similar issue in ncsi_choose_active_channel() function below.
>
> > @@ -916,32 +1045,49 @@ static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp)
> >
> > ncm = &nc->modes[NCSI_MODE_LINK];
> > if (ncm->data[2] & 0x1) {
> > - spin_unlock_irqrestore(&nc->lock, flags);
> > found = nc;
> > - goto out;
> > + with_link = true;
> > }
> >
> > - spin_unlock_irqrestore(&nc->lock, flags);
> > + /* If multi_channel is enabled configure all valid
> > + * channels whether or not they currently have link
> > + * so they will have AENs enabled.
> > + */
> > + if (with_link || np->multi_channel) {
>
> I notice that there is a case that we will misconfigure the interface.
> For example below, multi-channel is not enable for package 1.
> But we enable the channel for ncsi2 below (package 1 channel 0) as that interface is the first
> channel for that package with link.
I don't think I see the issue here; multi-channel is not set on package
1, but both channels are in the channel whitelist. Channel 0 is
configured since it's the first found on package 1, and channel 1 is not
since channel 0 is already found. Are you expecting something different?
>
> cat /sys/kernel/debug/ncsi_protocol/ncsi_device_
> IFIDX IFNAME NAME PID CID RX TX MP MC WP WC PC CS PS LS RU CR NQ HA
> =====================================================================
> 2 eth2 ncsi0 000 000 1 1 1 1 1 1 0 2 1 1 1 1 0 1
> 2 eth2 ncsi1 000 001 1 0 1 1 1 1 0 2 1 1 1 1 0 1
> 2 eth2 ncsi2 001 000 1 0 1 0 1 1 0 2 1 1 1 1 0 1
> 2 eth2 ncsi3 001 001 0 0 1 0 1 1 0 1 0 1 1 1 0 1
> =====================================================================
> MP: Multi-mode Package WP: Whitelist Package
> MC: Multi-mode Channel WC: Whitelist Channel
> PC: Primary Channel CS: Channel State IA/A/IV 1/2/3
> PS: Poll Status LS: Link Status
> RU: Running CR: Carrier OK
> NQ: Queue Stopped HA: Hardware Arbitration
>
> I temporally change to the following to avoid that.
> if ((with_link &&
> !np->multi_channel &&
> list_empty(&ndp->channel_queue)) || np->multi_channel) {
>
> > + spin_lock_irqsave(&ndp->lock, flags);
> > + list_add_tail_rcu(&nc->link,
> > + &ndp->channel_queue);
> > + spin_unlock_irqrestore(&ndp->lock, flags);
> > +
> > + netdev_dbg(ndp->ndev.dev,
> > + "NCSI: Channel %u added to queue (link %s)\n",
> > + nc->id,
> > + ncm->data[2] & 0x1 ? "up" : "down");
> > + }
> > +
> > + spin_unlock_irqrestore(&nc->lock, cflags);
> > +
> > + if (with_link && !np->multi_channel)
> > + break;
>
> Similar issue here. As we are using break, so each package will configure one active TX.
>
I believe this is handled properly in ncsi_channel_is_tx() in the most
recent revision.
> > }
> > + if (with_link && !ndp->multi_package)
> > + break;
> > }
> >
> > - if (!found) {
> > + if (list_empty(&ndp->channel_queue) && found) {
> > + netdev_info(ndp->ndev.dev,
> > + "NCSI: No channel with link found, configuring channel %u\n",
> > + found->id);
> > + spin_lock_irqsave(&ndp->lock, flags);
> > + list_add_tail_rcu(&found->link, &ndp->channel_queue);
> > + spin_unlock_irqrestore(&ndp->lock, flags);
> > + } else if (!found) {
> > netdev_warn(ndp->ndev.dev,
> > - "NCSI: No channel found with link\n");
> > + "NCSI: No channel found to configure!\n");
> > ncsi_report_link(ndp, true);
> > return -ENODEV;
> > }
>
> Also, for deselect package handler function, do we want to set to inactive here?
> If we just change the state, the cached data still keeps the old value. If the new
> ncsi_reset_dev() function is handling one by one, can we skip this part?
Technically yes we could skip the state change here since
ncsi_reset_dev() will have already done it. However if we send a DP
command via some other means then it is probably best to ensure we treat
all channels on that package as inactive.
>
> static int ncsi_rsp_handler_dp(struct ncsi_request *nr)
> {
> struct ncsi_rsp_pkt *rsp;
> struct ncsi_dev_priv *ndp = nr->ndp;
> struct ncsi_package *np;
> struct ncsi_channel *nc;
> unsigned long flags;
>
> /* Find the package */
> rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
> ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
> &np, NULL);
> if (!np)
> return -ENODEV;
>
> /* Change state of all channels attached to the package */
> NCSI_FOR_EACH_CHANNEL(np, nc) {
> spin_lock_irqsave(&nc->lock, flags);
> nc->state = NCSI_CHANNEL_INACTIVE;
>
> spin_unlock_irqrestore(&nc->lock, flags);
> }
>
> return 0;
> }
>
>
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox