* RE: [PATCH v2] hv_netvsc: Add per-cpu ethtool stats for netvsc
From: Yidong Ren @ 2018-06-13 22:03 UTC (permalink / raw)
To: Stephen Hemminger, Yidong Ren
Cc: Stephen Hemminger, netdev@vger.kernel.org, Haiyang Zhang,
linux-kernel@vger.kernel.org, devel@linuxdriverproject.org,
David S. Miller
In-Reply-To: <20180613144813.708f26bf@xeon-e3>
> From: devel <driverdev-devel-bounces@linuxdriverproject.org> On Behalf
> Of Stephen Hemminger
> > +/* statistics per queue (rx/tx packets/bytes) */ #define
> > +NETVSC_PCPU_STATS_LEN (num_present_cpus() *
> ARRAY_SIZE(pcpu_stats))
>
> Even though Hyper-V/Azure does not support hot plug cpu's it might be
> better to num_cpu_possible to avoid any possible future surprises.
That will create a very long output (num_cpu_possible = 128 on my machine) for ethtool,
While doesn't provide additional info.
num_present_cpus() would cause problem only if someone removed cpu
between netvsc_get_sset_count() and netvsc_get_strings() and netvsc_get_ethtool_stats().
An alternative way could be: Check all stats, and only output if not zero.
This need to be done in two pass. First pass to get the correct count, second pass to print the number.
Is there an elegant way to do this?
^ permalink raw reply
* Re: [PATCH v2] hv_netvsc: Add per-cpu ethtool stats for netvsc
From: Stephen Hemminger @ 2018-06-13 21:48 UTC (permalink / raw)
To: Yidong Ren
Cc: K. Y. Srinivasan, Haiyang Zhang, Stephen Hemminger,
David S. Miller, devel, netdev, linux-kernel
In-Reply-To: <20180613193608.444-1-yidren@linuxonhyperv.com>
On Wed, 13 Jun 2018 12:36:08 -0700
Yidong Ren <yidren@linuxonhyperv.com> wrote:
> From: Yidong Ren <yidren@microsoft.com>
>
> This patch implements following ethtool stats fields for netvsc:
> cpu<n>_tx/rx_packets/bytes
> cpu<n>_vf_tx/rx_packets/bytes
>
> Corresponding per-cpu counters exist in current code. Exposing these
> counters will help troubleshooting performance issues.
>
> Signed-off-by: Yidong Ren <yidren@microsoft.com>
> ---
> Changes in v2:
> - Remove cpp style comment
> - Resubmit after freeze
>
> drivers/net/hyperv/hyperv_net.h | 11 +++++
> drivers/net/hyperv/netvsc_drv.c | 104 +++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 113 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
> index 23304ac..c825353 100644
> --- a/drivers/net/hyperv/hyperv_net.h
> +++ b/drivers/net/hyperv/hyperv_net.h
> @@ -873,6 +873,17 @@ struct netvsc_ethtool_stats {
> unsigned long wake_queue;
> };
>
> +struct netvsc_ethtool_pcpu_stats {
> + u64 rx_packets;
> + u64 rx_bytes;
> + u64 tx_packets;
> + u64 tx_bytes;
> + u64 vf_rx_packets;
> + u64 vf_rx_bytes;
> + u64 vf_tx_packets;
> + u64 vf_tx_bytes;
> +};
> +
> struct netvsc_vf_pcpu_stats {
> u64 rx_packets;
> u64 rx_bytes;
> diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
> index 7b18a8c..6803aae 100644
> --- a/drivers/net/hyperv/netvsc_drv.c
> +++ b/drivers/net/hyperv/netvsc_drv.c
> @@ -1105,6 +1105,66 @@ static void netvsc_get_vf_stats(struct net_device *net,
> }
> }
>
> +static void netvsc_get_pcpu_stats(struct net_device *net,
> + struct netvsc_ethtool_pcpu_stats
> + __percpu *pcpu_tot)
> +{
> + struct net_device_context *ndev_ctx = netdev_priv(net);
> + struct netvsc_device *nvdev = rcu_dereference_rtnl(ndev_ctx->nvdev);
> + int i;
> +
> + /* fetch percpu stats of vf */
> + for_each_possible_cpu(i) {
> + const struct netvsc_vf_pcpu_stats *stats =
> + per_cpu_ptr(ndev_ctx->vf_stats, i);
> + struct netvsc_ethtool_pcpu_stats *this_tot =
> + per_cpu_ptr(pcpu_tot, i);
> + unsigned int start;
> +
> + do {
> + start = u64_stats_fetch_begin_irq(&stats->syncp);
> + this_tot->vf_rx_packets = stats->rx_packets;
> + this_tot->vf_tx_packets = stats->tx_packets;
> + this_tot->vf_rx_bytes = stats->rx_bytes;
> + this_tot->vf_tx_bytes = stats->tx_bytes;
> + } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
> + this_tot->rx_packets = this_tot->vf_rx_packets;
> + this_tot->tx_packets = this_tot->vf_tx_packets;
> + this_tot->rx_bytes = this_tot->vf_rx_bytes;
> + this_tot->tx_bytes = this_tot->vf_tx_bytes;
> + }
> +
> + /* fetch percpu stats of netvsc */
> + for (i = 0; i < nvdev->num_chn; i++) {
> + const struct netvsc_channel *nvchan = &nvdev->chan_table[i];
> + const struct netvsc_stats *stats;
> + struct netvsc_ethtool_pcpu_stats *this_tot =
> + per_cpu_ptr(pcpu_tot, nvchan->channel->target_cpu);
> + u64 packets, bytes;
> + unsigned int start;
> +
> + stats = &nvchan->tx_stats;
> + do {
> + start = u64_stats_fetch_begin_irq(&stats->syncp);
> + packets = stats->packets;
> + bytes = stats->bytes;
> + } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
> +
> + this_tot->tx_bytes += bytes;
> + this_tot->tx_packets += packets;
> +
> + stats = &nvchan->rx_stats;
> + do {
> + start = u64_stats_fetch_begin_irq(&stats->syncp);
> + packets = stats->packets;
> + bytes = stats->bytes;
> + } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
> +
> + this_tot->rx_bytes += bytes;
> + this_tot->rx_packets += packets;
> + }
> +}
> +
> static void netvsc_get_stats64(struct net_device *net,
> struct rtnl_link_stats64 *t)
> {
> @@ -1202,6 +1262,23 @@ static const struct {
> { "rx_no_memory", offsetof(struct netvsc_ethtool_stats, rx_no_memory) },
> { "stop_queue", offsetof(struct netvsc_ethtool_stats, stop_queue) },
> { "wake_queue", offsetof(struct netvsc_ethtool_stats, wake_queue) },
> +}, pcpu_stats[] = {
> + { "cpu%u_rx_packets",
> + offsetof(struct netvsc_ethtool_pcpu_stats, rx_packets) },
> + { "cpu%u_rx_bytes",
> + offsetof(struct netvsc_ethtool_pcpu_stats, rx_bytes) },
> + { "cpu%u_tx_packets",
> + offsetof(struct netvsc_ethtool_pcpu_stats, tx_packets) },
> + { "cpu%u_tx_bytes",
> + offsetof(struct netvsc_ethtool_pcpu_stats, tx_bytes) },
> + { "cpu%u_vf_rx_packets",
> + offsetof(struct netvsc_ethtool_pcpu_stats, vf_rx_packets) },
> + { "cpu%u_vf_rx_bytes",
> + offsetof(struct netvsc_ethtool_pcpu_stats, vf_rx_bytes) },
> + { "cpu%u_vf_tx_packets",
> + offsetof(struct netvsc_ethtool_pcpu_stats, vf_tx_packets) },
> + { "cpu%u_vf_tx_bytes",
> + offsetof(struct netvsc_ethtool_pcpu_stats, vf_tx_bytes) },
> }, vf_stats[] = {
> { "vf_rx_packets", offsetof(struct netvsc_vf_pcpu_stats, rx_packets) },
> { "vf_rx_bytes", offsetof(struct netvsc_vf_pcpu_stats, rx_bytes) },
> @@ -1213,6 +1290,9 @@ static const struct {
> #define NETVSC_GLOBAL_STATS_LEN ARRAY_SIZE(netvsc_stats)
> #define NETVSC_VF_STATS_LEN ARRAY_SIZE(vf_stats)
>
> +/* statistics per queue (rx/tx packets/bytes) */
> +#define NETVSC_PCPU_STATS_LEN (num_present_cpus() * ARRAY_SIZE(pcpu_stats))
Even though Hyper-V/Azure does not support hot plug cpu's it might be better
to num_cpu_possible to avoid any possible future surprises.
^ permalink raw reply
* More manual-page fixups.
From: Eric S. Raymond @ 2018-06-13 21:31 UTC (permalink / raw)
To: netdev
[-- Attachment #1: Type: text/plain, Size: 772 bytes --]
John Linville asked me to ship the ethtool.8 patch to this list.
That's the first 0001 patch in the enclosures and should be applied
to the ethtool repo.
The others are more syntax fixups for the iproute2 repo. Some
are things like list syntax errors that you don't notice when
rendering via groff because it has no error checking. Some others
unroll the pseudo-BNF used in some synopses to the standard form
described by DocBook synopsis markup.
There's more work to be done on this. Expect more patches, now
that I know where to send them.
--
<a href="http://www.catb.org/~esr/">Eric S. Raymond</a>
My work is funded by the Internet Civil Engineering Institute: https://icei.org
Please visit their site and donate: the civilization you save might be your own.
[-- Attachment #2: 0001-In-ethtool.8-remove-superfluous-and-incorrect-c.patch --]
[-- Type: text/x-diff, Size: 724 bytes --]
>From 9496a4d44cbc569e15bf656e30bfd2fc0dfc64ce Mon Sep 17 00:00:00 2001
From: "Eric S. Raymond" <esr@thyrsus.com>
Date: Wed, 13 Jun 2018 17:16:21 -0400
Subject: [PATCH] In ethtool.8, remove superfluous and incorrect \c.
Signed-off-by: Eric S. Raymond <esr@thyrsus.com>
---
ethtool.8.in | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ethtool.8.in b/ethtool.8.in
index 04c53c9..57b2893 100644
--- a/ethtool.8.in
+++ b/ethtool.8.in
@@ -684,7 +684,7 @@ Disable (wake on nothing). This option clears all previous options.
T}
.TE
.TP
-.B sopass \*(MA\c
+.B sopass \*(MA
Sets the SecureOn\[tm] password. The argument to this option must be 6
bytes in Ethernet MAC hex format (\*(MA).
.PP
--
2.17.1
[-- Attachment #3: 0001-Clean-up-markup-in-tc-bpf.8.patch --]
[-- Type: text/x-diff, Size: 15668 bytes --]
>From 15472b7be6c619e5eb35037da70b35d066544838 Mon Sep 17 00:00:00 2001
From: "Eric S. Raymond" <esr@thyrsus.com>
Date: Tue, 12 Jun 2018 22:32:24 -0400
Subject: [PATCH 1/5] Clean up markup in tc-bpf.8
This page had multiple issues:
1. .in +4/.nf....fi/.in was used where .EX/.EE was called for.
2. .SS and running text shouldn't have been used in Synopsis section.
Inline text has been moved to PARAMETERS.
3. Under PATAMETERS, adjacent .SS tags could not be lifted to XML.
.TP is now used in that section instead.
Signed-off-by: Eric S. Raymond <esr@thyrsus.com>
---
man/man8/tc-bpf.8 | 239 +++++++++++++++++++++++-----------------------
1 file changed, 117 insertions(+), 122 deletions(-)
diff --git a/man/man8/tc-bpf.8 b/man/man8/tc-bpf.8
index d311f295..f883a912 100644
--- a/man/man8/tc-bpf.8
+++ b/man/man8/tc-bpf.8
@@ -3,7 +3,6 @@
BPF \- BPF programmable classifier and actions for ingress/egress
queueing disciplines
.SH SYNOPSIS
-.SS eBPF classifier (filter) or action:
.B tc filter ... bpf
[
.B object-file
@@ -28,7 +27,7 @@ POLICE_SPEC ] [
ACTION_SPEC ] [
.B classid
CLASSID ]
-.br
+
.B tc action ... bpf
[
.B object-file
@@ -40,7 +39,6 @@ UDS_FILE ] [
.B verbose
]
-.SS cBPF classifier (filter) or action:
.B tc filter ... bpf
[
.B bytecode-file
@@ -53,7 +51,7 @@ POLICE_SPEC ] [
ACTION_SPEC ] [
.B classid
CLASSID ]
-.br
+
.B tc action ... bpf
[
.B bytecode-file
@@ -110,7 +108,9 @@ are pushed into one map and use another one for dynamically load balancing
traffic based on the determined load, just to provide a few examples.
.SH PARAMETERS
-.SS object-file
+The first pair of filter/action invocations is for eBPF, the second for cBPF.
+.TP
+object-file
points to an object file that has an executable and linkable format (ELF)
and contains eBPF opcodes and eBPF map definitions. The LLVM compiler
infrastructure with
@@ -120,16 +120,16 @@ files that can be passed to the eBPF classifier (more details in the
.B EXAMPLES
section). This option is mandatory when an eBPF classifier or action is
to be loaded.
-
-.SS section
+.TP
+section
is the name of the ELF section from the object file, where the eBPF
classifier or action resides. By default the section name for the
classifier is called "classifier", and for the action "action". Given
that a single object file can contain multiple classifier and actions,
the corresponding section name needs to be specified, if it differs
from the defaults.
-
-.SS export
+.TP
+export
points to a Unix domain socket file. In case the eBPF object file also
contains a section named "maps" with eBPF map specifications, then the
map file descriptors can be handed off via the Unix domain socket to
@@ -139,18 +139,18 @@ import, that uses them for calling into
.B bpf(2)
system call to read out or update eBPF map data from user space, for
example, for monitoring purposes or to push down new policies.
-
-.SS verbose
+.TP
+verbose
if set, it will dump the eBPF verifier output, even if loading the eBPF
program was successful. By default, only on error, the verifier log is
being emitted to the user.
-
-.SS direct-action | da
+.TP
+direct-action | da
instructs eBPF classifier to not invoke external TC actions, instead use the
TC actions return codes (\fBTC_ACT_OK\fR, \fBTC_ACT_SHOT\fR etc.) for
classifiers.
-
-.SS skip_hw | skip_sw
+.TP
+skip_hw | skip_sw
hardware offload control flags. By default TC will try to offload
filters to hardware if possible.
.B skip_hw
@@ -159,21 +159,22 @@ explicitly disables the attempt to offload.
forces the offload and disables running the eBPF program in the kernel.
If hardware offload is not possible and this flag was set kernel will
report an error and filter will not be installed at all.
-
-.SS police
+.TP
+police
is an optional parameter for an eBPF/cBPF classifier that specifies a
police in
.B tc(1)
which is attached to the classifier, for example, on an ingress qdisc.
-
-.SS action
+.TP
+action
is an optional parameter for an eBPF/cBPF classifier that specifies a
subsequent action in
.B tc(1)
which is attached to a classifier.
-
-.SS classid
-.SS flowid
+.TP
+classid
+.TP
+flowid
provides the default traffic control class identifier for this eBPF/cBPF
classifier. The default class identifier can also be overwritten by the
return code of the eBPF/cBPF program. A default return code of
@@ -184,8 +185,8 @@ a return code other than these two will override the default classid. This
allows for efficient, non-linear classification with only a single eBPF/cBPF
program as opposed to having multiple individual programs for various class
identifiers which would need to reparse packet contents.
-
-.SS bytecode
+.TP
+bytecode
is being used for loading cBPF classifier and actions only. The cBPF bytecode
is directly passed as a text string in the form of
.B \'s,c t f k,c t f k,c t f k,...\'
@@ -211,8 +212,8 @@ that ships with the Linux kernel source tree under
or
.B bytecode-file
option is mandatory when a cBPF classifier or action is to be loaded.
-
-.SS bytecode-file
+.TP
+bytecode-file
also being used to load a cBPF classifier or action. It's effectively the
same as
.B bytecode
@@ -224,7 +225,7 @@ rather resides in a text file.
A full blown example including eBPF agent code can be found inside the
iproute2 source package under:
.B examples/bpf/
-
+.sp
As prerequisites, the kernel needs to have the eBPF system call namely
.B bpf(2)
enabled and ships with
@@ -234,9 +235,11 @@ and
kernel modules for the traffic control subsystem. To enable eBPF/eBPF JIT
support, depending which of the two the given architecture supports:
-.in +4n
+.RS
+.EX
.B echo 1 > /proc/sys/net/core/bpf_jit_enable
-.in
+.EE
+.RE
A given restricted C file can be compiled via LLVM as:
@@ -247,24 +250,24 @@ A given restricted C file can be compiled via LLVM as:
The compiler invocation might still simplify in future, so for now,
it's quite handy to alias this construct in one way or another, for
example:
-.in +4n
-.nf
-.sp
+.RS
+.EX
+
__bcc() {
clang -O2 -emit-llvm -c $1 -o - | \\
llc -march=bpf -filetype=obj -o "`basename $1 .c`.o"
}
alias bcc=__bcc
-.fi
-.in
+.EE
+.RE
A minimal, stand-alone unit, which matches on all traffic with the
default classid (return code of -1) looks like:
-.in +4n
-.nf
-.sp
+.RS
+.EX
+
#include <linux/bpf.h>
#ifndef __section
@@ -277,8 +280,8 @@ __section("classifier") int cls_main(struct __sk_buff *skb)
}
char __license[] __section("license") = "GPL";
-.fi
-.in
+.EE
+.RE
More examples can be found further below in subsection
.B eBPF PROGRAMMING
@@ -299,9 +302,9 @@ example
.B objdump(1)
for inspecting ELF section headers:
-.in +4n
-.nf
-.sp
+.RS
+.EX
+
objdump -h bpf.o
[...]
3 classifier 000007f8 0000000000000000 0000000000000000 00000040 2**3
@@ -315,56 +318,56 @@ objdump -h bpf.o
7 license 00000004 0000000000000000 0000000000000000 00000988 2**0
CONTENTS, ALLOC, LOAD, DATA
[...]
-.fi
-.in
+.EE
+.RE
Adding an eBPF classifier from an object file that contains a classifier
in the default ELF section is trivial (note that instead of "object-file"
also shortcuts such as "obj" can be used):
-.in +4n
+.RS
.B bcc bpf.c
.br
.B tc filter add dev em1 parent 1: bpf obj bpf.o flowid 1:1
-.in
+.RE
In case the classifier resides in ELF section "mycls", then that same
command needs to be invoked as:
-.in +4n
+.RS
.B tc filter add dev em1 parent 1: bpf obj bpf.o sec mycls flowid 1:1
-.in
+.RE
Dumping the classifier configuration will tell the location of the
classifier, in other words that it's from object file "bpf.o" under
section "mycls":
-.in +4n
+.RS
.B tc filter show dev em1
.br
.B filter parent 1: protocol all pref 49152 bpf
.br
.B filter parent 1: protocol all pref 49152 bpf handle 0x1 flowid 1:1 bpf.o:[mycls]
-.in
+.RE
The same program can also be installed on ingress qdisc side as opposed
to egress ...
-.in +4n
+.RS
.B tc qdisc add dev em1 handle ffff: ingress
.br
.B tc filter add dev em1 parent ffff: bpf obj bpf.o sec mycls flowid ffff:1
-.in
+.RE
\&... and again dumped from there:
-.in +4n
+.RS
.B tc filter show dev em1 parent ffff:
.br
.B filter protocol all pref 49152 bpf
.br
.B filter protocol all pref 49152 bpf handle 0x1 flowid ffff:1 bpf.o:[mycls]
-.in
+.RE
Attaching a classifier and action on ingress has the restriction that
it doesn't have an actual underlying queueing discipline. What ingress
@@ -382,15 +385,13 @@ object file within various sections. In that case, non-default section
names must be provided, which is the case for both actions in this
example:
-.in +4n
-.B tc filter add dev em1 parent 1: bpf obj bpf.o flowid 1:1 \e
-.br
-.in +25n
-.B action bpf obj bpf.o sec action-mark \e
-.br
-.B action bpf obj bpf.o sec action-rand ok
-.in -25n
-.in -4n
+.RS
+.EX
+tc filter add dev em1 parent 1: bpf obj bpf.o flowid 1:1 \e
+ action bpf obj bpf.o sec action-mark \e
+ action bpf obj bpf.o sec action-rand ok
+.EE
+.RE
The advantage of this is that the classifier and the two actions can
then share eBPF maps with each other, if implemented in the programs.
@@ -421,17 +422,14 @@ this fd-owner shell, they can terminate and restart without losing eBPF
maps file descriptors. Example invocation with the previous classifier and
action mixture:
-.in +4n
-.B tc exec bpf imp /tmp/bpf
-.br
-.B tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf flowid 1:1 \e
-.br
-.in +25n
-.B action bpf obj bpf.o sec action-mark \e
-.br
-.B action bpf obj bpf.o sec action-rand ok
-.in -25n
-.in -4n
+.RS
+.EX
+tc exec bpf imp /tmp/bpf
+tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf flowid 1:1 \e
+ action bpf obj bpf.o sec action-mark \e
+ action bpf obj bpf.o sec action-rand ok
+.EE
+.RE
Assuming that eBPF maps are shared with classifier and actions, it's
enough to export them once, for example, from within the classifier
@@ -454,9 +452,8 @@ member of
The environment in this example looks as follows:
-.in +4n
-.nf
-.sp
+.RS
+.EX
sh# env | grep BPF
BPF_NUM_MAPS=3
BPF_MAP1=6
@@ -468,8 +465,8 @@ sh# ls -la /proc/self/fd
lrwx------. 1 root root 64 Apr 14 16:46 6 -> anon_inode:bpf-map
lrwx------. 1 root root 64 Apr 14 16:46 7 -> anon_inode:bpf-map
sh# my_bpf_agent
-.fi
-.in
+.EE
+.RE
eBPF agents are very useful in that they can prepopulate eBPF maps from
user space, monitor statistics via maps and based on that feedback, for
@@ -495,7 +492,7 @@ from the iproute2 source package for a fully fledged flow dissector
example to better demonstrate some of the possibilities with eBPF.
Supported 32 bit classifier return codes from the C program and their meanings:
-.in +4n
+.RS
.B 0
, denotes a mismatch
.br
@@ -505,12 +502,12 @@ Supported 32 bit classifier return codes from the C program and their meanings:
.B else
, everything else will override the default classid to provide a facility for
non-linear matching
-.in
+.RE
Supported 32 bit action return codes from the C program and their meanings (
.B linux/pkt_cls.h
):
-.in +4n
+.RS
.B TC_ACT_OK (0)
, will terminate the packet processing pipeline and allows the packet to
proceed
@@ -532,7 +529,7 @@ from the beginning
.br
.B else
, everything else is an unspecified return code
-.in
+.RE
Both classifier and action return codes are supported in eBPF and cBPF
programs.
@@ -543,9 +540,8 @@ from a container, have previously been marked in interval [0, 255]. The
program keeps statistics on different marks for user space and maps the
classid to the root qdisc with the marking itself as the minor handle:
-.in +4n
-.nf
-.sp
+.RS
+.EX
#include <stdint.h>
#include <asm/types.h>
@@ -595,17 +591,17 @@ __section("cls") int cls_main(struct __sk_buff *skb)
}
char __license[] __section("license") = "GPL";
-.fi
-.in
+.EE
+.RE
Another small example is a port redirector which demuxes destination port
80 into the interval [8080, 8087] steered by RSS, that can then be attached
to ingress qdisc. The exercise of adding the egress counterpart and IPv6
support is left to the reader:
-.in +4n
-.nf
-.sp
+.RS
+.EX
+
#include <asm/types.h>
#include <asm/byteorder.h>
@@ -664,16 +660,15 @@ __section("lb") int lb_main(struct __sk_buff *skb)
}
char __license[] __section("license") = "GPL";
-.fi
-.in
+.EE
+.RE
The related helper header file
.B helpers.h
in both examples was:
-.in +4n
-.nf
-.sp
+.RS
+.EX
/* Misc helper macros. */
#define __section(x) __attribute__((section(x), used))
#define offsetof(x, y) __builtin_offsetof(x, y)
@@ -704,8 +699,8 @@ unsigned long long load_byte(void *skb, unsigned long long off)
asm ("llvm.bpf.load.byte");
unsigned long long load_half(void *skb, unsigned long long off)
asm ("llvm.bpf.load.half");
-.fi
-.in
+.EE
+.RE
Best practice, we recommend to only have a single eBPF classifier loaded
in tc and perform
@@ -733,9 +728,11 @@ the kernel log, which can be read via
.B dmesg(1)
:
-.in +4n
+.RS
+.EX
.B echo 2 > /proc/sys/net/core/bpf_jit_enable
-.in
+.EE
+.RE
The Linux kernel source tree ships additionally under
.B tools/net/
@@ -744,18 +741,18 @@ a small helper called
that reads out the opcode image dump from the kernel log and dumps the
resulting disassembly:
-.in +4n
+.RS
.B bpf_jit_disasm -o
-.in
+.RE
Other than that, the Linux kernel also contains an extensive eBPF/cBPF
test suite module called
.B test_bpf
\&. Upon ...
-.in +4n
+.RS
.B modprobe test_bpf
-.in
+.RE
\&... it performs a diversity of test cases and dumps the results into
the kernel log that can be inspected with
@@ -786,9 +783,9 @@ The raw interface with tc takes opcodes directly. For example, the most
minimal classifier matching on every packet resulting in the default
classid of 1:1 looks like:
-.in +4n
+.RS
.B tc filter add dev em1 parent 1: bpf bytecode '1,6 0 0 4294967295,' flowid 1:1
-.in
+.RE
The first decimal of the bytecode sequence denotes the number of subsequent
4-tuples of cBPF opcodes. As mentioned, such a 4-tuple consists of
@@ -813,9 +810,8 @@ internal classic BPF compiler, his code derived here for usage with
.B tc(8)
:
-.in +4n
-.nf
-.sp
+.RS
+.EX
#include <pcap.h>
#include <stdio.h>
@@ -850,25 +846,25 @@ int main(int argc, char **argv)
pcap_freecode(&prog);
return 0;
}
-.fi
-.in
+.EE
+.RE
Given this small helper, any
.B tcpdump(8)
filter expression can be abused as a classifier where a match will
result in the default classid:
-.in +4n
+.RS
.B bpftool EN10MB 'tcp[tcpflags] & tcp-syn != 0' > /var/bpf/tcp-syn
.br
.B tc filter add dev em1 parent 1: bpf bytecode-file /var/bpf/tcp-syn flowid 1:1
-.in
+.RE
Basically, such a minimal generator is equivalent to:
-.in +4n
+.RS
.B tcpdump -iem1 -ddd 'tcp[tcpflags] & tcp-syn != 0' | tr '\\\\n' ',' > /var/bpf/tcp-syn
-.in
+.RE
Since
.B libpcap
@@ -888,25 +884,24 @@ for classifying IPv4/TCP packets, saved in a text file called
.B foobar
:
-.in +4n
-.nf
-.sp
+.RS
+.EX
ldh [12]
jne #0x800, drop
ldb [23]
jneq #6, drop
ret #-1
drop: ret #0
-.fi
-.in
+.EE
+.RE
Similarly, such a classifier can be loaded as:
-.in +4n
+.RS
.B bpf_asm foobar > /var/bpf/tcp-syn
.br
.B tc filter add dev em1 parent 1: bpf bytecode-file /var/bpf/tcp-syn flowid 1:1
-.in
+.RE
For BPF classifiers, the Linux kernel provides additionally under
.B tools/net/
--
2.17.1
[-- Attachment #4: 0002-In-tcp-nat.8-change-command-synopsis-to-a-form-that-.patch --]
[-- Type: text/x-diff, Size: 2853 bytes --]
>From 69842ef64825014c9c7f6054783d5af172f7879d Mon Sep 17 00:00:00 2001
From: "Eric S. Raymond" <esr@thyrsus.com>
Date: Tue, 12 Jun 2018 22:58:39 -0400
Subject: [PATCH 2/5] In tcp-nat.8, change command synopsis to a form that can
be parsed.
(This means getting rid of the pseudo-BNF := notation.)
Also, correct a misspelling.
Signed-off-by: Eric S. Raymond <esr@thyrsus.com>
---
man/man8/tc-nat.8 | 45 ++++++++++++++++++++-------------------------
1 file changed, 20 insertions(+), 25 deletions(-)
diff --git a/man/man8/tc-nat.8 b/man/man8/tc-nat.8
index fdcc052a..3617ac6c 100644
--- a/man/man8/tc-nat.8
+++ b/man/man8/tc-nat.8
@@ -6,22 +6,9 @@ nat - stateless native address translation action
.in +8
.ti -8
.BR tc " ... " "action nat"
-.I DIRECTION OLD NEW
+.RB "{ " ingress " | " egress " }"
+old-addr new-addr
-.ti -8
-.IR DIRECTION " := { "
-.BR ingress " | " egress " }"
-
-.ti -8
-.IR OLD " := " IPV4_ADDR_SPEC
-
-.ti -8
-.IR NEW " := " IPV4_ADDR_SPEC
-
-.ti -8
-.IR IPV4_ADDR_SPEC " := { "
-.BR default " | " any " | " all " | "
-\fIin_addr\fR[\fB/\fR{\fIprefix\fR|\fInetmask\fR}]
.SH DESCRIPTION
The
.B nat
@@ -39,38 +26,46 @@ Translate destination addresses, i.e. perform DNAT.
.B egress
Translate source addresses, i.e. perform SNAT.
.TP
-.I OLD
+.I old-addr
Specifies addresses which should be translated.
.TP
-.I NEW
+.I new-addr
Specifies addresses which
-.I OLD
+.I old-addr
should be translated into.
.SH NOTES
The accepted address format in
-.IR OLD " and " NEW
+.IR old-addr " and " new-addr
is quite flexible. It may either consist of one of the keywords
.BR default ", " any " or " all ,
representing the all-zero IP address or a combination of IP address and netmask
or prefix length separated by a slash
.RB ( / )
sign. In any case, the mask (or prefix length) value of
-.I OLD
+.I old-addr
is used for
-.I NEW
+.I new-addr
as well so that a one-to-one mapping of addresses is assured.
+.PP
+The most general form is
+
+.RS
+.BR default " | " any " | " all " | "
+\fIin_addr\fR[\fB/\fR{\fIprefix\fR|\fInetmask\fR}]
+.RE
+
Address translation is done using a combination of binary operations. First, the
original (source or destination) address is matched against the value of
-.IR OLD .
+.IR old-addr .
If the original address fits, the new address is created by taking the leading
bits from
-.I NEW
+.I new-addr
(defined by the netmask of
-.IR OLD )
+.IR old-addr )
and taking the remaining bits from the original address.
-There is rudimental support for upper layer protocols, namely TCP, UDP and ICMP.
+There is rudimentary support for upper layer protocols, namely TCP, UDP and ICMP.
While for the first two only checksum recalculation is performed, the action
also takes care of embedded IP headers in ICMP packets by translating the
respective address therein, too.
--
2.17.1
[-- Attachment #5: 0003-In-tc-pie.8-fix-up-list-and-example-syntax.patch --]
[-- Type: text/x-diff, Size: 4086 bytes --]
>From 4d787557880a25967da7dfdb070eea8c6cf9c635 Mon Sep 17 00:00:00 2001
From: "Eric S. Raymond" <esr@thyrsus.com>
Date: Wed, 13 Jun 2018 00:08:55 -0400
Subject: [PATCH 3/5] In tc-pie.8, fix up list and example syntax.
Signed-off-by: Eric S. Raymond <esr@thyrsus.com>
---
man/man8/tc-pie.8 | 56 ++++++++++++++++++++++++++++-------------------
1 file changed, 33 insertions(+), 23 deletions(-)
diff --git a/man/man8/tc-pie.8 b/man/man8/tc-pie.8
index 278293bd..64192b86 100644
--- a/man/man8/tc-pie.8
+++ b/man/man8/tc-pie.8
@@ -27,10 +27,14 @@ int ] [
Proportional Integral controller-Enhanced (PIE) is a control theoretic active
queue management scheme. It is based on the proportional integral controller but
aims to control delay. The main design goals are
- o Low latency control
- o High link utilization
- o Simple implementation
- o Guaranteed stability and fast responsiveness
+.IP
+Low latency control
+.IP
+High link utilization
+.IP
+Simple implementation
+.IP
+Guaranteed stability and fast responsiveness
.SH ALGORITHM
PIE is designed to control delay effectively. First, an average dequeue rate is
@@ -43,10 +47,11 @@ PIE makes adjustments to the probability based on the trend of the delay i.e.
whether it is going up or down.The delay converges quickly to the target value
specified.
-alpha and beta are statically chosen parameters chosen to control the drop probability
-growth and are determined through control theoretic approaches. alpha determines how
-the deviation between the current and target latency changes probability. beta exerts
-additional adjustments depending on the latency trend.
+alpha and beta are statically chosen parameters chosen to control the
+drop probability growth and are determined through control theoretic
+approaches. alpha determines how the deviation between the current and
+target latency changes probability. beta exerts additional adjustments
+depending on the latency trend.
The drop probabilty is used to mark packets in ecn mode. However, as in RED,
beyond 10% packets are dropped based on this probability. The bytemode is used
@@ -55,22 +60,24 @@ to drop packets proportional to the packet size.
Additional details can be found in the paper cited below.
.SH PARAMETERS
-.SS limit
+.TP
+limit
limit on the queue size in packets. Incoming packets are dropped when this limit
is reached. Default is 1000 packets.
-
-.SS target
+.TP
+target
is the expected queue delay. The default target delay is 20ms.
-
-.SS tupdate
+.TP
+tupdate
is the frequency at which the system drop probability is calculated. The default is 30ms.
-
-.SS alpha
-.SS beta
+.TP
+alpha
+.TP
+beta
alpha and beta are parameters chosen to control the drop probability. These
should be in the range between 0 and 32.
-
-.SS ecn | noecn
+.TP
+ecn | noecn
is used to mark packets instead of dropping
.B ecn
to turn on ecn mode,
@@ -78,8 +85,8 @@ to turn on ecn mode,
to turn off ecn mode. By default,
.B ecn
is turned off.
-
-.SS bytemode | nobytemode
+.TP
+bytemode | nobytemode
is used to scale drop probability proportional to packet size
.B bytemode
to turn on bytemode,
@@ -89,6 +96,7 @@ to turn off bytemode. By default,
is turned off.
.SH EXAMPLES
+.EX
# tc qdisc add dev eth0 root pie
# tc -s qdisc show
qdisc pie 8034: dev eth0 root refcnt 2 limit 200p target 19000us tupdate 29000us alpha 2 beta 20
@@ -113,7 +121,7 @@ is turned off.
backlog 33728b 32p requeues 0
prob 0.102262 delay 24000us avg_dq_rate 1464840
pkts_in 2468 overlimit 214 dropped 0 maxq 192 ecn_mark 71
-
+.EE
.SH SEE ALSO
.BR tc (8),
@@ -121,8 +129,10 @@ is turned off.
.BR tc-red (8)
.SH SOURCES
- o IETF draft submission is at http://tools.ietf.org/html/draft-pan-tsvwg-pie-00
- o IEEE Conference on High Performance Switching and Routing 2013 : "PIE: A
+.IP
+IETF draft submission is at http://tools.ietf.org/html/draft-pan-tsvwg-pie-00
+.IP
+IEEE Conference on High Performance Switching and Routing 2013 : "PIE: A
Lightweight Control Scheme to Address the Bufferbloat Problem"
.SH AUTHORS
--
2.17.1
[-- Attachment #6: 0004-Fix-list-syntax-errors-in-tc-pedit.8.patch --]
[-- Type: text/x-diff, Size: 892 bytes --]
>From 053a817b72cb942d47bb3fbb45385e04d8b6f378 Mon Sep 17 00:00:00 2001
From: "Eric S. Raymond" <esr@thyrsus.com>
Date: Wed, 13 Jun 2018 09:49:12 -0400
Subject: [PATCH 4/5] Fix list syntax errors in tc-pedit.8.
Signed-off-by: Eric S. Raymond <esr@thyrsus.com>
---
man/man8/tc-pedit.8 | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/man/man8/tc-pedit.8 b/man/man8/tc-pedit.8
index bbd725c4..53d07fec 100644
--- a/man/man8/tc-pedit.8
+++ b/man/man8/tc-pedit.8
@@ -224,17 +224,17 @@ The supported keywords for
.I IP6HDR_FIELD
are:
.RS
-.TP
+.IP ""
.B src
-.TQ
+.IP ""
.B dst
-.TQ
+.IP ""
.B flow_lbl
-.TQ
+.IP ""
.B payload_len
-.TQ
+.IP ""
.B nexthdr
-.TQ
+.IP ""
.B hoplimit
.RE
.TP
@@ -250,6 +250,7 @@ are:
Source or destination TCP port number, a 16-bit value.
.TP
.B flags
+(To be documented)
.RE
.TP
.BI udp " UDPHDR_FIELD"
--
2.17.1
[-- Attachment #7: 0005-In-devlink.8-translate-unparseable-callout-syntax-to.patch --]
[-- Type: text/x-diff, Size: 1099 bytes --]
>From 643db903b3f6cd90dce5cc90daafcd6830e83a8a Mon Sep 17 00:00:00 2001
From: "Eric S. Raymond" <esr@thyrsus.com>
Date: Wed, 13 Jun 2018 12:45:24 -0400
Subject: [PATCH 5/5] In devlink.8, translate unparseable callout syntax to
parseable form.
Signed-off-by: Eric S. Raymond <esr@thyrsus.com>
---
man/man8/devlink.8 | 14 +-------------
1 file changed, 1 insertion(+), 13 deletions(-)
diff --git a/man/man8/devlink.8 b/man/man8/devlink.8
index 7986310f..efc6e625 100644
--- a/man/man8/devlink.8
+++ b/man/man8/devlink.8
@@ -7,7 +7,7 @@ devlink \- Devlink tool
.in +8
.ti -8
.B devlink
-.RI "[ " OPTIONS " ] " OBJECT " { " COMMAND " | "
+.RI "[ " OPTIONS " ] { " dev | port | monitor | sb | resource " } { " COMMAND " | "
.BR help " }"
.sp
@@ -17,18 +17,6 @@ devlink \- Devlink tool
.BI "-batch " filename
.sp
-.ti -8
-.IR OBJECT " := { "
-.BR dev " | " port " | " monitor " | " sb " | " resource " }"
-.sp
-
-.ti -8
-.IR OPTIONS " := { "
-\fB\-V\fR[\fIersion\fR] |
-\fB\-n\fR[\fIno-nice-names\fR] }
-\fB\-j\fR[\fIjson\fR] }
-\fB\-p\fR[\fIpretty\fR] }
-
.SH OPTIONS
.TP
--
2.17.1
^ permalink raw reply related
* RE: [PATCH v2] hv_netvsc: Add per-cpu ethtool stats for netvsc
From: Yidong Ren @ 2018-06-13 21:07 UTC (permalink / raw)
To: Eric Dumazet, Yidong Ren, KY Srinivasan, Haiyang Zhang,
Stephen Hemminger, David S. Miller, devel@linuxdriverproject.org,
netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <4c853799-44e0-ff33-3555-41982d601ebb@gmail.com>
> From: Eric Dumazet <eric.dumazet@gmail.com>
> You actually want to allocate memory local to this cpu, possibly in one chunk,
> not spread all over the places.
>
> kvmalloc(nr_cpu_ids * sizeof(struct netvsc_ethtool_pcpu_stats)) should be
> really better, since it would most of the time be satisfied by a single kmalloc()
Got it. I'm just trying to allocate memory for each cpu. It doesn't have to be __percpu variable.
^ permalink raw reply
* Re: [PATCH 0/9] Netfilter fixes for net
From: David Miller @ 2018-06-13 21:05 UTC (permalink / raw)
To: pablo; +Cc: netfilter-devel, netdev
In-Reply-To: <20180613105700.12894-1-pablo@netfilter.org>
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 13 Jun 2018 12:56:51 +0200
> The following patchset contains Netfilter patches for your net tree:
>
> 1) Fix NULL pointer dereference from nf_nat_decode_session() if NAT is
> not loaded, from Prashant Bhole.
>
> 2) Fix socket extension module autoload.
>
> 3) Don't bogusly reject sets with the NFT_SET_EVAL flag set on from
> the dynset extension.
>
> 4) Fix races with nf_tables module removal and netns exit path,
> patches from Florian Westphal.
>
> 5) Don't hit BUG_ON if jumpstack goes too deep, instead hit
> WARN_ON_ONCE, from Taehee Yoo.
>
> 6) Another NULL pointer dereference from ctnetlink, again if NAT is
> not loaded, from Florian Westphal.
>
> 7) Fix x_tables match list corruption in xt_connmark module removal
> path, also from Florian.
>
> 8) nf_conncount doesn't properly deal with conntrack zones, hence
> garbage collector may get rid of entries in a different zone.
> From Yi-Hung Wei.
>
> You can pull these changes from:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git
Pulled, thank you.
^ permalink raw reply
* Re: [RFC nf-next 0/5] netfilter: add ebpf translation infrastructure
From: Florian Westphal @ 2018-06-13 20:59 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Florian Westphal, netfilter-devel, ast, daniel, netdev,
David S. Miller, ecree
In-Reply-To: <20180613004324.x2xdyj2qlbkkpccy@ast-mbp.dhcp.thefacebook.com>
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> On Tue, Jun 12, 2018 at 11:28:12AM +0200, Florian Westphal wrote:
> > I think its important user(space) can see which rules are jitted, and
> > which ebpf prog corresponds to which rule(s), using an expression as
> > container allows to re-use existing nft config plane code to serialze
> > this via netlink attributes.
>
> In my mind it would be all or nothing. I don't think it helps
> to convert some rules and not all.
Ok. Still, even in that case I think it would be good if we'd be able to tell
userspace the ebpf program id that corresponds to the ruleset.
> > Step 1: 1:1 mapping, an nft rule has at most one ebpf prog.
> > Step 2: figure out how to handle maps, sets, and how to cope with
> > not-yet-translateable expressions
> > Step 3: m:n mapping: kernel provides adjacent rules to the UMH for
> > jitting. Example: user appends rules a, b, c. UMH creates
> > single ebpf prog from a/b/c.
> > nft-pseudo-expression replaces a/b/c in the
> > packet path, original rules a/b/c are linked from the pseudo
> > expression for tracking. If user deletes rule b, we provide
> > a/c to UMH to create new epbf prog that replaces new
> > sequence a/c.
> > Step 4: always provide entire future base chain and all reachable chains
> > to the umh. Ideally all of it is replaced by single program.
[..]
> > Does that make sense to you?
> >
> > If you see this as flawed, please let me know, but as I have no idea
> > how to resolve these issues going from 0 to 4 makes no sense to me.
>
> I think the challenge is how to implement 4 without doing step 1, right?
Yes.
> imo doing such 1:1 (single rule to single bpf prog) translation does not
> help to break hard problem into smaller pieces. Such 1:1 is great
> for prototype, but not to land upstream.
> For the same reasons in bpfilter we did single iptable rule to single
> bpf prog translation, but such code doesn't belong in upstream tree,
> since it's not a scalable approach.
[..]
> > Okay, but without any idea how to consider existing expressions,
> > sets, maps etc. I'm not sure it makes sense to work on that at this
> > point.
>
> I think sets and ipset (in case of iptables) fit well into trie model.
Yes, but thats going to be a lot of effort to handle properly
without breaking (or replacing) userland plumbing.
For nft we could aim for full-translation for the ingress hook
initially as that takes stateful filering out of the picture (ingress
occurs before conntrack).
We could also ignore sets for now and only deal with anonymous sets (they
are immutable and data stored in such sets can be made available to
UMH).
I can rework the RFC to emit "future table" to UMH instead of
individual rules, but I don't know yet when i will have time to work on
it again.
Thanks,
Florian
^ permalink raw reply
* Re: [PATCH v2] hv_netvsc: Add per-cpu ethtool stats for netvsc
From: Eric Dumazet @ 2018-06-13 20:57 UTC (permalink / raw)
To: Yidong Ren, K. Y. Srinivasan, Haiyang Zhang, Stephen Hemminger,
David S. Miller, devel, netdev, linux-kernel
In-Reply-To: <20180613193608.444-1-yidren@linuxonhyperv.com>
On 06/13/2018 12:36 PM, Yidong Ren wrote:
> From: Yidong Ren <yidren@microsoft.com>
>
> This patch implements following ethtool stats fields for netvsc:
> cpu<n>_tx/rx_packets/bytes
> cpu<n>_vf_tx/rx_packets/bytes
...
>
> + pcpu_sum = alloc_percpu(struct netvsc_ethtool_pcpu_stats);
> + netvsc_get_pcpu_stats(dev, pcpu_sum);
> + for_each_present_cpu(cpu) {
> + struct netvsc_ethtool_pcpu_stats *this_sum =
> + per_cpu_ptr(pcpu_sum, cpu);
> + for (j = 0; j < ARRAY_SIZE(pcpu_stats); j++)
> + data[i++] = *(u64 *)((void *)this_sum
> + + pcpu_stats[j].offset);
> + }
> + free_percpu(pcpu_sum);
>
Using alloc_percpu() / free_percpu() for a short section of code makes no sense.
You actually want to allocate memory local to this cpu, possibly in one chunk,
not spread all over the places.
kvmalloc(nr_cpu_ids * sizeof(struct netvsc_ethtool_pcpu_stats)) should be really better,
since it would most of the time be satisfied by a single kmalloc()
^ permalink raw reply
* [BUG] net: stmmac: socfpga ethernet no longer working on linux-next
From: Dinh Nguyen @ 2018-06-13 20:46 UTC (permalink / raw)
To: netdev; +Cc: David Miller, clabbe, joabreu, Dinh Nguyen, Marek Vasut
Hi,
The stmmac ethernet has stopped working in linux-next and linus/master
branch(v4.17-11782-gbe779f03d563)
It appears that the stmmac ethernet has stopped working after these 2 commits:
4dbbe8dde848 net: stmmac: Add support for U32 TC filter using Flexible RX Parser
5f0456b43140 net: stmmac: Implement logic to automatically select HW Interface
If I move to this commit "565020aaeebf net: stmmac: Disable ACS
Feature for GMAC >= 4", then the stmmac works again on SoCFPGA.
I was following this thread:
https://www.spinics.net/lists/netdev/msg502858.html
Was wondering if there was a patch to fix dwmac-sun8i that the socfpga
platform needs as well?
Thanks,
Dinh
^ permalink raw reply
* Re: [PATCH net] net: qcom/emac: Add missing of_node_put()
From: Timur Tabi @ 2018-06-13 20:45 UTC (permalink / raw)
To: YueHaibing, davem; +Cc: linux-kernel, netdev, Hemanth Puranik
In-Reply-To: <20180611130345.15172-1-yuehaibing@huawei.com>
On 06/11/2018 08:03 AM, YueHaibing wrote:
> Add missing of_node_put() call for device node returned by
> of_parse_phandle().
>
> Signed-off-by: YueHaibing<yuehaibing@huawei.com>
Acked-by: Timur Tabi <timur@codeaurora.org>
This seems legit. The comment for of_find_device_by_node() that says
the np needs to be released was added after the code was written, so
it's possible that I didn't know at the time that this was a requirement.
However, I no longer have the ability to test EMAC on device tree
platforms, so I can't verify this code.
--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm
Technologies, Inc. Qualcomm Technologies, Inc. is a member of the
Code Aurora Forum, a Linux Foundation Collaborative Project.
^ permalink raw reply
* [PATCH v2] hv_netvsc: Add per-cpu ethtool stats for netvsc
From: Yidong Ren @ 2018-06-13 19:36 UTC (permalink / raw)
To: K. Y. Srinivasan, Haiyang Zhang, Stephen Hemminger,
David S. Miller, devel, netdev, linux-kernel
From: Yidong Ren <yidren@microsoft.com>
This patch implements following ethtool stats fields for netvsc:
cpu<n>_tx/rx_packets/bytes
cpu<n>_vf_tx/rx_packets/bytes
Corresponding per-cpu counters exist in current code. Exposing these
counters will help troubleshooting performance issues.
Signed-off-by: Yidong Ren <yidren@microsoft.com>
---
Changes in v2:
- Remove cpp style comment
- Resubmit after freeze
drivers/net/hyperv/hyperv_net.h | 11 +++++
drivers/net/hyperv/netvsc_drv.c | 104 +++++++++++++++++++++++++++++++++++++++-
2 files changed, 113 insertions(+), 2 deletions(-)
diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 23304ac..c825353 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -873,6 +873,17 @@ struct netvsc_ethtool_stats {
unsigned long wake_queue;
};
+struct netvsc_ethtool_pcpu_stats {
+ u64 rx_packets;
+ u64 rx_bytes;
+ u64 tx_packets;
+ u64 tx_bytes;
+ u64 vf_rx_packets;
+ u64 vf_rx_bytes;
+ u64 vf_tx_packets;
+ u64 vf_tx_bytes;
+};
+
struct netvsc_vf_pcpu_stats {
u64 rx_packets;
u64 rx_bytes;
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 7b18a8c..6803aae 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1105,6 +1105,66 @@ static void netvsc_get_vf_stats(struct net_device *net,
}
}
+static void netvsc_get_pcpu_stats(struct net_device *net,
+ struct netvsc_ethtool_pcpu_stats
+ __percpu *pcpu_tot)
+{
+ struct net_device_context *ndev_ctx = netdev_priv(net);
+ struct netvsc_device *nvdev = rcu_dereference_rtnl(ndev_ctx->nvdev);
+ int i;
+
+ /* fetch percpu stats of vf */
+ for_each_possible_cpu(i) {
+ const struct netvsc_vf_pcpu_stats *stats =
+ per_cpu_ptr(ndev_ctx->vf_stats, i);
+ struct netvsc_ethtool_pcpu_stats *this_tot =
+ per_cpu_ptr(pcpu_tot, i);
+ unsigned int start;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&stats->syncp);
+ this_tot->vf_rx_packets = stats->rx_packets;
+ this_tot->vf_tx_packets = stats->tx_packets;
+ this_tot->vf_rx_bytes = stats->rx_bytes;
+ this_tot->vf_tx_bytes = stats->tx_bytes;
+ } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+ this_tot->rx_packets = this_tot->vf_rx_packets;
+ this_tot->tx_packets = this_tot->vf_tx_packets;
+ this_tot->rx_bytes = this_tot->vf_rx_bytes;
+ this_tot->tx_bytes = this_tot->vf_tx_bytes;
+ }
+
+ /* fetch percpu stats of netvsc */
+ for (i = 0; i < nvdev->num_chn; i++) {
+ const struct netvsc_channel *nvchan = &nvdev->chan_table[i];
+ const struct netvsc_stats *stats;
+ struct netvsc_ethtool_pcpu_stats *this_tot =
+ per_cpu_ptr(pcpu_tot, nvchan->channel->target_cpu);
+ u64 packets, bytes;
+ unsigned int start;
+
+ stats = &nvchan->tx_stats;
+ do {
+ start = u64_stats_fetch_begin_irq(&stats->syncp);
+ packets = stats->packets;
+ bytes = stats->bytes;
+ } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+
+ this_tot->tx_bytes += bytes;
+ this_tot->tx_packets += packets;
+
+ stats = &nvchan->rx_stats;
+ do {
+ start = u64_stats_fetch_begin_irq(&stats->syncp);
+ packets = stats->packets;
+ bytes = stats->bytes;
+ } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+
+ this_tot->rx_bytes += bytes;
+ this_tot->rx_packets += packets;
+ }
+}
+
static void netvsc_get_stats64(struct net_device *net,
struct rtnl_link_stats64 *t)
{
@@ -1202,6 +1262,23 @@ static const struct {
{ "rx_no_memory", offsetof(struct netvsc_ethtool_stats, rx_no_memory) },
{ "stop_queue", offsetof(struct netvsc_ethtool_stats, stop_queue) },
{ "wake_queue", offsetof(struct netvsc_ethtool_stats, wake_queue) },
+}, pcpu_stats[] = {
+ { "cpu%u_rx_packets",
+ offsetof(struct netvsc_ethtool_pcpu_stats, rx_packets) },
+ { "cpu%u_rx_bytes",
+ offsetof(struct netvsc_ethtool_pcpu_stats, rx_bytes) },
+ { "cpu%u_tx_packets",
+ offsetof(struct netvsc_ethtool_pcpu_stats, tx_packets) },
+ { "cpu%u_tx_bytes",
+ offsetof(struct netvsc_ethtool_pcpu_stats, tx_bytes) },
+ { "cpu%u_vf_rx_packets",
+ offsetof(struct netvsc_ethtool_pcpu_stats, vf_rx_packets) },
+ { "cpu%u_vf_rx_bytes",
+ offsetof(struct netvsc_ethtool_pcpu_stats, vf_rx_bytes) },
+ { "cpu%u_vf_tx_packets",
+ offsetof(struct netvsc_ethtool_pcpu_stats, vf_tx_packets) },
+ { "cpu%u_vf_tx_bytes",
+ offsetof(struct netvsc_ethtool_pcpu_stats, vf_tx_bytes) },
}, vf_stats[] = {
{ "vf_rx_packets", offsetof(struct netvsc_vf_pcpu_stats, rx_packets) },
{ "vf_rx_bytes", offsetof(struct netvsc_vf_pcpu_stats, rx_bytes) },
@@ -1213,6 +1290,9 @@ static const struct {
#define NETVSC_GLOBAL_STATS_LEN ARRAY_SIZE(netvsc_stats)
#define NETVSC_VF_STATS_LEN ARRAY_SIZE(vf_stats)
+/* statistics per queue (rx/tx packets/bytes) */
+#define NETVSC_PCPU_STATS_LEN (num_present_cpus() * ARRAY_SIZE(pcpu_stats))
+
/* 4 statistics per queue (rx/tx packets/bytes) */
#define NETVSC_QUEUE_STATS_LEN(dev) ((dev)->num_chn * 4)
@@ -1228,6 +1308,7 @@ static int netvsc_get_sset_count(struct net_device *dev, int string_set)
case ETH_SS_STATS:
return NETVSC_GLOBAL_STATS_LEN
+ NETVSC_VF_STATS_LEN
+ + NETVSC_PCPU_STATS_LEN
+ NETVSC_QUEUE_STATS_LEN(nvdev);
default:
return -EINVAL;
@@ -1242,9 +1323,10 @@ static void netvsc_get_ethtool_stats(struct net_device *dev,
const void *nds = &ndc->eth_stats;
const struct netvsc_stats *qstats;
struct netvsc_vf_pcpu_stats sum;
+ struct netvsc_ethtool_pcpu_stats __percpu *pcpu_sum;
unsigned int start;
u64 packets, bytes;
- int i, j;
+ int i, j, cpu;
if (!nvdev)
return;
@@ -1256,6 +1338,17 @@ static void netvsc_get_ethtool_stats(struct net_device *dev,
for (j = 0; j < NETVSC_VF_STATS_LEN; j++)
data[i++] = *(u64 *)((void *)&sum + vf_stats[j].offset);
+ pcpu_sum = alloc_percpu(struct netvsc_ethtool_pcpu_stats);
+ netvsc_get_pcpu_stats(dev, pcpu_sum);
+ for_each_present_cpu(cpu) {
+ struct netvsc_ethtool_pcpu_stats *this_sum =
+ per_cpu_ptr(pcpu_sum, cpu);
+ for (j = 0; j < ARRAY_SIZE(pcpu_stats); j++)
+ data[i++] = *(u64 *)((void *)this_sum
+ + pcpu_stats[j].offset);
+ }
+ free_percpu(pcpu_sum);
+
for (j = 0; j < nvdev->num_chn; j++) {
qstats = &nvdev->chan_table[j].tx_stats;
@@ -1283,7 +1376,7 @@ static void netvsc_get_strings(struct net_device *dev, u32 stringset, u8 *data)
struct net_device_context *ndc = netdev_priv(dev);
struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
u8 *p = data;
- int i;
+ int i, cpu;
if (!nvdev)
return;
@@ -1300,6 +1393,13 @@ static void netvsc_get_strings(struct net_device *dev, u32 stringset, u8 *data)
p += ETH_GSTRING_LEN;
}
+ for_each_present_cpu(cpu) {
+ for (i = 0; i < ARRAY_SIZE(pcpu_stats); i++) {
+ sprintf(p, pcpu_stats[i].name, cpu);
+ p += ETH_GSTRING_LEN;
+ }
+ }
+
for (i = 0; i < nvdev->num_chn; i++) {
sprintf(p, "tx_queue_%u_packets", i);
p += ETH_GSTRING_LEN;
--
2.7.4
^ permalink raw reply related
* Re: KASAN: slab-out-of-bounds Read in bpf_skb_vlan_push
From: syzbot @ 2018-06-13 18:49 UTC (permalink / raw)
To: ast, daniel, davem, linux-kernel, netdev, syzkaller-bugs
In-Reply-To: <26c434ee-0a0a-fbba-282c-dabddfac652e@iogearbox.net>
Hello,
syzbot has tested the proposed patch and the reproducer did not trigger
crash:
Reported-and-tested-by:
syzbot+76de61614cb1abdd73fc@syzkaller.appspotmail.com
Tested on:
commit: be779f03d563 Merge tag 'kbuild-v4.18-2' of git://git.kerne..
git tree: upstream
kernel config: https://syzkaller.appspot.com/x/.config?x=68d8eba98e3f8e88
compiler: gcc (GCC) 8.0.1 20180413 (experimental)
Note: testing is done by a robot and is best-effort only.
^ permalink raw reply
* Re: [PATCH ethtool 1/6] ethtool: fix uninitialized return value
From: John W. Linville @ 2018-06-13 18:31 UTC (permalink / raw)
To: Ivan Vecera; +Cc: netdev
In-Reply-To: <20180608092010.13041-1-cera@cera.cz>
On Fri, Jun 08, 2018 at 11:20:05AM +0200, Ivan Vecera wrote:
> Fixes: b0fe96d ("Ethtool: Implements ETHTOOL_PHY_GTUNABLE/ETHTOOL_PHY_STUNABLE and PHY downshift")
> Signed-off-by: Ivan Vecera <cera@cera.cz>
LGTM -- I have queued the series for the next release, including
extending the commit IDs...
Thanks!
John
--
John W. Linville Someday the world will need a hero, and you
linville@tuxdriver.com might be all we have. Be ready.
^ permalink raw reply
* Re: KASAN: slab-out-of-bounds Read in bpf_skb_vlan_push
From: Daniel Borkmann @ 2018-06-13 18:42 UTC (permalink / raw)
To: syzbot, ast, davem, linux-kernel, netdev, syzkaller-bugs
In-Reply-To: <0000000000000c58c2056e8a3a27@google.com>
On 06/13/2018 08:34 PM, syzbot wrote:
> Hello,
>
> syzbot has tested the proposed patch and the reproducer did not trigger crash:
>
> Reported-and-tested-by: syzbot+76de61614cb1abdd73fc@syzkaller.appspotmail.com
>
> Tested on:
>
> commit: be779f03d563 Merge tag 'kbuild-v4.18-2' of git://git.kerne..
> git tree: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/--
> kernel config: https://syzkaller.appspot.com/x/.config?x=68d8eba98e3f8e88
> compiler: gcc (GCC) 8.0.1 20180413 (experimental)
>
> Note: testing is done by a robot and is best-effort only.
#syz fix: bpf: reject passing modified ctx to helper functions
^ permalink raw reply
* Re: KASAN: slab-out-of-bounds Read in bpf_skb_vlan_push
From: syzbot @ 2018-06-13 18:34 UTC (permalink / raw)
To: ast, daniel, davem, linux-kernel, netdev, syzkaller-bugs
In-Reply-To: <b3af0f35-b0f8-859d-4f8f-b919d35ebaaa@iogearbox.net>
Hello,
syzbot has tested the proposed patch and the reproducer did not trigger
crash:
Reported-and-tested-by:
syzbot+76de61614cb1abdd73fc@syzkaller.appspotmail.com
Tested on:
commit: be779f03d563 Merge tag 'kbuild-v4.18-2' of git://git.kerne..
git tree:
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/--
kernel config: https://syzkaller.appspot.com/x/.config?x=68d8eba98e3f8e88
compiler: gcc (GCC) 8.0.1 20180413 (experimental)
Note: testing is done by a robot and is best-effort only.
^ permalink raw reply
* Re: KASAN: slab-out-of-bounds Read in bpf_skb_vlan_push
From: Daniel Borkmann @ 2018-06-13 18:15 UTC (permalink / raw)
To: syzbot; +Cc: ast, davem, linux-kernel, netdev, syzkaller-bugs
In-Reply-To: <000000000000402477056e89f067@google.com>
On 06/13/2018 08:13 PM, syzbot wrote:
>> On 06/13/2018 06:17 PM, syzbot wrote:
>>> Hello,
>
>>> syzbot found the following crash on:
>
>>> HEAD commit: 75d4e704fa8d netdev-FAQ: clarify DaveM's position for stab..
>>> git tree: bpf-next
>>> console output: https://syzkaller.appspot.com/x/log.txt?x=1754783f800000
>>> kernel config: https://syzkaller.appspot.com/x/.config?x=a601a80fec461d44
>>> dashboard link: https://syzkaller.appspot.com/bug?extid=76de61614cb1abdd73fc
>>> compiler: gcc (GCC) 8.0.1 20180413 (experimental)
>>> syzkaller repro:https://syzkaller.appspot.com/x/repro.syz?x=12c1e1bf800000
>
>>> IMPORTANT: if you fix the bug, please add the following tag to the commit:
>>> Reported-by: syzbot+76de61614cb1abdd73fc@syzkaller.appspotmail.com
>
>>> IPv6: ADDRCONF(NETDEV_CHANGE): veth1: link becomes ready
>>> IPv6: ADDRCONF(NETDEV_CHANGE): veth0: link becomes ready
>>> 8021q: adding VLAN 0 to HW filter on device team0
>>> 8021q: adding VLAN 0 to HW filter on device team0
>>> ==================================================================
>>> BUG: KASAN: slab-out-of-bounds in skb_at_tc_ingress include/net/sch_generic.h:535 [inline]
>>> BUG: KASAN: slab-out-of-bounds in bpf_push_mac_rcsum net/core/filter.c:1625 [inline]
>>> BUG: KASAN: slab-out-of-bounds in ____bpf_skb_vlan_push net/core/filter.c:2446 [inline]
>>> BUG: KASAN: slab-out-of-bounds in bpf_skb_vlan_push+0x6b7/0x720 net/core/filter.c:2437
>>> Read of size 5 at addr ffff8801b77347d0 by task syz-executor6/6529
>
>> Should be fixed already by:
>
>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=58990d1ff3f7896ee341030e9a7c2e4002570683
>
>
>> #syz test: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
>
> want 2 args (repo, branch), got 1
Fair enough ... assumed default would have been master. ;-)
#syz test: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git master
^ permalink raw reply
* Re: 4b66af2d("af_key: Always verify length of provided sadb_key")
From: Greg KH @ 2018-06-13 18:13 UTC (permalink / raw)
To: Zubin Mithra; +Cc: linux-netdev, groeck, stable
In-Reply-To: <20180613180252.GA34929@zsmcros.c.googlers.com>
On Wed, Jun 13, 2018 at 02:02:54PM -0400, Zubin Mithra wrote:
> Hello,
>
> Syzkaller has reported a crash here[1] for a slab OOB read in pfkey_add.
>
> Could the following patch be applied to stable kernels for 4.14, 4.4, 3.18, 3.14, 3.10 and 3.8?
>
> 4b66af2d("af_key: Always verify length of provided sadb_key")
>
> [1] https://syzkaller.appspot.com/bug?id=26cb120b31cd24d984fc16da67f50fb375c432a7
Now queued up, thanks.
greg k-h
^ permalink raw reply
* Re: Re: KASAN: slab-out-of-bounds Read in bpf_skb_vlan_push
From: syzbot @ 2018-06-13 18:13 UTC (permalink / raw)
To: Daniel Borkmann; +Cc: ast, daniel, davem, linux-kernel, netdev, syzkaller-bugs
In-Reply-To: <b3af0f35-b0f8-859d-4f8f-b919d35ebaaa@iogearbox.net>
> On 06/13/2018 06:17 PM, syzbot wrote:
>> Hello,
>> syzbot found the following crash on:
>> HEAD commit: 75d4e704fa8d netdev-FAQ: clarify DaveM's position for
>> stab..
>> git tree: bpf-next
>> console output: https://syzkaller.appspot.com/x/log.txt?x=1754783f800000
>> kernel config:
>> https://syzkaller.appspot.com/x/.config?x=a601a80fec461d44
>> dashboard link:
>> https://syzkaller.appspot.com/bug?extid=76de61614cb1abdd73fc
>> compiler: gcc (GCC) 8.0.1 20180413 (experimental)
>> syzkaller
>> repro:https://syzkaller.appspot.com/x/repro.syz?x=12c1e1bf800000
>> IMPORTANT: if you fix the bug, please add the following tag to the
>> commit:
>> Reported-by: syzbot+76de61614cb1abdd73fc@syzkaller.appspotmail.com
>> IPv6: ADDRCONF(NETDEV_CHANGE): veth1: link becomes ready
>> IPv6: ADDRCONF(NETDEV_CHANGE): veth0: link becomes ready
>> 8021q: adding VLAN 0 to HW filter on device team0
>> 8021q: adding VLAN 0 to HW filter on device team0
>> ==================================================================
>> BUG: KASAN: slab-out-of-bounds in skb_at_tc_ingress
>> include/net/sch_generic.h:535 [inline]
>> BUG: KASAN: slab-out-of-bounds in bpf_push_mac_rcsum
>> net/core/filter.c:1625 [inline]
>> BUG: KASAN: slab-out-of-bounds in ____bpf_skb_vlan_push
>> net/core/filter.c:2446 [inline]
>> BUG: KASAN: slab-out-of-bounds in bpf_skb_vlan_push+0x6b7/0x720
>> net/core/filter.c:2437
>> Read of size 5 at addr ffff8801b77347d0 by task syz-executor6/6529
> Should be fixed already by:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=58990d1ff3f7896ee341030e9a7c2e4002570683
> #syz test:
> git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
want 2 args (repo, branch), got 1
^ permalink raw reply
* Re: KASAN: slab-out-of-bounds Read in bpf_skb_vlan_push
From: Daniel Borkmann @ 2018-06-13 18:13 UTC (permalink / raw)
To: syzbot, ast, davem, linux-kernel, netdev, syzkaller-bugs
In-Reply-To: <000000000000219be6056e8850fa@google.com>
On 06/13/2018 06:17 PM, syzbot wrote:
> Hello,
>
> syzbot found the following crash on:
>
> HEAD commit: 75d4e704fa8d netdev-FAQ: clarify DaveM's position for stab..
> git tree: bpf-next
> console output: https://syzkaller.appspot.com/x/log.txt?x=1754783f800000
> kernel config: https://syzkaller.appspot.com/x/.config?x=a601a80fec461d44
> dashboard link: https://syzkaller.appspot.com/bug?extid=76de61614cb1abdd73fc
> compiler: gcc (GCC) 8.0.1 20180413 (experimental)
> syzkaller repro:https://syzkaller.appspot.com/x/repro.syz?x=12c1e1bf800000
>
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+76de61614cb1abdd73fc@syzkaller.appspotmail.com
>
> IPv6: ADDRCONF(NETDEV_CHANGE): veth1: link becomes ready
> IPv6: ADDRCONF(NETDEV_CHANGE): veth0: link becomes ready
> 8021q: adding VLAN 0 to HW filter on device team0
> 8021q: adding VLAN 0 to HW filter on device team0
> ==================================================================
> BUG: KASAN: slab-out-of-bounds in skb_at_tc_ingress include/net/sch_generic.h:535 [inline]
> BUG: KASAN: slab-out-of-bounds in bpf_push_mac_rcsum net/core/filter.c:1625 [inline]
> BUG: KASAN: slab-out-of-bounds in ____bpf_skb_vlan_push net/core/filter.c:2446 [inline]
> BUG: KASAN: slab-out-of-bounds in bpf_skb_vlan_push+0x6b7/0x720 net/core/filter.c:2437
> Read of size 5 at addr ffff8801b77347d0 by task syz-executor6/6529
Should be fixed already by:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=58990d1ff3f7896ee341030e9a7c2e4002570683
#syz test: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
^ permalink raw reply
* 4b66af2d("af_key: Always verify length of provided sadb_key")
From: Zubin Mithra @ 2018-06-13 18:02 UTC (permalink / raw)
To: linux-netdev; +Cc: groeck, stable
Hello,
Syzkaller has reported a crash here[1] for a slab OOB read in pfkey_add.
Could the following patch be applied to stable kernels for 4.14, 4.4, 3.18, 3.14, 3.10 and 3.8?
4b66af2d("af_key: Always verify length of provided sadb_key")
[1] https://syzkaller.appspot.com/bug?id=26cb120b31cd24d984fc16da67f50fb375c432a7
Thanks,
- Zubin
^ permalink raw reply
* Re: [bpf PATCH] bpf: selftest fix for sockmap
From: John Fastabend @ 2018-06-13 17:52 UTC (permalink / raw)
To: Daniel Borkmann, ast; +Cc: netdev
In-Reply-To: <0e4b873d-7e2d-7683-c7cb-0a0112d123a5@iogearbox.net>
On 06/12/2018 05:31 PM, Daniel Borkmann wrote:
> On 06/11/2018 08:47 PM, John Fastabend wrote:
>> In selftest test_maps the sockmap test case attempts to add a socket
>> in listening state to the sockmap. This is no longer a valid operation
>> so it fails as expected. However, the test wrongly reports this as an
>> error now. Fix the test to avoid adding sockets in listening state.
>>
>> Fixes: 945ae430aa44 ("bpf: sockmap only allow ESTABLISHED sock state")
>> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
>
> (fyi, discussed with John that this will be enrolled into the set of
> fixes he has pending for bpf since the test is related to the one
> restricting to ESTABLISHED state.)
>
Patch part of this series now,
https://patchwork.ozlabs.org/project/netdev/list/?series=49988
^ permalink raw reply
* [bpf PATCH 6/6] bpf: selftest remove attempts to add LISTEN sockets to sockmap
From: John Fastabend @ 2018-06-13 17:50 UTC (permalink / raw)
To: ast, daniel; +Cc: netdev
In-Reply-To: <20180613174202.4264.59970.stgit@john-Precision-Tower-5810>
In selftest test_maps the sockmap test case attempts to add a socket
in listening state to the sockmap. This is no longer a valid operation
so it fails as expected. However, the test wrongly reports this as an
error now. Fix the test to avoid adding sockets in listening state.
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
tools/testing/selftests/bpf/test_maps.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 6c25334..9fed5f0 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -564,7 +564,7 @@ static void test_sockmap(int tasks, void *data)
}
/* Test update without programs */
- for (i = 0; i < 6; i++) {
+ for (i = 2; i < 6; i++) {
err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY);
if (err) {
printf("Failed noprog update sockmap '%i:%i'\n",
@@ -727,7 +727,7 @@ static void test_sockmap(int tasks, void *data)
}
/* Test map update elem afterwards fd lives in fd and map_fd */
- for (i = 0; i < 6; i++) {
+ for (i = 2; i < 6; i++) {
err = bpf_map_update_elem(map_fd_rx, &i, &sfd[i], BPF_ANY);
if (err) {
printf("Failed map_fd_rx update sockmap %i '%i:%i'\n",
^ permalink raw reply related
* [bpf PATCH 5/6] bpf: sockhash, add release routine
From: John Fastabend @ 2018-06-13 17:50 UTC (permalink / raw)
To: ast, daniel; +Cc: netdev
In-Reply-To: <20180613174202.4264.59970.stgit@john-Precision-Tower-5810>
Add map_release_uref pointer to hashmap ops. This was dropped when
original sockhash code was ported into bpf-next before initial
commit.
Fixes: 81110384441a ("bpf: sockmap, add hash map support")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
kernel/bpf/sockmap.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 91c7b47..57fa2fa 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -2516,6 +2516,7 @@ struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
.map_get_next_key = sock_hash_get_next_key,
.map_update_elem = sock_hash_update_elem,
.map_delete_elem = sock_hash_delete_elem,
+ .map_release_uref = sock_map_release,
};
static bool bpf_is_valid_sock(struct bpf_sock_ops_kern *ops)
^ permalink raw reply related
* [bpf PATCH 4/6] bpf: sockmap, tcp_disconnect to listen transition
From: John Fastabend @ 2018-06-13 17:50 UTC (permalink / raw)
To: ast, daniel; +Cc: netdev
In-Reply-To: <20180613174202.4264.59970.stgit@john-Precision-Tower-5810>
After adding checks to ensure TCP is in ESTABLISHED state when a
sock is added we need to also ensure that user does not transition
through tcp_disconnect() and back into ESTABLISHED state without
sockmap removing the sock.
To do this add unhash hook and remove sock from map there.
Reported-by: Eric Dumazet <edumazet@google.com>
Fixes: 81110384441a ("bpf: sockmap, add hash map support")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
kernel/bpf/sockmap.c | 67 +++++++++++++++++++++++++++++++++++++-------------
1 file changed, 50 insertions(+), 17 deletions(-)
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 2e848cd..91c7b47 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -130,6 +130,7 @@ struct smap_psock {
struct proto *sk_proto;
void (*save_close)(struct sock *sk, long timeout);
+ void (*save_unhash)(struct sock *sk);
void (*save_data_ready)(struct sock *sk);
void (*save_write_space)(struct sock *sk);
};
@@ -141,6 +142,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
static void bpf_tcp_close(struct sock *sk, long timeout);
+static void bpf_tcp_unhash(struct sock *sk);
static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
{
@@ -182,6 +184,7 @@ static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS],
{
prot[SOCKMAP_BASE] = *base;
prot[SOCKMAP_BASE].close = bpf_tcp_close;
+ prot[SOCKMAP_BASE].unhash = bpf_tcp_unhash;
prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg;
prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read;
@@ -215,6 +218,7 @@ static int bpf_tcp_init(struct sock *sk)
}
psock->save_close = sk->sk_prot->close;
+ psock->save_unhash = sk->sk_prot->unhash;
psock->sk_proto = sk->sk_prot;
/* Build IPv6 sockmap whenever the address of tcpv6_prot changes */
@@ -302,28 +306,12 @@ struct smap_psock_map_entry *psock_map_pop(struct sock *sk,
return e;
}
-static void bpf_tcp_close(struct sock *sk, long timeout)
+static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock)
{
- void (*close_fun)(struct sock *sk, long timeout);
struct smap_psock_map_entry *e;
struct sk_msg_buff *md, *mtmp;
- struct smap_psock *psock;
struct sock *osk;
- rcu_read_lock();
- psock = smap_psock_sk(sk);
- if (unlikely(!psock)) {
- rcu_read_unlock();
- return sk->sk_prot->close(sk, timeout);
- }
-
- /* The psock may be destroyed anytime after exiting the RCU critial
- * section so by the time we use close_fun the psock may no longer
- * be valid. However, bpf_tcp_close is called with the sock lock
- * held so the close hook and sk are still valid.
- */
- close_fun = psock->save_close;
-
if (psock->cork) {
free_start_sg(psock->sock, psock->cork);
kfree(psock->cork);
@@ -378,6 +366,51 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
}
e = psock_map_pop(sk, psock);
}
+}
+
+static void bpf_tcp_unhash(struct sock *sk)
+{
+ void (*unhash_fun)(struct sock *sk);
+ struct smap_psock *psock;
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ return sk->sk_prot->unhash(sk);
+ }
+
+ /* The psock may be destroyed anytime after exiting the RCU critial
+ * section so by the time we use close_fun the psock may no longer
+ * be valid. However, bpf_tcp_close is called with the sock lock
+ * held so the close hook and sk are still valid.
+ */
+ unhash_fun = psock->save_unhash;
+ bpf_tcp_remove(sk, psock);
+ rcu_read_unlock();
+ unhash_fun(sk);
+
+}
+
+static void bpf_tcp_close(struct sock *sk, long timeout)
+{
+ void (*close_fun)(struct sock *sk, long timeout);
+ struct smap_psock *psock;
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ return sk->sk_prot->close(sk, timeout);
+ }
+
+ /* The psock may be destroyed anytime after exiting the RCU critial
+ * section so by the time we use close_fun the psock may no longer
+ * be valid. However, bpf_tcp_close is called with the sock lock
+ * held so the close hook and sk are still valid.
+ */
+ close_fun = psock->save_close;
+ bpf_tcp_remove(sk, psock);
rcu_read_unlock();
close_fun(sk, timeout);
}
^ permalink raw reply related
* [bpf PATCH 3/6] bpf: sockhash fix omitted bucket lock in sock_close
From: John Fastabend @ 2018-06-13 17:50 UTC (permalink / raw)
To: ast, daniel; +Cc: netdev
In-Reply-To: <20180613174202.4264.59970.stgit@john-Precision-Tower-5810>
First in tcp_close, reduce scope of sk_callback_lock() the lock is
only needed for protecting smap_release_sock() the ingress and cork
lists are protected by sock lock. Having the lock in wider scope is
harmless but may confuse the reader who may infer it is in fact
needed.
Next, in sock_hash_delete_elem() the pattern is as follows,
sock_hash_delete_elem()
[...]
spin_lock(bucket_lock)
l = lookup_elem_raw()
if (l)
hlist_del_rcu()
write_lock(sk_callback_lock)
.... destroy psock ...
write_unlock(sk_callback_lock)
spin_unlock(bucket_lock)
The ordering is necessary because we only know the {p}sock after
dereferencing the hash table which we can't do unless we have the
bucket lock held. Once we have the bucket lock and the psock element
it is deleted from the hashmap to ensure any other path doing a lookup
will fail. Finally, the refcnt is decremented and if zero the psock
is destroyed.
In parallel with the above (or free'ing the map) a tcp close event
may trigger tcp_close(). Which at the moment omits the bucket lock
altogether (oops!) where the flow looks like this,
bpf_tcp_close()
[...]
write_lock(sk_callback_lock)
for each psock->maps // list of maps this sock is part of
hlist_del_rcu(ref_hash_node);
.... destroy psock ...
write_unlock(sk_callback_lock)
Obviously, and demonstrated by syzbot, this is broken because
we can have multiple threads deleting entries via hlist_del_rcu().
To fix this we might be tempted to wrap the hlist operation in a
bucket lock but that would create a lock inversion problem. In
summary to follow locking rules maps needs the sk_callback_lock but we
need the bucket lock to do the hlist_del_rcu. To resolve the lock
inversion problem note that when bpf_tcp_close is called no updates
can happen in parallel, due to ESTABLISH state check in update logic,
so pop the head of the list repeatedly and remove the reference until
no more are left. If a delete happens in parallel from the BPF API
that is OK as well because it will do a similar action, lookup the
sock in the map/hash, delete it from the map/hash, and dec the refcnt.
We check for this case before doing a destroy on the psock to ensure
we don't have two threads tearing down a psock. The new logic is
as follows,
bpf_tcp_close()
e = psock_map_pop(psock->maps) // done with sk_callback_lock
bucket_lock() // lock hash list bucket
l = lookup_elem_raw(head, hash, key, key_size);
if (l) {
//only get here if elmnt was not already removed
hlist_del_rcu()
... destroy psock...
}
bucket_unlock()
And finally for all the above to work add missing sk_callback_lock
around smap_list_remove in sock_hash_ctx_update_elem(). Otherwise
delete and update may corrupt maps list.
(As an aside the sk_callback_lock serves two purposes. The
first, is to update the sock callbacks sk_data_ready, sk_write_space,
etc. The second is to protect the psock 'maps' list. The 'maps' list
is used to (as shown above) to delete all map/hash references to a
sock when the sock is closed)
(If we did not have the ESTABLISHED state guarantee from tcp_close
then we could not ensure completion because updates could happen
forever and pin thread in delete loop.)
Reported-by: syzbot+0ce137753c78f7b6acc1@syzkaller.appspotmail.com
Fixes: 81110384441a ("bpf: sockmap, add hash map support")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
kernel/bpf/sockmap.c | 112 ++++++++++++++++++++++++++++++++++++--------------
1 file changed, 80 insertions(+), 32 deletions(-)
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index e5bab52..2e848cd 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -258,16 +258,54 @@ static void bpf_tcp_release(struct sock *sk)
rcu_read_unlock();
}
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
+ u32 hash, void *key, u32 key_size)
+{
+ struct htab_elem *l;
+
+ hlist_for_each_entry_rcu(l, head, hash_node) {
+ if (l->hash == hash && !memcmp(&l->key, key, key_size))
+ return l;
+ }
+
+ return NULL;
+}
+
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &__select_bucket(htab, hash)->head;
+}
+
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{
atomic_dec(&htab->count);
kfree_rcu(l, rcu);
}
+struct smap_psock_map_entry *psock_map_pop(struct sock *sk,
+ struct smap_psock *psock)
+{
+ struct smap_psock_map_entry *e;
+
+ write_lock_bh(&sk->sk_callback_lock);
+ e = list_first_entry_or_null(&psock->maps,
+ struct smap_psock_map_entry,
+ list);
+ if (e)
+ list_del(&e->list);
+ write_unlock_bh(&sk->sk_callback_lock);
+ return e;
+}
+
static void bpf_tcp_close(struct sock *sk, long timeout)
{
void (*close_fun)(struct sock *sk, long timeout);
- struct smap_psock_map_entry *e, *tmp;
+ struct smap_psock_map_entry *e;
struct sk_msg_buff *md, *mtmp;
struct smap_psock *psock;
struct sock *osk;
@@ -286,7 +324,6 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
*/
close_fun = psock->save_close;
- write_lock_bh(&sk->sk_callback_lock);
if (psock->cork) {
free_start_sg(psock->sock, psock->cork);
kfree(psock->cork);
@@ -299,20 +336,48 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
kfree(md);
}
- list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+ /* Sock is in TCP_CLOSE state so any concurrent adds or updates will be
+ * blocked by ESTABLISHED check. However, tcp_close() + delete + free
+ * can all run at the same time. If a tcp_close + delete happens each
+ * code path will remove the entry for the map/hash before deleting it.
+ * In the map case a xchg and then check to verify we have a sk protects
+ * two paths from tearing down the same object. For hash map we lock the
+ * bucket and remove the object from the hash map before destroying to
+ * ensure that only one reference exists. By pulling object off the head
+ * of the list with (with sk_callback_lock) if multiple deleters are
+ * running we avoid duplicate references.
+ */
+ e = psock_map_pop(sk, psock);
+ while (e) {
if (e->entry) {
osk = cmpxchg(e->entry, sk, NULL);
if (osk == sk) {
- list_del(&e->list);
smap_release_sock(psock, sk);
}
} else {
- hlist_del_rcu(&e->hash_link->hash_node);
- smap_release_sock(psock, e->hash_link->sk);
- free_htab_elem(e->htab, e->hash_link);
+ struct htab_elem *link = e->hash_link;
+ struct hlist_head *head;
+ struct htab_elem *l;
+ struct bucket *b;
+
+ b = __select_bucket(e->htab, link->hash);
+ head = &b->head;
+ raw_spin_lock_bh(&b->lock);
+ l = lookup_elem_raw(head,
+ link->hash, link->key,
+ e->htab->elem_size);
+ /* If another thread deleted this object skip deletion.
+ * The refcnt on psock may or may not be zero.
+ */
+ if (l) {
+ hlist_del_rcu(&e->hash_link->hash_node);
+ smap_release_sock(psock, e->hash_link->sk);
+ free_htab_elem(e->htab, e->hash_link);
+ }
+ raw_spin_unlock_bh(&b->lock);
}
+ e = psock_map_pop(sk, psock);
}
- write_unlock_bh(&sk->sk_callback_lock);
rcu_read_unlock();
close_fun(sk, timeout);
}
@@ -2085,16 +2150,6 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
return ERR_PTR(err);
}
-static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
-{
- return &htab->buckets[hash & (htab->n_buckets - 1)];
-}
-
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
-{
- return &__select_bucket(htab, hash)->head;
-}
-
static void sock_hash_free(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
@@ -2111,10 +2166,13 @@ static void sock_hash_free(struct bpf_map *map)
*/
rcu_read_lock();
for (i = 0; i < htab->n_buckets; i++) {
- struct hlist_head *head = select_bucket(htab, i);
+ struct bucket *b = __select_bucket(htab, i);
+ struct hlist_head *head;
struct hlist_node *n;
struct htab_elem *l;
+ raw_spin_lock_bh(&b->lock);
+ head = &b->head;
hlist_for_each_entry_safe(l, n, head, hash_node) {
struct sock *sock = l->sk;
struct smap_psock *psock;
@@ -2134,6 +2192,7 @@ static void sock_hash_free(struct bpf_map *map)
write_unlock_bh(&sock->sk_callback_lock);
kfree(l);
}
+ raw_spin_unlock_bh(&b->lock);
}
rcu_read_unlock();
bpf_map_area_free(htab->buckets);
@@ -2164,19 +2223,6 @@ static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
return l_new;
}
-static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
- u32 hash, void *key, u32 key_size)
-{
- struct htab_elem *l;
-
- hlist_for_each_entry_rcu(l, head, hash_node) {
- if (l->hash == hash && !memcmp(&l->key, key, key_size))
- return l;
- }
-
- return NULL;
-}
-
static inline u32 htab_map_hash(const void *key, u32 key_len)
{
return jhash(key, key_len, 0);
@@ -2308,8 +2354,10 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
psock = smap_psock_sk(l_old->sk);
hlist_del_rcu(&l_old->hash_node);
+ write_lock_bh(&l_old->sk->sk_callback_lock);
smap_list_remove(psock, NULL, l_old);
smap_release_sock(psock, l_old->sk);
+ write_unlock_bh(&l_old->sk->sk_callback_lock);
free_htab_elem(htab, l_old);
}
raw_spin_unlock_bh(&b->lock);
^ permalink raw reply related
* [bpf PATCH 2/6] bpf: sockmap only allow ESTABLISHED sock state
From: John Fastabend @ 2018-06-13 17:50 UTC (permalink / raw)
To: ast, daniel; +Cc: netdev
In-Reply-To: <20180613174202.4264.59970.stgit@john-Precision-Tower-5810>
Per the note in the TLS ULP (which is actually a generic statement
regarding ULPs)
/* The TLS ulp is currently supported only for TCP sockets
* in ESTABLISHED state.
* Supporting sockets in LISTEN state will require us
* to modify the accept implementation to clone rather then
* share the ulp context.
*/
After this patch we only allow socks that are in ESTABLISHED state or
are being added via a sock_ops event that is transitioning into an
ESTABLISHED state. By allowing sock_ops events we allow users to
manage sockmaps directly from sock ops programs. The two supported
sock_ops ops are BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB and
BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB.
Also tested with 'netserver -6' and 'netperf -H [IPv6]' as well as
'netperf -H [IPv4]'.
Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
kernel/bpf/sockmap.c | 28 +++++++++++++++++++++++++++-
1 file changed, 27 insertions(+), 1 deletion(-)
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index f6dd4cd..e5bab52 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1976,8 +1976,12 @@ static int sock_map_update_elem(struct bpf_map *map,
return -EINVAL;
}
+ /* ULPs are currently supported only for TCP sockets in ESTABLISHED
+ * state.
+ */
if (skops.sk->sk_type != SOCK_STREAM ||
- skops.sk->sk_protocol != IPPROTO_TCP) {
+ skops.sk->sk_protocol != IPPROTO_TCP ||
+ skops.sk->sk_state != TCP_ESTABLISHED) {
fput(socket->file);
return -EOPNOTSUPP;
}
@@ -2338,6 +2342,16 @@ static int sock_hash_update_elem(struct bpf_map *map,
return -EINVAL;
}
+ /* ULPs are currently supported only for TCP sockets in ESTABLISHED
+ * state.
+ */
+ if (skops.sk->sk_type != SOCK_STREAM ||
+ skops.sk->sk_protocol != IPPROTO_TCP ||
+ skops.sk->sk_state != TCP_ESTABLISHED) {
+ fput(socket->file);
+ return -EOPNOTSUPP;
+ }
+
err = sock_hash_ctx_update_elem(&skops, map, key, flags);
fput(socket->file);
return err;
@@ -2423,10 +2437,19 @@ struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
.map_delete_elem = sock_hash_delete_elem,
};
+static bool bpf_is_valid_sock(struct bpf_sock_ops_kern *ops)
+{
+ return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB ||
+ ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB;
+}
+
BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
struct bpf_map *, map, void *, key, u64, flags)
{
WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (!bpf_is_valid_sock(bpf_sock))
+ return -EOPNOTSUPP;
return sock_map_ctx_update_elem(bpf_sock, map, key, flags);
}
@@ -2445,6 +2468,9 @@ struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
struct bpf_map *, map, void *, key, u64, flags)
{
WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (!bpf_is_valid_sock(bpf_sock))
+ return -EOPNOTSUPP;
return sock_hash_ctx_update_elem(bpf_sock, map, key, flags);
}
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox