Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH net-next] net: bridge: Lock before br_fdb_find()
From: David Miller @ 2018-05-30 16:42 UTC (permalink / raw)
  To: petrm; +Cc: netdev, bridge
In-Reply-To: <ff84c00d882dc646a7d69dcbdfe8417b4c0bdf91.1527522008.git.petrm@mellanox.com>

From: Petr Machata <petrm@mellanox.com>
Date: Mon, 28 May 2018 17:44:16 +0200

> Callers of br_fdb_find() need to hold the hash lock, which
> br_fdb_find_port() doesn't do. Add the missing lock/unlock
> pair.
> 
> Signed-off-by: Petr Machata <petrm@mellanox.com>

If all of the these uses of br_fdb_find_port() are safe, then it
should use the RCU fdb lookup variant.

So I basically agree with Stephen that this locking doesn't make any
sense.

The lock is needed when you are going to add or delete an FDB entry.

Here we are doing a lookup and returning a device pointer via the FDB
entry found in the lookup.

The RTNL assertion assures that the device returned won't disappear.

If the device can disappear, the spinlock added by this patch doesn't
change that at all.

^ permalink raw reply

* [PATCH] samples/bpf: Add xdp_sample_pkts example
From: Toke Høiland-Jørgensen @ 2018-05-30 16:45 UTC (permalink / raw)
  To: netdev; +Cc: Toke Høiland-Jørgensen

This adds an example program showing how to sample packets from XDP using
the perf event buffer. The example userspace program just prints the
ethernet header for every packet sampled.

Most of the userspace code is borrowed from other examples, most notably
trace_output.

Note that the example only works when everything runs on CPU0; so
suitable smp_affinity needs to be set on the device. Some drivers seem
to reset smp_affinity when loading an XDP program, so it may be
necessary to change it after starting the example userspace program.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
 samples/bpf/Makefile               |   4 +
 samples/bpf/xdp_sample_pkts_kern.c |  48 ++++++++++++
 samples/bpf/xdp_sample_pkts_user.c | 147 +++++++++++++++++++++++++++++++++++++
 3 files changed, 199 insertions(+)
 create mode 100644 samples/bpf/xdp_sample_pkts_kern.c
 create mode 100644 samples/bpf/xdp_sample_pkts_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 1303af1..6f0c6d2 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -52,6 +52,7 @@ hostprogs-y += xdp_adjust_tail
 hostprogs-y += xdpsock
 hostprogs-y += xdp_fwd
 hostprogs-y += task_fd_query
+hostprogs-y += xdp_sample_pkts
 
 # Libbpf dependencies
 LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -107,6 +108,7 @@ xdp_adjust_tail-objs := xdp_adjust_tail_user.o
 xdpsock-objs := bpf_load.o xdpsock_user.o
 xdp_fwd-objs := bpf_load.o xdp_fwd_user.o
 task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
+xdp_sample_pkts-objs := bpf_load.o xdp_sample_pkts_user.o $(TRACE_HELPERS)
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -163,6 +165,7 @@ always += xdp_adjust_tail_kern.o
 always += xdpsock_kern.o
 always += xdp_fwd_kern.o
 always += task_fd_query_kern.o
+always += xdp_sample_pkts_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -179,6 +182,7 @@ HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_task_fd_query_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_xdp_sample_pkts_user.o += -I$(srctree)/tools/lib/bpf/
 
 HOST_LOADLIBES		+= $(LIBBPF) -lelf
 HOSTLOADLIBES_tracex4		+= -lrt
diff --git a/samples/bpf/xdp_sample_pkts_kern.c b/samples/bpf/xdp_sample_pkts_kern.c
new file mode 100644
index 0000000..c58183a
--- /dev/null
+++ b/samples/bpf/xdp_sample_pkts_kern.c
@@ -0,0 +1,48 @@
+#include <linux/ptrace.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+#define SAMPLE_SIZE 64ul
+
+struct bpf_map_def SEC("maps") my_map = {
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(u32),
+	.max_entries = 2,
+};
+
+SEC("xdp_sample")
+int xdp_sample_prog(struct xdp_md *ctx)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data = (void *)(long)ctx->data;
+
+        /* Metadata will be in the perf event before the packet data. */
+	struct S {
+		u16 cookie;
+		u16 pkt_len;
+	} __attribute__((packed)) metadata;
+
+	if (data + SAMPLE_SIZE < data_end) {
+		/* The XDP perf_event_output handler will use the upper 32 bits
+		 * of the flags argument as a number of bytes to include of the
+		 * packet payload in the event data. If the size is too big, the
+		 * call to bpf_perf_event_output will fail and return -EFAULT.
+		 *
+		 * See bpf_xdp_event_output in net/core/filter.c.
+		 */
+		u64 flags = SAMPLE_SIZE << 32;
+
+		metadata.cookie = 0xdead;
+		metadata.pkt_len = (u16)(data_end - data);
+
+		bpf_perf_event_output(ctx, &my_map, flags,
+				      &metadata, sizeof(metadata));
+	}
+
+	return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/xdp_sample_pkts_user.c b/samples/bpf/xdp_sample_pkts_user.c
new file mode 100644
index 0000000..f996917
--- /dev/null
+++ b/samples/bpf/xdp_sample_pkts_user.c
@@ -0,0 +1,147 @@
+/* This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <linux/perf_event.h>
+#include <linux/bpf.h>
+#include <net/if.h>
+#include <errno.h>
+#include <assert.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <signal.h>
+#include <libbpf.h>
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include <bpf/bpf.h>
+
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+static int pmu_fd, if_idx = 0;
+static char *if_name;
+
+static int do_attach(int idx, int fd, const char *name)
+{
+	int err;
+
+	err = bpf_set_link_xdp_fd(idx, fd, 0);
+	if (err < 0)
+		printf("ERROR: failed to attach program to %s\n", name);
+
+	return err;
+}
+
+static int do_detach(int idx, const char *name)
+{
+	int err;
+
+	err = bpf_set_link_xdp_fd(idx, -1, 0);
+	if (err < 0)
+		printf("ERROR: failed to detach program from %s\n", name);
+
+	return err;
+}
+
+#define SAMPLE_SIZE 64
+
+static int print_bpf_output(void *data, int size)
+{
+	struct {
+		__u16 cookie;
+		__u16 pkt_len;
+		__u8  pkt_data[SAMPLE_SIZE];
+	} __attribute__((packed)) *e = data;
+	int i;
+
+	if (e->cookie != 0xdead) {
+		printf("BUG cookie %x sized %d\n",
+		       e->cookie, size);
+		return LIBBPF_PERF_EVENT_ERROR;
+	}
+
+	printf("Pkt len: %-5d bytes. Ethernet hdr: ", e->pkt_len);
+	for (i = 0; i < 14 && i < e->pkt_len; i++)
+		printf("%02x ", e->pkt_data[i]);
+	printf("\n");
+
+	return LIBBPF_PERF_EVENT_CONT;
+}
+
+static void test_bpf_perf_event(void)
+{
+	struct perf_event_attr attr = {
+		.sample_type = PERF_SAMPLE_RAW,
+		.type = PERF_TYPE_SOFTWARE,
+		.config = PERF_COUNT_SW_BPF_OUTPUT,
+	};
+	int key = 0;
+
+	pmu_fd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
+
+	assert(pmu_fd >= 0);
+	assert(bpf_map_update_elem(map_fd[0], &key, &pmu_fd, BPF_ANY) == 0);
+	ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+}
+
+static void sig_handler(int signo)
+{
+	do_detach(if_idx, if_name);
+	exit(0);
+}
+
+int main(int argc, char **argv)
+{
+	char filename[256];
+	int ret, err;
+
+	if (argc < 2) {
+		printf("Usage: %s <ifname>\n", argv[0]);
+		return 1;
+	}
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	if_idx = if_nametoindex(argv[1]);
+	if (!if_idx)
+		if_idx = strtoul(argv[1], NULL, 0);
+
+	if (!if_idx) {
+		fprintf(stderr, "Invalid ifname\n");
+		return 1;
+	}
+	if_name = argv[1];
+	err = do_attach(if_idx, prog_fd[0], argv[1]);
+	if (err)
+		return err;
+
+	if (signal(SIGINT, sig_handler) ||
+	    signal(SIGHUP, sig_handler) ||
+	    signal(SIGTERM, sig_handler)) {
+		perror("signal");
+		return 1;
+	}
+
+	test_bpf_perf_event();
+
+	if (perf_event_mmap(pmu_fd) < 0)
+		return 1;
+
+	ret = perf_event_poller(pmu_fd, print_bpf_output);
+	kill(0, SIGINT);
+	return ret;
+}
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH bpf-next 09/11] bpf: fix context access in tracing progs on 32 bit archs
From: Song Liu @ 2018-05-30 16:46 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: Alexei Starovoitov, Networking
In-Reply-To: <20180528004344.3606-10-daniel@iogearbox.net>

On Sun, May 27, 2018 at 5:43 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> Wang reported that all the testcases for BPF_PROG_TYPE_PERF_EVENT
> program type in test_verifier report the following errors on x86_32:
>
>   172/p unpriv: spill/fill of different pointers ldx FAIL
>   Unexpected error message!
>   0: (bf) r6 = r10
>   1: (07) r6 += -8
>   2: (15) if r1 == 0x0 goto pc+3
>   R1=ctx(id=0,off=0,imm=0) R6=fp-8,call_-1 R10=fp0,call_-1
>   3: (bf) r2 = r10
>   4: (07) r2 += -76
>   5: (7b) *(u64 *)(r6 +0) = r2
>   6: (55) if r1 != 0x0 goto pc+1
>   R1=ctx(id=0,off=0,imm=0) R2=fp-76,call_-1 R6=fp-8,call_-1 R10=fp0,call_-1 fp-8=fp
>   7: (7b) *(u64 *)(r6 +0) = r1
>   8: (79) r1 = *(u64 *)(r6 +0)
>   9: (79) r1 = *(u64 *)(r1 +68)
>   invalid bpf_context access off=68 size=8
>
>   378/p check bpf_perf_event_data->sample_period byte load permitted FAIL
>   Failed to load prog 'Permission denied'!
>   0: (b7) r0 = 0
>   1: (71) r0 = *(u8 *)(r1 +68)
>   invalid bpf_context access off=68 size=1
>
>   379/p check bpf_perf_event_data->sample_period half load permitted FAIL
>   Failed to load prog 'Permission denied'!
>   0: (b7) r0 = 0
>   1: (69) r0 = *(u16 *)(r1 +68)
>   invalid bpf_context access off=68 size=2
>
>   380/p check bpf_perf_event_data->sample_period word load permitted FAIL
>   Failed to load prog 'Permission denied'!
>   0: (b7) r0 = 0
>   1: (61) r0 = *(u32 *)(r1 +68)
>   invalid bpf_context access off=68 size=4
>
>   381/p check bpf_perf_event_data->sample_period dword load permitted FAIL
>   Failed to load prog 'Permission denied'!
>   0: (b7) r0 = 0
>   1: (79) r0 = *(u64 *)(r1 +68)
>   invalid bpf_context access off=68 size=8
>
> Reason is that struct pt_regs on x86_32 doesn't fully align to 8 byte
> boundary due to its size of 68 bytes.
>
> Therefore, bpf_ctx_narrow_access_ok() will then bail out saying that
> off & (size_default - 1) which is 68 & 7 doesn't cleanly align in the
> case of sample_period access from struct bpf_perf_event_data, hence
> verifier wrongly thinks we might be doing an unaligned access here.
> Therefore adjust this down to machine size and check the offset for
> narrow access on that basis.
>
> We also need to fix pe_prog_is_valid_access(), since we hit the check
> for off % size != 0 (e.g. 68 % 8 -> 4) in the first and last test.
>
> Reported-by: Wang YanQing <udknight@gmail.com>
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Acked-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  include/linux/filter.h   | 30 ++++++++++++++++++++++++------
>  kernel/trace/bpf_trace.c | 10 ++++++++--
>  2 files changed, 32 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index d407ede..89903d2 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -639,16 +639,34 @@ static inline bool bpf_prog_was_classic(const struct bpf_prog *prog)
>         return prog->type == BPF_PROG_TYPE_UNSPEC;
>  }
>
> -static inline bool
> -bpf_ctx_narrow_access_ok(u32 off, u32 size, const u32 size_default)
> +static inline u32 bpf_ctx_off_adjust_machine(u32 size)
> +{
> +       const u32 size_machine = sizeof(unsigned long);
> +
> +       if (size > size_machine && size % size_machine == 0)
> +               size = size_machine;

Not sure whether I understand this correctly. I guess we only need:
    if (size % size_machine == 0)
               size = size_machine;

Or, is this function equivalent to
    if (size == 8 && size_machine == 4)
         size = 4;

If this is the case, maybe we can make bpf_ctx_narrow_align_ok()
simpler?

Thanks,
Song

> +
> +       return size;
> +}
> +
> +static inline bool bpf_ctx_narrow_align_ok(u32 off, u32 size_access,
> +                                          u32 size_default)
>  {
> -       bool off_ok;
> +       size_default = bpf_ctx_off_adjust_machine(size_default);
> +       size_access  = bpf_ctx_off_adjust_machine(size_access);
> +
>  #ifdef __LITTLE_ENDIAN
> -       off_ok = (off & (size_default - 1)) == 0;
> +       return (off & (size_default - 1)) == 0;
>  #else
> -       off_ok = (off & (size_default - 1)) + size == size_default;
> +       return (off & (size_default - 1)) + size_access == size_default;
>  #endif
> -       return off_ok && size <= size_default && (size & (size - 1)) == 0;
> +}
> +
> +static inline bool
> +bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
> +{
> +       return bpf_ctx_narrow_align_ok(off, size, size_default) &&
> +              size <= size_default && (size & (size - 1)) == 0;
>  }
>
>  #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0]))
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 81fdf2f..7269530 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -880,8 +880,14 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type
>                 return false;
>         if (type != BPF_READ)
>                 return false;
> -       if (off % size != 0)
> -               return false;
> +       if (off % size != 0) {
> +               if (sizeof(unsigned long) != 4)
> +                       return false;
> +               if (size != 8)
> +                       return false;
> +               if (off % size != 4)
> +                       return false;
> +       }
>
>         switch (off) {
>         case bpf_ctx_range(struct bpf_perf_event_data, sample_period):
> --
> 2.9.5
>

^ permalink raw reply

* [PATCH iproute2 net-next] iproute: ip route get support for sport, dport and ipproto match
From: Roopa Prabhu @ 2018-05-30 17:06 UTC (permalink / raw)
  To: dsahern; +Cc: netdev

From: Roopa Prabhu <roopa@cumulusnetworks.com>

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
---
 ip/iproute.c           | 26 +++++++++++++++++++++++++-
 man/man8/ip-route.8.in | 20 +++++++++++++++++++-
 2 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/ip/iproute.c b/ip/iproute.c
index 56dd9f2..ef04d27 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -69,7 +69,8 @@ static void usage(void)
 		"                            [ from ADDRESS iif STRING ]\n"
 		"                            [ oif STRING ] [ tos TOS ]\n"
 		"                            [ mark NUMBER ] [ vrf NAME ]\n"
-		"                            [ uid NUMBER ]\n"
+		"                            [ uid NUMBER ] [ ipproto PROTOCOL ]\n"
+		"                            [ sport NUMBER ] [ dport NUMBER ]\n"
 		"       ip route { add | del | change | append | replace } ROUTE\n"
 		"SELECTOR := [ root PREFIX ] [ match PREFIX ] [ exact PREFIX ]\n"
 		"            [ table TABLE_ID ] [ vrf NAME ] [ proto RTPROTO ]\n"
@@ -1994,6 +1995,29 @@ static int iproute_get(int argc, char **argv)
 				req.r.rtm_family = addr.family;
 			addattr_l(&req.n, sizeof(req), RTA_NEWDST,
 				  &addr.data, addr.bytelen);
+		} else if (matches(*argv, "sport") == 0) {
+			__be16 sport;
+
+			NEXT_ARG();
+			if (get_be16(&sport, *argv, 0))
+				invarg("invalid sport\n", *argv);
+			addattr16(&req.n, sizeof(req), RTA_SPORT, sport);
+		} else if (matches(*argv, "dport") == 0) {
+			__be16 dport;
+
+			NEXT_ARG();
+			if (get_be16(&dport, *argv, 0))
+				invarg("invalid dport\n", *argv);
+			addattr16(&req.n, sizeof(req), RTA_DPORT, dport);
+		} else if (matches(*argv, "ipproto") == 0) {
+			int ipproto;
+
+			NEXT_ARG();
+			ipproto = inet_proto_a2n(*argv);
+			if (ipproto < 0)
+				invarg("Invalid \"ipproto\" value\n",
+				       *argv);
+			addattr8(&req.n, sizeof(req), RTA_IP_PROTO, ipproto);
 		} else {
 			inet_prefix addr;
 
diff --git a/man/man8/ip-route.8.in b/man/man8/ip-route.8.in
index b28f3d2..b21a847 100644
--- a/man/man8/ip-route.8.in
+++ b/man/man8/ip-route.8.in
@@ -38,7 +38,13 @@ ip-route \- routing table management
 .B  tos
 .IR TOS " ] [ "
 .B  vrf
-.IR NAME " ] "
+.IR NAME " ] [ "
+.B  ipproto
+.IR PROTOCOL " ] [ "
+.B  sport
+.IR NUMBER " ] [ "
+.B  dport
+.IR NUMBER " ] "
 
 .ti -8
 .BR "ip route" " { " add " | " del " | " change " | " append " | "\
@@ -1045,6 +1051,18 @@ the firewall mark
 force the vrf device on which this packet will be routed.
 
 .TP
+.BI ipproto " PROTOCOL"
+ip protocol as seen by the route lookup
+
+.TP
+.BI sport " NUMBER"
+source port as seen by the route lookup
+
+.TP
+.BI dport " NUMBER"
+destination port as seen by the route lookup
+
+.TP
 .B connected
 if no source address
 .RB "(option " from ")"
-- 
2.1.4

^ permalink raw reply related

* Re: [PATCH bpf-next 05/11] bpf: avoid retpoline for lookup/update/delete calls on maps
From: Song Liu @ 2018-05-30 17:06 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: Alexei Starovoitov, Networking
In-Reply-To: <20180528004344.3606-6-daniel@iogearbox.net>

On Sun, May 27, 2018 at 5:43 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> While some of the BPF map lookup helpers provide a ->map_gen_lookup()
> callback for inlining the map lookup altogether it is not available
> for every map, so the remaining ones have to call bpf_map_lookup_elem()
> helper which does a dispatch to map->ops->map_lookup_elem(). In
> times of retpolines, this will control and trap speculative execution
> rather than letting it do its work for the indirect call and will
> therefore cause a slowdown. Likewise, bpf_map_update_elem() and
> bpf_map_delete_elem() do not have an inlined version and need to call
> into their map->ops->map_update_elem() resp. map->ops->map_delete_elem()
> handlers.
>
> Before:
>
>   # bpftool p d x i 1
>     0: (bf) r2 = r10
>     1: (07) r2 += -8
>     2: (7a) *(u64 *)(r2 +0) = 0
>     3: (18) r1 = map[id:1]
>     5: (85) call __htab_map_lookup_elem#232656
>     6: (15) if r0 == 0x0 goto pc+4
>     7: (71) r1 = *(u8 *)(r0 +35)
>     8: (55) if r1 != 0x0 goto pc+1
>     9: (72) *(u8 *)(r0 +35) = 1
>    10: (07) r0 += 56
>    11: (15) if r0 == 0x0 goto pc+4
>    12: (bf) r2 = r0
>    13: (18) r1 = map[id:1]
>    15: (85) call bpf_map_delete_elem#215008  <-- indirect call via
>    16: (95) exit                                 helper
>
> After:
>
>   # bpftool p d x i 1
>     0: (bf) r2 = r10
>     1: (07) r2 += -8
>     2: (7a) *(u64 *)(r2 +0) = 0
>     3: (18) r1 = map[id:1]
>     5: (85) call __htab_map_lookup_elem#233328
>     6: (15) if r0 == 0x0 goto pc+4
>     7: (71) r1 = *(u8 *)(r0 +35)
>     8: (55) if r1 != 0x0 goto pc+1
>     9: (72) *(u8 *)(r0 +35) = 1
>    10: (07) r0 += 56
>    11: (15) if r0 == 0x0 goto pc+4
>    12: (bf) r2 = r0
>    13: (18) r1 = map[id:1]
>    15: (85) call htab_lru_map_delete_elem#238240  <-- direct call
>    16: (95) exit
>
> In all three lookup/update/delete cases however we can use the actual
> address of the map callback directly if we find that there's only a
> single path with a map pointer leading to the helper call, meaning
> when the map pointer has not been poisoned from verifier side.
> Example code can be seen above for the delete case.
>
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Acked-by: Alexei Starovoitov <ast@kernel.org>

Acked-by: Song Liu <songliubraving@fb.com>

> ---
>  include/linux/filter.h |  3 +++
>  kernel/bpf/hashtab.c   | 12 ++++++---
>  kernel/bpf/verifier.c  | 67 +++++++++++++++++++++++++++++++++++++-------------
>  3 files changed, 62 insertions(+), 20 deletions(-)
>
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index b443f70..d407ede 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -301,6 +301,9 @@ struct xdp_buff;
>
>  /* Function call */
>
> +#define BPF_CAST_CALL(x)                                       \
> +               ((u64 (*)(u64, u64, u64, u64, u64))(x))
> +
>  #define BPF_EMIT_CALL(FUNC)                                    \
>         ((struct bpf_insn) {                                    \
>                 .code  = BPF_JMP | BPF_CALL,                    \
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index b76828f..3ca2198 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -503,7 +503,9 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
>         struct bpf_insn *insn = insn_buf;
>         const int ret = BPF_REG_0;
>
> -       *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
> +       BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
> +                    (void *(*)(struct bpf_map *map, void *key))NULL));
> +       *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
>         *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
>         *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
>                                 offsetof(struct htab_elem, key) +
> @@ -530,7 +532,9 @@ static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
>         const int ret = BPF_REG_0;
>         const int ref_reg = BPF_REG_1;
>
> -       *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
> +       BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
> +                    (void *(*)(struct bpf_map *map, void *key))NULL));
> +       *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
>         *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
>         *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
>                               offsetof(struct htab_elem, lru_node) +
> @@ -1369,7 +1373,9 @@ static u32 htab_of_map_gen_lookup(struct bpf_map *map,
>         struct bpf_insn *insn = insn_buf;
>         const int ret = BPF_REG_0;
>
> -       *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
> +       BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
> +                    (void *(*)(struct bpf_map *map, void *key))NULL));
> +       *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
>         *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
>         *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
>                                 offsetof(struct htab_elem, key) +
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 4f4786e..5684b15 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -2421,8 +2421,11 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
>         struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
>
>         if (func_id != BPF_FUNC_tail_call &&
> -           func_id != BPF_FUNC_map_lookup_elem)
> +           func_id != BPF_FUNC_map_lookup_elem &&
> +           func_id != BPF_FUNC_map_update_elem &&
> +           func_id != BPF_FUNC_map_delete_elem)
>                 return 0;
> +
>         if (meta->map_ptr == NULL) {
>                 verbose(env, "kernel subsystem misconfigured verifier\n");
>                 return -EINVAL;
> @@ -5586,6 +5589,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
>         struct bpf_insn *insn = prog->insnsi;
>         const struct bpf_func_proto *fn;
>         const int insn_cnt = prog->len;
> +       const struct bpf_map_ops *ops;
>         struct bpf_insn_aux_data *aux;
>         struct bpf_insn insn_buf[16];
>         struct bpf_prog *new_prog;
> @@ -5715,10 +5719,13 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
>                 }
>
>                 /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
> -                * handlers are currently limited to 64 bit only.
> +                * and other inlining handlers are currently limited to 64 bit
> +                * only.
>                  */
>                 if (prog->jit_requested && BITS_PER_LONG == 64 &&
> -                   insn->imm == BPF_FUNC_map_lookup_elem) {
> +                   (insn->imm == BPF_FUNC_map_lookup_elem ||
> +                    insn->imm == BPF_FUNC_map_update_elem ||
> +                    insn->imm == BPF_FUNC_map_delete_elem)) {
>                         aux = &env->insn_aux_data[i + delta];
>                         if (bpf_map_ptr_poisoned(aux))
>                                 goto patch_call_imm;
> @@ -5727,23 +5734,49 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
>                         if (!map_ptr->ops->map_gen_lookup)
>                                 goto patch_call_imm;
>
> -                       cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
> -                       if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
> -                               verbose(env, "bpf verifier is misconfigured\n");
> -                               return -EINVAL;
> -                       }
> +                       ops = map_ptr->ops;
> +                       if (insn->imm == BPF_FUNC_map_lookup_elem &&
> +                           ops->map_gen_lookup) {
> +                               cnt = ops->map_gen_lookup(map_ptr, insn_buf);
> +                               if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
> +                                       verbose(env, "bpf verifier is misconfigured\n");
> +                                       return -EINVAL;
> +                               }
>
> -                       new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
> -                                                      cnt);
> -                       if (!new_prog)
> -                               return -ENOMEM;
> +                               new_prog = bpf_patch_insn_data(env, i + delta,
> +                                                              insn_buf, cnt);
> +                               if (!new_prog)
> +                                       return -ENOMEM;
>
> -                       delta += cnt - 1;
> +                               delta    += cnt - 1;
> +                               env->prog = prog = new_prog;
> +                               insn      = new_prog->insnsi + i + delta;
> +                               continue;
> +                       }
>
> -                       /* keep walking new program and skip insns we just inserted */
> -                       env->prog = prog = new_prog;
> -                       insn      = new_prog->insnsi + i + delta;
> -                       continue;
> +                       BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
> +                                    (void *(*)(struct bpf_map *map, void *key))NULL));
> +                       BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
> +                                    (int (*)(struct bpf_map *map, void *key))NULL));
> +                       BUILD_BUG_ON(!__same_type(ops->map_update_elem,
> +                                    (int (*)(struct bpf_map *map, void *key, void *value,
> +                                             u64 flags))NULL));
> +                       switch (insn->imm) {
> +                       case BPF_FUNC_map_lookup_elem:
> +                               insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) -
> +                                           __bpf_call_base;
> +                               continue;
> +                       case BPF_FUNC_map_update_elem:
> +                               insn->imm = BPF_CAST_CALL(ops->map_update_elem) -
> +                                           __bpf_call_base;
> +                               continue;
> +                       case BPF_FUNC_map_delete_elem:
> +                               insn->imm = BPF_CAST_CALL(ops->map_delete_elem) -
> +                                           __bpf_call_base;
> +                               continue;
> +                       }
> +
> +                       goto patch_call_imm;
>                 }
>
>                 if (insn->imm == BPF_FUNC_redirect_map) {
> --
> 2.9.5
>

^ permalink raw reply

* Re: [PATCH bpf-next 07/11] bpf: make sure to clear unused fields in tunnel/xfrm state fetch
From: Song Liu @ 2018-05-30 17:15 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: Alexei Starovoitov, Networking
In-Reply-To: <20180528004344.3606-8-daniel@iogearbox.net>

On Sun, May 27, 2018 at 5:43 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> Since the remaining bits are not filled in struct bpf_tunnel_key
> resp. struct bpf_xfrm_state and originate from uninitialized stack
> space, we should make sure to clear them before handing control
> back to the program.
>
> Also add a padding element to struct bpf_xfrm_state for future use
> similar as we have in struct bpf_tunnel_key and clear it as well.
>
>   struct bpf_xfrm_state {
>       __u32                      reqid;            /*     0     4 */
>       __u32                      spi;              /*     4     4 */
>       __u16                      family;           /*     8     2 */
>
>       /* XXX 2 bytes hole, try to pack */
>
>       union {
>           __u32              remote_ipv4;          /*           4 */
>           __u32              remote_ipv6[4];       /*          16 */
>       };                                           /*    12    16 */
>
>       /* size: 28, cachelines: 1, members: 4 */
>       /* sum members: 26, holes: 1, sum holes: 2 */
>       /* last cacheline: 28 bytes */
>   };
>
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Acked-by: Alexei Starovoitov <ast@kernel.org>

Acked-by: Song Liu <songliubraving@fb.com>

> ---
>  include/uapi/linux/bpf.h | 3 ++-
>  net/core/filter.c        | 6 ++++++
>  2 files changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index e2853aa..7108711 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2214,7 +2214,7 @@ struct bpf_tunnel_key {
>         };
>         __u8 tunnel_tos;
>         __u8 tunnel_ttl;
> -       __u16 tunnel_ext;
> +       __u16 tunnel_ext;       /* Padding, future use. */
>         __u32 tunnel_label;
>  };
>
> @@ -2225,6 +2225,7 @@ struct bpf_xfrm_state {
>         __u32 reqid;
>         __u32 spi;      /* Stored in network byte order */
>         __u16 family;
> +       __u16 ext;      /* Padding, future use. */
>         union {
>                 __u32 remote_ipv4;      /* Stored in network byte order */
>                 __u32 remote_ipv6[4];   /* Stored in network byte order */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 717c740..5ceb5e6 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -3445,6 +3445,7 @@ BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key
>         to->tunnel_id = be64_to_cpu(info->key.tun_id);
>         to->tunnel_tos = info->key.tos;
>         to->tunnel_ttl = info->key.ttl;
> +       to->tunnel_ext = 0;
>
>         if (flags & BPF_F_TUNINFO_IPV6) {
>                 memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
> @@ -3452,6 +3453,8 @@ BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key
>                 to->tunnel_label = be32_to_cpu(info->key.label);
>         } else {
>                 to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
> +               memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
> +               to->tunnel_label = 0;
>         }
>
>         if (unlikely(size != sizeof(struct bpf_tunnel_key)))
> @@ -4047,11 +4050,14 @@ BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
>         to->reqid = x->props.reqid;
>         to->spi = x->id.spi;
>         to->family = x->props.family;
> +       to->ext = 0;
> +
>         if (to->family == AF_INET6) {
>                 memcpy(to->remote_ipv6, x->props.saddr.a6,
>                        sizeof(to->remote_ipv6));
>         } else {
>                 to->remote_ipv4 = x->props.saddr.a4;
> +               memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
>         }
>
>         return 0;
> --
> 2.9.5
>

^ permalink raw reply

* Re: [PATCH bpf-next 04/11] bpf: show prog and map id in fdinfo
From: Jesper Dangaard Brouer @ 2018-05-30 17:15 UTC (permalink / raw)
  To: Song Liu; +Cc: Daniel Borkmann, Networking, brouer
In-Reply-To: <CAPhsuW4BL_y74F3k2DXn3QQJiiEDtdj_p5dw=G29gADgy-p-9A@mail.gmail.com>

On Wed, 30 May 2018 09:15:25 -0700
Song Liu <liu.song.a23@gmail.com> wrote:

> On Tue, May 29, 2018 at 12:55 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> > On 05/29/2018 07:27 PM, Jesper Dangaard Brouer wrote:  
> >> On Mon, 28 May 2018 02:43:37 +0200
> >> Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>  
> >>> Its trivial and straight forward to expose it for scripts that can
> >>> then use it along with bpftool in order to inspect an individual
> >>> application's used maps and progs. Right now we dump some basic
> >>> information in the fdinfo file but with the help of the map/prog
> >>> id full introspection becomes possible now.
> >>>
> >>> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> >>> Acked-by: Alexei Starovoitov <ast@kernel.org>  
> 
> Acked-by: Song Liu <songliubraving@fb.com>
> 
> >>
> >> AFAICR iproute uses this proc fdinfo, for pinned maps.  Have you tested
> >> if this change is handled gracefully by tc ?  
> >
> > Yep, it works just fine, I also tested it before submission.  

Sounds good.

Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH bpf-next 08/11] bpf: fix cbpf parser bug for octal numbers
From: Song Liu @ 2018-05-30 17:16 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: Alexei Starovoitov, Networking
In-Reply-To: <20180528004344.3606-9-daniel@iogearbox.net>

On Sun, May 27, 2018 at 5:43 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> Range is 0-7, not 0-9, otherwise parser silently excludes it from the
> strtol() rather than throwing an error.
>
> Reported-by: Marc Boschma <marc@boschma.cx>
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Acked-by: Alexei Starovoitov <ast@kernel.org>

Acked-by: Song Liu <songliubraving@fb.com>

> ---
>  tools/bpf/bpf_exp.l | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/tools/bpf/bpf_exp.l b/tools/bpf/bpf_exp.l
> index bd83149..4da8d05 100644
> --- a/tools/bpf/bpf_exp.l
> +++ b/tools/bpf/bpf_exp.l
> @@ -175,7 +175,7 @@ extern void yyerror(const char *str);
>                         yylval.number = strtol(yytext, NULL, 10);
>                         return number;
>                 }
> -([0][0-9]+)    {
> +([0][0-7]+)    {
>                         yylval.number = strtol(yytext + 1, NULL, 8);
>                         return number;
>                 }
> --
> 2.9.5
>

^ permalink raw reply

* Re: [PATCH][next] bpf: devmap: remove redundant assignment of dev = dev
From: Song Liu @ 2018-05-30 17:18 UTC (permalink / raw)
  To: Colin King
  Cc: Alexei Starovoitov, Daniel Borkmann, Networking, kernel-janitors,
	open list
In-Reply-To: <20180530150916.16808-1-colin.king@canonical.com>

On Wed, May 30, 2018 at 8:09 AM, Colin King <colin.king@canonical.com> wrote:
> From: Colin Ian King <colin.king@canonical.com>
>
> The assignment dev = dev is redundant and should be removed.
>
> Detected by CoverityScan, CID#1469486 ("Evaluation order violation")
>
> Signed-off-by: Colin Ian King <colin.king@canonical.com>

Acked-by: Song Liu <songliubraving@fb.com>

> ---
>  kernel/bpf/devmap.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index ae16d0c373ef..1fe3fe60508a 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -352,7 +352,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
>  static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
>  {
>         struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
> -       struct net_device *dev = dev = obj ? obj->dev : NULL;
> +       struct net_device *dev = obj ? obj->dev : NULL;
>
>         return dev ? &dev->ifindex : NULL;
>  }
> --
> 2.17.0
>

^ permalink raw reply

* Re: [PATCH, net-next] net: ethernet: freescale: fix false-positive string overflow warning
From: David Miller @ 2018-05-30 17:18 UTC (permalink / raw)
  To: arnd
  Cc: fugang.duan, fabio.estevam, andrew, troy.kisky, f.fainelli,
	netdev, linux-kernel
In-Reply-To: <20180528154958.2684086-1-arnd@arndb.de>

From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 28 May 2018 17:49:46 +0200

> While compile-testing on arm64 with gcc-8.1, I ran into a build diagnostic:
 ...
> It appears this has never shown on ppc32 or arm32 for an unknown reason, but
> now gcc fails to identify that the 'irq_cnt' loop index has an upper bound
> of 3, and instead uses a bogus range.
> 
> To work around the warning, this changes the sprintf to snprintf with the
> correct buffer length.
> 
> Fixes: 78cc6e7ef957 ("net: ethernet: freescale: Allow FEC with COMPILE_TEST")
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>

Applied, thanks Arnd.

^ permalink raw reply

* Re: [PATCH] net: davinci: fix building davinci mdio code without CONFIG_OF
From: David Miller @ 2018-05-30 17:22 UTC (permalink / raw)
  To: arnd
  Cc: grygorii.strashko, nsekhar, f.fainelli, linux-omap, netdev,
	linux-kernel
In-Reply-To: <20180528155059.2736080-1-arnd@arndb.de>

From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 28 May 2018 17:50:20 +0200

> Test-building this driver on targets without CONFIG_OF revealed a build
> failure:
> 
> drivers/net/ethernet/ti/davinci_mdio.c: In function 'davinci_mdio_probe':
> drivers/net/ethernet/ti/davinci_mdio.c:380:9: error: implicit declaration of function 'davinci_mdio_probe_dt'; did you mean 'davinci_mdio_probe'? [-Werror=implicit-function-declaration]
> 
> This adjusts the #ifdef logic in the driver to make it build in
> all configurations.
> 
> Fixes: 2652113ff043 ("net: ethernet: ti: Allow most drivers with COMPILE_TEST")
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>

Applied to net-next.

^ permalink raw reply

* Re: [PATCH] drivers/net: Fix various unnecessary characters after logging newlines
From: David Miller @ 2018-05-30 17:24 UTC (permalink / raw)
  To: joe; +Cc: netdev, brcm80211-dev-list
In-Reply-To: <1d1c9f2ac9bd51bcf2505f5ed898dd48540e92b2.camel@perches.com>

From: Joe Perches <joe@perches.com>
Date: Mon, 28 May 2018 19:51:57 -0700

> Remove and coalesce formats when there is an unnecessary
> character after a logging newline.  These extra characters
> cause logging defects.
> 
> Miscellanea:
> 
> o Coalesce formats
> 
> Signed-off-by: Joe Perches <joe@perches.com>

Applied to net-next, thanks Joe.

^ permalink raw reply

* Re: [v2] MAINTAINERS: add myself as maintainer for QorIQ PTP clock driver
From: David Miller @ 2018-05-30 17:27 UTC (permalink / raw)
  To: yangbo.lu
  Cc: netdev, devicetree, linux-kernel, richardcochran, claudiu.manoil,
	robh+dt
In-Reply-To: <20180529034744.44590-1-yangbo.lu@nxp.com>

From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Tue, 29 May 2018 11:47:44 +0800

> Added myself as maintainer for QorIQ PTP clock driver.
> Since gianfar_ptp.c was renamed to ptp_qoriq.c, let's
> maintain it under QorIQ PTP clock driver.
> 
> Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
> ---
> Changes for v2:
> 	- Dropped dpaa2/rtc part.

Applied to net-next, thanks.

^ permalink raw reply

* Re: [PATCH mlx5-next 2/2] net/mlx5: Add FPGA QP error event
From: Saeed Mahameed @ 2018-05-30 17:28 UTC (permalink / raw)
  To: andrew@lunn.ch
  Cc: Jason Gunthorpe, netdev@vger.kernel.org, Ilan Tayari,
	linux-rdma@vger.kernel.org, Leon Romanovsky, Adi Nissim
In-Reply-To: <20180530162105.GF27537@lunn.ch>

On Wed, 2018-05-30 at 18:21 +0200, Andrew Lunn wrote:
> On Wed, May 30, 2018 at 03:14:20PM +0000, Saeed Mahameed wrote:
> > On Wed, 2018-05-30 at 03:07 +0200, Andrew Lunn wrote:
> > > On Tue, May 29, 2018 at 05:19:54PM -0700, Saeed Mahameed wrote:
> > > > From: Ilan Tayari <ilant@mellanox.com>
> > > > 
> > > > The FPGA QP event fires whenever a QP on the FPGA trasitions
> > > > to the error state.
> > > 
> > > FPGA i know, field programmable gate array. Could you offer some
> > > clue
> > > as to what QP means?
> > > 
> > 
> > Hi Andre, QP "Queue Pair" is well known rdma concept, and widely
> > used
> > in mlx drivers, it is used as in the driver as a ring buffer to
> > communicate with the FPGA device.
> 
> O.K. Thanks.
> 
> It is hard to get the right balance between assuming people know
> everything, and know nothing. But you can help teach people these
> terms i commit messages:
> 
>       The FPGA queue pair event fires whenever a QP on the FPGA
>       transitions to the error state.
> 

Sure will fix the commit message in V2,

Thanks a lot for the feedback.

>    Andrew

^ permalink raw reply

* Re: [PATCH net] vhost_net: flush batched heads before trying to busy polling
From: David Miller @ 2018-05-30 17:31 UTC (permalink / raw)
  To: jasowang; +Cc: mst, kvm, virtualization, netdev, linux-kernel
In-Reply-To: <1527574699-13047-1-git-send-email-jasowang@redhat.com>

From: Jason Wang <jasowang@redhat.com>
Date: Tue, 29 May 2018 14:18:19 +0800

> After commit e2b3b35eb989 ("vhost_net: batch used ring update in rx"),
> we tend to batch updating used heads. But it doesn't flush batched
> heads before trying to do busy polling, this will cause vhost to wait
> for guest TX which waits for the used RX. Fixing by flush batched
> heads before busy loop.
> 
> 1 byte TCP_RR performance recovers from 13107.83 to 50402.65.
> 
> Fixes: e2b3b35eb989 ("vhost_net: batch used ring update in rx")
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Applied and queued up for -stable.

^ permalink raw reply

* Re: [PATCH mlx5-next 1/2] net/mlx5: Add temperature warning event to log
From: Saeed Mahameed @ 2018-05-30 17:31 UTC (permalink / raw)
  To: andrew@lunn.ch
  Cc: Jason Gunthorpe, netdev@vger.kernel.org, Ilan Tayari,
	linux-rdma@vger.kernel.org, Leon Romanovsky, Adi Nissim
In-Reply-To: <20180530161720.GE27537@lunn.ch>

On Wed, 2018-05-30 at 18:17 +0200, Andrew Lunn wrote:
> > Hi Andrew, yes the temperature is available by other means, this
> > patch
> > is needed for alert information reasons in order to know which
> > internal
> > sensors triggered the alarm.
> > We are working in parallel to expose temperature sensor to hwmon,
> > but
> > this is still WIP.
> > 
> > 
> > Is it ok to have both ?
> 
> Hi Saeed
> 
> Ideally no. hwmon has mechanisms for setting alarm thresholds, and
> indicating the thresholds have been exceeded. There are also ways to
> tie this to thermal zones, so the system can react on overheating,
> slow down the CPU, drop voltages, ramp up fans, etc. hwmon should be
> your primary interface, not dmesg.
> 

Yes we are working on this, but it is not something that can happen
soon since we need to define the correct Firmware APIs which are still
WIP.

> But if you are stuck doing things in the wrong order, i guess it is
> O.K. I don't think dmesg is a Binary API, so you can remove it later.
> 

Yes this is the plan, once the hwmon is supported we will remove the
extra dmesg warnings.

>      Andrew

^ permalink raw reply

* Re: [RFC PATCH ghak32 V2 00/13] audit: implement container id
From: Richard Guy Briggs @ 2018-05-30 17:33 UTC (permalink / raw)
  To: Steve Grubb
  Cc: simo, jlayton, carlos, linux-api, containers, LKML, eparis,
	dhowells, linux-audit, ebiederm, luto, netdev, linux-fsdevel,
	cgroups, serge, viro
In-Reply-To: <23151436.iBN3rkXKiY@x2>

On 2018-05-30 09:20, Steve Grubb wrote:
> On Friday, March 16, 2018 5:00:27 AM EDT Richard Guy Briggs wrote:
> > Implement audit kernel container ID.
> > 
> > This patchset is a second RFC based on the proposal document (V3)
> > posted:
> > 	https://www.redhat.com/archives/linux-audit/2018-January/msg00014.html
> 
> So, if you work on a container orchestrator, how exactly is this set of 
> interfaces to be used and in what order?

It was designed keeping in mind the Virtuallization Manager Guest
Lifecycle Events document.
	https://github.com/linux-audit/audit-documentation/wiki/SPEC-Virtualization-Manager-Guest-Lifecycle-Events

The orchestrator would start setting things up and when it knows the PID
of the conainer task but before that task has had a chance to thread or
spawn children it registers the audit container ID via the /proc
interface.  After that, it consults audit for any events maching that
ID.

> Thanks,
> -Steve
> 
> > The first patch implements the proc fs write to set the audit container
> > ID of a process, emitting an AUDIT_CONTAINER record to announce the
> > registration of that container ID on that process.  This patch requires
> > userspace support for record acceptance and proper type display.
> > 
> > The second checks for children or co-threads and refuses to set the
> > container ID if either are present.  (This policy could be changed to
> > set both with the same container ID provided they meet the rest of the
> > requirements.)
> > 
> > The third implements the auxiliary record AUDIT_CONTAINER_INFO if a
> > container ID is identifiable with an event.  This patch requires
> > userspace support for proper type display.
> > 
> > The fourth adds container ID filtering to the exit, exclude and user
> > lists.  This patch requires auditctil userspace support for the
> > --containerid option.
> > 
> > The 5th adds signal and ptrace support.
> > 
> > The 6th creates a local audit context to be able to bind a standalone
> > record with a locally created auxiliary record.
> > 
> > The 7th, 8th, 9th, 10th patches add container ID records to standalone
> > records.  Some of these may end up being syscall auxiliary records and
> > won't need this specific support since they'll be supported via
> > syscalls.
> > 
> > The 11th adds network namespace container ID labelling based on member
> > tasks' container ID labels.
> > 
> > The 12th adds container ID support to standalone netfilter records that
> > don't have a task context and lists each container to which that net
> > namespace belongs.
> > 
> > The 13th implements reading the container ID from the proc filesystem
> > for debugging.  This patch isn't planned for upstream inclusion.
> > 
> > Feedback please!
> > 
> > Example: Set a container ID of 123456 to the "sleep" task:
> > 	sleep 2&
> > 	child=$!
> > 	echo 123456 > /proc/$child/containerid; echo $?
> > 	ausearch -ts recent -m container
> > 	echo child:$child contid:$( cat /proc/$child/containerid)
> > This should produce a record such as:
> > 	type=CONTAINER msg=audit(1521122590.315:222): op=set pid=689 uid=0
> > subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 auid=0 tty=pts0
> > ses=3 opid=707 old-contid=18446744073709551615 contid=123456 res=1
> > 
> > Example: Set a filter on a container ID 123459 on /tmp/tmpcontainerid:
> > 	containerid=123459
> > 	key=tmpcontainerid
> > 	auditctl -a exit,always -F dir=/tmp -F perm=wa -F containerid=$containerid
> > -F key=$key perl -e "sleep 1; open(my \$tmpfile, '>', \"/tmp/$key\");
> > close(\$tmpfile);" & child=$!
> > 	echo $containerid > /proc/$child/containerid
> > 	sleep 2
> > 	ausearch -i -ts recent -k $key
> > 	auditctl -d exit,always -F dir=/tmp -F perm=wa -F containerid=$containerid
> > -F key=$key rm -f /tmp/$key
> > This should produce an event such as:
> > 	type=CONTAINER_INFO msg=audit(1521122591.614:227): op=task contid=123459
> > 	type=PROCTITLE msg=audit(1521122591.614:227):
> > proctitle=7065726C002D6500736C65657020313B206F70656E286D792024746D7066696C
> > 652C20273E272C20222F746D702F746D70636F6E7461696E6572696422293B20636C6F73652
> > 824746D7066696C65293B type=PATH msg=audit(1521122591.614:227): item=1
> > name="/tmp/tmpcontainerid" inode=18427 dev=00:26 mode=0100644 ouid=0
> > ogid=0 rdev=00:00 obj=unconfined_u:object_r:user_tmp_t:s0 nametype=CREATE
> > cap_fp=0000000000000000 cap_fi=0000000000000000 cap_fe=0 cap_fver=0
> > type=PATH msg=audit(1521122591.614:227): item=0 name="/tmp/" inode=13513
> > dev=00:26 mode=041777 ouid=0 ogid=0 rdev=00:00
> > obj=system_u:object_r:tmp_t:s0 nametype=PARENT cap_fp=0000000000000000
> > cap_fi=0000000000000000 cap_fe=0 cap_fver=0 type=CWD
> > msg=audit(1521122591.614:227): cwd="/root"
> > 	type=SYSCALL msg=audit(1521122591.614:227): arch=c000003e syscall=257
> > success=yes exit=3 a0=ffffffffffffff9c a1=55db90a28900 a2=241 a3=1b6
> > items=2 ppid=689 pid=724 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0
> > sgid=0 fsgid=0 tty=pts0 ses=3 comm="perl" exe="/usr/bin/perl"
> > subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023
> > key="tmpcontainerid"
> > 
> > See:
> > 	https://github.com/linux-audit/audit-kernel/issues/32
> > 	https://github.com/linux-audit/audit-userspace/issues/40
> > 	https://github.com/linux-audit/audit-testsuite/issues/64
> > 
> > Richard Guy Briggs (13):
> >   audit: add container id
> >   audit: check children and threading before allowing containerid
> >   audit: log container info of syscalls
> >   audit: add containerid filtering
> >   audit: add containerid support for ptrace and signals
> >   audit: add support for non-syscall auxiliary records
> >   audit: add container aux record to watch/tree/mark
> >   audit: add containerid support for tty_audit
> >   audit: add containerid support for config/feature/user records
> >   audit: add containerid support for seccomp and anom_abend records
> >   audit: add support for containerid to network namespaces
> >   audit: NETFILTER_PKT: record each container ID associated with a netNS
> >   debug audit: read container ID of a process
> > 
> >  drivers/tty/tty_audit.c     |   5 +-
> >  fs/proc/base.c              |  53 ++++++++++++++++
> >  include/linux/audit.h       |  43 +++++++++++++
> >  include/linux/init_task.h   |   4 +-
> >  include/linux/sched.h       |   1 +
> >  include/net/net_namespace.h |  12 ++++
> >  include/uapi/linux/audit.h  |   8 ++-
> >  kernel/audit.c              |  75 ++++++++++++++++++++---
> >  kernel/audit.h              |   3 +
> >  kernel/audit_fsnotify.c     |   5 +-
> >  kernel/audit_tree.c         |   5 +-
> >  kernel/audit_watch.c        |  33 +++++-----
> >  kernel/auditfilter.c        |  52 +++++++++++++++-
> >  kernel/auditsc.c            | 145
> > ++++++++++++++++++++++++++++++++++++++++++-- kernel/nsproxy.c            |
> >   6 ++
> >  net/core/net_namespace.c    |  45 ++++++++++++++
> >  net/netfilter/xt_AUDIT.c    |  15 ++++-
> >  17 files changed, 473 insertions(+), 37 deletions(-)
> 
> 
> 
> 

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635

^ permalink raw reply

* Re: [PATCH V4] mlx4_core: allocate ICM memory in page size chunks
From: Qing Huang @ 2018-05-30 17:39 UTC (permalink / raw)
  To: Eric Dumazet, David Miller
  Cc: tariqt, haakon.bugge, yanjun.zhu, netdev, linux-rdma,
	linux-kernel, gi-oh.kim
In-Reply-To: <cdf0d1a9-378a-8b09-544e-a4918acb57fe@gmail.com>



On 5/29/2018 8:49 PM, Eric Dumazet wrote:
>
> On 05/29/2018 11:44 PM, Eric Dumazet wrote:
>
>> And I will add this simple fix, this really should address your initial concern much better.
>>
>> @@ -99,6 +100,8 @@ static int mlx4_alloc_icm_pages(struct scatterlist *mem, int order,
>>   {
>>          struct page *page;
>>   
>> +       if (order)
>> +               gfp_mask |= __GFP_NORETRY;
>      and also      gfp_mask &= ~__GFP_DIRECT_RECLAIM
>

Would this just fail the allocation without trying to reclaim memory 
under memory pressure? We've tried something similar but it didn't fix 
the original problem we were facing.


>>          page = alloc_pages_node(node, gfp_mask, order);
>>          if (!page) {
>>                  page = alloc_pages(gfp_mask, order);
>>

^ permalink raw reply

* Re: [PATCH] b53: Add brcm5389 support
From: Florian Fainelli @ 2018-05-30 17:46 UTC (permalink / raw)
  To: Damien Thébault, vivien.didelot@savoirfairelinux.com,
	andrew@lunn.ch
  Cc: davem@davemloft.net, netdev@vger.kernel.org
In-Reply-To: <279329c705e4fcdee889c3a8a9ecb1becd09ab88.camel@vitec.com>



On 05/30/2018 08:33 AM, Damien Thébault wrote:
> This patch adds support for the BCM5389 switch connected through MDIO.

This looks good, please do address the following couple of things:
- subject should be: net: dsa: b53: Add BCM5389 support
- you also need to update
Documentation/devicetree/bindings/net/dsa/b53.txt with the compatible string

Thank you!

> 
> Signed-off-by: Damien Thébault <damien.thebault@vitec.com>
> ---
>  drivers/net/dsa/b53/b53_common.c | 13 +++++++++++++
>  drivers/net/dsa/b53/b53_mdio.c   |  5 ++++-
>  drivers/net/dsa/b53/b53_priv.h   |  1 +
>  3 files changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
> index 78616787f2a3..3da5fca77cbd 100644
> --- a/drivers/net/dsa/b53/b53_common.c
> +++ b/drivers/net/dsa/b53/b53_common.c
> @@ -1711,6 +1711,18 @@ static const struct b53_chip_data b53_switch_chips[] = {
>  		.cpu_port = B53_CPU_PORT_25,
>  		.duplex_reg = B53_DUPLEX_STAT_FE,
>  	},
> +	{
> +		.chip_id = BCM5389_DEVICE_ID,
> +		.dev_name = "BCM5389",
> +		.vlans = 4096,
> +		.enabled_ports = 0x1f,
> +		.arl_entries = 4,
> +		.cpu_port = B53_CPU_PORT,
> +		.vta_regs = B53_VTA_REGS,
> +		.duplex_reg = B53_DUPLEX_STAT_GE,
> +		.jumbo_pm_reg = B53_JUMBO_PORT_MASK,
> +		.jumbo_size_reg = B53_JUMBO_MAX_SIZE,
> +	},
>  	{
>  		.chip_id = BCM5395_DEVICE_ID,
>  		.dev_name = "BCM5395",
> @@ -2034,6 +2046,7 @@ int b53_switch_detect(struct b53_device *dev)
>  		else
>  			dev->chip_id = BCM5365_DEVICE_ID;
>  		break;
> +	case BCM5389_DEVICE_ID:
>  	case BCM5395_DEVICE_ID:
>  	case BCM5397_DEVICE_ID:
>  	case BCM5398_DEVICE_ID:
> diff --git a/drivers/net/dsa/b53/b53_mdio.c b/drivers/net/dsa/b53/b53_mdio.c
> index fa7556f5d4fb..a533a90e3904 100644
> --- a/drivers/net/dsa/b53/b53_mdio.c
> +++ b/drivers/net/dsa/b53/b53_mdio.c
> @@ -285,6 +285,7 @@ static const struct b53_io_ops b53_mdio_ops = {
>  #define B53_BRCM_OUI_1	0x0143bc00
>  #define B53_BRCM_OUI_2	0x03625c00
>  #define B53_BRCM_OUI_3	0x00406000
> +#define B53_BRCM_OUI_4	0x01410c00
>  
>  static int b53_mdio_probe(struct mdio_device *mdiodev)
>  {
> @@ -311,7 +312,8 @@ static int b53_mdio_probe(struct mdio_device *mdiodev)
>  	 */
>  	if ((phy_id & 0xfffffc00) != B53_BRCM_OUI_1 &&
>  	    (phy_id & 0xfffffc00) != B53_BRCM_OUI_2 &&
> -	    (phy_id & 0xfffffc00) != B53_BRCM_OUI_3) {
> +	    (phy_id & 0xfffffc00) != B53_BRCM_OUI_3 &&
> +	    (phy_id & 0xfffffc00) != B53_BRCM_OUI_4) {
>  		dev_err(&mdiodev->dev, "Unsupported device: 0x%08x\n", phy_id);
>  		return -ENODEV;
>  	}
> @@ -360,6 +362,7 @@ static const struct of_device_id b53_of_match[] = {
>  	{ .compatible = "brcm,bcm53125" },
>  	{ .compatible = "brcm,bcm53128" },
>  	{ .compatible = "brcm,bcm5365" },
> +	{ .compatible = "brcm,bcm5389" },
>  	{ .compatible = "brcm,bcm5395" },
>  	{ .compatible = "brcm,bcm5397" },
>  	{ .compatible = "brcm,bcm5398" },
> diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
> index 1187ebd79287..3b57f47d0e79 100644
> --- a/drivers/net/dsa/b53/b53_priv.h
> +++ b/drivers/net/dsa/b53/b53_priv.h
> @@ -48,6 +48,7 @@ struct b53_io_ops {
>  enum {
>  	BCM5325_DEVICE_ID = 0x25,
>  	BCM5365_DEVICE_ID = 0x65,
> +	BCM5389_DEVICE_ID = 0x89,
>  	BCM5395_DEVICE_ID = 0x95,
>  	BCM5397_DEVICE_ID = 0x97,
>  	BCM5398_DEVICE_ID = 0x98,
> 

-- 
Florian

^ permalink raw reply

* Re: [PATCH V4] mlx4_core: allocate ICM memory in page size chunks
From: Qing Huang @ 2018-05-30 17:53 UTC (permalink / raw)
  To: Eric Dumazet, David Miller
  Cc: tariqt, haakon.bugge, yanjun.zhu, netdev, linux-rdma,
	linux-kernel, gi-oh.kim
In-Reply-To: <7a353b65-6b7f-1aee-1c48-e83c8e02f693@gmail.com>



On 5/29/2018 8:34 PM, Eric Dumazet wrote:
>
> On 05/25/2018 10:23 AM, David Miller wrote:
>> From: Qing Huang <qing.huang@oracle.com>
>> Date: Wed, 23 May 2018 16:22:46 -0700
>>
>>> When a system is under memory presure (high usage with fragments),
>>> the original 256KB ICM chunk allocations will likely trigger kernel
>>> memory management to enter slow path doing memory compact/migration
>>> ops in order to complete high order memory allocations.
>>>
>>> When that happens, user processes calling uverb APIs may get stuck
>>> for more than 120s easily even though there are a lot of free pages
>>> in smaller chunks available in the system.
>>>
>>> Syslog:
>>> ...
>>> Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
>>> oracle_205573_e:205573 blocked for more than 120 seconds.
>>> ...
>>>
>>> With 4KB ICM chunk size on x86_64 arch, the above issue is fixed.
>>>
>>> However in order to support smaller ICM chunk size, we need to fix
>>> another issue in large size kcalloc allocations.
>>>
>>> E.g.
>>> Setting log_num_mtt=30 requires 1G mtt entries. With the 4KB ICM chunk
>>> size, each ICM chunk can only hold 512 mtt entries (8 bytes for each mtt
>>> entry). So we need a 16MB allocation for a table->icm pointer array to
>>> hold 2M pointers which can easily cause kcalloc to fail.
>>>
>>> The solution is to use kvzalloc to replace kcalloc which will fall back
>>> to vmalloc automatically if kmalloc fails.
>>>
>>> Signed-off-by: Qing Huang <qing.huang@oracle.com>
>>> Acked-by: Daniel Jurgens <danielj@mellanox.com>
>>> Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
>> Applied, thanks.
>>
> I must say this patch causes regressions here.
>
> KASAN is not happy.
>
> It looks that you guys did not really looked at mlx4_alloc_icm()
>
> This function is properly handling high order allocations with fallbacks to order-0 pages
> under high memory pressure.
>
> BUG: KASAN: slab-out-of-bounds in to_rdma_ah_attr+0x808/0x9e0 [mlx4_ib]
> Read of size 4 at addr ffff8817df584f68 by task qp_listing_test/92585
>
> CPU: 38 PID: 92585 Comm: qp_listing_test Tainted: G           O
> Call Trace:
>   [<ffffffffba80d7bb>] dump_stack+0x4d/0x72
>   [<ffffffffb951dc5f>] print_address_description+0x6f/0x260
>   [<ffffffffb951e1c7>] kasan_report+0x257/0x370
>   [<ffffffffb951e339>] __asan_report_load4_noabort+0x19/0x20
>   [<ffffffffc0256d28>] to_rdma_ah_attr+0x808/0x9e0 [mlx4_ib]
>   [<ffffffffc02785b3>] mlx4_ib_query_qp+0x1213/0x1660 [mlx4_ib]
>   [<ffffffffc02dbfdb>] qpstat_print_qp+0x13b/0x500 [ib_uverbs]
>   [<ffffffffc02dc3ea>] qpstat_seq_show+0x4a/0xb0 [ib_uverbs]
>   [<ffffffffb95f125c>] seq_read+0xa9c/0x1230
>   [<ffffffffb96e0821>] proc_reg_read+0xc1/0x180
>   [<ffffffffb9577918>] __vfs_read+0xe8/0x730
>   [<ffffffffb9578057>] vfs_read+0xf7/0x300
>   [<ffffffffb95794d2>] SyS_read+0xd2/0x1b0
>   [<ffffffffb8e06b16>] do_syscall_64+0x186/0x420
>   [<ffffffffbaa00071>] entry_SYSCALL_64_after_hwframe+0x3d/0xa2
> RIP: 0033:0x7f851a7bb30d
> RSP: 002b:00007ffd09a758c0 EFLAGS: 00000293 ORIG_RAX: 0000000000000000
> RAX: ffffffffffffffda RBX: 00007f84ff959440 RCX: 00007f851a7bb30d
> RDX: 000000000003fc00 RSI: 00007f84ff60a000 RDI: 000000000000000b
> RBP: 00007ffd09a75900 R08: 00000000ffffffff R09: 0000000000000000
> R10: 0000000000000022 R11: 0000000000000293 R12: 0000000000000000
> R13: 000000000003ffff R14: 000000000003ffff R15: 00007f84ff60a000
>
> Allocated by task 4488:
>   save_stack+0x46/0xd0
>   kasan_kmalloc+0xad/0xe0
>   __kmalloc+0x101/0x5e0
>   ib_register_device+0xc03/0x1250 [ib_core]
>   mlx4_ib_add+0x27d6/0x4dd0 [mlx4_ib]
>   mlx4_add_device+0xa9/0x340 [mlx4_core]
>   mlx4_register_interface+0x16e/0x390 [mlx4_core]
>   xhci_pci_remove+0x7a/0x180 [xhci_pci]
>   do_one_initcall+0xa0/0x230
>   do_init_module+0x1b9/0x5a4
>   load_module+0x63e6/0x94c0
>   SYSC_init_module+0x1a4/0x1c0
>   SyS_init_module+0xe/0x10
>   do_syscall_64+0x186/0x420
>   entry_SYSCALL_64_after_hwframe+0x3d/0xa2
>
> Freed by task 0:
> (stack is not available)
>
> The buggy address belongs to the object at ffff8817df584f40
>   which belongs to the cache kmalloc-32 of size 32
> The buggy address is located 8 bytes to the right of
>   32-byte region [ffff8817df584f40, ffff8817df584f60)
> The buggy address belongs to the page:
> page:ffffea005f7d6100 count:1 mapcount:0 mapping:ffff8817df584000 index:0xffff8817df584fc1
> flags: 0x880000000000100(slab)
> raw: 0880000000000100 ffff8817df584000 ffff8817df584fc1 000000010000003f
> raw: ffffea005f3ac0a0 ffffea005c476760 ffff8817fec00900 ffff883ff78d26c0
> page dumped because: kasan: bad access detected
> page->mem_cgroup:ffff883ff78d26c0

What kind of test case did you run? It looks like a bug somewhere in the 
code.
Perhaps smaller chunks make it easier to occur, we should fix the bug 
though.


>
> Memory state around the buggy address:
>   ffff8817df584e00: 00 03 fc fc fc fc fc fc 00 03 fc fc fc fc fc fc
>   ffff8817df584e80: 00 00 00 04 fc fc fc fc 00 00 00 fc fc fc fc fc
>> ffff8817df584f00: fb fb fb fb fc fc fc fc 00 00 00 00 fc fc fc fc
>                                                            ^
>   ffff8817df584f80: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc
>   ffff8817df585000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>
> I will test :
>
> diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c
> index 685337d58276fc91baeeb64387c52985e1bc6dda..4d2a71381acb739585d662175e86caef72338097 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/icm.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
> @@ -43,12 +43,13 @@
>   #include "fw.h"
>   
>   /*
> - * We allocate in page size (default 4KB on many archs) chunks to avoid high
> - * order memory allocations in fragmented/high usage memory situation.
> + * We allocate in as big chunks as we can, up to a maximum of 256 KB
> + * per chunk. Note that the chunks are not necessarily in contiguous
> + * physical memory.
>    */
>   enum {
> -       MLX4_ICM_ALLOC_SIZE     = PAGE_SIZE,
> -       MLX4_TABLE_CHUNK_SIZE   = PAGE_SIZE,
> +       MLX4_ICM_ALLOC_SIZE     = 1 << 18,
> +       MLX4_TABLE_CHUNK_SIZE   = 1 << 18
>   };
>   
>   static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [net] ixgbe: fix parsing of TC actions for HW offload
From: Jeff Kirsher @ 2018-05-30 18:01 UTC (permalink / raw)
  To: davem
  Cc: Ondřej Hlavatý, netdev, nhorman, sassmann, jogreene,
	Jamal Hadi Salim, Jiri Pirko, Jeff Kirsher

From: Ondřej Hlavatý <ohlavaty@redhat.com>

The previous code was optimistic, accepting the offload of whole action
chain when there was a single known action (drop/redirect). This results
in offloading a rule which should not be offloaded, because its behavior
cannot be reproduced in the hardware.

For example:

$ tc filter add dev eno1 parent ffff: protocol ip \
    u32 ht 800: order 1 match tcp src 42 FFFF \
    action mirred egress mirror dev enp1s16 pipe \
    drop

The controller is unable to mirror the packet to a VF, but still
offloads the rule by dropping the packet.

Change the approach of the function to a pessimistic one, rejecting the
chain when an unknown action is found. This is better suited for future
extensions.

Note that both recognized actions always return TC_ACT_SHOT, therefore
it is safe to ignore actions behind them.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Ondřej Hlavatý <ohlavaty@redhat.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index afadba99f7b8..d01e1f0280cf 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9054,7 +9054,6 @@ static int parse_tc_actions(struct ixgbe_adapter *adapter,
 {
 	const struct tc_action *a;
 	LIST_HEAD(actions);
-	int err;
 
 	if (!tcf_exts_has_actions(exts))
 		return -EINVAL;
@@ -9075,14 +9074,14 @@ static int parse_tc_actions(struct ixgbe_adapter *adapter,
 
 			if (!dev)
 				return -EINVAL;
-			err = handle_redirect_action(adapter, dev->ifindex, queue,
-						     action);
-			if (err == 0)
-				return err;
+			return handle_redirect_action(adapter, dev->ifindex,
+						      queue, action);
 		}
+
+		return -EINVAL;
 	}
 
-	return -EINVAL;
+	return 0;
 }
 #else
 static int parse_tc_actions(struct ixgbe_adapter *adapter,
-- 
2.17.0

^ permalink raw reply related

* [bpf-next V1 PATCH 0/8] bpf/xdp: add flags argument to ndo_xdp_xmit and flag flush operation
From: Jesper Dangaard Brouer @ 2018-05-30 18:00 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: John Fastabend, makita.toshiaki

As I mentioned in merge commit 10f678683e4 ("Merge branch 'xdp_xmit-bulking'")
I plan to change the API for ndo_xdp_xmit once more, by adding a flags
argument, which is done in this patchset.

I know it is late in the cycle (currently at rc7), but it would be
nice to avoid changing NDOs over several kernel releases, as it is
annoying to vendors and distro backporters, but it is not strictly
UAPI so it is allowed (according to Alexei).

The end-goal is getting rid of the ndo_xdp_flush operation, as it will
make it possible for drivers to implement a TXQ synchronization mechanism
that is not necessarily derived from the CPU id (smp_processor_id).

This patchset removes all callers of the ndo_xdp_flush operation, but
it doesn't take the last step of removing it from all drivers.  This
can be done later, or I can update the patchset on request.

Micro-benchmarks only show a very small performance improvement, for
map-redirect around ~2 ns, and for non-map redirect ~7 ns.  I've not
benchmarked this with CONFIG_RETPOLINE, but the performance benefit
should be more visible given we end-up removing an indirect call.

---

Jesper Dangaard Brouer (8):
      xdp: add flags argument to ndo_xdp_xmit API
      i40e: implement flush flag for ndo_xdp_xmit
      ixgbe: implement flush flag for ndo_xdp_xmit
      tun: implement flush flag for ndo_xdp_xmit
      virtio_net: implement flush flag for ndo_xdp_xmit
      xdp: done implementing ndo_xdp_xmit flush flag for all drivers
      bpf/xdp: non-map redirect can avoid calling ndo_xdp_flush
      bpf/xdp: devmap can avoid calling ndo_xdp_flush


 drivers/net/ethernet/intel/i40e/i40e_txrx.c   |    9 ++++++++-
 drivers/net/ethernet/intel/i40e/i40e_txrx.h   |    3 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   23 +++++++++++++++++------
 drivers/net/tun.c                             |   25 ++++++++++++++++++-------
 drivers/net/virtio_net.c                      |    9 ++++++++-
 include/linux/netdevice.h                     |    7 ++++---
 include/net/xdp.h                             |    4 ++++
 kernel/bpf/devmap.c                           |   20 +++++++-------------
 net/core/filter.c                             |    3 +--
 9 files changed, 69 insertions(+), 34 deletions(-)

^ permalink raw reply

* [bpf-next V1 PATCH 1/8] xdp: add flags argument to ndo_xdp_xmit API
From: Jesper Dangaard Brouer @ 2018-05-30 18:00 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: John Fastabend, makita.toshiaki
In-Reply-To: <152770312703.20510.5854417568847239931.stgit@firesoul>

This patch only change the API and reject any use of flags. This is an
intermediate step that allows us to implement the flush flag operation
later, for each individual driver in a separate patch.

The plan is to implement flush operation via XDP_XMIT_FLUSH flag
and then remove XDP_XMIT_FLAGS_NONE when done.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   |    6 +++++-
 drivers/net/ethernet/intel/i40e/i40e_txrx.h   |    3 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |    5 ++++-
 drivers/net/tun.c                             |    8 ++++++--
 drivers/net/virtio_net.c                      |    5 ++++-
 include/linux/netdevice.h                     |    7 ++++---
 include/net/xdp.h                             |    5 +++++
 kernel/bpf/devmap.c                           |    2 +-
 net/core/filter.c                             |    2 +-
 9 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 9b698c5acd05..c0451d6e0790 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -3670,7 +3670,8 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
  * For error cases, a negative errno code is returned and no-frames
  * are transmitted (caller must handle freeing frames).
  **/
-int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
+int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
+		  u32 flags)
 {
 	struct i40e_netdev_priv *np = netdev_priv(dev);
 	unsigned int queue_index = smp_processor_id();
@@ -3684,6 +3685,9 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
 	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
 		return -ENXIO;
 
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+		return -EINVAL;
+
 	for (i = 0; i < n; i++) {
 		struct xdp_frame *xdpf = frames[i];
 		int err;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index eb8804b3d7b6..820f76db251b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -487,7 +487,8 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
 void i40e_detect_recover_hung(struct i40e_vsi *vsi);
 int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
 bool __i40e_chk_linearize(struct sk_buff *skb);
-int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames);
+int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
+		  u32 flags);
 void i40e_xdp_flush(struct net_device *dev);
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 031d65c4178d..87f088f4af52 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10023,7 +10023,7 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 }
 
 static int ixgbe_xdp_xmit(struct net_device *dev, int n,
-			  struct xdp_frame **frames)
+			  struct xdp_frame **frames, u32 flags)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(dev);
 	struct ixgbe_ring *ring;
@@ -10033,6 +10033,9 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n,
 	if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
 		return -ENETDOWN;
 
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+		return -EINVAL;
+
 	/* During program transitions its possible adapter->xdp_prog is assigned
 	 * but ring has not been configured yet. In this case simply abort xmit.
 	 */
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 2265d2ccea47..b182b8cdd219 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1285,7 +1285,8 @@ static const struct net_device_ops tun_netdev_ops = {
 	.ndo_get_stats64	= tun_net_get_stats64,
 };
 
-static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
+static int tun_xdp_xmit(struct net_device *dev, int n,
+			struct xdp_frame **frames, u32 flags)
 {
 	struct tun_struct *tun = netdev_priv(dev);
 	struct tun_file *tfile;
@@ -1294,6 +1295,9 @@ static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames
 	int cnt = n;
 	int i;
 
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+		return -EINVAL;
+
 	rcu_read_lock();
 
 	numqueues = READ_ONCE(tun->numqueues);
@@ -1332,7 +1336,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
 	if (unlikely(!frame))
 		return -EOVERFLOW;
 
-	return tun_xdp_xmit(dev, 1, &frame);
+	return tun_xdp_xmit(dev, 1, &frame, 0);
 }
 
 static void tun_xdp_flush(struct net_device *dev)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b2647dd5d302..4ed823625953 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -468,7 +468,7 @@ static int __virtnet_xdp_tx_xmit(struct virtnet_info *vi,
 }
 
 static int virtnet_xdp_xmit(struct net_device *dev,
-			    int n, struct xdp_frame **frames)
+			    int n, struct xdp_frame **frames, u32 flags)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct receive_queue *rq = vi->rq;
@@ -481,6 +481,9 @@ static int virtnet_xdp_xmit(struct net_device *dev,
 	int err;
 	int i;
 
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+		return -EINVAL;
+
 	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
 	sq = &vi->sq[qp];
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8452f72087ef..7f17785a59d7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1185,13 +1185,13 @@ struct dev_ifalias {
  *	This function is used to set or query state related to XDP on the
  *	netdevice and manage BPF offload. See definition of
  *	enum bpf_netdev_command for details.
- * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp);
+ * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp,
+ *			u32 flags);
  *	This function is used to submit @n XDP packets for transmit on a
  *	netdevice. Returns number of frames successfully transmitted, frames
  *	that got dropped are freed/returned via xdp_return_frame().
  *	Returns negative number, means general error invoking ndo, meaning
  *	no frames were xmit'ed and core-caller will free all frames.
- *	TODO: Consider add flag to allow sending flush operation.
  * void (*ndo_xdp_flush)(struct net_device *dev);
  *	This function is used to inform the driver to flush a particular
  *	xdp tx queue. Must be called on same CPU as xdp_xmit.
@@ -1380,7 +1380,8 @@ struct net_device_ops {
 	int			(*ndo_bpf)(struct net_device *dev,
 					   struct netdev_bpf *bpf);
 	int			(*ndo_xdp_xmit)(struct net_device *dev, int n,
-						struct xdp_frame **xdp);
+						struct xdp_frame **xdp,
+						u32 flags);
 	void			(*ndo_xdp_flush)(struct net_device *dev);
 };
 
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 7ad779237ae8..308a4b30b484 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -40,6 +40,11 @@ enum xdp_mem_type {
 	MEM_TYPE_MAX,
 };
 
+/* XDP flags for ndo_xdp_xmit */
+#define XDP_XMIT_FLAGS_NONE	0U
+#define XDP_XMIT_FLUSH		(1U << 0)
+#define XDP_XMIT_FLAGS_MASK	XDP_XMIT_FLUSH
+
 struct xdp_mem_info {
 	u32 type; /* enum xdp_mem_type, but known size type */
 	u32 id;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index ae16d0c373ef..04fbd75a5274 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -232,7 +232,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 		prefetch(xdpf);
 	}
 
-	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q);
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, 0);
 	if (sent < 0) {
 		err = sent;
 		sent = 0;
diff --git a/net/core/filter.c b/net/core/filter.c
index 81bd2e9fe8fc..6a21dbcad350 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3056,7 +3056,7 @@ static int __bpf_tx_xdp(struct net_device *dev,
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf);
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, 0);
 	if (sent <= 0)
 		return sent;
 	dev->netdev_ops->ndo_xdp_flush(dev);

^ permalink raw reply related

* [bpf-next V1 PATCH 2/8] i40e: implement flush flag for ndo_xdp_xmit
From: Jesper Dangaard Brouer @ 2018-05-30 18:00 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: John Fastabend, makita.toshiaki
In-Reply-To: <152770312703.20510.5854417568847239931.stgit@firesoul>

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c |    5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index c0451d6e0790..03c1446f0465 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -3685,7 +3685,7 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
 		return -ENXIO;
 
-	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
 		return -EINVAL;
 
 	for (i = 0; i < n; i++) {
@@ -3699,6 +3699,9 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 		}
 	}
 
+	if (unlikely(flags & XDP_XMIT_FLUSH))
+		i40e_xdp_ring_update_tail(vsi->xdp_rings[queue_index]);
+
 	return n - drops;
 }
 

^ permalink raw reply related

* [bpf-next V1 PATCH 3/8] ixgbe: implement flush flag for ndo_xdp_xmit
From: Jesper Dangaard Brouer @ 2018-05-30 18:00 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: John Fastabend, makita.toshiaki
In-Reply-To: <152770312703.20510.5854417568847239931.stgit@firesoul>

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 87f088f4af52..4fd77c9067f2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10022,6 +10022,15 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	}
 }
 
+static void ixgbe_xdp_ring_update_tail(struct ixgbe_ring *ring)
+{
+	/* Force memory writes to complete before letting h/w know there
+	 * are new descriptors to fetch.
+	 */
+	wmb();
+	writel(ring->next_to_use, ring->tail);
+}
+
 static int ixgbe_xdp_xmit(struct net_device *dev, int n,
 			  struct xdp_frame **frames, u32 flags)
 {
@@ -10033,7 +10042,7 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n,
 	if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
 		return -ENETDOWN;
 
-	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
 		return -EINVAL;
 
 	/* During program transitions its possible adapter->xdp_prog is assigned
@@ -10054,6 +10063,9 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n,
 		}
 	}
 
+	if (unlikely(flags & XDP_XMIT_FLUSH))
+		ixgbe_xdp_ring_update_tail(ring);
+
 	return n - drops;
 }
 
@@ -10072,11 +10084,7 @@ static void ixgbe_xdp_flush(struct net_device *dev)
 	if (unlikely(!ring))
 		return;
 
-	/* Force memory writes to complete before letting h/w know there
-	 * are new descriptors to fetch.
-	 */
-	wmb();
-	writel(ring->next_to_use, ring->tail);
+	ixgbe_xdp_ring_update_tail(ring);
 
 	return;
 }

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox