Netdev List
 help / color / mirror / Atom feed
* [PATCH bpf-next v5 5/7] samples/bpf: add a samples/bpf test for BPF_TASK_FD_QUERY
From: Yonghong Song @ 2018-05-24 18:21 UTC (permalink / raw)
  To: peterz, ast, daniel, netdev; +Cc: kernel-team

This is mostly to test kprobe/uprobe which needs kernel headers.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 samples/bpf/Makefile             |   4 +
 samples/bpf/task_fd_query_kern.c |  19 ++
 samples/bpf/task_fd_query_user.c | 382 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 405 insertions(+)
 create mode 100644 samples/bpf/task_fd_query_kern.c
 create mode 100644 samples/bpf/task_fd_query_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 62d1aa1..7dc85ed 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -51,6 +51,7 @@ hostprogs-y += cpustat
 hostprogs-y += xdp_adjust_tail
 hostprogs-y += xdpsock
 hostprogs-y += xdp_fwd
+hostprogs-y += task_fd_query
 
 # Libbpf dependencies
 LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -105,6 +106,7 @@ cpustat-objs := bpf_load.o cpustat_user.o
 xdp_adjust_tail-objs := xdp_adjust_tail_user.o
 xdpsock-objs := bpf_load.o xdpsock_user.o
 xdp_fwd-objs := bpf_load.o xdp_fwd_user.o
+task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -160,6 +162,7 @@ always += cpustat_kern.o
 always += xdp_adjust_tail_kern.o
 always += xdpsock_kern.o
 always += xdp_fwd_kern.o
+always += task_fd_query_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -175,6 +178,7 @@ HOSTCFLAGS_offwaketime_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_task_fd_query_user.o += -I$(srctree)/tools/lib/bpf/
 
 HOST_LOADLIBES		+= $(LIBBPF) -lelf
 HOSTLOADLIBES_tracex4		+= -lrt
diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c
new file mode 100644
index 0000000..f4b0a9e
--- /dev/null
+++ b/samples/bpf/task_fd_query_kern.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/version.h>
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+SEC("kprobe/blk_start_request")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	return 0;
+}
+
+SEC("kretprobe/blk_account_io_completion")
+int bpf_prog2(struct pt_regs *ctx)
+{
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c
new file mode 100644
index 0000000..8381d79
--- /dev/null
+++ b/samples/bpf/task_fd_query_user.c
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <linux/bpf.h>
+#include <sys/ioctl.h>
+#include <sys/resource.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+#define CHECK_PERROR_RET(condition) ({			\
+	int __ret = !!(condition);			\
+	if (__ret) {					\
+		printf("FAIL: %s:\n", __func__);	\
+		perror("    ");			\
+		return -1;				\
+	}						\
+})
+
+#define CHECK_AND_RET(condition) ({			\
+	int __ret = !!(condition);			\
+	if (__ret)					\
+		return -1;				\
+})
+
+static __u64 ptr_to_u64(void *ptr)
+{
+	return (__u64) (unsigned long) ptr;
+}
+
+#define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type"
+static int bpf_find_probe_type(const char *event_type)
+{
+	char buf[256];
+	int fd, ret;
+
+	ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type);
+	CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+	fd = open(buf, O_RDONLY);
+	CHECK_PERROR_RET(fd < 0);
+
+	ret = read(fd, buf, sizeof(buf));
+	close(fd);
+	CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+	errno = 0;
+	ret = (int)strtol(buf, NULL, 10);
+	CHECK_PERROR_RET(errno);
+	return ret;
+}
+
+#define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe"
+static int bpf_get_retprobe_bit(const char *event_type)
+{
+	char buf[256];
+	int fd, ret;
+
+	ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type);
+	CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+	fd = open(buf, O_RDONLY);
+	CHECK_PERROR_RET(fd < 0);
+
+	ret = read(fd, buf, sizeof(buf));
+	close(fd);
+	CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+	CHECK_PERROR_RET(strlen(buf) < strlen("config:"));
+
+	errno = 0;
+	ret = (int)strtol(buf + strlen("config:"), NULL, 10);
+	CHECK_PERROR_RET(errno);
+	return ret;
+}
+
+static int test_debug_fs_kprobe(int prog_fd_idx, const char *fn_name,
+				__u32 expected_fd_type)
+{
+	__u64 probe_offset, probe_addr;
+	__u32 len, prog_id, fd_type;
+	char buf[256];
+	int err;
+
+	len = sizeof(buf);
+	err = bpf_task_fd_query(getpid(), event_fd[prog_fd_idx], 0, buf, &len,
+				&prog_id, &fd_type, &probe_offset,
+				&probe_addr);
+	if (err < 0) {
+		printf("FAIL: %s, for event_fd idx %d, fn_name %s\n",
+		       __func__, prog_fd_idx, fn_name);
+		perror("    :");
+		return -1;
+	}
+	if (strcmp(buf, fn_name) != 0 ||
+	    fd_type != expected_fd_type ||
+	    probe_offset != 0x0 || probe_addr != 0x0) {
+		printf("FAIL: bpf_trace_event_query(event_fd[%d]):\n",
+		       prog_fd_idx);
+		printf("buf: %s, fd_type: %u, probe_offset: 0x%llx,"
+		       " probe_addr: 0x%llx\n",
+		       buf, fd_type, probe_offset, probe_addr);
+		return -1;
+	}
+	return 0;
+}
+
+static int test_nondebug_fs_kuprobe_common(const char *event_type,
+	const char *name, __u64 offset, __u64 addr, bool is_return,
+	char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type,
+	__u64 *probe_offset, __u64 *probe_addr)
+{
+	int is_return_bit = bpf_get_retprobe_bit(event_type);
+	int type = bpf_find_probe_type(event_type);
+	struct perf_event_attr attr = {};
+	int fd;
+
+	if (type < 0 || is_return_bit < 0) {
+		printf("FAIL: %s incorrect type (%d) or is_return_bit (%d)\n",
+			__func__, type, is_return_bit);
+		return -1;
+	}
+
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+	if (is_return)
+		attr.config |= 1 << is_return_bit;
+
+	if (name) {
+		attr.config1 = ptr_to_u64((void *)name);
+		attr.config2 = offset;
+	} else {
+		attr.config1 = 0;
+		attr.config2 = addr;
+	}
+	attr.size = sizeof(attr);
+	attr.type = type;
+
+	fd = sys_perf_event_open(&attr, -1, 0, -1, 0);
+	CHECK_PERROR_RET(fd < 0);
+
+	CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0);
+	CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0);
+	CHECK_PERROR_RET(bpf_task_fd_query(getpid(), fd, 0, buf, buf_len,
+			 prog_id, fd_type, probe_offset, probe_addr) < 0);
+
+	return 0;
+}
+
+static int test_nondebug_fs_probe(const char *event_type, const char *name,
+				  __u64 offset, __u64 addr, bool is_return,
+				  __u32 expected_fd_type,
+				  __u32 expected_ret_fd_type,
+				  char *buf, __u32 buf_len)
+{
+	__u64 probe_offset, probe_addr;
+	__u32 prog_id, fd_type;
+	int err;
+
+	err = test_nondebug_fs_kuprobe_common(event_type, name,
+					      offset, addr, is_return,
+					      buf, &buf_len, &prog_id,
+					      &fd_type, &probe_offset,
+					      &probe_addr);
+	if (err < 0) {
+		printf("FAIL: %s, "
+		       "for name %s, offset 0x%llx, addr 0x%llx, is_return %d\n",
+		       __func__, name ? name : "", offset, addr, is_return);
+		perror("    :");
+		return -1;
+	}
+	if ((is_return && fd_type != expected_ret_fd_type) ||
+	    (!is_return && fd_type != expected_fd_type)) {
+		printf("FAIL: %s, incorrect fd_type %u\n",
+		       __func__, fd_type);
+		return -1;
+	}
+	if (name) {
+		if (strcmp(name, buf) != 0) {
+			printf("FAIL: %s, incorrect buf %s\n", __func__, buf);
+			return -1;
+		}
+		if (probe_offset != offset) {
+			printf("FAIL: %s, incorrect probe_offset 0x%llx\n",
+			       __func__, probe_offset);
+			return -1;
+		}
+	} else {
+		if (buf_len != 0) {
+			printf("FAIL: %s, incorrect buf %p\n",
+			       __func__, buf);
+			return -1;
+		}
+
+		if (probe_addr != addr) {
+			printf("FAIL: %s, incorrect probe_addr 0x%llx\n",
+			       __func__, probe_addr);
+			return -1;
+		}
+	}
+	return 0;
+}
+
+static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return)
+{
+	const char *event_type = "uprobe";
+	struct perf_event_attr attr = {};
+	char buf[256], event_alias[256];
+	__u64 probe_offset, probe_addr;
+	__u32 len, prog_id, fd_type;
+	int err, res, kfd, efd;
+	ssize_t bytes;
+
+	snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events",
+		 event_type);
+	kfd = open(buf, O_WRONLY | O_APPEND, 0);
+	CHECK_PERROR_RET(kfd < 0);
+
+	res = snprintf(event_alias, sizeof(event_alias), "test_%d", getpid());
+	CHECK_PERROR_RET(res < 0 || res >= sizeof(event_alias));
+
+	res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx",
+		       is_return ? 'r' : 'p', event_type, event_alias,
+		       binary_path, offset);
+	CHECK_PERROR_RET(res < 0 || res >= sizeof(buf));
+	CHECK_PERROR_RET(write(kfd, buf, strlen(buf)) < 0);
+
+	close(kfd);
+	kfd = -1;
+
+	snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s/id",
+		 event_type, event_alias);
+	efd = open(buf, O_RDONLY, 0);
+	CHECK_PERROR_RET(efd < 0);
+
+	bytes = read(efd, buf, sizeof(buf));
+	CHECK_PERROR_RET(bytes <= 0 || bytes >= sizeof(buf));
+	close(efd);
+	buf[bytes] = '\0';
+
+	attr.config = strtol(buf, NULL, 0);
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+	kfd = sys_perf_event_open(&attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC);
+	CHECK_PERROR_RET(kfd < 0);
+	CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0);
+	CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_ENABLE, 0) < 0);
+
+	len = sizeof(buf);
+	err = bpf_task_fd_query(getpid(), kfd, 0, buf, &len,
+				&prog_id, &fd_type, &probe_offset,
+				&probe_addr);
+	if (err < 0) {
+		printf("FAIL: %s, binary_path %s\n", __func__, binary_path);
+		perror("    :");
+		return -1;
+	}
+	if ((is_return && fd_type != BPF_FD_TYPE_URETPROBE) ||
+	    (!is_return && fd_type != BPF_FD_TYPE_UPROBE)) {
+		printf("FAIL: %s, incorrect fd_type %u\n", __func__,
+		       fd_type);
+		return -1;
+	}
+	if (strcmp(binary_path, buf) != 0) {
+		printf("FAIL: %s, incorrect buf %s\n", __func__, buf);
+		return -1;
+	}
+	if (probe_offset != offset) {
+		printf("FAIL: %s, incorrect probe_offset 0x%llx\n", __func__,
+		       probe_offset);
+		return -1;
+	}
+
+	close(kfd);
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	struct rlimit r = {1024*1024, RLIM_INFINITY};
+	extern char __executable_start;
+	char filename[256], buf[256];
+	__u64 uprobe_file_offset;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK)");
+		return 1;
+	}
+
+	if (load_kallsyms()) {
+		printf("failed to process /proc/kallsyms\n");
+		return 1;
+	}
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	/* test two functions in the corresponding *_kern.c file */
+	CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_start_request",
+					   BPF_FD_TYPE_KPROBE));
+	CHECK_AND_RET(test_debug_fs_kprobe(1, "blk_account_io_completion",
+					   BPF_FD_TYPE_KRETPROBE));
+
+	/* test nondebug fs kprobe */
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0,
+					     false, BPF_FD_TYPE_KPROBE,
+					     BPF_FD_TYPE_KRETPROBE,
+					     buf, sizeof(buf)));
+#ifdef __x86_64__
+	/* set a kprobe on "bpf_check + 0x5", which is x64 specific */
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x5, 0x0,
+					     false, BPF_FD_TYPE_KPROBE,
+					     BPF_FD_TYPE_KRETPROBE,
+					     buf, sizeof(buf)));
+#endif
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0,
+					     true, BPF_FD_TYPE_KPROBE,
+					     BPF_FD_TYPE_KRETPROBE,
+					     buf, sizeof(buf)));
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+					     ksym_get_addr("bpf_check"), false,
+					     BPF_FD_TYPE_KPROBE,
+					     BPF_FD_TYPE_KRETPROBE,
+					     buf, sizeof(buf)));
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+					     ksym_get_addr("bpf_check"), false,
+					     BPF_FD_TYPE_KPROBE,
+					     BPF_FD_TYPE_KRETPROBE,
+					     NULL, 0));
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+					     ksym_get_addr("bpf_check"), true,
+					     BPF_FD_TYPE_KPROBE,
+					     BPF_FD_TYPE_KRETPROBE,
+					     buf, sizeof(buf)));
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+					     ksym_get_addr("bpf_check"), true,
+					     BPF_FD_TYPE_KPROBE,
+					     BPF_FD_TYPE_KRETPROBE,
+					     0, 0));
+
+	/* test nondebug fs uprobe */
+	/* the calculation of uprobe file offset is based on gcc 7.3.1 on x64
+	 * and the default linker script, which defines __executable_start as
+	 * the start of the .text section. The calculation could be different
+	 * on different systems with different compilers. The right way is
+	 * to parse the ELF file. We took a shortcut here.
+	 */
+	uprobe_file_offset = (__u64)main - (__u64)&__executable_start;
+	CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0],
+					     uprobe_file_offset, 0x0, false,
+					     BPF_FD_TYPE_UPROBE,
+					     BPF_FD_TYPE_URETPROBE,
+					     buf, sizeof(buf)));
+	CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0],
+					     uprobe_file_offset, 0x0, true,
+					     BPF_FD_TYPE_UPROBE,
+					     BPF_FD_TYPE_URETPROBE,
+					     buf, sizeof(buf)));
+
+	/* test debug fs uprobe */
+	CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset,
+					   false));
+	CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset,
+					   true));
+
+	return 0;
+}
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v5 6/7] tools/bpf: add two BPF_TASK_FD_QUERY tests in test_progs
From: Yonghong Song @ 2018-05-24 18:21 UTC (permalink / raw)
  To: peterz, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180524182158.456462-1-yhs@fb.com>

The new tests are added to query perf_event information
for raw_tracepoint and tracepoint attachment. For tracepoint,
both syscalls and non-syscalls tracepoints are queries as
they are treated slightly differently inside the kernel.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/testing/selftests/bpf/test_progs.c | 158 +++++++++++++++++++++++++++++++
 1 file changed, 158 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 3ecf733..0ef6820 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1542,6 +1542,162 @@ static void test_get_stack_raw_tp(void)
 	bpf_object__close(obj);
 }
 
+static void test_task_fd_query_rawtp(void)
+{
+	const char *file = "./test_get_stack_rawtp.o";
+	__u64 probe_offset, probe_addr;
+	__u32 len, prog_id, fd_type;
+	struct bpf_object *obj;
+	int efd, err, prog_fd;
+	__u32 duration = 0;
+	char buf[256];
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
+		return;
+
+	efd = bpf_raw_tracepoint_open("sys_enter", prog_fd);
+	if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+
+	/* query (getpid(), efd) */
+	len = sizeof(buf);
+	err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id,
+				&fd_type, &probe_offset, &probe_addr);
+	if (CHECK(err < 0, "bpf_task_fd_query", "err %d errno %d\n", err,
+		  errno))
+		goto close_prog;
+
+	err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+	      strcmp(buf, "sys_enter") == 0;
+	if (CHECK(!err, "check_results", "fd_type %d tp_name %s\n",
+		  fd_type, buf))
+		goto close_prog;
+
+	/* test zero len */
+	len = 0;
+	err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id,
+				&fd_type, &probe_offset, &probe_addr);
+	if (CHECK(err < 0, "bpf_task_fd_query (len = 0)", "err %d errno %d\n",
+		  err, errno))
+		goto close_prog;
+	err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+	      len == strlen("sys_enter");
+	if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+		goto close_prog;
+
+	/* test empty buffer */
+	len = sizeof(buf);
+	err = bpf_task_fd_query(getpid(), efd, 0, 0, &len, &prog_id,
+				&fd_type, &probe_offset, &probe_addr);
+	if (CHECK(err < 0, "bpf_task_fd_query (buf = 0)", "err %d errno %d\n",
+		  err, errno))
+		goto close_prog;
+	err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+	      len == strlen("sys_enter");
+	if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+		goto close_prog;
+
+	/* test smaller buffer */
+	len = 3;
+	err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id,
+				&fd_type, &probe_offset, &probe_addr);
+	if (CHECK(err >= 0 || errno != ENOSPC, "bpf_task_fd_query (len = 3)",
+		  "err %d errno %d\n", err, errno))
+		goto close_prog;
+	err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+	      len == strlen("sys_enter") &&
+	      strcmp(buf, "sy") == 0;
+	if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+		goto close_prog;
+
+	goto close_prog_noerr;
+close_prog:
+	error_cnt++;
+close_prog_noerr:
+	bpf_object__close(obj);
+}
+
+static void test_task_fd_query_tp_core(const char *probe_name,
+				       const char *tp_name)
+{
+	const char *file = "./test_tracepoint.o";
+	int err, bytes, efd, prog_fd, pmu_fd;
+	struct perf_event_attr attr = {};
+	__u64 probe_offset, probe_addr;
+	__u32 len, prog_id, fd_type;
+	struct bpf_object *obj;
+	__u32 duration = 0;
+	char buf[256];
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
+		goto close_prog;
+
+	snprintf(buf, sizeof(buf),
+		 "/sys/kernel/debug/tracing/events/%s/id", probe_name);
+	efd = open(buf, O_RDONLY, 0);
+	if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+	bytes = read(efd, buf, sizeof(buf));
+	close(efd);
+	if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read",
+		  "bytes %d errno %d\n", bytes, errno))
+		goto close_prog;
+
+	attr.config = strtol(buf, NULL, 0);
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+			 0 /* cpu 0 */, -1 /* group id */,
+			 0 /* flags */);
+	if (CHECK(err, "perf_event_open", "err %d errno %d\n", err, errno))
+		goto close_pmu;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err,
+		  errno))
+		goto close_pmu;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", err,
+		  errno))
+		goto close_pmu;
+
+	/* query (getpid(), pmu_fd) */
+	len = sizeof(buf);
+	err = bpf_task_fd_query(getpid(), pmu_fd, 0, buf, &len, &prog_id,
+				&fd_type, &probe_offset, &probe_addr);
+	if (CHECK(err < 0, "bpf_task_fd_query", "err %d errno %d\n", err,
+		  errno))
+		goto close_pmu;
+
+	err = (fd_type == BPF_FD_TYPE_TRACEPOINT) && !strcmp(buf, tp_name);
+	if (CHECK(!err, "check_results", "fd_type %d tp_name %s\n",
+		  fd_type, buf))
+		goto close_pmu;
+
+	close(pmu_fd);
+	goto close_prog_noerr;
+
+close_pmu:
+	close(pmu_fd);
+close_prog:
+	error_cnt++;
+close_prog_noerr:
+	bpf_object__close(obj);
+}
+
+static void test_task_fd_query_tp(void)
+{
+	test_task_fd_query_tp_core("sched/sched_switch",
+				   "sched_switch");
+	test_task_fd_query_tp_core("syscalls/sys_enter_read",
+				   "sys_enter_read");
+}
+
 int main(void)
 {
 	jit_enabled = is_jit_enabled();
@@ -1561,6 +1717,8 @@ int main(void)
 	test_stacktrace_build_id_nmi();
 	test_stacktrace_map_raw_tp();
 	test_get_stack_raw_tp();
+	test_task_fd_query_rawtp();
+	test_task_fd_query_tp();
 
 	printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
 	return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v5 7/7] tools/bpftool: add perf subcommand
From: Yonghong Song @ 2018-05-24 18:21 UTC (permalink / raw)
  To: peterz, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180524182158.456462-1-yhs@fb.com>

The new command "bpftool perf [show | list]" will traverse
all processes under /proc, and if any fd is associated
with a perf event, it will print out related perf event
information. Documentation is also added.

Below is an example to show the results using bcc commands.
Running the following 4 bcc commands:
  kprobe:     trace.py '__x64_sys_nanosleep'
  kretprobe:  trace.py 'r::__x64_sys_nanosleep'
  tracepoint: trace.py 't:syscalls:sys_enter_nanosleep'
  uprobe:     trace.py 'p:/home/yhs/a.out:main'

The bpftool command line and result:

  $ bpftool perf
  pid 21711  fd 5: prog_id 5  kprobe  func __x64_sys_write  offset 0
  pid 21765  fd 5: prog_id 7  kretprobe  func __x64_sys_nanosleep  offset 0
  pid 21767  fd 5: prog_id 8  tracepoint  sys_enter_nanosleep
  pid 21800  fd 5: prog_id 9  uprobe  filename /home/yhs/a.out  offset 1159

  $ bpftool -j perf
  [{"pid":21711,"fd":5,"prog_id":5,"fd_type":"kprobe","func":"__x64_sys_write","offset":0}, \
   {"pid":21765,"fd":5,"prog_id":7,"fd_type":"kretprobe","func":"__x64_sys_nanosleep","offset":0}, \
   {"pid":21767,"fd":5,"prog_id":8,"fd_type":"tracepoint","tracepoint":"sys_enter_nanosleep"}, \
   {"pid":21800,"fd":5,"prog_id":9,"fd_type":"uprobe","filename":"/home/yhs/a.out","offset":1159}]

  $ bpftool prog
  5: kprobe  name probe___x64_sys  tag e495a0c82f2c7a8d  gpl
	  loaded_at 2018-05-15T04:46:37-0700  uid 0
	  xlated 200B  not jited  memlock 4096B  map_ids 4
  7: kprobe  name probe___x64_sys  tag f2fdee479a503abf  gpl
	  loaded_at 2018-05-15T04:48:32-0700  uid 0
	  xlated 200B  not jited  memlock 4096B  map_ids 7
  8: tracepoint  name tracepoint__sys  tag 5390badef2395fcf  gpl
	  loaded_at 2018-05-15T04:48:48-0700  uid 0
	  xlated 200B  not jited  memlock 4096B  map_ids 8
  9: kprobe  name probe_main_1  tag 0a87bdc2e2953b6d  gpl
	  loaded_at 2018-05-15T04:49:52-0700  uid 0
	  xlated 200B  not jited  memlock 4096B  map_ids 9

  $ ps ax | grep "python ./trace.py"
  21711 pts/0    T      0:03 python ./trace.py __x64_sys_write
  21765 pts/0    S+     0:00 python ./trace.py r::__x64_sys_nanosleep
  21767 pts/2    S+     0:00 python ./trace.py t:syscalls:sys_enter_nanosleep
  21800 pts/3    S+     0:00 python ./trace.py p:/home/yhs/a.out:main
  22374 pts/1    S+     0:00 grep --color=auto python ./trace.py

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/bpf/bpftool/Documentation/bpftool-perf.rst |  81 ++++++++
 tools/bpf/bpftool/Documentation/bpftool.rst      |   5 +-
 tools/bpf/bpftool/bash-completion/bpftool        |   9 +
 tools/bpf/bpftool/main.c                         |   3 +-
 tools/bpf/bpftool/main.h                         |   1 +
 tools/bpf/bpftool/perf.c                         | 246 +++++++++++++++++++++++
 6 files changed, 343 insertions(+), 2 deletions(-)
 create mode 100644 tools/bpf/bpftool/Documentation/bpftool-perf.rst
 create mode 100644 tools/bpf/bpftool/perf.c

diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
new file mode 100644
index 0000000..e3eb0ea
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
@@ -0,0 +1,81 @@
+================
+bpftool-perf
+================
+-------------------------------------------------------------------------------
+tool for inspection of perf related bpf prog attachments
+-------------------------------------------------------------------------------
+
+:Manual section: 8
+
+SYNOPSIS
+========
+
+	**bpftool** [*OPTIONS*] **perf** *COMMAND*
+
+	*OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] }
+
+	*COMMANDS* :=
+	{ **show** | **list** | **help** }
+
+PERF COMMANDS
+=============
+
+|	**bpftool** **perf { show | list }**
+|	**bpftool** **perf help**
+
+DESCRIPTION
+===========
+	**bpftool perf { show | list }**
+		  List all raw_tracepoint, tracepoint, kprobe attachment in the system.
+
+		  Output will start with process id and file descriptor in that process,
+		  followed by bpf program id, attachment information, and attachment point.
+		  The attachment point for raw_tracepoint/tracepoint is the trace probe name.
+		  The attachment point for k[ret]probe is either symbol name and offset,
+		  or a kernel virtual address.
+		  The attachment point for u[ret]probe is the file name and the file offset.
+
+	**bpftool perf help**
+		  Print short help message.
+
+OPTIONS
+=======
+	-h, --help
+		  Print short generic help message (similar to **bpftool help**).
+
+	-v, --version
+		  Print version number (similar to **bpftool version**).
+
+	-j, --json
+		  Generate JSON output. For commands that cannot produce JSON, this
+		  option has no effect.
+
+	-p, --pretty
+		  Generate human-readable JSON output. Implies **-j**.
+
+EXAMPLES
+========
+
+| **# bpftool perf**
+
+::
+
+      pid 21711  fd 5: prog_id 5  kprobe  func __x64_sys_write  offset 0
+      pid 21765  fd 5: prog_id 7  kretprobe  func __x64_sys_nanosleep  offset 0
+      pid 21767  fd 5: prog_id 8  tracepoint  sys_enter_nanosleep
+      pid 21800  fd 5: prog_id 9  uprobe  filename /home/yhs/a.out  offset 1159
+
+|
+| **# bpftool -j perf**
+
+::
+
+    [{"pid":21711,"fd":5,"prog_id":5,"fd_type":"kprobe","func":"__x64_sys_write","offset":0}, \
+     {"pid":21765,"fd":5,"prog_id":7,"fd_type":"kretprobe","func":"__x64_sys_nanosleep","offset":0}, \
+     {"pid":21767,"fd":5,"prog_id":8,"fd_type":"tracepoint","tracepoint":"sys_enter_nanosleep"}, \
+     {"pid":21800,"fd":5,"prog_id":9,"fd_type":"uprobe","filename":"/home/yhs/a.out","offset":1159}]
+
+
+SEE ALSO
+========
+	**bpftool**\ (8), **bpftool-prog**\ (8), **bpftool-map**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index 564cb0d..b6f5d56 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -16,7 +16,7 @@ SYNOPSIS
 
 	**bpftool** **version**
 
-	*OBJECT* := { **map** | **program** | **cgroup** }
+	*OBJECT* := { **map** | **program** | **cgroup** | **perf** }
 
 	*OPTIONS* := { { **-V** | **--version** } | { **-h** | **--help** }
 	| { **-j** | **--json** } [{ **-p** | **--pretty** }] }
@@ -30,6 +30,8 @@ SYNOPSIS
 
 	*CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** }
 
+	*PERF-COMMANDS* := { **show** | **list** | **help** }
+
 DESCRIPTION
 ===========
 	*bpftool* allows for inspection and simple modification of BPF objects
@@ -56,3 +58,4 @@ OPTIONS
 SEE ALSO
 ========
 	**bpftool-map**\ (8), **bpftool-prog**\ (8), **bpftool-cgroup**\ (8)
+        **bpftool-perf**\ (8)
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index b301c9b..7bc198d 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -448,6 +448,15 @@ _bpftool()
                     ;;
             esac
             ;;
+        perf)
+            case $command in
+                *)
+                    [[ $prev == $object ]] && \
+                        COMPREPLY=( $( compgen -W 'help \
+                            show list' -- "$cur" ) )
+                    ;;
+            esac
+            ;;
     esac
 } &&
 complete -F _bpftool bpftool
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index 1ec852d..eea7f14 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -87,7 +87,7 @@ static int do_help(int argc, char **argv)
 		"       %s batch file FILE\n"
 		"       %s version\n"
 		"\n"
-		"       OBJECT := { prog | map | cgroup }\n"
+		"       OBJECT := { prog | map | cgroup | perf }\n"
 		"       " HELP_SPEC_OPTIONS "\n"
 		"",
 		bin_name, bin_name, bin_name);
@@ -216,6 +216,7 @@ static const struct cmd cmds[] = {
 	{ "prog",	do_prog },
 	{ "map",	do_map },
 	{ "cgroup",	do_cgroup },
+	{ "perf",	do_perf },
 	{ "version",	do_version },
 	{ 0 }
 };
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 6173cd9..63fdb31 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -119,6 +119,7 @@ int do_prog(int argc, char **arg);
 int do_map(int argc, char **arg);
 int do_event_pipe(int argc, char **argv);
 int do_cgroup(int argc, char **arg);
+int do_perf(int argc, char **arg);
 
 int prog_parse_fd(int *argc, char ***argv);
 int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len);
diff --git a/tools/bpf/bpftool/perf.c b/tools/bpf/bpftool/perf.c
new file mode 100644
index 0000000..ac6b1a1
--- /dev/null
+++ b/tools/bpf/bpftool/perf.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright (C) 2018 Facebook
+// Author: Yonghong Song <yhs@fb.com>
+
+#define _GNU_SOURCE
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <ftw.h>
+
+#include <bpf.h>
+
+#include "main.h"
+
+/* 0: undecided, 1: supported, 2: not supported */
+static int perf_query_supported;
+static bool has_perf_query_support(void)
+{
+	__u64 probe_offset, probe_addr;
+	__u32 len, prog_id, fd_type;
+	char buf[256];
+	int fd;
+
+	if (perf_query_supported)
+		goto out;
+
+	fd = open(bin_name, O_RDONLY);
+	if (fd < 0) {
+		p_err("perf_query_support: %s", strerror(errno));
+		goto out;
+	}
+
+	/* the following query will fail as no bpf attachment,
+	 * the expected errno is ENOTSUPP
+	 */
+	errno = 0;
+	len = sizeof(buf);
+	bpf_task_fd_query(getpid(), fd, 0, buf, &len, &prog_id,
+			  &fd_type, &probe_offset, &probe_addr);
+
+	if (errno == 524 /* ENOTSUPP */) {
+		perf_query_supported = 1;
+		goto close_fd;
+	}
+
+	perf_query_supported = 2;
+	p_err("perf_query_support: %s", strerror(errno));
+	fprintf(stderr,
+		"HINT: non root or kernel doesn't support TASK_FD_QUERY\n");
+
+close_fd:
+	close(fd);
+out:
+	return perf_query_supported == 1;
+}
+
+static void print_perf_json(int pid, int fd, __u32 prog_id, __u32 fd_type,
+			    char *buf, __u64 probe_offset, __u64 probe_addr)
+{
+	jsonw_start_object(json_wtr);
+	jsonw_int_field(json_wtr, "pid", pid);
+	jsonw_int_field(json_wtr, "fd", fd);
+	jsonw_uint_field(json_wtr, "prog_id", prog_id);
+	switch (fd_type) {
+	case BPF_FD_TYPE_RAW_TRACEPOINT:
+		jsonw_string_field(json_wtr, "fd_type", "raw_tracepoint");
+		jsonw_string_field(json_wtr, "tracepoint", buf);
+		break;
+	case BPF_FD_TYPE_TRACEPOINT:
+		jsonw_string_field(json_wtr, "fd_type", "tracepoint");
+		jsonw_string_field(json_wtr, "tracepoint", buf);
+		break;
+	case BPF_FD_TYPE_KPROBE:
+		jsonw_string_field(json_wtr, "fd_type", "kprobe");
+		if (buf[0] != '\0') {
+			jsonw_string_field(json_wtr, "func", buf);
+			jsonw_lluint_field(json_wtr, "offset", probe_offset);
+		} else {
+			jsonw_lluint_field(json_wtr, "addr", probe_addr);
+		}
+		break;
+	case BPF_FD_TYPE_KRETPROBE:
+		jsonw_string_field(json_wtr, "fd_type", "kretprobe");
+		if (buf[0] != '\0') {
+			jsonw_string_field(json_wtr, "func", buf);
+			jsonw_lluint_field(json_wtr, "offset", probe_offset);
+		} else {
+			jsonw_lluint_field(json_wtr, "addr", probe_addr);
+		}
+		break;
+	case BPF_FD_TYPE_UPROBE:
+		jsonw_string_field(json_wtr, "fd_type", "uprobe");
+		jsonw_string_field(json_wtr, "filename", buf);
+		jsonw_lluint_field(json_wtr, "offset", probe_offset);
+		break;
+	case BPF_FD_TYPE_URETPROBE:
+		jsonw_string_field(json_wtr, "fd_type", "uretprobe");
+		jsonw_string_field(json_wtr, "filename", buf);
+		jsonw_lluint_field(json_wtr, "offset", probe_offset);
+		break;
+	}
+	jsonw_end_object(json_wtr);
+}
+
+static void print_perf_plain(int pid, int fd, __u32 prog_id, __u32 fd_type,
+			     char *buf, __u64 probe_offset, __u64 probe_addr)
+{
+	printf("pid %d  fd %d: prog_id %u  ", pid, fd, prog_id);
+	switch (fd_type) {
+	case BPF_FD_TYPE_RAW_TRACEPOINT:
+		printf("raw_tracepoint  %s\n", buf);
+		break;
+	case BPF_FD_TYPE_TRACEPOINT:
+		printf("tracepoint  %s\n", buf);
+		break;
+	case BPF_FD_TYPE_KPROBE:
+		if (buf[0] != '\0')
+			printf("kprobe  func %s  offset %llu\n", buf,
+			       probe_offset);
+		else
+			printf("kprobe  addr %llu\n", probe_addr);
+		break;
+	case BPF_FD_TYPE_KRETPROBE:
+		if (buf[0] != '\0')
+			printf("kretprobe  func %s  offset %llu\n", buf,
+			       probe_offset);
+		else
+			printf("kretprobe  addr %llu\n", probe_addr);
+		break;
+	case BPF_FD_TYPE_UPROBE:
+		printf("uprobe  filename %s  offset %llu\n", buf, probe_offset);
+		break;
+	case BPF_FD_TYPE_URETPROBE:
+		printf("uretprobe  filename %s  offset %llu\n", buf,
+		       probe_offset);
+		break;
+	}
+}
+
+static int show_proc(const char *fpath, const struct stat *sb,
+		     int tflag, struct FTW *ftwbuf)
+{
+	__u64 probe_offset, probe_addr;
+	__u32 len, prog_id, fd_type;
+	int err, pid = 0, fd = 0;
+	const char *pch;
+	char buf[4096];
+
+	/* prefix always /proc */
+	pch = fpath + 5;
+	if (*pch == '\0')
+		return 0;
+
+	/* pid should be all numbers */
+	pch++;
+	while (isdigit(*pch)) {
+		pid = pid * 10 + *pch - '0';
+		pch++;
+	}
+	if (*pch == '\0')
+		return 0;
+	if (*pch != '/')
+		return FTW_SKIP_SUBTREE;
+
+	/* check /proc/<pid>/fd directory */
+	pch++;
+	if (strncmp(pch, "fd", 2))
+		return FTW_SKIP_SUBTREE;
+	pch += 2;
+	if (*pch == '\0')
+		return 0;
+	if (*pch != '/')
+		return FTW_SKIP_SUBTREE;
+
+	/* check /proc/<pid>/fd/<fd_num> */
+	pch++;
+	while (isdigit(*pch)) {
+		fd = fd * 10 + *pch - '0';
+		pch++;
+	}
+	if (*pch != '\0')
+		return FTW_SKIP_SUBTREE;
+
+	/* query (pid, fd) for potential perf events */
+	len = sizeof(buf);
+	err = bpf_task_fd_query(pid, fd, 0, buf, &len, &prog_id, &fd_type,
+				&probe_offset, &probe_addr);
+	if (err < 0)
+		return 0;
+
+	if (json_output)
+		print_perf_json(pid, fd, prog_id, fd_type, buf, probe_offset,
+				probe_addr);
+	else
+		print_perf_plain(pid, fd, prog_id, fd_type, buf, probe_offset,
+				 probe_addr);
+
+	return 0;
+}
+
+static int do_show(int argc, char **argv)
+{
+	int flags = FTW_ACTIONRETVAL | FTW_PHYS;
+	int err = 0, nopenfd = 16;
+
+	if (!has_perf_query_support())
+		return -1;
+
+	if (json_output)
+		jsonw_start_array(json_wtr);
+	if (nftw("/proc", show_proc, nopenfd, flags) == -1) {
+		p_err("%s", strerror(errno));
+		err = -1;
+	}
+	if (json_output)
+		jsonw_end_array(json_wtr);
+
+	return err;
+}
+
+static int do_help(int argc, char **argv)
+{
+	fprintf(stderr,
+		"Usage: %s %s { show | list | help }\n"
+		"",
+		bin_name, argv[-2]);
+
+	return 0;
+}
+
+static const struct cmd cmds[] = {
+	{ "show",	do_show },
+	{ "list",	do_show },
+	{ "help",	do_help },
+	{ 0 }
+};
+
+int do_perf(int argc, char **argv)
+{
+	return cmd_select(cmds, argc, argv, do_help);
+}
-- 
2.9.5

^ permalink raw reply related

* Re: [PATCH net-next 0/8] nfp: offload LAG for tc flower egress
From: Samudrala, Sridhar @ 2018-05-24 18:23 UTC (permalink / raw)
  To: Or Gerlitz, Jakub Kicinski
  Cc: David Miller, Linux Netdev List, oss-drivers, Jiri Pirko,
	Jay Vosburgh, Veaceslav Falico, Andy Gospodarek
In-Reply-To: <CAJ3xEMj48Gvox-hCyrGEXNtcr7g_9+drxAN6jbaOSGLEHaappA@mail.gmail.com>


On 5/24/2018 10:04 AM, Or Gerlitz wrote:
> On Thu, May 24, 2018 at 5:22 AM, Jakub Kicinski
> <jakub.kicinski@netronome.com> wrote:
>> Hi!
>>
>> This series from John adds bond offload to the nfp driver.  Patch 5
>> exposes the hash type for NETDEV_LAG_TX_TYPE_HASH to make sure nfp
>> hashing matches that of the software LAG.  This may be unnecessarily
>> conservative, let's see what LAG maintainers think :)
>>
>> John says:
>>
>> This patchset sets up the infrastructure and offloads output actions for
>> when a TC flower rule attempts to egress a packet to a LAG port.
>>
>> Firstly it adds some of the infrastructure required to the flower app and
>> to the nfp core. This includes the ability to change the MAC address of a
>> repr, a function for combining lookup and write to a FW symbol, and the
>> addition of private data to a repr on a per app basis.
>>
>> Patch 6 continues by implementing notifiers that track Linux bonds and
>> communicates to the FW those which enslave reprs, along with the current
>> state of reprs within the bond.
>>
>> Patch 7 ensures bonds are synchronised with FW by receiving and acting
>> upon cmsgs sent to the kernel. These may request that a bond message is
>> retransmitted when FW can process it, or may request a full sync of the
>> bonds defined in the kernel.
>>
>> Patch 8 offloads a flower action when that action requires egressing to a
>> pre-defined Linux bond.
> Does this apply also to non-uplink representors? if yes, what is the use case?
>
> We are looking on supporting uplink lag in sriov switchdev scheme - we refer to
> it as "vf lag" -- b/c the netdev and rdma devices seen by the VF are actually
> subject to HA and/or LAG - I wasn't sure if/how you limit this series
> to uplink reprs

Also, does this patchset support offloading LAG when using vxlan based tunnels?

When using OVS offloading with vxlan,  the encap rule that gets offloaded via tc-flower
has egress port as vxlan device and the decap rule has the in-port as vxlan device, not
the actual egress port.  How are you addressing this issue?

^ permalink raw reply

* Re: [PATCH net-next 0/8] nfp: offload LAG for tc flower egress
From: Jakub Kicinski @ 2018-05-24 18:49 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: David Miller, Linux Netdev List, oss-drivers, Jiri Pirko,
	Jay Vosburgh, Veaceslav Falico, Andy Gospodarek
In-Reply-To: <CAJ3xEMj48Gvox-hCyrGEXNtcr7g_9+drxAN6jbaOSGLEHaappA@mail.gmail.com>

On Thu, 24 May 2018 20:04:56 +0300, Or Gerlitz wrote:
> On Thu, May 24, 2018 at 5:22 AM, Jakub Kicinski wrote:
> > Hi!
> >
> > This series from John adds bond offload to the nfp driver.  Patch 5
> > exposes the hash type for NETDEV_LAG_TX_TYPE_HASH to make sure nfp
> > hashing matches that of the software LAG.  This may be unnecessarily
> > conservative, let's see what LAG maintainers think :)
> >
> > John says:
> >
> > This patchset sets up the infrastructure and offloads output actions for
> > when a TC flower rule attempts to egress a packet to a LAG port.
> >
> > Firstly it adds some of the infrastructure required to the flower app and
> > to the nfp core. This includes the ability to change the MAC address of a
> > repr, a function for combining lookup and write to a FW symbol, and the
> > addition of private data to a repr on a per app basis.
> >
> > Patch 6 continues by implementing notifiers that track Linux bonds and
> > communicates to the FW those which enslave reprs, along with the current
> > state of reprs within the bond.
> >
> > Patch 7 ensures bonds are synchronised with FW by receiving and acting
> > upon cmsgs sent to the kernel. These may request that a bond message is
> > retransmitted when FW can process it, or may request a full sync of the
> > bonds defined in the kernel.
> >
> > Patch 8 offloads a flower action when that action requires egressing to a
> > pre-defined Linux bond.  
> 
> Does this apply also to non-uplink representors? if yes, what is the use case?
> 
> We are looking on supporting uplink lag in sriov switchdev scheme - we refer to
> it as "vf lag" -- b/c the netdev and rdma devices seen by the VF are actually
> subject to HA and/or LAG - I wasn't sure if/how you limit this series
> to uplink reprs

I don't think we have a limitation on the output port within the LAG.
But keep in mind in our devices all ports belong to the same eswitch/PF
so bonding uplink ports is generally sufficient, I'm not sure VF
bonding adds much HA.  IOW AFAIK we support VF bonding because HW can do
it easily, not because we have a strong use case for it.

^ permalink raw reply

* Re: [PATCH net-next 0/8] nfp: offload LAG for tc flower egress
From: Jakub Kicinski @ 2018-05-24 18:53 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: Or Gerlitz, David Miller, Linux Netdev List, oss-drivers,
	Jiri Pirko, Jay Vosburgh, Veaceslav Falico, Andy Gospodarek
In-Reply-To: <185db5ee-4a86-6479-46e6-4c48f9516a90@intel.com>

On Thu, 24 May 2018 11:23:00 -0700, Samudrala, Sridhar wrote:
> On 5/24/2018 10:04 AM, Or Gerlitz wrote:
> > On Thu, May 24, 2018 at 5:22 AM, Jakub Kicinski
> > <jakub.kicinski@netronome.com> wrote:  
> >> Hi!
> >>
> >> This series from John adds bond offload to the nfp driver.  Patch 5
> >> exposes the hash type for NETDEV_LAG_TX_TYPE_HASH to make sure nfp
> >> hashing matches that of the software LAG.  This may be unnecessarily
> >> conservative, let's see what LAG maintainers think :)
> >>
> >> John says:
> >>
> >> This patchset sets up the infrastructure and offloads output actions for
> >> when a TC flower rule attempts to egress a packet to a LAG port.
> >>
> >> Firstly it adds some of the infrastructure required to the flower app and
> >> to the nfp core. This includes the ability to change the MAC address of a
> >> repr, a function for combining lookup and write to a FW symbol, and the
> >> addition of private data to a repr on a per app basis.
> >>
> >> Patch 6 continues by implementing notifiers that track Linux bonds and
> >> communicates to the FW those which enslave reprs, along with the current
> >> state of reprs within the bond.
> >>
> >> Patch 7 ensures bonds are synchronised with FW by receiving and acting
> >> upon cmsgs sent to the kernel. These may request that a bond message is
> >> retransmitted when FW can process it, or may request a full sync of the
> >> bonds defined in the kernel.
> >>
> >> Patch 8 offloads a flower action when that action requires egressing to a
> >> pre-defined Linux bond.  
> > Does this apply also to non-uplink representors? if yes, what is the use case?
> >
> > We are looking on supporting uplink lag in sriov switchdev scheme - we refer to
> > it as "vf lag" -- b/c the netdev and rdma devices seen by the VF are actually
> > subject to HA and/or LAG - I wasn't sure if/how you limit this series
> > to uplink reprs  
> 
> Also, does this patchset support offloading LAG when using vxlan based tunnels?
> 
> When using OVS offloading with vxlan,  the encap rule that gets offloaded via tc-flower
> has egress port as vxlan device and the decap rule has the in-port as vxlan device, not
> the actual egress port.  How are you addressing this issue?

It is very much on our radar, I think we will send out a related RFC
later today :)

But to be honest I think you can just install an egress callback on the
bond and that will pretty much work today.  You don't have to "own" the
egress device to install a egdev callback on it.

^ permalink raw reply

* [PATCH] rtlwifi: remove duplicate code
From: Gustavo A. R. Silva @ 2018-05-24 18:54 UTC (permalink / raw)
  To: Ping-Ke Shih, Kalle Valo, David S. Miller
  Cc: linux-wireless, netdev, linux-kernel, Gustavo A. R. Silva

Remove and refactor some code in order to avoid having identical code
for different branches.

Notice that the logic has been there since 2014.

Addresses-Coverity-ID: 1426199 ("Identical code for different branches")
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
 .../realtek/rtlwifi/btcoexist/halbtc8723b2ant.c    | 23 ++++------------------
 1 file changed, 4 insertions(+), 19 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c
index 279fe01..df3facc 100644
--- a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c
+++ b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c
@@ -2876,25 +2876,10 @@ static void btc8723b2ant_action_hid(struct btc_coexist *btcoexist)
 		btc8723b2ant_ps_tdma(btcoexist, NORMAL_EXEC, true, 13);
 
 	/* sw mechanism */
-	if (BTC_WIFI_BW_HT40 == wifi_bw) {
-		if ((wifi_rssi_state == BTC_RSSI_STATE_HIGH) ||
-		    (wifi_rssi_state == BTC_RSSI_STATE_STAY_HIGH)) {
-			btc8723b2ant_sw_mechanism(btcoexist, true, true,
-						  false, false);
-		} else {
-			btc8723b2ant_sw_mechanism(btcoexist, true, true,
-						  false, false);
-		}
-	} else {
-		if ((wifi_rssi_state == BTC_RSSI_STATE_HIGH) ||
-		    (wifi_rssi_state == BTC_RSSI_STATE_STAY_HIGH)) {
-			btc8723b2ant_sw_mechanism(btcoexist, false, true,
-						  false, false);
-		} else {
-			btc8723b2ant_sw_mechanism(btcoexist, false, true,
-						  false, false);
-		}
-	}
+	if (wifi_bw == BTC_WIFI_BW_HT40)
+		btc8723b2ant_sw_mechanism(btcoexist, true, true, false, false);
+	else
+		btc8723b2ant_sw_mechanism(btcoexist, false, true, false, false);
 }
 
 /* A2DP only / PAN(EDR) only/ A2DP+PAN(HS) */
-- 
2.7.4

^ permalink raw reply related

* Re: Poor TCP performance with XPS enabled after scrubbing skb
From: Flavio Leitner @ 2018-05-24 19:17 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Paolo Abeni
In-Reply-To: <c8f6c2f8-c590-2912-c7af-8bce717480b6@gmail.com>

On Tue, May 15, 2018 at 02:08:09PM -0700, Eric Dumazet wrote:
> 
> 
> On 05/15/2018 12:31 PM, Flavio Leitner wrote:
> > Hi,
> > 
> > There is a significant throughput issue (~50% drop) for a single TCP
> > stream when the skb is scrubbed and XPS is enabled.
> > 
> > If I turn CONFIG_XPS off, then the issue never happens and the test
> > reaches line rate.  The same happens if I echo 0 to tx-*/xps_cpus.
> > 
> > It looks like that when the skb is scrubbed, there is no more reference
> > to the struct sock, 
> 
> And this is really the problem here, since it breaks back pressure (and TCP Small queues)
> 
> I am not sure why skb_orphan() is used in this scrubbing really.
> 

veth originally called skb_orphan() on veth_xmit() most probably
because there was no TX completion. Then the code got generalized to
dev_forward_skb() and later on moved to skb_scrub_packet().

The issue is that we call skb_scrub_packet() on TX and RX paths and
that is done while crossing netns.  It doesn't look correct to keep
the ->sk because I suspect that iptables/selinux/bpf, or some code
path that I am probably missing could expose/use the wrong ->sk, for
example.

However, netdev_pick_tx() can't store the queue mapping without ->sk.

The hack in the first email relies on the headers (skb_tx_hash) to
always selected the same TX queue, which solves the original problem
but not the TCP small queues you mentioned.

-- 
Flavio

^ permalink raw reply

* Re: [PATCH] rtlwifi: remove duplicate code
From: Joe Perches @ 2018-05-24 19:24 UTC (permalink / raw)
  To: Gustavo A. R. Silva, Ping-Ke Shih, Kalle Valo, David S. Miller
  Cc: linux-wireless, netdev, linux-kernel
In-Reply-To: <20180524185450.GA2875@embeddedor.com>

On Thu, 2018-05-24 at 13:54 -0500, Gustavo A. R. Silva wrote:
> Remove and refactor some code in order to avoid having identical code
> for different branches.

True and nice tool and patch submittal thanks.

> Notice that the logic has been there since 2014.

But perhaps the original logic is a defective copy/paste
and it should be corrected instead.

Can anyone from realtek verify this?

> Addresses-Coverity-ID: 1426199 ("Identical code for different branches")
> Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
> ---
>  .../realtek/rtlwifi/btcoexist/halbtc8723b2ant.c    | 23 ++++------------------
>  1 file changed, 4 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c
> index 279fe01..df3facc 100644
> --- a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c
> +++ b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c
> @@ -2876,25 +2876,10 @@ static void btc8723b2ant_action_hid(struct btc_coexist *btcoexist)
>  		btc8723b2ant_ps_tdma(btcoexist, NORMAL_EXEC, true, 13);
>  
>  	/* sw mechanism */
> -	if (BTC_WIFI_BW_HT40 == wifi_bw) {
> -		if ((wifi_rssi_state == BTC_RSSI_STATE_HIGH) ||
> -		    (wifi_rssi_state == BTC_RSSI_STATE_STAY_HIGH)) {
> -			btc8723b2ant_sw_mechanism(btcoexist, true, true,
> -						  false, false);
> -		} else {
> -			btc8723b2ant_sw_mechanism(btcoexist, true, true,
> -						  false, false);
> -		}
> -	} else {
> -		if ((wifi_rssi_state == BTC_RSSI_STATE_HIGH) ||
> -		    (wifi_rssi_state == BTC_RSSI_STATE_STAY_HIGH)) {
> -			btc8723b2ant_sw_mechanism(btcoexist, false, true,
> -						  false, false);
> -		} else {
> -			btc8723b2ant_sw_mechanism(btcoexist, false, true,
> -						  false, false);
> -		}
> -	}
> +	if (wifi_bw == BTC_WIFI_BW_HT40)
> +		btc8723b2ant_sw_mechanism(btcoexist, true, true, false, false);
> +	else
> +		btc8723b2ant_sw_mechanism(btcoexist, false, true, false, false);
>  }
>  
>  /* A2DP only / PAN(EDR) only/ A2DP+PAN(HS) */

^ permalink raw reply

* Re: [PATCH net-next 0/8] nfp: offload LAG for tc flower egress
From: Or Gerlitz @ 2018-05-24 19:26 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: David Miller, Linux Netdev List, oss-drivers, Jiri Pirko,
	Jay Vosburgh, Veaceslav Falico, Andy Gospodarek
In-Reply-To: <20180524114929.0fb4e38f@cakuba>

On Thu, May 24, 2018 at 9:49 PM, Jakub Kicinski
<jakub.kicinski@netronome.com> wrote:
> On Thu, 24 May 2018 20:04:56 +0300, Or Gerlitz wrote:

>> Does this apply also to non-uplink representors? if yes, what is the use case?
>>
>> We are looking on supporting uplink lag in sriov switchdev scheme - we refer to
>> it as "vf lag" -- b/c the netdev and rdma devices seen by the VF are actually
>> subject to HA and/or LAG - I wasn't sure if/how you limit this series
>> to uplink reprs
>
> I don't think we have a limitation on the output port within the LAG.
> But keep in mind in our devices all ports belong to the same eswitch/PF
> so bonding uplink ports is generally sufficient, I'm not sure VF
> bonding adds much HA.  IOW AFAIK we support VF bonding because HW can do
> it easily, not because we have a strong use case for it.


To make it clear, vf lag is code name for uplink lag, I think we want
to say that
we provide the VM a lagged VF, anyway, again, the lag is done on the uplink reps
not on the vf reps. Unlike the uplink port which is physical one, the
vf vport is virtual
one, what could be the benefit to bond two vports?

^ permalink raw reply

* Re: [PATCH net] vhost: synchronize IOTLB message with dev cleanup
From: David Miller @ 2018-05-24 19:35 UTC (permalink / raw)
  To: jasowang; +Cc: mst, kvm, virtualization, netdev, linux-kernel
In-Reply-To: <1526990337-24892-1-git-send-email-jasowang@redhat.com>

From: Jason Wang <jasowang@redhat.com>
Date: Tue, 22 May 2018 19:58:57 +0800

> DaeRyong Jeong reports a race between vhost_dev_cleanup() and
> vhost_process_iotlb_msg():
> 
> Thread interleaving:
> CPU0 (vhost_process_iotlb_msg)			CPU1 (vhost_dev_cleanup)
> (In the case of both VHOST_IOTLB_UPDATE and
> VHOST_IOTLB_INVALIDATE)
> =====						=====
> 						vhost_umem_clean(dev->iotlb);
> if (!dev->iotlb) {
> 	        ret = -EFAULT;
> 		        break;
> }
> 						dev->iotlb = NULL;
> 
> The reason is we don't synchronize between them, fixing by protecting
> vhost_process_iotlb_msg() with dev mutex.
> 
> Reported-by: DaeRyong Jeong <threeearcat@gmail.com>
> Fixes: 6b1e6cc7855b0 ("vhost: new device IOTLB API")
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Michael, please review.

^ permalink raw reply

* Re: [PATCH] net: phy: replace bool members in struct phy_device with bit-fields
From: David Miller @ 2018-05-24 19:36 UTC (permalink / raw)
  To: hkallweit1; +Cc: f.fainelli, andrew, netdev
In-Reply-To: <3c59ea3d-f707-b991-1f88-8540891488b9@gmail.com>

From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 23 May 2018 08:05:20 +0200

> In struct phy_device we have a number of flags being defined as type
> bool. Similar to e.g. struct pci_dev we can save some space by using
> bit-fields.
> 
> Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>

Applied to net-next, thanks.

^ permalink raw reply

* [PATCH net] ibmvnic: Fix partial success login retries
From: Thomas Falcon @ 2018-05-24 19:37 UTC (permalink / raw)
  To: netdev; +Cc: nfont, jallen, Thomas Falcon

In its current state, the driver will handle backing device
login in a loop for a certain number of retries while the
device returns a partial success, indicating that the driver
may need to try again using a smaller number of resources.

The variable it checks to continue retrying may change
over the course of operations, resulting in reallocation
of resources but exits without sending the login attempt.
Guard against this by introducing a boolean variable that
will retain the state indicating that the driver needs to
reattempt login with backing device firmware.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 4bb4646..4382bff 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -796,9 +796,11 @@ static int ibmvnic_login(struct net_device *netdev)
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
 	unsigned long timeout = msecs_to_jiffies(30000);
 	int retry_count = 0;
+	bool retry;
 	int rc;
 
 	do {
+		retry = false;
 		if (retry_count > IBMVNIC_MAX_QUEUES) {
 			netdev_warn(netdev, "Login attempts exceeded\n");
 			return -1;
@@ -822,6 +824,9 @@ static int ibmvnic_login(struct net_device *netdev)
 			retry_count++;
 			release_sub_crqs(adapter, 1);
 
+			retry = true;
+			netdev_dbg(netdev,
+				   "Received partial success, retrying...\n");
 			adapter->init_done_rc = 0;
 			reinit_completion(&adapter->init_done);
 			send_cap_queries(adapter);
@@ -849,7 +854,7 @@ static int ibmvnic_login(struct net_device *netdev)
 			netdev_warn(netdev, "Adapter login failed\n");
 			return -1;
 		}
-	} while (adapter->init_done_rc == PARTIALSUCCESS);
+	} while (retry);
 
 	/* handle pending MAC address changes after successful login */
 	if (adapter->mac_change_pending) {
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH v3] powerpc: Implement csum_ipv6_magic in assembly
From: Segher Boessenkool @ 2018-05-24 19:42 UTC (permalink / raw)
  To: Christophe LEROY
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linux-kernel, linuxppc-dev, netdev
In-Reply-To: <3848a4ad-2c0e-691f-e98f-347cfe3484e8@c-s.fr>

On Thu, May 24, 2018 at 08:20:16AM +0200, Christophe LEROY wrote:
> Le 23/05/2018 à 20:34, Segher Boessenkool a écrit :
> >On Tue, May 22, 2018 at 08:57:01AM +0200, Christophe Leroy wrote:
> >>+_GLOBAL(csum_ipv6_magic)
> >>+	lwz	r8, 0(r3)
> >>+	lwz	r9, 4(r3)
> >>+	lwz	r10, 8(r3)
> >>+	lwz	r11, 12(r3)
> >>+	addc	r0, r5, r6
> >>+	adde	r0, r0, r7
> >>+	adde	r0, r0, r8
> >>+	adde	r0, r0, r9
> >>+	adde	r0, r0, r10
> >>+	adde	r0, r0, r11
> >>+	lwz	r8, 0(r4)
> >>+	lwz	r9, 4(r4)
> >>+	lwz	r10, 8(r4)
> >>+	lwz	r11, 12(r4)
> >>+	adde	r0, r0, r8
> >>+	adde	r0, r0, r9
> >>+	adde	r0, r0, r10
> >>+	adde	r0, r0, r11
> >>+	addze	r0, r0
> >>+	rotlwi	r3, r0, 16
> >>+	add	r3, r0, r3
> >>+	not	r3, r3
> >>+	rlwinm	r3, r3, 16, 16, 31
> >>+	blr
> >>+EXPORT_SYMBOL(csum_ipv6_magic)
> >
> >Clustering the loads and carry insns together is pretty much the worst you
> >can do on most 32-bit CPUs.
> 
> Oh, really ? __csum_partial is written that way too.

I thought I told you about this before?  Maybe not.

> Right, now I tried interleaving the lwz and adde. I get no improvment at 
> all on a 885, but I get a 15% improvment on a 8321.

It won't likely help on single-issue cores (like the one 885 has), yes.


Segher

^ permalink raw reply

* Re: [PATCH] rtlwifi: remove duplicate code
From: Gustavo A. R. Silva @ 2018-05-24 19:47 UTC (permalink / raw)
  To: Joe Perches, Ping-Ke Shih, Kalle Valo, David S. Miller
  Cc: linux-wireless, netdev, linux-kernel
In-Reply-To: <57f2881d031de041a6ce95cf8adc5ce80551ce19.camel@perches.com>

Hi Joe,

On 05/24/2018 02:24 PM, Joe Perches wrote:
> On Thu, 2018-05-24 at 13:54 -0500, Gustavo A. R. Silva wrote:
>> Remove and refactor some code in order to avoid having identical code
>> for different branches.
> 
> True and nice tool and patch submittal thanks.
> 
>> Notice that the logic has been there since 2014.
> 
> But perhaps the original logic is a defective copy/paste
> and it should be corrected instead.
> 
> Can anyone from realtek verify this?
> 

I actually used gitk to track down the last changes made to this code 
and, it doesn't look like a copy/paste issue:

commit: c6821613e653aae4f54c75689e229e3f063b7f69
commit: 27a31a60a4de4c1b45e371152bb6e701e1a8cc40

Thanks
--
Gustavo

>> Addresses-Coverity-ID: 1426199 ("Identical code for different branches")
>> Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
>> ---
>>   .../realtek/rtlwifi/btcoexist/halbtc8723b2ant.c    | 23 ++++------------------
>>   1 file changed, 4 insertions(+), 19 deletions(-)
>>
>> diff --git a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c
>> index 279fe01..df3facc 100644
>> --- a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c
>> +++ b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c
>> @@ -2876,25 +2876,10 @@ static void btc8723b2ant_action_hid(struct btc_coexist *btcoexist)
>>   		btc8723b2ant_ps_tdma(btcoexist, NORMAL_EXEC, true, 13);
>>   
>>   	/* sw mechanism */
>> -	if (BTC_WIFI_BW_HT40 == wifi_bw) {
>> -		if ((wifi_rssi_state == BTC_RSSI_STATE_HIGH) ||
>> -		    (wifi_rssi_state == BTC_RSSI_STATE_STAY_HIGH)) {
>> -			btc8723b2ant_sw_mechanism(btcoexist, true, true,
>> -						  false, false);
>> -		} else {
>> -			btc8723b2ant_sw_mechanism(btcoexist, true, true,
>> -						  false, false);
>> -		}
>> -	} else {
>> -		if ((wifi_rssi_state == BTC_RSSI_STATE_HIGH) ||
>> -		    (wifi_rssi_state == BTC_RSSI_STATE_STAY_HIGH)) {
>> -			btc8723b2ant_sw_mechanism(btcoexist, false, true,
>> -						  false, false);
>> -		} else {
>> -			btc8723b2ant_sw_mechanism(btcoexist, false, true,
>> -						  false, false);
>> -		}
>> -	}
>> +	if (wifi_bw == BTC_WIFI_BW_HT40)
>> +		btc8723b2ant_sw_mechanism(btcoexist, true, true, false, false);
>> +	else
>> +		btc8723b2ant_sw_mechanism(btcoexist, false, true, false, false);
>>   }
>>   
>>   /* A2DP only / PAN(EDR) only/ A2DP+PAN(HS) */

^ permalink raw reply

* [PATCH] 8139too: Remove unnecessary netif_napi_del()
From: Bo Chen @ 2018-05-24 19:48 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-kernel, Bo Chen

The call to free_netdev() in __rtl8139_cleanup_dev() clears the network device
napi list, and explicit calls to netif_napi_del() are unnecessary.

Signed-off-by: Bo Chen <chenbo@pdx.edu>
---
 drivers/net/ethernet/realtek/8139too.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
index d118da5a10a2..ffd68a7bc9e1 100644
--- a/drivers/net/ethernet/realtek/8139too.c
+++ b/drivers/net/ethernet/realtek/8139too.c
@@ -1104,7 +1104,6 @@ static int rtl8139_init_one(struct pci_dev *pdev,
 	return 0;
 
 err_out:
-	netif_napi_del(&tp->napi);
 	__rtl8139_cleanup_dev (dev);
 	pci_disable_device (pdev);
 	return i;
@@ -1119,7 +1118,6 @@ static void rtl8139_remove_one(struct pci_dev *pdev)
 	assert (dev != NULL);
 
 	cancel_delayed_work_sync(&tp->thread);
-	netif_napi_del(&tp->napi);
 
 	unregister_netdev (dev);
 
-- 
2.17.0

^ permalink raw reply related

* Re: [PATCH net-next] vrf: add CRC32c offload to device features
From: Marcelo Ricardo Leitner @ 2018-05-24 19:54 UTC (permalink / raw)
  To: Davide Caratti; +Cc: David Ahern, Vlad Yasevich, linux-sctp, netdev
In-Reply-To: <bb3aa69eaef613f033f8f52674740286ba67dc31.1527175921.git.dcaratti@redhat.com>

On Thu, May 24, 2018 at 05:49:35PM +0200, Davide Caratti wrote:
> SCTP sockets originated in a VRF can improve their performance if CRC32c
> computation is delegated to underlying devices: update device features,
> setting NETIF_F_SCTP_CRC. Iterating the following command in the topology
> proposed with [1],
> 
>  # ip vrf exec vrf-h2 netperf -H 192.0.2.1 -t SCTP_STREAM -- -m 10K
> 
> the measured throughput in Mbit/s improved from 2395 ± 1% to 2720 ± 1%.
> 
> [1] https://www.spinics.net/lists/netdev/msg486007.html
> 
> Signed-off-by: Davide Caratti <dcaratti@redhat.com>

Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>

> ---
>  drivers/net/vrf.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
> index 90b5f3900c22..f93547f257fb 100644
> --- a/drivers/net/vrf.c
> +++ b/drivers/net/vrf.c
> @@ -1254,7 +1254,7 @@ static void vrf_setup(struct net_device *dev)
>  
>  	/* enable offload features */
>  	dev->features   |= NETIF_F_GSO_SOFTWARE;
> -	dev->features   |= NETIF_F_RXCSUM | NETIF_F_HW_CSUM;
> +	dev->features   |= NETIF_F_RXCSUM | NETIF_F_HW_CSUM | NETIF_F_SCTP_CRC;
>  	dev->features   |= NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA;
>  
>  	dev->hw_features = dev->features;
> -- 
> 2.17.0
> 

^ permalink raw reply

* Re: [PATCH v3] powerpc: Implement csum_ipv6_magic in assembly
From: Segher Boessenkool @ 2018-05-24 19:55 UTC (permalink / raw)
  To: Christophe Leroy; +Cc: linux-kernel, Paul Mackerras, netdev, linuxppc-dev
In-Reply-To: <1dac2356-5d8a-2892-109e-6e1b26c2bd8c@c-s.fr>

On Thu, May 24, 2018 at 10:18:44AM +0000, Christophe Leroy wrote:
> On 05/24/2018 06:20 AM, Christophe LEROY wrote:
> >Le 23/05/2018 à 20:34, Segher Boessenkool a écrit :
> >>On Tue, May 22, 2018 at 08:57:01AM +0200, Christophe Leroy wrote:
> >>>The generic csum_ipv6_magic() generates a pretty bad result
> >>
> >><snip>
> >>
> >>Please try with a more recent compiler, what you used is pretty ancient.
> >>It's not like recent compilers do great on this either, but it's not
> >>*that* bad anymore ;-)
> 
> Here is what I get with GCC 8.1
> It doesn't look much better, does it ?

There are no more mfocrf, which is a big speedup.  Other than that it is
pretty lousy still, I totally agree.  This improvement happened quite a
while ago, it's fixed in GCC 6.


Segher

^ permalink raw reply

* [PATCH net-next] net: phy: convert further flags in struct phy_device to bit-field
From: Heiner Kallweit @ 2018-05-24 20:15 UTC (permalink / raw)
  To: Andrew Lunn, Florian Fainelli, David Miller; +Cc: netdev@vger.kernel.org

This patch is a follow-up to 87e5808d52b6 ("net: phy: replace bool
members in struct phy_device with bit-fields") and converts further
flags to bit-fields.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
 include/linux/phy.h | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 6cd090984..cc66f2834 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -418,21 +418,20 @@ struct phy_device {
 	/* The most recently read link state */
 	unsigned link:1;
 
+	/* forced speed & duplex (no autoneg)
+	 * partner speed & duplex & pause (autoneg)
+	 */
+	unsigned pause:1;
+	unsigned asym_pause:1;
+	int speed;
+	int duplex;
+
 	enum phy_state state;
 
 	u32 dev_flags;
 
 	phy_interface_t interface;
 
-	/*
-	 * forced speed & duplex (no autoneg)
-	 * partner speed & duplex & pause (autoneg)
-	 */
-	int speed;
-	int duplex;
-	int pause;
-	int asym_pause;
-
 	/* Enabled Interrupts */
 	u32 interrupts;
 
-- 
2.17.0

^ permalink raw reply related

* [PATCH net-next] net: phy: realtek: add suspend/resume callbacks for RTL8211B
From: Heiner Kallweit @ 2018-05-24 20:40 UTC (permalink / raw)
  To: David Miller, Realtek linux nic maintainers, Hau,
	Florian Fainelli, Andrew Lunn
  Cc: netdev@vger.kernel.org, Kevin Hao

Add RTL8211B suspend / resume callbacks.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
This patch is based on my knowledge of the r8169 driver, and on some
guessing. Therefore I'd appreciate a confirmation from Realtek.

The integrated PHY in some chips supported by the r8169 driver uses
a special sequence for power-down/-up. I have a board with a RTL8168D
network chip (one of the chips using the special sequence) and there
the PHY identifies as RTL8211B. So my guess is that this applies also
to external RTL8211B PHY's.

A hint for RTL8211B requiring a special sequence is that no suspend/
resume callbacks are defined yet in the Realtek PHY driver.
Last but not least the non-standard usage of register MII_MMD_DATA
is in line with the description of patch 0231b1a074c6.
("net: phy: realtek: Use the dummy stubs for MMD register access for rtl8211b")
---
 drivers/net/phy/realtek.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c
index 9f48ecf9c..082fb40c6 100644
--- a/drivers/net/phy/realtek.c
+++ b/drivers/net/phy/realtek.c
@@ -145,6 +145,20 @@ static int rtl8211f_config_init(struct phy_device *phydev)
 	return phy_modify_paged(phydev, 0xd08, 0x11, RTL8211F_TX_DELAY, val);
 }
 
+static int rtl8211b_suspend(struct phy_device *phydev)
+{
+	phy_write(phydev, MII_MMD_DATA, BIT(9));
+
+	return genphy_suspend(phydev);
+}
+
+static int rtl8211b_resume(struct phy_device *phydev)
+{
+	phy_write(phydev, MII_MMD_DATA, 0);
+
+	return genphy_resume(phydev);
+}
+
 static struct phy_driver realtek_drvs[] = {
 	{
 		.phy_id         = 0x00008201,
@@ -174,6 +188,8 @@ static struct phy_driver realtek_drvs[] = {
 		.config_intr	= &rtl8211b_config_intr,
 		.read_mmd	= &genphy_read_mmd_unsupported,
 		.write_mmd	= &genphy_write_mmd_unsupported,
+		.suspend	= rtl8211b_suspend,
+		.resume		= rtl8211b_resume,
 	}, {
 		.phy_id		= 0x001cc914,
 		.name		= "RTL8211DN Gigabit Ethernet",
-- 
2.17.0

^ permalink raw reply related

* Re: [PATCH net-next] net: phy: realtek: add suspend/resume callbacks for RTL8211B
From: Andrew Lunn @ 2018-05-24 20:53 UTC (permalink / raw)
  To: Heiner Kallweit
  Cc: David Miller, Realtek linux nic maintainers, Hau,
	Florian Fainelli, netdev@vger.kernel.org, Kevin Hao
In-Reply-To: <c6bcb091-1602-0ae9-8112-c4a6284b9c35@gmail.com>

On Thu, May 24, 2018 at 10:40:12PM +0200, Heiner Kallweit wrote:
> Add RTL8211B suspend / resume callbacks.
> 
> Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
> ---
> This patch is based on my knowledge of the r8169 driver, and on some
> guessing. Therefore I'd appreciate a confirmation from Realtek.
> 
> The integrated PHY in some chips supported by the r8169 driver uses
> a special sequence for power-down/-up. I have a board with a RTL8168D
> network chip (one of the chips using the special sequence) and there
> the PHY identifies as RTL8211B. So my guess is that this applies also
> to external RTL8211B PHY's.
> 
> A hint for RTL8211B requiring a special sequence is that no suspend/
> resume callbacks are defined yet in the Realtek PHY driver.
> Last but not least the non-standard usage of register MII_MMD_DATA
> is in line with the description of patch 0231b1a074c6.
> ("net: phy: realtek: Use the dummy stubs for MMD register access for rtl8211b")
> ---
>  drivers/net/phy/realtek.c | 16 ++++++++++++++++
>  1 file changed, 16 insertions(+)
> 
> diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c
> index 9f48ecf9c..082fb40c6 100644
> --- a/drivers/net/phy/realtek.c
> +++ b/drivers/net/phy/realtek.c
> @@ -145,6 +145,20 @@ static int rtl8211f_config_init(struct phy_device *phydev)
>  	return phy_modify_paged(phydev, 0xd08, 0x11, RTL8211F_TX_DELAY, val);
>  }
>  
> +static int rtl8211b_suspend(struct phy_device *phydev)
> +{
> +	phy_write(phydev, MII_MMD_DATA, BIT(9));

Hi Heiner

Using it like this suggests it is not actually MMD_DATA, it is
something else which just happens to use the same address as the
optional MMD_DATA. To make this clearer, it would be good to add
#defines for both the register address and this BIT(9). Is there any
vendor code you know of which might give you a clue for appropriate
names?

I guess this device also does not support EEE? Does phy_init_eee()
correctly figure this out? Is there a chance calling phy_init_eee()
might trigger a suspend?

       Andrew

^ permalink raw reply

* Re: [net-next 2/6] net/mlx5: Add pbmc and pptb in the port_access_reg_cap_mask
From: Or Gerlitz @ 2018-05-24 21:21 UTC (permalink / raw)
  To: Saeed Mahameed, Huy Nguyen
  Cc: David S. Miller, Linux Netdev List, Saeed Mahameed
In-Reply-To: <CALzJLG8Azhqnik7ZMXD7bM31uRN4o=wnBW1qGaEf4JimJykKkw@mail.gmail.com>

On Tue, May 22, 2018 at 7:01 PM, Saeed Mahameed
<saeedm@dev.mellanox.co.il> wrote:
> On Tue, May 22, 2018 at 3:21 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>> On Tue, May 22, 2018 at 1:19 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>> On Tue, May 22, 2018 at 12:04 AM, Saeed Mahameed <saeedm@mellanox.com> wrote:
>>>> From: Huy Nguyen <huyn@mellanox.com>
>>>>
>>>> Add pbmc and pptb in the port_access_reg_cap_mask. These two
>>>> bits determine if device supports receive buffer configuration.
>>>>
>>>> Signed-off-by: Huy Nguyen <huyn@mellanox.com>
>>>
>>> Huy, Parav reviewed your code to death (but he's still alive and kicking!),
>>> go a head and add his R.Bs note to the entire series.

Just wanted to make sure you didn't miss this one, ack?



>> when you fix that, also address checkpatch's scream on
>>
>> WARNING: Missing or malformed SPDX-License-Identifier tag in line 1
>>
>> in four cases along the series
>>
>
> We are going to do this once for all mlx5 files soon, i don't want to
> have two types of license headers in the meanwhile.
> let's keep this as is until then.
>
>>
>>>> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>

^ permalink raw reply

* Re: [net-next 2/6] net/mlx5: Add pbmc and pptb in the port_access_reg_cap_mask
From: Saeed Mahameed @ 2018-05-24 21:28 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: Huy Nguyen, David S. Miller, Linux Netdev List, Saeed Mahameed
In-Reply-To: <CAJ3xEMjRMnQX-4+5j0Go5MSmCu9=vVT7Th-RTK4ox+0VN_4HhQ@mail.gmail.com>

On Thu, May 24, 2018 at 2:21 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
> On Tue, May 22, 2018 at 7:01 PM, Saeed Mahameed
> <saeedm@dev.mellanox.co.il> wrote:
>> On Tue, May 22, 2018 at 3:21 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>> On Tue, May 22, 2018 at 1:19 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>> On Tue, May 22, 2018 at 12:04 AM, Saeed Mahameed <saeedm@mellanox.com> wrote:
>>>>> From: Huy Nguyen <huyn@mellanox.com>
>>>>>
>>>>> Add pbmc and pptb in the port_access_reg_cap_mask. These two
>>>>> bits determine if device supports receive buffer configuration.
>>>>>
>>>>> Signed-off-by: Huy Nguyen <huyn@mellanox.com>
>>>>
>>>> Huy, Parav reviewed your code to death (but he's still alive and kicking!),
>>>> go a head and add his R.Bs note to the entire series.
>
> Just wanted to make sure you didn't miss this one, ack?
>
ack

^ permalink raw reply

* [pull request][net-next V2 0/6] Mellanox, mlx5e updates 2018-05-19
From: Saeed Mahameed @ 2018-05-24 21:38 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Saeed Mahameed

Hi Dave,

This is a mlx5e only pull request, for more information please see tag
log below.

Please pull and let me know if there's any problem.

v1->v2:
1) patch #1 commit message: lldptool usage example and explanation on why 
   dcbnl is selected over devlink interface as was agreed on mailing list.

2) patches #1 and #6: Add total_size in dcbnl_buffer to report the total
   available buffer size of the netdev, as suggested by John.

3) Added Reviewed-by tag to all the patches.

Thanks,
Saeed.

---

The following changes since commit 87e5808d52b65fc5b0bfda209ba8864cc2f933e5:

  net: phy: replace bool members in struct phy_device with bit-fields (2018-05-24 15:35:58 -0400)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git tags/mlx5e-updates-2018-05-19

for you to fetch changes up to ecdf2dadee8e8c5015771b802a9851ff332d3fc4:

  net/mlx5e: Receive buffer support for DCBX (2018-05-24 14:23:33 -0700)

----------------------------------------------------------------
mlx5e-updates-2018-05-19

This series contains updates for mlx5e netdevice driver with one subject,
DSCP to priority mapping, in the first patch Huy adds the needed API in
dcbnl, the second patch adds the needed mlx5 core capability bits for the
feature, and all other patches are mlx5e (netdev) only changes to add
support for the feature.

From: Huy Nguyen

Dscp to priority mapping for Ethernet packet:

These patches enable differentiated services code point (dscp) to
priority mapping for Ethernet packet. Once this feature is
enabled, the packet is routed to the corresponding priority based on its
dscp. User can combine this feature with priority flow control (pfc)
feature to have priority flow control based on the dscp.

Firmware interface:
Mellanox firmware provides two control knobs for this feature:
  QPTS register allow changing the trust state between dscp and
  pcp mode. The default is pcp mode. Once in dscp mode, firmware will
  route the packet based on its dscp value if the dscp field exists.

  QPDPM register allow mapping a specific dscp (0 to 63) to a
  specific priority (0 to 7). By default, all the dscps are mapped to
  priority zero.

Software interface:
This feature is controlled via application priority TLV. IEEE
specification P802.1Qcd/D2.1 defines priority selector id 5 for
application priority TLV. This APP TLV selector defines DSCP to priority
map. This APP TLV can be sent by the switch or can be set locally using
software such as lldptool. In mlx5 drivers, we add the support for net
dcb's getapp and setapp call back. Mlx5 driver only handles the selector
id 5 application entry (dscp application priority application entry).
If user sends multiple dscp to priority APP TLV entries on the same
dscp, the last sent one will take effect. All the previous sent will be
deleted.

This attribute combined with pfc attribute allows advanced user to
fine tune the qos setting for specific priority queue. For example,
user can give dedicated buffer for one or more priorities or user
can give large buffer to certain priorities.

The dcb buffer configuration will be controlled by lldptool.
>> lldptool -T -i eth2 -V BUFFER prio 0,2,5,7,1,2,3,6
      maps priorities 0,1,2,3,4,5,6,7 to receive buffer 0,2,5,7,1,2,3,6
>> lldptool -T -i eth2 -V BUFFER size 87296,87296,0,87296,0,0,0,0
      sets receive buffer size for buffer 0,1,2,3,4,5,6,7 respectively

After discussion on mailing list with Jakub, Jiri, Ido and John, we agreed to
choose dcbnl over devlink interface since this feature is intended to set
port attributes which are governed by the netdev instance of that port, where
devlink API is more suitable for global ASIC configurations.

The firmware trust state (in QPTS register) is changed based on the
number of dscp to priority application entries. When the first dscp to
priority application entry is added by the user, the trust state is
changed to dscp. When the last dscp to priority application entry is
deleted by the user, the trust state is changed to pcp.

When the port is in DSCP trust state, the transmit queue is selected
based on the dscp of the skb.

When the port is in DSCP trust state and vport inline mode is not NONE,
firmware requires mlx5 driver to copy the IP header to the
wqe ethernet segment inline header if the skb has it.
This is done by changing the transmit queue sq's min inline mode to L3.
Note that the min inline mode of sqs that belong to other features
such as xdpsq, icosq are not modified.

----------------------------------------------------------------
Huy Nguyen (6):
      net/dcb: Add dcbnl buffer attribute
      net/mlx5e: Move port speed code from en_ethtool.c to en/port.c
      net/mlx5: Add pbmc and pptb in the port_access_reg_cap_mask
      net/mlx5: PPTB and PBMC register firmware command support
      net/mlx5e: Receive buffer configuration
      net/mlx5e: Receive buffer support for DCBX

 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   4 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |   8 +-
 .../net/ethernet/mellanox/mlx5/core/en/Makefile    |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en/port.c  | 237 +++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en/port.h  |  48 +++
 .../ethernet/mellanox/mlx5/core/en/port_buffer.c   | 327 +++++++++++++++++++++
 .../ethernet/mellanox/mlx5/core/en/port_buffer.h   |  75 +++++
 drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 132 ++++++++-
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 102 +++----
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   3 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |   3 +-
 include/linux/mlx5/device.h                        |   3 +
 include/linux/mlx5/driver.h                        |   2 +
 include/linux/mlx5/mlx5_ifc.h                      |  47 +++
 include/net/dcbnl.h                                |   4 +
 include/uapi/linux/dcbnl.h                         |  11 +
 net/dcb/dcbnl.c                                    |  20 ++
 17 files changed, 947 insertions(+), 80 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/Makefile
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/port.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/port.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h

^ permalink raw reply

* [PATCH net-next] ifb: fix packets checksum
From: Jon Maxwell @ 2018-05-24 21:38 UTC (permalink / raw)
  To: davem
  Cc: dsahern, mschiffer, zhangshengju, ktkhai, netdev, linux-kernel,
	jmaxwell

Fixup the checksum for CHECKSUM_COMPLETE when pulling skbs on RX path. 
Otherwise we get splats when tc mirred is used to redirect packets to ifb.

Before fix:

nic: hw csum failure

Signed-off-by: Jon Maxwell <jmaxwell37@gmail.com>
---
 drivers/net/ifb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 5f2897ec0edc..d345c61d476c 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -102,7 +102,7 @@ static void ifb_ri_tasklet(unsigned long _txp)
 		if (!skb->tc_from_ingress) {
 			dev_queue_xmit(skb);
 		} else {
-			skb_pull(skb, skb->mac_len);
+			skb_pull_rcsum(skb, skb->mac_len);
 			netif_receive_skb(skb);
 		}
 	}
-- 
2.13.6

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox