Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 3/8] bpf: register BPF_PROG_TYPE_TRACEPOINT program type
From: Alexei Starovoitov @ 2016-04-05  4:52 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Peter Zijlstra, David S . Miller, Ingo Molnar, Daniel Borkmann,
	Arnaldo Carvalho de Melo, Wang Nan, Josef Bacik, Brendan Gregg,
	netdev, linux-kernel, kernel-team
In-Reply-To: <1459831974-2891931-1-git-send-email-ast@fb.com>

register tracepoint bpf program type and let it call the same set
of helper functions as BPF_PROG_TYPE_KPROBE

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/trace/bpf_trace.c | 45 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3e4ffb3ace5f..3e5ebe3254d2 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -268,7 +268,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
 	.arg5_type	= ARG_CONST_STACK_SIZE,
 };
 
-static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
+static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
 	case BPF_FUNC_map_lookup_elem:
@@ -295,12 +295,20 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_perf_event_read:
 		return &bpf_perf_event_read_proto;
+	default:
+		return NULL;
+	}
+}
+
+static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
 	case BPF_FUNC_perf_event_output:
 		return &bpf_perf_event_output_proto;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto;
 	default:
-		return NULL;
+		return tracing_func_proto(func_id);
 	}
 }
 
@@ -332,9 +340,42 @@ static struct bpf_prog_type_list kprobe_tl = {
 	.type	= BPF_PROG_TYPE_KPROBE,
 };
 
+static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_perf_event_output:
+	case BPF_FUNC_get_stackid:
+		return NULL;
+	default:
+		return tracing_func_proto(func_id);
+	}
+}
+
+static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+	if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
+		return false;
+	if (type != BPF_READ)
+		return false;
+	if (off % size != 0)
+		return false;
+	return true;
+}
+
+static const struct bpf_verifier_ops tracepoint_prog_ops = {
+	.get_func_proto  = tp_prog_func_proto,
+	.is_valid_access = tp_prog_is_valid_access,
+};
+
+static struct bpf_prog_type_list tracepoint_tl = {
+	.ops	= &tracepoint_prog_ops,
+	.type	= BPF_PROG_TYPE_TRACEPOINT,
+};
+
 static int __init register_kprobe_prog_ops(void)
 {
 	bpf_register_prog_type(&kprobe_tl);
+	bpf_register_prog_type(&tracepoint_tl);
 	return 0;
 }
 late_initcall(register_kprobe_prog_ops);
-- 
2.8.0

^ permalink raw reply related

* [PATCH net-next 0/8] allow bpf attach to tracepoints
From: Alexei Starovoitov @ 2016-04-05  4:52 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Peter Zijlstra, David S . Miller, Ingo Molnar, Daniel Borkmann,
	Arnaldo Carvalho de Melo, Wang Nan, Josef Bacik, Brendan Gregg,
	netdev, linux-kernel, kernel-team

Hi Steven, Peter,

last time we discussed bpf+tracepoints it was a year ago [1] and the reason
we didn't proceed with that approach was that bpf would make arguments
arg1, arg2 to trace_xx(arg1, arg2) call to be exposed to bpf program
and that was considered unnecessary extension of abi. Back then I wanted
to avoid the cost of buffer alloc and field assign part in all
of the tracepoints, but looks like when optimized the cost is acceptable.
So this new apporach doesn't expose any new abi to bpf program.
The program is looking at tracepoint fields after they were copied
by perf_trace_xx() and described in /sys/kernel/debug/tracing/events/xxx/format
We made a tool [2] that takes arguments from /sys/.../format and works as:
$ tplist.py -v random:urandom_read
    int got_bits;
    int pool_left;
    int input_left;
Then these fields can be copy-pasted into bpf program like:
struct urandom_read {
    __u64 hidden_pad;
    int got_bits;
    int pool_left;
    int input_left;
};
and the program can use it:
SEC("tracepoint/random/urandom_read")
int bpf_prog(struct urandom_read *ctx)
{
    return ctx->pool_left > 0 ? 1 : 0;
}
This way the program can access tracepoint fields faster than
equivalent bpf+kprobe program, which is the main goal of these patches.

Patch 1 and 2 are simple changes in perf core side, please review.
I'd like to take the whole set via net-next tree, since the rest of
the patches might conflict with other bpf work going on in net-next
and we want to avoid cross-tree merge conflicts.
Patch 7 is an example of access to tracepoint fields from bpf prog.
Patch 8 is a micro benchmark for bpf+kprobe vs bpf+tracepoint.

Note that for actual tracing tools the user doesn't need to
run tplist.py and copy-paste fields manually. The tools do it
automatically. Like argdist tool [3] can be used as:
$ argdist -H 't:block:block_rq_complete():u32:nr_sector'
where 'nr_sector' is name of tracepoint field taken from
/sys/kernel/debug/tracing/events/block/block_rq_complete/format
and appropriate bpf program is generated on the fly.

[1] http://thread.gmane.org/gmane.linux.kernel.api/8127/focus=8165
[2] https://github.com/iovisor/bcc/blob/master/tools/tplist.py
[3] https://github.com/iovisor/bcc/blob/master/tools/argdist.py

Alexei Starovoitov (8):
  perf: optimize perf_fetch_caller_regs
  perf, bpf: allow bpf programs attach to tracepoints
  bpf: register BPF_PROG_TYPE_TRACEPOINT program type
  bpf: support bpf_get_stackid() and bpf_perf_event_output() in
    tracepoint programs
  bpf: sanitize bpf tracepoint access
  samples/bpf: add tracepoint support to bpf loader
  samples/bpf: tracepoint example
  samples/bpf: add tracepoint vs kprobe performance tests

 include/linux/bpf.h                     |   2 +
 include/linux/perf_event.h              |   2 -
 include/linux/trace_events.h            |   1 +
 include/trace/perf.h                    |  18 +++-
 include/uapi/linux/bpf.h                |   1 +
 kernel/bpf/stackmap.c                   |   2 +-
 kernel/bpf/verifier.c                   |   6 +-
 kernel/events/core.c                    |  21 ++++-
 kernel/trace/bpf_trace.c                |  85 ++++++++++++++++-
 kernel/trace/trace_event_perf.c         |   4 +
 kernel/trace/trace_events.c             |  18 ++++
 samples/bpf/Makefile                    |   5 +
 samples/bpf/bpf_load.c                  |  26 +++++-
 samples/bpf/offwaketime_kern.c          |  26 +++++-
 samples/bpf/test_overhead_kprobe_kern.c |  41 ++++++++
 samples/bpf/test_overhead_tp_kern.c     |  36 +++++++
 samples/bpf/test_overhead_user.c        | 161 ++++++++++++++++++++++++++++++++
 17 files changed, 432 insertions(+), 23 deletions(-)
 create mode 100644 samples/bpf/test_overhead_kprobe_kern.c
 create mode 100644 samples/bpf/test_overhead_tp_kern.c
 create mode 100644 samples/bpf/test_overhead_user.c

-- 
2.8.0

^ permalink raw reply

* [PATCH net-next 5/8] bpf: sanitize bpf tracepoint access
From: Alexei Starovoitov @ 2016-04-05  4:52 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Peter Zijlstra, David S . Miller, Ingo Molnar, Daniel Borkmann,
	Arnaldo Carvalho de Melo, Wang Nan, Josef Bacik, Brendan Gregg,
	netdev, linux-kernel, kernel-team
In-Reply-To: <1459831974-2891931-1-git-send-email-ast@fb.com>

during bpf program loading remember the last byte of ctx access
and at the time of attaching the program to tracepoint check that
the program doesn't access bytes beyond defined in tracepoint fields

This also disallows access to __dynamic_array fields, but can be
relaxed in the future.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |  1 +
 include/linux/trace_events.h |  1 +
 kernel/bpf/verifier.c        |  6 +++++-
 kernel/events/core.c         |  8 ++++++++
 kernel/trace/trace_events.c  | 18 ++++++++++++++++++
 5 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 198f6ace70ec..b2365a6eba3d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -131,6 +131,7 @@ struct bpf_prog_type_list {
 struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
+	u32 max_ctx_offset;
 	const struct bpf_verifier_ops *ops;
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 0810f81b6db2..97bd7da98567 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -569,6 +569,7 @@ extern int trace_define_field(struct trace_event_call *call, const char *type,
 			      int is_signed, int filter_type);
 extern int trace_add_event_call(struct trace_event_call *call);
 extern int trace_remove_event_call(struct trace_event_call *call);
+extern int trace_event_get_offsets(struct trace_event_call *call);
 
 #define is_signed_type(type)	(((type)(-1)) < (type)1)
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2e08f8e9b771..58792fed5678 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -652,8 +652,12 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
 			    enum bpf_access_type t)
 {
 	if (env->prog->aux->ops->is_valid_access &&
-	    env->prog->aux->ops->is_valid_access(off, size, t))
+	    env->prog->aux->ops->is_valid_access(off, size, t)) {
+		/* remember the offset of last byte accessed in ctx */
+		if (env->prog->aux->max_ctx_offset < off + size)
+			env->prog->aux->max_ctx_offset = off + size;
 		return 0;
+	}
 
 	verbose("invalid bpf_context access off=%d size=%d\n", off, size);
 	return -EACCES;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 58fc9a7d1562..e7e3c2057582 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7131,6 +7131,14 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 		return -EINVAL;
 	}
 
+	if (is_tracepoint) {
+		int off = trace_event_get_offsets(event->tp_event);
+
+		if (prog->aux->max_ctx_offset > off) {
+			bpf_prog_put(prog);
+			return -EACCES;
+		}
+	}
 	event->tp_event->prog = prog;
 
 	return 0;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 05ddc0820771..ced963049e0a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -204,6 +204,24 @@ static void trace_destroy_fields(struct trace_event_call *call)
 	}
 }
 
+/*
+ * run-time version of trace_event_get_offsets_<call>() that returns the last
+ * accessible offset of trace fields excluding __dynamic_array bytes
+ */
+int trace_event_get_offsets(struct trace_event_call *call)
+{
+	struct ftrace_event_field *tail;
+	struct list_head *head;
+
+	head = trace_get_fields(call);
+	/*
+	 * head->next points to the last field with the largest offset,
+	 * since it was added last by trace_define_field()
+	 */
+	tail = list_first_entry(head, struct ftrace_event_field, link);
+	return tail->offset + tail->size;
+}
+
 int trace_event_raw_init(struct trace_event_call *call)
 {
 	int id;
-- 
2.8.0

^ permalink raw reply related

* [PATCH net-next 8/8] samples/bpf: add tracepoint vs kprobe performance tests
From: Alexei Starovoitov @ 2016-04-05  4:52 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Peter Zijlstra, David S . Miller, Ingo Molnar, Daniel Borkmann,
	Arnaldo Carvalho de Melo, Wang Nan, Josef Bacik, Brendan Gregg,
	netdev, linux-kernel, kernel-team
In-Reply-To: <1459831974-2891931-1-git-send-email-ast@fb.com>

the first microbenchmark does
fd=open("/proc/self/comm");
for() {
  write(fd, "test");
}
and on 4 cpus in parallel:
                                      writes per sec
base (no tracepoints, no kprobes)         930k
with kprobe at __set_task_comm()          420k
with tracepoint at task:task_rename       730k

For kprobe + full bpf program manully fetches oldcomm, newcomm via bpf_probe_read.
For tracepint bpf program does nothing, since arguments are copied by tracepoint.

2nd microbenchmark does:
fd=open("/dev/urandom");
for() {
  read(fd, buf);
}
and on 4 cpus in parallel:
                                       reads per sec
base (no tracepoints, no kprobes)         300k
with kprobe at urandom_read()             279k
with tracepoint at random:urandom_read    290k

bpf progs attached to kprobe and tracepoint are noop.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/Makefile                    |   5 +
 samples/bpf/test_overhead_kprobe_kern.c |  41 ++++++++
 samples/bpf/test_overhead_tp_kern.c     |  36 +++++++
 samples/bpf/test_overhead_user.c        | 161 ++++++++++++++++++++++++++++++++
 4 files changed, 243 insertions(+)
 create mode 100644 samples/bpf/test_overhead_kprobe_kern.c
 create mode 100644 samples/bpf/test_overhead_tp_kern.c
 create mode 100644 samples/bpf/test_overhead_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 502c9fc8db85..9959771bf808 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -19,6 +19,7 @@ hostprogs-y += lathist
 hostprogs-y += offwaketime
 hostprogs-y += spintest
 hostprogs-y += map_perf_test
+hostprogs-y += test_overhead
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
@@ -38,6 +39,7 @@ lathist-objs := bpf_load.o libbpf.o lathist_user.o
 offwaketime-objs := bpf_load.o libbpf.o offwaketime_user.o
 spintest-objs := bpf_load.o libbpf.o spintest_user.o
 map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o
+test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -56,6 +58,8 @@ always += lathist_kern.o
 always += offwaketime_kern.o
 always += spintest_kern.o
 always += map_perf_test_kern.o
+always += test_overhead_tp_kern.o
+always += test_overhead_kprobe_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -75,6 +79,7 @@ HOSTLOADLIBES_lathist += -lelf
 HOSTLOADLIBES_offwaketime += -lelf
 HOSTLOADLIBES_spintest += -lelf
 HOSTLOADLIBES_map_perf_test += -lelf -lrt
+HOSTLOADLIBES_test_overhead += -lelf -lrt
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/test_overhead_kprobe_kern.c b/samples/bpf/test_overhead_kprobe_kern.c
new file mode 100644
index 000000000000..468a66a92ef9
--- /dev/null
+++ b/samples/bpf/test_overhead_kprobe_kern.c
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/version.h>
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;})
+
+SEC("kprobe/__set_task_comm")
+int prog(struct pt_regs *ctx)
+{
+	struct signal_struct *signal;
+	struct task_struct *tsk;
+	char oldcomm[16] = {};
+	char newcomm[16] = {};
+	u16 oom_score_adj;
+	u32 pid;
+
+	tsk = (void *)PT_REGS_PARM1(ctx);
+
+	pid = _(tsk->pid);
+	bpf_probe_read(oldcomm, sizeof(oldcomm), &tsk->comm);
+	bpf_probe_read(newcomm, sizeof(newcomm), (void *)PT_REGS_PARM2(ctx));
+	signal = _(tsk->signal);
+	oom_score_adj = _(signal->oom_score_adj);
+	return 0;
+}
+
+SEC("kprobe/urandom_read")
+int prog2(struct pt_regs *ctx)
+{
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/test_overhead_tp_kern.c b/samples/bpf/test_overhead_tp_kern.c
new file mode 100644
index 000000000000..38f5c0b9da9f
--- /dev/null
+++ b/samples/bpf/test_overhead_tp_kern.c
@@ -0,0 +1,36 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+/* from /sys/kernel/debug/tracing/events/task/task_rename/format */
+struct task_rename {
+	__u64 pad;
+	__u32 pid;
+	char oldcomm[16];
+	char newcomm[16];
+	__u16 oom_score_adj;
+};
+SEC("tracepoint/task/task_rename")
+int prog(struct task_rename *ctx)
+{
+	return 0;
+}
+
+/* from /sys/kernel/debug/tracing/events/random/urandom_read/format */
+struct urandom_read {
+	__u64 pad;
+	int got_bits;
+	int pool_left;
+	int input_left;
+};
+SEC("tracepoint/random/urandom_read")
+int prog2(struct urandom_read *ctx)
+{
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_overhead_user.c b/samples/bpf/test_overhead_user.c
new file mode 100644
index 000000000000..6872d67e151d
--- /dev/null
+++ b/samples/bpf/test_overhead_user.c
@@ -0,0 +1,161 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#define _GNU_SOURCE
+#include <sched.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <asm/unistd.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <assert.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include <string.h>
+#include <time.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define MAX_CNT 1000000
+
+static __u64 time_get_ns(void)
+{
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	return ts.tv_sec * 1000000000ull + ts.tv_nsec;
+}
+
+static void test_task_rename(int cpu)
+{
+	__u64 start_time;
+	char buf[] = "test\n";
+	int i, fd;
+
+	fd = open("/proc/self/comm", O_WRONLY|O_TRUNC);
+	if (fd < 0) {
+		printf("couldn't open /proc\n");
+		exit(1);
+	}
+	start_time = time_get_ns();
+	for (i = 0; i < MAX_CNT; i++)
+		write(fd, buf, sizeof(buf));
+	printf("task_rename:%d: %lld events per sec\n",
+	       cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
+	close(fd);
+}
+
+static void test_urandom_read(int cpu)
+{
+	__u64 start_time;
+	char buf[4];
+	int i, fd;
+
+	fd = open("/dev/urandom", O_RDONLY);
+	if (fd < 0) {
+		printf("couldn't open /dev/urandom\n");
+		exit(1);
+	}
+	start_time = time_get_ns();
+	for (i = 0; i < MAX_CNT; i++)
+		read(fd, buf, sizeof(buf));
+	printf("urandom_read:%d: %lld events per sec\n",
+	       cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
+	close(fd);
+}
+
+static void loop(int cpu, int flags)
+{
+	cpu_set_t cpuset;
+
+	CPU_ZERO(&cpuset);
+	CPU_SET(cpu, &cpuset);
+	sched_setaffinity(0, sizeof(cpuset), &cpuset);
+
+	if (flags & 1)
+		test_task_rename(cpu);
+	if (flags & 2)
+		test_urandom_read(cpu);
+}
+
+static void run_perf_test(int tasks, int flags)
+{
+	pid_t pid[tasks];
+	int i;
+
+	for (i = 0; i < tasks; i++) {
+		pid[i] = fork();
+		if (pid[i] == 0) {
+			loop(i, flags);
+			exit(0);
+		} else if (pid[i] == -1) {
+			printf("couldn't spawn #%d process\n", i);
+			exit(1);
+		}
+	}
+	for (i = 0; i < tasks; i++) {
+		int status;
+
+		assert(waitpid(pid[i], &status, 0) == pid[i]);
+		assert(status == 0);
+	}
+}
+
+static void unload_progs(void)
+{
+	close(prog_fd[0]);
+	close(prog_fd[1]);
+	close(event_fd[0]);
+	close(event_fd[1]);
+}
+
+int main(int argc, char **argv)
+{
+	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+	char filename[256];
+	int num_cpu = 8;
+	int test_flags = ~0;
+
+	setrlimit(RLIMIT_MEMLOCK, &r);
+
+	if (argc > 1)
+		test_flags = atoi(argv[1]) ? : test_flags;
+	if (argc > 2)
+		num_cpu = atoi(argv[2]) ? : num_cpu;
+
+	if (test_flags & 0x3) {
+		printf("BASE\n");
+		run_perf_test(num_cpu, test_flags);
+	}
+
+	if (test_flags & 0xC) {
+		snprintf(filename, sizeof(filename),
+			 "%s_kprobe_kern.o", argv[0]);
+		if (load_bpf_file(filename)) {
+			printf("%s", bpf_log_buf);
+			return 1;
+		}
+		printf("w/KPROBE\n");
+		run_perf_test(num_cpu, test_flags >> 2);
+		unload_progs();
+	}
+
+	if (test_flags & 0x30) {
+		snprintf(filename, sizeof(filename),
+			 "%s_tp_kern.o", argv[0]);
+		if (load_bpf_file(filename)) {
+			printf("%s", bpf_log_buf);
+			return 1;
+		}
+		printf("w/TRACEPOINT\n");
+		run_perf_test(num_cpu, test_flags >> 4);
+		unload_progs();
+	}
+
+	return 0;
+}
-- 
2.8.0

^ permalink raw reply related

* [PATCH net-next 4/8] bpf: support bpf_get_stackid() and bpf_perf_event_output() in tracepoint programs
From: Alexei Starovoitov @ 2016-04-05  4:52 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Peter Zijlstra, David S . Miller, Ingo Molnar, Daniel Borkmann,
	Arnaldo Carvalho de Melo, Wang Nan, Josef Bacik, Brendan Gregg,
	netdev, linux-kernel, kernel-team
In-Reply-To: <1459831974-2891931-1-git-send-email-ast@fb.com>

needs two wrapper functions to fetch 'struct pt_regs *' to convert
tracepoint bpf context into kprobe bpf context to reuse existing
helper functions

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      |  1 +
 kernel/bpf/stackmap.c    |  2 +-
 kernel/trace/bpf_trace.c | 42 +++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 21ee41b92e8a..198f6ace70ec 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -160,6 +160,7 @@ struct bpf_array {
 #define MAX_TAIL_CALL_CNT 32
 
 u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
+u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 void bpf_fd_array_map_clear(struct bpf_map *map);
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 499d9e933f8e..35114725cf30 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -116,7 +116,7 @@ free_smap:
 	return ERR_PTR(err);
 }
 
-static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 {
 	struct pt_regs *regs = (struct pt_regs *) (long) r1;
 	struct bpf_map *map = (struct bpf_map *) (long) r2;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3e5ebe3254d2..413ec5614180 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -340,12 +340,52 @@ static struct bpf_prog_type_list kprobe_tl = {
 	.type	= BPF_PROG_TYPE_KPROBE,
 };
 
+static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
+{
+	/*
+	 * r1 points to perf tracepoint buffer where first 8 bytes are hidden
+	 * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
+	 * from there and call the same bpf_perf_event_output() helper
+	 */
+	u64 ctx = *(long *)r1;
+
+	return bpf_perf_event_output(ctx, r2, index, r4, size);
+}
+
+static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
+	.func		= bpf_perf_event_output_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_STACK,
+	.arg5_type	= ARG_CONST_STACK_SIZE,
+};
+
+static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	u64 ctx = *(long *)r1;
+
+	return bpf_get_stackid(ctx, r2, r3, r4, r5);
+}
+
+static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
+	.func		= bpf_get_stackid_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
 	case BPF_FUNC_perf_event_output:
+		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
-		return NULL;
+		return &bpf_get_stackid_proto_tp;
 	default:
 		return tracing_func_proto(func_id);
 	}
-- 
2.8.0

^ permalink raw reply related

* [PATCH net-next 2/8] perf, bpf: allow bpf programs attach to tracepoints
From: Alexei Starovoitov @ 2016-04-05  4:52 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Peter Zijlstra, David S . Miller, Ingo Molnar, Daniel Borkmann,
	Arnaldo Carvalho de Melo, Wang Nan, Josef Bacik, Brendan Gregg,
	netdev, linux-kernel, kernel-team
In-Reply-To: <1459831974-2891931-1-git-send-email-ast@fb.com>

introduce BPF_PROG_TYPE_TRACEPOINT program type and allow it to be
attached to tracepoints.
The tracepoint will copy the arguments in the per-cpu buffer and pass
it to the bpf program as its first argument.
The layout of the fields can be discovered by doing
'cat /sys/kernel/debug/tracing/events/sched/sched_switch/format'
prior to the compilation of the program with exception that first 8 bytes
are reserved and not accessible to the program. This area is used to store
the pointer to 'struct pt_regs' which some of the bpf helpers will use:
+---------+
| 8 bytes | hidden 'struct pt_regs *' (inaccessible to bpf program)
+---------+
| N bytes | static tracepoint fields defined in tracepoint/format (bpf readonly)
+---------+
| dynamic | __dynamic_array bytes of tracepoint (inaccessible to bpf yet)
+---------+

Not that all of the fields are already dumped to user space via perf ring buffer
and some application access it directly without consulting tracepoint/format.
Same rule applies here: static tracepoint fields should only be accessed
in a format defined in tracepoint/format. The order of fields and
field sizes are not an ABI.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/trace/perf.h            | 18 ++++++++++++++----
 include/uapi/linux/bpf.h        |  1 +
 kernel/events/core.c            | 13 +++++++++----
 kernel/trace/trace_event_perf.c |  3 +++
 4 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/include/trace/perf.h b/include/trace/perf.h
index 26486fcd74ce..55feb69c873f 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -37,18 +37,19 @@ perf_trace_##call(void *__data, proto)					\
 	struct trace_event_call *event_call = __data;			\
 	struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
 	struct trace_event_raw_##call *entry;				\
+	struct bpf_prog *prog = event_call->prog;			\
 	struct pt_regs *__regs;						\
 	u64 __addr = 0, __count = 1;					\
 	struct task_struct *__task = NULL;				\
 	struct hlist_head *head;					\
 	int __entry_size;						\
 	int __data_size;						\
-	int rctx;							\
+	int rctx, event_type;						\
 									\
 	__data_size = trace_event_get_offsets_##call(&__data_offsets, args); \
 									\
 	head = this_cpu_ptr(event_call->perf_events);			\
-	if (__builtin_constant_p(!__task) && !__task &&			\
+	if (!prog && __builtin_constant_p(!__task) && !__task &&	\
 				hlist_empty(head))			\
 		return;							\
 									\
@@ -56,8 +57,9 @@ perf_trace_##call(void *__data, proto)					\
 			     sizeof(u64));				\
 	__entry_size -= sizeof(u32);					\
 									\
-	entry = perf_trace_buf_prepare(__entry_size,			\
-			event_call->event.type, &__regs, &rctx);	\
+	event_type = prog ? TRACE_EVENT_TYPE_MAX : event_call->event.type; \
+	entry = perf_trace_buf_prepare(__entry_size, event_type,	\
+				       &__regs, &rctx);			\
 	if (!entry)							\
 		return;							\
 									\
@@ -67,6 +69,14 @@ perf_trace_##call(void *__data, proto)					\
 									\
 	{ assign; }							\
 									\
+	if (prog) {							\
+		*(struct pt_regs **)entry = __regs;			\
+		if (!trace_call_bpf(prog, entry) || hlist_empty(head)) { \
+			perf_swevent_put_recursion_context(rctx);	\
+			return;						\
+		}							\
+		memset(&entry->ent, 0, sizeof(entry->ent));		\
+	}								\
 	perf_trace_buf_submit(entry, __entry_size, rctx, __addr,	\
 		__count, __regs, head, __task);				\
 }
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 23917bb47bf3..70eda5aeb304 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -92,6 +92,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_KPROBE,
 	BPF_PROG_TYPE_SCHED_CLS,
 	BPF_PROG_TYPE_SCHED_ACT,
+	BPF_PROG_TYPE_TRACEPOINT,
 };
 
 #define BPF_PSEUDO_MAP_FD	1
diff --git a/kernel/events/core.c b/kernel/events/core.c
index de24fbce5277..58fc9a7d1562 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6725,12 +6725,13 @@ int perf_swevent_get_recursion_context(void)
 }
 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
 
-inline void perf_swevent_put_recursion_context(int rctx)
+void perf_swevent_put_recursion_context(int rctx)
 {
 	struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
 
 	put_recursion_context(swhash->recursion, rctx);
 }
+EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
 
 void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 {
@@ -7104,6 +7105,7 @@ static void perf_event_free_filter(struct perf_event *event)
 
 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 {
+	bool is_kprobe, is_tracepoint;
 	struct bpf_prog *prog;
 
 	if (event->attr.type != PERF_TYPE_TRACEPOINT)
@@ -7112,15 +7114,18 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 	if (event->tp_event->prog)
 		return -EEXIST;
 
-	if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE))
-		/* bpf programs can only be attached to u/kprobes */
+	is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
+	is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
+	if (!is_kprobe && !is_tracepoint)
+		/* bpf programs can only be attached to u/kprobe or tracepoint */
 		return -EINVAL;
 
 	prog = bpf_prog_get(prog_fd);
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
 
-	if (prog->type != BPF_PROG_TYPE_KPROBE) {
+	if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
+	    (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
 		/* valid fd, but invalid bpf program type */
 		bpf_prog_put(prog);
 		return -EINVAL;
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 7a68afca8249..7ada829029d3 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -284,6 +284,9 @@ void *perf_trace_buf_prepare(int size, unsigned short type,
 		*regs = this_cpu_ptr(&__perf_regs[*rctxp]);
 	raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
 
+	if (type == TRACE_EVENT_TYPE_MAX)
+		return raw_data;
+
 	/* zero the dead bytes from align to not leak stack to user */
 	memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
 
-- 
2.8.0

^ permalink raw reply related

* [PATCH net-next 1/8] perf: optimize perf_fetch_caller_regs
From: Alexei Starovoitov @ 2016-04-05  4:52 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Peter Zijlstra, David S . Miller, Ingo Molnar, Daniel Borkmann,
	Arnaldo Carvalho de Melo, Wang Nan, Josef Bacik, Brendan Gregg,
	netdev, linux-kernel, kernel-team
In-Reply-To: <1459831974-2891931-1-git-send-email-ast@fb.com>

avoid memset in perf_fetch_caller_regs, since it's the critical path of all tracepoints.
It's called from perf_sw_event_sched, perf_event_task_sched_in and all of perf_trace_##call
with this_cpu_ptr(&__perf_regs[..]) which are zero initialized by perpcu_alloc and
subsequent call to perf_arch_fetch_caller_regs initializes the same fields on all archs,
so we can safely drop memset from all of the above cases and move it into
perf_ftrace_function_call that calls it with stack allocated pt_regs.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/perf_event.h      | 2 --
 kernel/trace/trace_event_perf.c | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f291275ffd71..e89f7199c223 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -882,8 +882,6 @@ static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned lo
  */
 static inline void perf_fetch_caller_regs(struct pt_regs *regs)
 {
-	memset(regs, 0, sizeof(*regs));
-
 	perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
 }
 
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 00df25fd86ef..7a68afca8249 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -316,6 +316,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 
 	BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
 
+	memset(&regs, 0, sizeof(regs));
 	perf_fetch_caller_regs(&regs);
 
 	entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx);
-- 
2.8.0

^ permalink raw reply related

* [net-next:master 58/68] net/ipv4/udp.c:1693:9: sparse: incompatible types in comparison expression (different address spaces)
From: kbuild test robot @ 2016-04-05  4:38 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: kbuild-all, netdev

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master
head:   15f41e2ba13a6726632e44b1180e805a61e470ad
commit: ca065d0cf80fa547724440a8bf37f1e674d917c0 [58/68] udp: no longer use SLAB_DESTROY_BY_RCU
reproduce:
        # apt-get install sparse
        git checkout ca065d0cf80fa547724440a8bf37f1e674d917c0
        make ARCH=x86_64 allmodconfig
        make C=1 CF=-D__CHECK_ENDIAN__


sparse warnings: (new ones prefixed by >>)

   include/linux/compiler.h:232:8: sparse: attribute 'no_sanitize_address': unknown attribute
>> net/ipv4/udp.c:1693:9: sparse: incompatible types in comparison expression (different address spaces)
>> net/ipv4/udp.c:1693:9: sparse: incompatible types in comparison expression (different address spaces)
--
   include/linux/compiler.h:232:8: sparse: attribute 'no_sanitize_address': unknown attribute
>> net/ipv6/udp.c:743:9: sparse: incompatible types in comparison expression (different address spaces)
>> net/ipv6/udp.c:743:9: sparse: incompatible types in comparison expression (different address spaces)

vim +1693 net/ipv4/udp.c

  1677		struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
  1678		unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
  1679		unsigned int offset = offsetof(typeof(*sk), sk_node);
  1680		int dif = skb->dev->ifindex;
  1681		struct hlist_node *node;
  1682		struct sk_buff *nskb;
  1683	
  1684		if (use_hash2) {
  1685			hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
  1686				    udp_table.mask;
  1687			hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask;
  1688	start_lookup:
  1689			hslot = &udp_table.hash2[hash2];
  1690			offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
  1691		}
  1692	
> 1693		sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
  1694			if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr,
  1695						 uh->source, saddr, dif, hnum))
  1696				continue;
  1697	
  1698			if (!first) {
  1699				first = sk;
  1700				continue;
  1701			}

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

^ permalink raw reply

* Re: [net PATCH v2 2/2] ipv4/GRO: Make GRO conform to RFC 6864
From: Herbert Xu @ 2016-04-05  4:32 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Alexander Duyck, Tom Herbert, Jesse Gross, Eric Dumazet, Netdev,
	David Miller
In-Reply-To: <CAKgT0UfcqBUTY1SxawwKdnzS7qW7XggjzpKNcfT3cTSZ9DHMmA@mail.gmail.com>

On Mon, Apr 04, 2016 at 09:26:55PM -0700, Alexander Duyck wrote:
> 
> The problem is right now we are mangling the IP ID for outer headers
> on tunnels.  We end up totally ignoring the delta between the values
> so if you have two flows that get interleaved over the same tunnel GRO
> will currently mash the IP IDs for the two tunnels so that they end up
> overlapping.

Then it should be fixed.  I never reviewed those patches or I would
have objected at the time.

> The reason why I keep referencing RFC 6864 is because it specifies
> that the IP ID field must not be read if the DF bit is set, and that
> if we are manipulating headers we can handle the IP ID as though we
> are the transmitting station.  What this means is that if DF is not
> set we have to have unique values per packet, otherwise we can ignore
> the values if DF is set.

As I said GRO itself should not be visible.  The fact that it is
for tunnels is a bug.
 
> The question I would have is what are you really losing with increment
> from 0 versus fixed 0?  From what I see it is essentially just garbage
> in/garbage out.

GRO is meant to be lossless, that is, you should not be able to
detect its presence from the outside.  If you lose information then
you're breaking this rule and people will soon start asking for it
to be disabled in various situations.

I'm not against doing this per se but it should not be part of the
default configuration.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* [net-next:master 57/68] DockBook: include/net/sock.h:442: warning: No description found for parameter 'sk_rcu'
From: kbuild test robot @ 2016-04-05  4:29 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: kbuild-all, netdev

[-- Attachment #1: Type: text/plain, Size: 2900 bytes --]

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master
head:   15f41e2ba13a6726632e44b1180e805a61e470ad
commit: a4298e4522d687a79af8f8fbb7eca68399ab2d81 [57/68] net: add SOCK_RCU_FREE socket flag
reproduce: make htmldocs

All warnings (new ones prefixed by >>):

   include/linux/skbuff.h:923: warning: No description found for parameter 'sk'
>> include/net/sock.h:442: warning: No description found for parameter 'sk_rcu'
   net/core/filter.c:1251: warning: No description found for parameter 'locked'

vim +/sk_rcu +442 include/net/sock.h

ef64a54f6 Pavel Emelyanov 2012-02-21  426  	__s32			sk_peek_off;
^1da177e4 Linus Torvalds  2005-04-16  427  	int			sk_write_pending;
d5f642384 Alexey Dobriyan 2008-11-04  428  #ifdef CONFIG_SECURITY
^1da177e4 Linus Torvalds  2005-04-16  429  	void			*sk_security;
d5f642384 Alexey Dobriyan 2008-11-04  430  #endif
2a56a1fec Tejun Heo       2015-12-07  431  	struct sock_cgroup_data	sk_cgrp_data;
baac50bbc Johannes Weiner 2016-01-14  432  	struct mem_cgroup	*sk_memcg;
^1da177e4 Linus Torvalds  2005-04-16  433  	void			(*sk_state_change)(struct sock *sk);
676d23690 David S. Miller 2014-04-11  434  	void			(*sk_data_ready)(struct sock *sk);
^1da177e4 Linus Torvalds  2005-04-16  435  	void			(*sk_write_space)(struct sock *sk);
^1da177e4 Linus Torvalds  2005-04-16  436  	void			(*sk_error_report)(struct sock *sk);
^1da177e4 Linus Torvalds  2005-04-16  437  	int			(*sk_backlog_rcv)(struct sock *sk,
^1da177e4 Linus Torvalds  2005-04-16  438  						  struct sk_buff *skb);
^1da177e4 Linus Torvalds  2005-04-16  439  	void                    (*sk_destruct)(struct sock *sk);
ef456144d Craig Gallek    2016-01-04  440  	struct sock_reuseport __rcu	*sk_reuseport_cb;
a4298e452 Eric Dumazet    2016-04-01  441  	struct rcu_head		sk_rcu;
^1da177e4 Linus Torvalds  2005-04-16 @442  };
^1da177e4 Linus Torvalds  2005-04-16  443  
559835ea7 Pravin B Shelar 2013-09-24  444  #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))
559835ea7 Pravin B Shelar 2013-09-24  445  
559835ea7 Pravin B Shelar 2013-09-24  446  #define rcu_dereference_sk_user_data(sk)	rcu_dereference(__sk_user_data((sk)))
559835ea7 Pravin B Shelar 2013-09-24  447  #define rcu_assign_sk_user_data(sk, ptr)	rcu_assign_pointer(__sk_user_data((sk)), ptr)
559835ea7 Pravin B Shelar 2013-09-24  448  
4a17fd522 Pavel Emelyanov 2012-04-19  449  /*
4a17fd522 Pavel Emelyanov 2012-04-19  450   * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK

:::::: The code at line 442 was first introduced by commit
:::::: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 Linux-2.6.12-rc2

:::::: TO: Linus Torvalds <torvalds@ppc970.osdl.org>
:::::: CC: Linus Torvalds <torvalds@ppc970.osdl.org>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/octet-stream, Size: 6302 bytes --]

^ permalink raw reply

* Re: [net PATCH v2 2/2] ipv4/GRO: Make GRO conform to RFC 6864
From: Alexander Duyck @ 2016-04-05  4:26 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Alexander Duyck, Tom Herbert, Jesse Gross, Eric Dumazet, Netdev,
	David Miller
In-Reply-To: <20160405034437.GA9322@gondor.apana.org.au>

On Mon, Apr 4, 2016 at 8:44 PM, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> On Mon, Apr 04, 2016 at 09:31:21AM -0700, Alexander Duyck wrote:
>> RFC 6864 states that the IPv4 ID field MUST NOT be used for purposes other
>> than fragmentation and reassembly.  Currently we are looking at this field
>> as a way of identifying what frames can be aggregated and  which cannot for
>> GRO.  While this is valid for frames that do not have DF set, it is invalid
>> to do so if the bit is set.
>
> This justification is bogus.  GRO is a completely local optimisation
> that should have zero visibility to third parties.  So it makes no
> sense to talk about RFC compliance of GRO.  The Linux network stack
> as a whole is subject to RFC compliance, but not GRO per se.

The problem is right now we are mangling the IP ID for outer headers
on tunnels.  We end up totally ignoring the delta between the values
so if you have two flows that get interleaved over the same tunnel GRO
will currently mash the IP IDs for the two tunnels so that they end up
overlapping.

The reason why I keep referencing RFC 6864 is because it specifies
that the IP ID field must not be read if the DF bit is set, and that
if we are manipulating headers we can handle the IP ID as though we
are the transmitting station.  What this means is that if DF is not
set we have to have unique values per packet, otherwise we can ignore
the values if DF is set.

>> In the case of the non-incrementing IP ID we will end up losing the data
>> that the IP ID is fixed.  However as per RFC 6864 we should be able to
>> write any value into the IP ID when the DF bit is set so this should cause
>> minimal harm.
>
> No we should not do that, at least not by default.  GRO was designed
> to be completely lossless, that is its main advantage of the various
> forms of LRO which preceded it.

Well the problem is it isn't right now.  Instead in the case of
tunnels it allows you to generate overlapping sequences of IP IDs.

> If you lose that people will start asking it to be disabled for
> routers/bridges and we'll be back in the same old mess that we
> had with LRO.

The question I would have is what are you really losing with increment
from 0 versus fixed 0?  From what I see it is essentially just garbage
in/garbage out.

> So please do this properly and preserve the information in the packet.
> As I said earlier all it takes is one single bit, like we do with ECN.
> If you put it in the feature bit you'll also allow us to distinguish
> between TSO drivers that produce fixed IDs vs. incrementing IDs.

Actually it will take at least 2.  One for the outer headers and one
for the inner headers.  I'll also have to add tracking for each to the
GRO code so that we don't merge frames that go from a fixed ID to
incrementing one or visa-versa since I suspect that is probably
floating around out there too as my GSO partial code was going to end
up doing some of that.

- Alex

^ permalink raw reply

* [PATCHv2 net-next] cxgb4/cxgb4vf:  Deprecate module parameter dflt_msg_enable
From: Hariprasad Shenai @ 2016-04-05  4:22 UTC (permalink / raw)
  To: davem; +Cc: netdev, leedom, nirranjan, Hariprasad Shenai

Message level can be set through ethtool, so deprecate module parameter
which is used to set the same.

Signed-off-by: Hariprasad Shenai <hariprasad@chelsio.com>
---

V2: Fix grammar in module param description, based on review comment by
    Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>

 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c     | 3 ++-
 drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index d1e3f0997d6b..a1e329ec24cd 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -168,7 +168,8 @@ MODULE_PARM_DESC(force_init, "Forcibly become Master PF and initialize adapter,"
 static int dflt_msg_enable = DFLT_MSG_ENABLE;
 
 module_param(dflt_msg_enable, int, 0644);
-MODULE_PARM_DESC(dflt_msg_enable, "Chelsio T4 default message enable bitmap");
+MODULE_PARM_DESC(dflt_msg_enable, "Chelsio T4 default message enable bitmap, "
+		 "deprecated parameter");
 
 /*
  * The driver uses the best interrupt scheme available on a platform in the
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
index 1cc8a7a69457..730fec73d5a6 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
@@ -74,7 +74,8 @@ static int dflt_msg_enable = DFLT_MSG_ENABLE;
 
 module_param(dflt_msg_enable, int, 0644);
 MODULE_PARM_DESC(dflt_msg_enable,
-		 "default adapter ethtool message level bitmap");
+		 "default adapter ethtool message level bitmap, "
+		 "deprecated parameter");
 
 /*
  * The driver uses the best interrupt scheme available on a platform in the
-- 
2.3.4

^ permalink raw reply related

* [PATCH net-next 7/7] sctp: fix some rhashtable functions using in sctp proc/diag
From: Xin Long @ 2016-04-05  4:06 UTC (permalink / raw)
  To: network dev, linux-sctp
  Cc: Marcelo Ricardo Leitner, Vlad Yasevich, daniel, davem
In-Reply-To: <cover.1459829123.git.lucien.xin@gmail.com>

When rhashtable_walk_init return err, no release function should be
called, and when rhashtable_walk_start return err, we should only invoke
rhashtable_walk_exit to release the source.

But now when sctp_transport_walk_start return err, we just call
rhashtable_walk_stop/exit, and never care about if rhashtable_walk_init
or start return err, which is so bad.

We will fix it by calling rhashtable_walk_exit if rhashtable_walk_start
return err in sctp_transport_walk_start, and if sctp_transport_walk_start
return err, we do not need to call sctp_transport_walk_stop any more.

For sctp proc, we will use 'iter->start_fail' to decide if we will call
rhashtable_walk_stop/exit.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 net/sctp/proc.c   |  7 ++++++-
 net/sctp/socket.c | 15 ++++++++++-----
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 9fe1393..4cb5aed 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -280,6 +280,7 @@ void sctp_eps_proc_exit(struct net *net)
 struct sctp_ht_iter {
 	struct seq_net_private p;
 	struct rhashtable_iter hti;
+	int start_fail;
 };
 
 static void *sctp_transport_seq_start(struct seq_file *seq, loff_t *pos)
@@ -287,8 +288,10 @@ static void *sctp_transport_seq_start(struct seq_file *seq, loff_t *pos)
 	struct sctp_ht_iter *iter = seq->private;
 	int err = sctp_transport_walk_start(&iter->hti);
 
-	if (err)
+	if (err) {
+		iter->start_fail = 1;
 		return ERR_PTR(err);
+	}
 
 	return sctp_transport_get_idx(seq_file_net(seq), &iter->hti, *pos);
 }
@@ -297,6 +300,8 @@ static void sctp_transport_seq_stop(struct seq_file *seq, void *v)
 {
 	struct sctp_ht_iter *iter = seq->private;
 
+	if (iter->start_fail)
+		return;
 	sctp_transport_walk_stop(&iter->hti);
 }
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b0bf6c7..473a40c 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4298,8 +4298,12 @@ int sctp_transport_walk_start(struct rhashtable_iter *iter)
 		return err;
 
 	err = rhashtable_walk_start(iter);
+	if (err && err != -EAGAIN) {
+		rhashtable_walk_exit(iter);
+		return err;
+	}
 
-	return err == -EAGAIN ? 0 : err;
+	return 0;
 }
 
 void sctp_transport_walk_stop(struct rhashtable_iter *iter)
@@ -4388,11 +4392,12 @@ EXPORT_SYMBOL_GPL(sctp_transport_lookup_process);
 int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *),
 			    struct net *net, int pos, void *p) {
 	struct rhashtable_iter hti;
-	int err = 0;
 	void *obj;
+	int err;
 
-	if (sctp_transport_walk_start(&hti))
-		goto out;
+	err = sctp_transport_walk_start(&hti);
+	if (err)
+		return err;
 
 	sctp_transport_get_idx(net, &hti, pos);
 	obj = sctp_transport_get_next(net, &hti);
@@ -4406,8 +4411,8 @@ int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *),
 		if (err)
 			break;
 	}
-out:
 	sctp_transport_walk_stop(&hti);
+
 	return err;
 }
 EXPORT_SYMBOL_GPL(sctp_for_each_transport);
-- 
2.1.0

^ permalink raw reply related

* [PATCH net-next 6/7] sctp: merge the seq_start/next/exits in remaddrs and assocs
From: Xin Long @ 2016-04-05  4:06 UTC (permalink / raw)
  To: network dev, linux-sctp
  Cc: Marcelo Ricardo Leitner, Vlad Yasevich, daniel, davem
In-Reply-To: <cover.1459829123.git.lucien.xin@gmail.com>

In sctp proc, these three functions in remaddrs and assocs are the
same. we should merge them into one.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 net/sctp/proc.c | 45 +++++++++------------------------------------
 1 file changed, 9 insertions(+), 36 deletions(-)

diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index dd8492f..9fe1393 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -282,7 +282,7 @@ struct sctp_ht_iter {
 	struct rhashtable_iter hti;
 };
 
-static void *sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos)
+static void *sctp_transport_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct sctp_ht_iter *iter = seq->private;
 	int err = sctp_transport_walk_start(&iter->hti);
@@ -293,14 +293,14 @@ static void *sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos)
 	return sctp_transport_get_idx(seq_file_net(seq), &iter->hti, *pos);
 }
 
-static void sctp_assocs_seq_stop(struct seq_file *seq, void *v)
+static void sctp_transport_seq_stop(struct seq_file *seq, void *v)
 {
 	struct sctp_ht_iter *iter = seq->private;
 
 	sctp_transport_walk_stop(&iter->hti);
 }
 
-static void *sctp_assocs_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+static void *sctp_transport_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct sctp_ht_iter *iter = seq->private;
 
@@ -367,9 +367,9 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
 }
 
 static const struct seq_operations sctp_assoc_ops = {
-	.start = sctp_assocs_seq_start,
-	.next  = sctp_assocs_seq_next,
-	.stop  = sctp_assocs_seq_stop,
+	.start = sctp_transport_seq_start,
+	.next  = sctp_transport_seq_next,
+	.stop  = sctp_transport_seq_stop,
 	.show  = sctp_assocs_seq_show,
 };
 
@@ -406,33 +406,6 @@ void sctp_assocs_proc_exit(struct net *net)
 	remove_proc_entry("assocs", net->sctp.proc_net_sctp);
 }
 
-static void *sctp_remaddr_seq_start(struct seq_file *seq, loff_t *pos)
-{
-	struct sctp_ht_iter *iter = seq->private;
-	int err = sctp_transport_walk_start(&iter->hti);
-
-	if (err)
-		return ERR_PTR(err);
-
-	return sctp_transport_get_idx(seq_file_net(seq), &iter->hti, *pos);
-}
-
-static void *sctp_remaddr_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-	struct sctp_ht_iter *iter = seq->private;
-
-	++*pos;
-
-	return sctp_transport_get_next(seq_file_net(seq), &iter->hti);
-}
-
-static void sctp_remaddr_seq_stop(struct seq_file *seq, void *v)
-{
-	struct sctp_ht_iter *iter = seq->private;
-
-	sctp_transport_walk_stop(&iter->hti);
-}
-
 static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
 {
 	struct sctp_association *assoc;
@@ -506,9 +479,9 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
 }
 
 static const struct seq_operations sctp_remaddr_ops = {
-	.start = sctp_remaddr_seq_start,
-	.next  = sctp_remaddr_seq_next,
-	.stop  = sctp_remaddr_seq_stop,
+	.start = sctp_transport_seq_start,
+	.next  = sctp_transport_seq_next,
+	.stop  = sctp_transport_seq_stop,
 	.show  = sctp_remaddr_seq_show,
 };
 
-- 
2.1.0

^ permalink raw reply related

* [PATCH net-next 5/7] sctp: reuse the some transport traversal functions in proc
From: Xin Long @ 2016-04-05  4:06 UTC (permalink / raw)
  To: network dev, linux-sctp
  Cc: Marcelo Ricardo Leitner, Vlad Yasevich, daniel, davem
In-Reply-To: <cover.1459829123.git.lucien.xin@gmail.com>

There are some transport traversal functions for sctp_diag, we can also
use it for sctp_proc. cause they have the similar situation to traversal
transport.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 net/sctp/proc.c | 80 +++++++++++++--------------------------------------------
 1 file changed, 18 insertions(+), 62 deletions(-)

diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 5cfac8d..dd8492f 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -282,80 +282,31 @@ struct sctp_ht_iter {
 	struct rhashtable_iter hti;
 };
 
-static struct sctp_transport *sctp_transport_get_next(struct seq_file *seq)
-{
-	struct sctp_ht_iter *iter = seq->private;
-	struct sctp_transport *t;
-
-	t = rhashtable_walk_next(&iter->hti);
-	for (; t; t = rhashtable_walk_next(&iter->hti)) {
-		if (IS_ERR(t)) {
-			if (PTR_ERR(t) == -EAGAIN)
-				continue;
-			break;
-		}
-
-		if (net_eq(sock_net(t->asoc->base.sk), seq_file_net(seq)) &&
-		    t->asoc->peer.primary_path == t)
-			break;
-	}
-
-	return t;
-}
-
-static struct sctp_transport *sctp_transport_get_idx(struct seq_file *seq,
-						     loff_t pos)
-{
-	void *obj = SEQ_START_TOKEN;
-
-	while (pos && (obj = sctp_transport_get_next(seq)) && !IS_ERR(obj))
-		pos--;
-
-	return obj;
-}
-
-static int sctp_transport_walk_start(struct seq_file *seq)
-{
-	struct sctp_ht_iter *iter = seq->private;
-	int err;
-
-	err = rhashtable_walk_init(&sctp_transport_hashtable, &iter->hti);
-	if (err)
-		return err;
-
-	err = rhashtable_walk_start(&iter->hti);
-
-	return err == -EAGAIN ? 0 : err;
-}
-
-static void sctp_transport_walk_stop(struct seq_file *seq)
-{
-	struct sctp_ht_iter *iter = seq->private;
-
-	rhashtable_walk_stop(&iter->hti);
-	rhashtable_walk_exit(&iter->hti);
-}
-
 static void *sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	int err = sctp_transport_walk_start(seq);
+	struct sctp_ht_iter *iter = seq->private;
+	int err = sctp_transport_walk_start(&iter->hti);
 
 	if (err)
 		return ERR_PTR(err);
 
-	return sctp_transport_get_idx(seq, *pos);
+	return sctp_transport_get_idx(seq_file_net(seq), &iter->hti, *pos);
 }
 
 static void sctp_assocs_seq_stop(struct seq_file *seq, void *v)
 {
-	sctp_transport_walk_stop(seq);
+	struct sctp_ht_iter *iter = seq->private;
+
+	sctp_transport_walk_stop(&iter->hti);
 }
 
 static void *sctp_assocs_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
+	struct sctp_ht_iter *iter = seq->private;
+
 	++*pos;
 
-	return sctp_transport_get_next(seq);
+	return sctp_transport_get_next(seq_file_net(seq), &iter->hti);
 }
 
 /* Display sctp associations (/proc/net/sctp/assocs). */
@@ -457,24 +408,29 @@ void sctp_assocs_proc_exit(struct net *net)
 
 static void *sctp_remaddr_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	int err = sctp_transport_walk_start(seq);
+	struct sctp_ht_iter *iter = seq->private;
+	int err = sctp_transport_walk_start(&iter->hti);
 
 	if (err)
 		return ERR_PTR(err);
 
-	return sctp_transport_get_idx(seq, *pos);
+	return sctp_transport_get_idx(seq_file_net(seq), &iter->hti, *pos);
 }
 
 static void *sctp_remaddr_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
+	struct sctp_ht_iter *iter = seq->private;
+
 	++*pos;
 
-	return sctp_transport_get_next(seq);
+	return sctp_transport_get_next(seq_file_net(seq), &iter->hti);
 }
 
 static void sctp_remaddr_seq_stop(struct seq_file *seq, void *v)
 {
-	sctp_transport_walk_stop(seq);
+	struct sctp_ht_iter *iter = seq->private;
+
+	sctp_transport_walk_stop(&iter->hti);
 }
 
 static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
-- 
2.1.0

^ permalink raw reply related

* [PATCH net-next 4/7] sctp: add the sctp_diag.c file
From: Xin Long @ 2016-04-05  4:06 UTC (permalink / raw)
  To: network dev, linux-sctp
  Cc: Marcelo Ricardo Leitner, Vlad Yasevich, daniel, davem
In-Reply-To: <cover.1459829123.git.lucien.xin@gmail.com>

This one will implement all the interface of inet_diag, inet_diag_handler.
which includes sctp_diag_dump, sctp_diag_dump_one and sctp_diag_get_info.

It will work as a modules, and register inet_diag_handler when loading.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 include/uapi/linux/inet_diag.h |   2 +
 net/sctp/Kconfig               |   4 +
 net/sctp/Makefile              |   1 +
 net/sctp/sctp_diag.c           | 581 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 588 insertions(+)
 create mode 100644 net/sctp/sctp_diag.c

diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 68a1f71..f5f3629 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -113,6 +113,8 @@ enum {
 	INET_DIAG_DCTCPINFO,
 	INET_DIAG_PROTOCOL,  /* response attribute only */
 	INET_DIAG_SKV6ONLY,
+	INET_DIAG_LOCALS,
+	INET_DIAG_PEERS,
 };
 
 #define INET_DIAG_MAX INET_DIAG_SKV6ONLY
diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig
index 71c1a59..d9c04dc 100644
--- a/net/sctp/Kconfig
+++ b/net/sctp/Kconfig
@@ -99,5 +99,9 @@ config SCTP_COOKIE_HMAC_SHA1
 	select CRYPTO_HMAC if SCTP_COOKIE_HMAC_SHA1
 	select CRYPTO_SHA1 if SCTP_COOKIE_HMAC_SHA1
 
+config INET_SCTP_DIAG
+	depends on INET_DIAG
+	def_tristate INET_DIAG
+
 
 endif # IP_SCTP
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 3b4ffb0..0fca582 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -4,6 +4,7 @@
 
 obj-$(CONFIG_IP_SCTP) += sctp.o
 obj-$(CONFIG_NET_SCTPPROBE) += sctp_probe.o
+obj-$(CONFIG_INET_SCTP_DIAG) += sctp_diag.o
 
 sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
 	  protocol.o endpointola.o associola.o \
diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
new file mode 100644
index 0000000..c32bad6
--- /dev/null
+++ b/net/sctp/sctp_diag.c
@@ -0,0 +1,581 @@
+#include <linux/module.h>
+#include <linux/inet_diag.h>
+#include <linux/sock_diag.h>
+#include <net/sctp/sctp.h>
+
+extern struct inet_diag_handler *inet_diag_get_handler(int proto);
+extern void inet_diag_msg_common_fill(struct inet_diag_msg *r,
+				      struct sock *sk);
+
+static int inet_sctp_fill_laddrs(struct sk_buff *skb,
+				 struct list_head *address_list)
+{
+	struct sctp_sockaddr_entry *laddr;
+	int addrlen = sizeof(struct sockaddr_storage);
+	int addrcnt = 0;
+	struct nlattr *attr;
+	void *info = NULL;
+
+	list_for_each_entry_rcu(laddr, address_list, list)
+		addrcnt++;
+
+	attr = nla_reserve(skb, INET_DIAG_LOCALS, addrlen * addrcnt);
+	if (!attr)
+		return -EMSGSIZE;
+
+	info = nla_data(attr);
+	list_for_each_entry_rcu(laddr, address_list, list) {
+		memcpy(info, &laddr->a, addrlen);
+		info += addrlen;
+	}
+
+	return 0;
+}
+
+static int inet_sctp_fill_paddrs(struct sk_buff *skb,
+				 struct sctp_association *asoc)
+{
+	int addrlen = sizeof(struct sockaddr_storage);
+	struct sctp_transport *from;
+	struct nlattr *attr;
+	void *info = NULL;
+
+	attr = nla_reserve(skb, INET_DIAG_PEERS,
+			   addrlen * asoc->peer.transport_count);
+	if (!attr)
+		return -EMSGSIZE;
+
+	info = nla_data(attr);
+	list_for_each_entry(from, &asoc->peer.transport_addr_list,
+			    transports) {
+		memcpy(info, &from->ipaddr, addrlen);
+		info += addrlen;
+	}
+
+	return 0;
+}
+
+static int inet_assoc_diag_fill(struct sock *sk,
+				struct sctp_association *asoc,
+				struct sk_buff *skb,
+				const struct inet_diag_req_v2 *req,
+				struct user_namespace *user_ns,
+				int portid, u32 seq, u16 nlmsg_flags,
+				const struct nlmsghdr *unlh)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct inet_diag_handler *handler;
+	int ext = req->idiag_ext;
+	struct inet_diag_msg *r;
+	struct nlmsghdr  *nlh;
+	struct nlattr *attr;
+	void *info = NULL;
+	union sctp_addr laddr, paddr;
+	struct dst_entry *dst;
+	struct sctp_infox infox;
+
+	handler = inet_diag_get_handler(req->sdiag_protocol);
+	BUG_ON(!handler);
+
+	nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
+			nlmsg_flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	r = nlmsg_data(nlh);
+	BUG_ON(!sk_fullsock(sk));
+
+	laddr = list_entry(asoc->base.bind_addr.address_list.next,
+			   struct sctp_sockaddr_entry, list)->a;
+	paddr = asoc->peer.primary_path->ipaddr;
+	dst = asoc->peer.primary_path->dst;
+
+	r->idiag_family = sk->sk_family;
+	r->id.idiag_sport = htons(asoc->base.bind_addr.port);
+	r->id.idiag_dport = htons(asoc->peer.port);
+	r->id.idiag_if = dst ? dst->dev->ifindex : 0;
+	sock_diag_save_cookie(sk, r->id.idiag_cookie);
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6) {
+		*(struct in6_addr *)r->id.idiag_src = laddr.v6.sin6_addr;
+		*(struct in6_addr *)r->id.idiag_dst = paddr.v6.sin6_addr;
+	} else
+#endif
+	{
+		memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+		memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+
+		r->id.idiag_src[0] = laddr.v4.sin_addr.s_addr;
+		r->id.idiag_dst[0] = paddr.v4.sin_addr.s_addr;
+	}
+
+	r->idiag_state = asoc->state;
+	r->idiag_timer = SCTP_EVENT_TIMEOUT_T3_RTX;
+	r->idiag_retrans = asoc->rtx_data_chunks;
+#define EXPIRES_IN_MS(tmo)  DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
+	r->idiag_expires =
+		EXPIRES_IN_MS(asoc->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX]);
+#undef EXPIRES_IN_MS
+
+	if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
+		goto errout;
+
+	/* IPv6 dual-stack sockets use inet->tos for IPv4 connections,
+	 * hence this needs to be included regardless of socket family.
+	 */
+	if (ext & (1 << (INET_DIAG_TOS - 1)))
+		if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
+			goto errout;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (r->idiag_family == AF_INET6) {
+		if (ext & (1 << (INET_DIAG_TCLASS - 1)))
+			if (nla_put_u8(skb, INET_DIAG_TCLASS,
+				       inet6_sk(sk)->tclass) < 0)
+				goto errout;
+
+		if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) &&
+		    nla_put_u8(skb, INET_DIAG_SKV6ONLY, ipv6_only_sock(sk)))
+			goto errout;
+	}
+#endif
+
+	r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
+	r->idiag_inode = sock_i_ino(sk);
+
+	if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
+		struct inet_diag_meminfo minfo = {
+			.idiag_rmem = sk_rmem_alloc_get(sk),
+			.idiag_wmem = sk->sk_wmem_queued,
+			.idiag_fmem = sk->sk_forward_alloc,
+			.idiag_tmem = sk_wmem_alloc_get(sk),
+		};
+
+		if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0)
+			goto errout;
+	}
+
+	if (ext & (1 << (INET_DIAG_SKMEMINFO - 1)))
+		if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
+			goto errout;
+
+	if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) {
+		attr = nla_reserve(skb, INET_DIAG_INFO,
+				   handler->idiag_info_size);
+		if (!attr)
+			goto errout;
+
+		info = nla_data(attr);
+	}
+	infox.sctpinfo = (struct sctp_info *)info;
+	infox.asoc = asoc;
+	handler->idiag_get_info(sk, r, &infox);
+
+	if (ext & (1 << (INET_DIAG_CONG - 1)))
+		if (nla_put_string(skb, INET_DIAG_CONG, "reno") < 0)
+			goto errout;
+
+	if (inet_sctp_fill_laddrs(skb, &asoc->base.bind_addr.address_list))
+		goto errout;
+
+	if (inet_sctp_fill_paddrs(skb, asoc))
+		goto errout;
+
+	nlmsg_end(skb, nlh);
+	return 0;
+
+errout:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int inet_ep_diag_fill(struct sock *sk, struct sctp_endpoint *ep,
+			     struct sk_buff *skb,
+			     const struct inet_diag_req_v2 *req,
+			     struct user_namespace *user_ns,
+			     u32 portid, u32 seq, u16 nlmsg_flags,
+			     const struct nlmsghdr *unlh)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct inet_diag_handler *handler;
+	int ext = req->idiag_ext;
+	struct inet_diag_msg *r;
+	struct nlmsghdr  *nlh;
+	struct nlattr *attr;
+	void *info = NULL;
+	struct sctp_infox infox;
+
+	handler = inet_diag_get_handler(req->sdiag_protocol);
+	BUG_ON(!handler);
+
+	nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
+			nlmsg_flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	r = nlmsg_data(nlh);
+	BUG_ON(!sk_fullsock(sk));
+
+	inet_diag_msg_common_fill(r, sk);
+	r->idiag_state = sk->sk_state;
+	r->idiag_timer = 0;
+	r->idiag_retrans = 0;
+
+	if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
+		goto errout;
+
+	/* IPv6 dual-stack sockets use inet->tos for IPv4 connections,
+	 * hence this needs to be included regardless of socket family.
+	 */
+	if (ext & (1 << (INET_DIAG_TOS - 1)))
+		if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
+			goto errout;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (r->idiag_family == AF_INET6) {
+		if (ext & (1 << (INET_DIAG_TCLASS - 1)))
+			if (nla_put_u8(skb, INET_DIAG_TCLASS,
+				       inet6_sk(sk)->tclass) < 0)
+				goto errout;
+
+		if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) &&
+		    nla_put_u8(skb, INET_DIAG_SKV6ONLY, ipv6_only_sock(sk)))
+			goto errout;
+	}
+#endif
+
+	r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
+	r->idiag_inode = sock_i_ino(sk);
+
+	if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
+		struct inet_diag_meminfo minfo = {
+			.idiag_rmem = sk_rmem_alloc_get(sk),
+			.idiag_wmem = sk->sk_wmem_queued,
+			.idiag_fmem = sk->sk_forward_alloc,
+			.idiag_tmem = sk_wmem_alloc_get(sk),
+		};
+
+		if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0)
+			goto errout;
+	}
+
+	if (ext & (1 << (INET_DIAG_SKMEMINFO - 1)))
+		if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
+			goto errout;
+
+	if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) {
+		attr = nla_reserve(skb, INET_DIAG_INFO,
+				   handler->idiag_info_size);
+		if (!attr)
+			goto errout;
+
+		info = nla_data(attr);
+	}
+	infox.sctpinfo = (struct sctp_info *)info;
+	infox.asoc = NULL;
+	handler->idiag_get_info(sk, r, &infox);
+
+	if (inet_sctp_fill_laddrs(skb, &ep->base.bind_addr.address_list))
+		goto errout;
+
+	nlmsg_end(skb, nlh);
+	return 0;
+
+errout:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static size_t inet_assoc_attr_size(struct sctp_association *asoc)
+{
+	int addrlen = sizeof(struct sockaddr_storage);
+	int addrcnt = 0;
+	struct sctp_sockaddr_entry *laddr;
+
+	list_for_each_entry_rcu(laddr, &asoc->base.bind_addr.address_list,
+				list)
+		addrcnt++;
+
+	return	  nla_total_size(sizeof(struct tcp_info))
+		+ nla_total_size(1) /* INET_DIAG_SHUTDOWN */
+		+ nla_total_size(1) /* INET_DIAG_TOS */
+		+ nla_total_size(1) /* INET_DIAG_TCLASS */
+		+ nla_total_size(addrlen * asoc->peer.transport_count)
+		+ nla_total_size(addrlen * addrcnt)
+		+ nla_total_size(sizeof(struct inet_diag_meminfo))
+		+ nla_total_size(sizeof(struct inet_diag_msg))
+		+ nla_total_size(sizeof(struct sctp_info))
+		+ 64;
+}
+
+/* callback and param */
+struct sctp_comm_param {
+	struct sk_buff *skb;
+	struct netlink_callback *cb;
+	const struct inet_diag_req_v2 *r;
+	const struct nlmsghdr *nlh;
+};
+
+static int sctp_tsp_dump_one(struct sctp_transport *tsp, void *p)
+{
+	struct sctp_association *assoc = tsp->asoc;
+	struct sock *sk = tsp->asoc->base.sk;
+	struct sctp_comm_param *commp = p;
+	struct sk_buff *in_skb = commp->skb;
+	const struct inet_diag_req_v2 *req = commp->r;
+	const struct nlmsghdr *nlh = commp->nlh;
+	struct net *net = sock_net(in_skb->sk);
+	struct sk_buff *rep;
+	int err;
+
+	err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
+	if (err)
+		goto out;
+
+	err = -ENOMEM;
+	rep = nlmsg_new(inet_assoc_attr_size(assoc), GFP_KERNEL);
+	if (!rep)
+		goto out;
+
+	err = inet_assoc_diag_fill(sk, assoc, rep, req,
+				   sk_user_ns(NETLINK_CB(in_skb).sk),
+				   NETLINK_CB(in_skb).portid,
+				   nlh->nlmsg_seq, 0, nlh);
+	if (err < 0) {
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(rep);
+		goto out;
+	}
+
+	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
+			      MSG_DONTWAIT);
+	if (err > 0)
+		err = 0;
+out:
+	return err;
+}
+
+static int sctp_tsp_dump(struct sctp_transport *tsp, void *p)
+{
+	struct sctp_endpoint *ep = tsp->asoc->ep;
+	struct sctp_comm_param *commp = p;
+	struct sock *sk = ep->base.sk;
+	struct sk_buff *skb = commp->skb;
+	struct netlink_callback *cb = commp->cb;
+	const struct inet_diag_req_v2 *r = commp->r;
+	struct sctp_association *assoc =
+		list_entry(ep->asocs.next, struct sctp_association, asocs);
+	int err = 0;
+
+	if (tsp->asoc != assoc)
+		goto out;
+
+	if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family)
+		goto out;
+
+	lock_sock(sk);
+	list_for_each_entry(assoc, &ep->asocs, asocs) {
+		if (cb->args[4] < cb->args[1])
+			goto next;
+
+		if (r->id.idiag_sport != htons(assoc->base.bind_addr.port) &&
+		    r->id.idiag_sport)
+			goto next;
+		if (r->id.idiag_dport != htons(assoc->peer.port) &&
+		    r->id.idiag_dport)
+			goto next;
+
+		if (!cb->args[3] &&
+		    inet_ep_diag_fill(sk, ep, skb, r,
+				      sk_user_ns(NETLINK_CB(cb->skb).sk),
+				      NETLINK_CB(cb->skb).portid,
+				      cb->nlh->nlmsg_seq,
+				      NLM_F_MULTI, cb->nlh) < 0) {
+			cb->args[3] = 1;
+			err = 2;
+			goto release;
+		}
+		cb->args[3] = 1;
+
+		if (inet_assoc_diag_fill(sk, assoc, skb, r,
+					 sk_user_ns(NETLINK_CB(cb->skb).sk),
+					 NETLINK_CB(cb->skb).portid,
+					 cb->nlh->nlmsg_seq, 0, cb->nlh) < 0) {
+			err = 2;
+			goto release;
+		}
+next:
+		cb->args[4]++;
+	}
+	cb->args[1] = 0;
+	cb->args[2]++;
+	cb->args[3] = 0;
+	cb->args[4] = 0;
+release:
+	release_sock(sk);
+	return err;
+out:
+	cb->args[2]++;
+	return err;
+}
+
+static int sctp_ep_dump(struct sctp_endpoint *ep, void *p)
+{
+	struct sctp_comm_param *commp = p;
+	struct sock *sk = ep->base.sk;
+	struct sk_buff *skb = commp->skb;
+	struct netlink_callback *cb = commp->cb;
+	const struct inet_diag_req_v2 *r = commp->r;
+	struct net *net = sock_net(skb->sk);
+	struct inet_sock *inet = inet_sk(sk);
+	int err = 0;
+
+	if (!net_eq(sock_net(sk), net))
+		goto out;
+
+	if (cb->args[4] < cb->args[1])
+		goto next;
+
+	if (r->sdiag_family != AF_UNSPEC &&
+	    sk->sk_family != r->sdiag_family)
+		goto next;
+
+	if (r->id.idiag_sport != inet->inet_sport &&
+	    r->id.idiag_sport)
+		goto next;
+
+	if (r->id.idiag_dport != inet->inet_dport &&
+	    r->id.idiag_dport)
+		goto next;
+
+	if (inet_ep_diag_fill(sk, ep, skb, r,
+			      sk_user_ns(NETLINK_CB(cb->skb).sk),
+			      NETLINK_CB(cb->skb).portid,
+			      cb->nlh->nlmsg_seq, NLM_F_MULTI,
+			      cb->nlh) < 0) {
+		err = 2;
+		goto out;
+	}
+next:
+	cb->args[4]++;
+out:
+	return err;
+}
+
+/* define the functions for sctp_diag_handler*/
+static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+			       void *info)
+{
+	struct sctp_infox *infox = (struct sctp_infox *)info;
+
+	if (infox->asoc) {
+		r->idiag_rqueue = atomic_read(&infox->asoc->rmem_alloc);
+		r->idiag_wqueue = infox->asoc->sndbuf_used;
+	} else {
+		r->idiag_rqueue = sk->sk_ack_backlog;
+		r->idiag_wqueue = sk->sk_max_ack_backlog;
+	}
+	if (infox->sctpinfo)
+		sctp_get_sctp_info(sk, infox->asoc, infox->sctpinfo);
+}
+
+static int sctp_diag_dump_one(struct sk_buff *in_skb,
+			      const struct nlmsghdr *nlh,
+			      const struct inet_diag_req_v2 *req)
+{
+	struct net *net = sock_net(in_skb->sk);
+	union sctp_addr laddr, paddr;
+	struct sctp_comm_param commp = {
+		.skb = in_skb,
+		.r = req,
+		.nlh = nlh,
+	};
+
+	if (req->sdiag_family == AF_INET) {
+		laddr.v4.sin_port = req->id.idiag_sport;
+		laddr.v4.sin_addr.s_addr = req->id.idiag_src[0];
+		laddr.v4.sin_family = AF_INET;
+
+		paddr.v4.sin_port = req->id.idiag_dport;
+		paddr.v4.sin_addr.s_addr = req->id.idiag_dst[0];
+		paddr.v4.sin_family = AF_INET;
+	} else {
+		laddr.v6.sin6_port = req->id.idiag_sport;
+		memcpy(&laddr.v6.sin6_addr, req->id.idiag_src, 64);
+		laddr.v6.sin6_family = AF_INET6;
+
+		paddr.v6.sin6_port = req->id.idiag_dport;
+		memcpy(&paddr.v6.sin6_addr, req->id.idiag_dst, 64);
+		paddr.v6.sin6_family = AF_INET6;
+	}
+
+	return sctp_transport_lookup_process(sctp_tsp_dump_one,
+					     net, &laddr, &paddr, &commp);
+}
+
+static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+			   const struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+	u32 idiag_states = r->idiag_states;
+	struct net *net = sock_net(skb->sk);
+	struct sctp_comm_param commp = {
+		.skb = skb,
+		.cb = cb,
+		.r = r,
+	};
+
+	/* eps hashtable dumps
+	 * args:
+	 * 0 : if it will traversal listen sock
+	 * 1 : to record the sock pos of this time's traversal
+	 * 4 : to work as a temporary variable to traversal list
+	 */
+	if (cb->args[0] == 0) {
+		if (!(idiag_states & TCPF_LISTEN))
+			goto skip;
+		if (sctp_for_each_endpoint(sctp_ep_dump, &commp))
+			goto done;
+skip:
+		cb->args[0] = 1;
+		cb->args[1] = 0;
+		cb->args[4] = 0;
+	}
+
+	/* asocs by transport hashtable dump
+	 * args:
+	 * 1 : to record the assoc pos of this time's traversal
+	 * 2 : to record the transport pos of this time's traversal
+	 * 3 : to mark if we have dumped the ep info of the current asoc
+	 * 4 : to work as a temporary variable to traversal list
+	 */
+	if (!(idiag_states & ~TCPF_LISTEN))
+		goto done;
+	sctp_for_each_transport(sctp_tsp_dump, net, cb->args[2], &commp);
+done:
+	cb->args[1] = cb->args[4];
+	cb->args[4] = 0;
+}
+
+static const struct inet_diag_handler sctp_diag_handler = {
+	.dump		 = sctp_diag_dump,
+	.dump_one	 = sctp_diag_dump_one,
+	.idiag_get_info  = sctp_diag_get_info,
+	.idiag_type	 = IPPROTO_SCTP,
+	.idiag_info_size = sizeof(struct sctp_info),
+};
+
+static int __init sctp_diag_init(void)
+{
+	return inet_diag_register(&sctp_diag_handler);
+}
+
+static void __exit sctp_diag_exit(void)
+{
+	inet_diag_unregister(&sctp_diag_handler);
+}
+
+module_init(sctp_diag_init);
+module_exit(sctp_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-132);
-- 
2.1.0

^ permalink raw reply related

* [PATCH net-next 3/7] sctp: export some functions for sctp_diag in inet_diag
From: Xin Long @ 2016-04-05  4:06 UTC (permalink / raw)
  To: network dev, linux-sctp
  Cc: Marcelo Ricardo Leitner, Vlad Yasevich, daniel, davem
In-Reply-To: <cover.1459829123.git.lucien.xin@gmail.com>

inet_diag_msg_common_fill is used to fill the diag msg common info,
we need to use it in sctp_diag as well, so export it.

We also add inet_diag_get_handler() to access inet_diag_table in sctp
diag.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 net/ipv4/inet_diag.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index bd591eb..29121a6 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -66,7 +66,13 @@ static void inet_diag_unlock_handler(const struct inet_diag_handler *handler)
 	mutex_unlock(&inet_diag_table_mutex);
 }
 
-static void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
+struct inet_diag_handler *inet_diag_get_handler(int proto)
+{
+	return inet_diag_table[proto];
+}
+EXPORT_SYMBOL_GPL(inet_diag_get_handler);
+
+void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
 {
 	r->idiag_family = sk->sk_family;
 
@@ -89,6 +95,7 @@ static void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
 	r->id.idiag_dst[0] = sk->sk_daddr;
 	}
 }
+EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill);
 
 static size_t inet_sk_attr_size(void)
 {
-- 
2.1.0

^ permalink raw reply related

* [PATCH net-next 2/7] sctp: export some apis or variables for sctp_diag
From: Xin Long @ 2016-04-05  4:06 UTC (permalink / raw)
  To: network dev, linux-sctp
  Cc: Marcelo Ricardo Leitner, Vlad Yasevich, daniel, davem
In-Reply-To: <cover.1459829123.git.lucien.xin@gmail.com>

For some main variables in sctp.ko, we couldn't export it to other modules,
so we have to define some api to access them.

It will include sctp transport and endpoint's traversal.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 include/net/sctp/sctp.h |  13 +++++
 net/sctp/socket.c       | 124 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 137 insertions(+)

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 36e1eae..c0c4deb 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -116,6 +116,19 @@ extern struct percpu_counter sctp_sockets_allocated;
 int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *);
 struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int, int *);
 
+int sctp_transport_walk_start(struct rhashtable_iter *iter);
+void sctp_transport_walk_stop(struct rhashtable_iter *iter);
+struct sctp_transport *sctp_transport_get_next(struct net *net,
+			struct rhashtable_iter *iter);
+struct sctp_transport *sctp_transport_get_idx(struct net *net,
+			struct rhashtable_iter *iter, int pos);
+int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *),
+				  struct net *net,
+				  const union sctp_addr *laddr,
+				  const union sctp_addr *paddr, void *p);
+int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *),
+			    struct net *net, int pos, void *p);
+int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *), void *p);
 int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
 		       struct sctp_info *info);
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 8f79f23..b0bf6c7 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4288,6 +4288,130 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
 }
 EXPORT_SYMBOL_GPL(sctp_get_sctp_info);
 
+/* use callback to avoid exporting the core structure */
+int sctp_transport_walk_start(struct rhashtable_iter *iter)
+{
+	int err;
+
+	err = rhashtable_walk_init(&sctp_transport_hashtable, iter);
+	if (err)
+		return err;
+
+	err = rhashtable_walk_start(iter);
+
+	return err == -EAGAIN ? 0 : err;
+}
+
+void sctp_transport_walk_stop(struct rhashtable_iter *iter)
+{
+	rhashtable_walk_stop(iter);
+	rhashtable_walk_exit(iter);
+}
+
+struct sctp_transport *sctp_transport_get_next(struct net *net,
+					       struct rhashtable_iter *iter)
+{
+	struct sctp_transport *t;
+
+	t = rhashtable_walk_next(iter);
+	for (; t; t = rhashtable_walk_next(iter)) {
+		if (IS_ERR(t)) {
+			if (PTR_ERR(t) == -EAGAIN)
+				continue;
+			break;
+		}
+
+		if (net_eq(sock_net(t->asoc->base.sk), net) &&
+		    t->asoc->peer.primary_path == t)
+			break;
+	}
+
+	return t;
+}
+
+struct sctp_transport *sctp_transport_get_idx(struct net *net,
+					      struct rhashtable_iter *iter,
+					      int pos)
+{
+	void *obj = SEQ_START_TOKEN;
+
+	while (pos && (obj = sctp_transport_get_next(net, iter)) &&
+	       !IS_ERR(obj))
+		pos--;
+
+	return obj;
+}
+
+int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *),
+			   void *p) {
+	int err = 0;
+	int hash = 0;
+	struct sctp_ep_common *epb;
+	struct sctp_hashbucket *head;
+
+	for (head = sctp_ep_hashtable; hash < sctp_ep_hashsize;
+	     hash++, head++) {
+		read_lock(&head->lock);
+		sctp_for_each_hentry(epb, &head->chain) {
+			err = cb(sctp_ep(epb), p);
+			if (err)
+				break;
+		}
+		read_unlock(&head->lock);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(sctp_for_each_endpoint);
+
+int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *),
+				  struct net *net,
+				  const union sctp_addr *laddr,
+				  const union sctp_addr *paddr, void *p)
+{
+	struct sctp_transport *transport;
+	int err = 0;
+
+	rcu_read_lock();
+	transport = sctp_addrs_lookup_transport(net, laddr, paddr);
+	if (!transport || !sctp_transport_hold(transport))
+		goto out;
+	err = cb(transport, p);
+	sctp_transport_put(transport);
+
+out:
+	rcu_read_unlock();
+	return err;
+}
+EXPORT_SYMBOL_GPL(sctp_transport_lookup_process);
+
+int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *),
+			    struct net *net, int pos, void *p) {
+	struct rhashtable_iter hti;
+	int err = 0;
+	void *obj;
+
+	if (sctp_transport_walk_start(&hti))
+		goto out;
+
+	sctp_transport_get_idx(net, &hti, pos);
+	obj = sctp_transport_get_next(net, &hti);
+	for (; obj && !IS_ERR(obj); obj = sctp_transport_get_next(net, &hti)) {
+		struct sctp_transport *transport = obj;
+
+		if (!sctp_transport_hold(transport))
+			continue;
+		err = cb(transport, p);
+		sctp_transport_put(transport);
+		if (err)
+			break;
+	}
+out:
+	sctp_transport_walk_stop(&hti);
+	return err;
+}
+EXPORT_SYMBOL_GPL(sctp_for_each_transport);
+
 /* 7.2.1 Association Status (SCTP_STATUS)
 
  * Applications can retrieve current status information about an
-- 
2.1.0

^ permalink raw reply related

* [PATCH net-next 1/7] sctp: add sctp_info dump api for sctp_diag
From: Xin Long @ 2016-04-05  4:06 UTC (permalink / raw)
  To: network dev, linux-sctp
  Cc: Marcelo Ricardo Leitner, Vlad Yasevich, daniel, davem
In-Reply-To: <cover.1459829123.git.lucien.xin@gmail.com>

sctp_diag will dump some important details of sctp's assoc or ep, we use
sctp_info to describe them,  sctp_get_sctp_info to get them, and export
it to sctp_diag.ko.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 include/linux/sctp.h    | 65 +++++++++++++++++++++++++++++++++++++
 include/net/sctp/sctp.h |  3 ++
 net/sctp/socket.c       | 86 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 154 insertions(+)

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index a9414fd..a448ebc 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -705,4 +705,69 @@ typedef struct sctp_auth_chunk {
 	sctp_authhdr_t auth_hdr;
 } __packed sctp_auth_chunk_t;
 
+struct sctp_info {
+	__u32	sctpi_tag;
+	__u32	sctpi_state;
+	__u32	sctpi_rwnd;
+	__u16	sctpi_unackdata;
+	__u16	sctpi_penddata;
+	__u16	sctpi_instrms;
+	__u16	sctpi_outstrms;
+	__u32	sctpi_fragmentation_point;
+	__u32	sctpi_inqueue;
+	__u32	sctpi_outqueue;
+	__u32	sctpi_overall_error;
+	__u32	sctpi_max_burst;
+	__u32	sctpi_maxseg;
+	__u32	sctpi_peer_rwnd;
+	__u32	sctpi_peer_tag;
+	__u8	sctpi_peer_capable;
+	__u8	sctpi_peer_sack;
+
+	/* assoc status info */
+	__u64	sctpi_isacks;
+	__u64	sctpi_osacks;
+	__u64	sctpi_opackets;
+	__u64	sctpi_ipackets;
+	__u64	sctpi_rtxchunks;
+	__u64	sctpi_outofseqtsns;
+	__u64	sctpi_idupchunks;
+	__u64	sctpi_gapcnt;
+	__u64	sctpi_ouodchunks;
+	__u64	sctpi_iuodchunks;
+	__u64	sctpi_oodchunks;
+	__u64	sctpi_iodchunks;
+	__u64	sctpi_octrlchunks;
+	__u64	sctpi_ictrlchunks;
+
+	/* primary transport info */
+	struct sockaddr_storage	sctpi_p_address;
+	__s32	sctpi_p_state;
+	__u32	sctpi_p_cwnd;
+	__u32	sctpi_p_srtt;
+	__u32	sctpi_p_rto;
+	__u32	sctpi_p_hbinterval;
+	__u32	sctpi_p_pathmaxrxt;
+	__u32	sctpi_p_sackdelay;
+	__u32	sctpi_p_sackfreq;
+	__u32	sctpi_p_ssthresh;
+	__u32	sctpi_p_partial_bytes_acked;
+	__u32	sctpi_p_flight_size;
+	__u16	sctpi_p_error;
+
+	/* sctp sock info */
+	__u32	sctpi_s_autoclose;
+	__u32	sctpi_s_adaptation_ind;
+	__u32	sctpi_s_pd_point;
+	__u8	sctpi_s_nodelay;
+	__u8	sctpi_s_disable_fragments;
+	__u8	sctpi_s_v4mapped;
+	__u8	sctpi_s_frag_interleave;
+};
+
+struct sctp_infox {
+	struct sctp_info *sctpinfo;
+	struct sctp_association *asoc;
+};
+
 #endif /* __LINUX_SCTP_H__ */
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 65521cf..36e1eae 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -116,6 +116,9 @@ extern struct percpu_counter sctp_sockets_allocated;
 int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *);
 struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int, int *);
 
+int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
+		       struct sctp_info *info);
+
 /*
  * sctp/primitive.c
  */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 878d28e..8f79f23 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4202,6 +4202,92 @@ static void sctp_shutdown(struct sock *sk, int how)
 	}
 }
 
+int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
+		       struct sctp_info *info)
+{
+	struct sctp_transport *prim;
+	struct list_head *pos, *temp;
+	int mask;
+
+	memset(info, 0, sizeof(*info));
+	if (!asoc) {
+		struct sctp_sock *sp = sctp_sk(sk);
+
+		info->sctpi_s_autoclose = sp->autoclose;
+		info->sctpi_s_adaptation_ind = sp->adaptation_ind;
+		info->sctpi_s_pd_point = sp->pd_point;
+		info->sctpi_s_nodelay = sp->nodelay;
+		info->sctpi_s_disable_fragments = sp->disable_fragments;
+		info->sctpi_s_v4mapped = sp->v4mapped;
+		info->sctpi_s_frag_interleave = sp->frag_interleave;
+
+		return 0;
+	}
+
+	info->sctpi_tag = asoc->c.my_vtag;
+	info->sctpi_state = asoc->state;
+	info->sctpi_rwnd = asoc->a_rwnd;
+	info->sctpi_unackdata = asoc->unack_data;
+	info->sctpi_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map);
+	info->sctpi_instrms = asoc->c.sinit_max_instreams;
+	info->sctpi_outstrms = asoc->c.sinit_num_ostreams;
+	list_for_each_safe(pos, temp, &asoc->base.inqueue.in_chunk_list)
+		info->sctpi_inqueue++;
+	list_for_each_safe(pos, temp, &asoc->outqueue.out_chunk_list)
+		info->sctpi_outqueue++;
+	info->sctpi_overall_error = asoc->overall_error_count;
+	info->sctpi_max_burst = asoc->max_burst;
+	info->sctpi_maxseg = asoc->frag_point;
+	info->sctpi_peer_rwnd = asoc->peer.rwnd;
+	info->sctpi_peer_tag = asoc->c.peer_vtag;
+
+	mask = asoc->peer.ecn_capable << 1;
+	mask = (mask | asoc->peer.ipv4_address) << 1;
+	mask = (mask | asoc->peer.ipv6_address) << 1;
+	mask = (mask | asoc->peer.hostname_address) << 1;
+	mask = (mask | asoc->peer.asconf_capable) << 1;
+	mask = (mask | asoc->peer.prsctp_capable) << 1;
+	mask = (mask | asoc->peer.auth_capable);
+	info->sctpi_peer_capable = mask;
+	mask = asoc->peer.sack_needed << 1;
+	mask = (mask | asoc->peer.sack_generation) << 1;
+	mask = (mask | asoc->peer.zero_window_announced);
+	info->sctpi_peer_sack = mask;
+
+	info->sctpi_isacks = asoc->stats.isacks;
+	info->sctpi_osacks = asoc->stats.osacks;
+	info->sctpi_opackets = asoc->stats.opackets;
+	info->sctpi_ipackets = asoc->stats.ipackets;
+	info->sctpi_rtxchunks = asoc->stats.rtxchunks;
+	info->sctpi_outofseqtsns = asoc->stats.outofseqtsns;
+	info->sctpi_idupchunks = asoc->stats.idupchunks;
+	info->sctpi_gapcnt = asoc->stats.gapcnt;
+	info->sctpi_ouodchunks = asoc->stats.ouodchunks;
+	info->sctpi_iuodchunks = asoc->stats.iuodchunks;
+	info->sctpi_oodchunks = asoc->stats.oodchunks;
+	info->sctpi_iodchunks = asoc->stats.iodchunks;
+	info->sctpi_octrlchunks = asoc->stats.octrlchunks;
+	info->sctpi_ictrlchunks = asoc->stats.ictrlchunks;
+
+	prim = asoc->peer.primary_path;
+	memcpy(&info->sctpi_p_address, &prim->ipaddr,
+	       sizeof(struct sockaddr_storage));
+	info->sctpi_p_state = prim->state;
+	info->sctpi_p_cwnd = prim->cwnd;
+	info->sctpi_p_srtt = prim->srtt;
+	info->sctpi_p_rto = jiffies_to_msecs(prim->rto);
+	info->sctpi_p_hbinterval = prim->hbinterval;
+	info->sctpi_p_pathmaxrxt = prim->pathmaxrxt;
+	info->sctpi_p_sackdelay = jiffies_to_msecs(prim->sackdelay);
+	info->sctpi_p_ssthresh = prim->ssthresh;
+	info->sctpi_p_partial_bytes_acked = prim->partial_bytes_acked;
+	info->sctpi_p_flight_size = prim->flight_size;
+	info->sctpi_p_error = prim->error_count;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sctp_get_sctp_info);
+
 /* 7.2.1 Association Status (SCTP_STATUS)
 
  * Applications can retrieve current status information about an
-- 
2.1.0

^ permalink raw reply related

* [PATCH net-next 0/7] sctp: support sctp_diag in kernel
From: Xin Long @ 2016-04-05  4:06 UTC (permalink / raw)
  To: network dev, linux-sctp
  Cc: Marcelo Ricardo Leitner, Vlad Yasevich, daniel, davem

This patchset will add sctp_diag module to implement diag interface on
sctp in kernel.

For a listening sctp endpoint, we will just dump it's ep info.
For a sctp connection, we will the assoc info and it's ep info.

The ss dump will looks like:

[iproute2]# ./misc/ss --sctp  -n -l
State      Recv-Q Send-Q   Local Address:Port       Peer Address:Port
LISTEN     0      128      172.16.254.254:8888      *:*
LISTEN     0      5        127.0.0.1:1234           *:*
LISTEN     0      5        127.0.0.1:1234           *:*
  - ESTAB  0      0        127.0.0.1%lo:1234        127.0.0.1:4321
LISTEN     0      128      172.16.254.254:8888      *:*
  - ESTAB  0      0        172.16.254.254%eth1:8888 172.16.253.253:8888
  - ESTAB  0      0        172.16.254.254%eth1:8888 172.16.1.1:8888
  - ESTAB  0      0        172.16.254.254%eth1:8888 172.16.1.2:8888
  - ESTAB  0      0        172.16.254.254%eth1:8888 172.16.2.1:8888
  - ESTAB  0      0        172.16.254.254%eth1:8888 172.16.2.2:8888
  - ESTAB  0      0        172.16.254.254%eth1:8888 172.16.3.1:8888
  - ESTAB  0      0        172.16.254.254%eth1:8888 172.16.3.2:8888
LISTEN     0      0        127.0.0.1:4321           *:*
  - ESTAB  0      0        127.0.0.1%lo:4321        127.0.0.1:1234

The entries with '- ESTAB' are the assocs, some of them may belong to
the same endpoint. So we will dump the parent endpoint first, like the
entry with 'LISTEN'. then dump the assocs. ep and assocs entries will
be dumped in right order so that ss can show them in tree format easily.

Besides, this patchset also simplifies sctp proc codes, cause it has
some similar codes with sctp diag in sctp transport traversal.

Xin Long (7):
  sctp: add sctp_info dump api for sctp_diag
  sctp: export some apis or variables for sctp_diag
  sctp: export some functions for sctp_diag in inet_diag
  sctp: add the sctp_diag.c file
  sctp: reuse the some transport traversal functions in proc
  sctp: merge the seq_start/next/exits in remaddrs and assocs
  sctp: fix some rhashtable functions using in sctp proc/diag

 include/linux/sctp.h           |  65 +++++
 include/net/sctp/sctp.h        |  16 ++
 include/uapi/linux/inet_diag.h |   2 +
 net/ipv4/inet_diag.c           |   9 +-
 net/sctp/Kconfig               |   4 +
 net/sctp/Makefile              |   1 +
 net/sctp/proc.c                | 104 ++------
 net/sctp/sctp_diag.c           | 581 +++++++++++++++++++++++++++++++++++++++++
 net/sctp/socket.c              | 215 +++++++++++++++
 9 files changed, 911 insertions(+), 86 deletions(-)
 create mode 100644 net/sctp/sctp_diag.c

-- 
2.1.0

^ permalink raw reply

* Re: [net PATCH v2 2/2] ipv4/GRO: Make GRO conform to RFC 6864
From: Herbert Xu @ 2016-04-05  3:44 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: tom, jesse, alexander.duyck, edumazet, netdev, davem
In-Reply-To: <20160404162818.14332.1076.stgit@localhost.localdomain>

On Mon, Apr 04, 2016 at 09:31:21AM -0700, Alexander Duyck wrote:
> RFC 6864 states that the IPv4 ID field MUST NOT be used for purposes other
> than fragmentation and reassembly.  Currently we are looking at this field
> as a way of identifying what frames can be aggregated and  which cannot for
> GRO.  While this is valid for frames that do not have DF set, it is invalid
> to do so if the bit is set.

This justification is bogus.  GRO is a completely local optimisation
that should have zero visibility to third parties.  So it makes no
sense to talk about RFC compliance of GRO.  The Linux network stack
as a whole is subject to RFC compliance, but not GRO per se.

> In the case of the non-incrementing IP ID we will end up losing the data
> that the IP ID is fixed.  However as per RFC 6864 we should be able to
> write any value into the IP ID when the DF bit is set so this should cause
> minimal harm.

No we should not do that, at least not by default.  GRO was designed
to be completely lossless, that is its main advantage of the various
forms of LRO which preceded it.

If you lose that people will start asking it to be disabled for
routers/bridges and we'll be back in the same old mess that we
had with LRO.

So please do this properly and preserve the information in the packet.
As I said earlier all it takes is one single bit, like we do with ECN.
If you put it in the feature bit you'll also allow us to distinguish
between TSO drivers that produce fixed IDs vs. incrementing IDs.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* [PATCHv2 net-next 6/6] bridge: a netlink notification should be sent when those attributes are changed by ioctl
From: Xin Long @ 2016-04-05  3:32 UTC (permalink / raw)
  To: network dev, bridge
  Cc: davem, Stephen Hemminger, Hannes Frederic Sowa, nikolay
In-Reply-To: <cover.1459827115.git.lucien.xin@gmail.com>

Now when we change the attributes of bridge or br_port by netlink,
a relevant netlink notification will be sent, but if we change them
by ioctl or sysfs, no notification will be sent.

We should ensure that whenever those attributes change internally or from
sysfs/ioctl, that a netlink notification is sent out to listeners.

Also, NetworkManager will use this in the future to listen for out-of-band
bridge master attribute updates and incorporate them into the runtime
configuration.

This patch is used for ioctl.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 net/bridge/br_ioctl.c | 40 ++++++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index 263b4de..f8fc624 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -112,7 +112,9 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd)
 static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
 	struct net_bridge *br = netdev_priv(dev);
+	struct net_bridge_port *p = NULL;
 	unsigned long args[4];
+	int ret = -EOPNOTSUPP;
 
 	if (copy_from_user(args, rq->ifr_data, sizeof(args)))
 		return -EFAULT;
@@ -182,25 +184,29 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 		if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 
-		return br_set_forward_delay(br, args[1]);
+		ret = br_set_forward_delay(br, args[1]);
+		break;
 
 	case BRCTL_SET_BRIDGE_HELLO_TIME:
 		if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 
-		return br_set_hello_time(br, args[1]);
+		ret = br_set_hello_time(br, args[1]);
+		break;
 
 	case BRCTL_SET_BRIDGE_MAX_AGE:
 		if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 
-		return br_set_max_age(br, args[1]);
+		ret = br_set_max_age(br, args[1]);
+		break;
 
 	case BRCTL_SET_AGEING_TIME:
 		if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 
-		return br_set_ageing_time(br, args[1]);
+		ret = br_set_ageing_time(br, args[1]);
+		break;
 
 	case BRCTL_GET_PORT_INFO:
 	{
@@ -240,20 +246,19 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 			return -EPERM;
 
 		br_stp_set_enabled(br, args[1]);
-		return 0;
+		ret = 0;
+		break;
 
 	case BRCTL_SET_BRIDGE_PRIORITY:
 		if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 
 		br_stp_set_bridge_priority(br, args[1]);
-		return 0;
+		ret = 0;
+		break;
 
 	case BRCTL_SET_PORT_PRIORITY:
 	{
-		struct net_bridge_port *p;
-		int ret;
-
 		if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 
@@ -263,14 +268,11 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 		else
 			ret = br_stp_set_port_priority(p, args[2]);
 		spin_unlock_bh(&br->lock);
-		return ret;
+		break;
 	}
 
 	case BRCTL_SET_PATH_COST:
 	{
-		struct net_bridge_port *p;
-		int ret;
-
 		if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 
@@ -280,8 +282,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 		else
 			ret = br_stp_set_path_cost(p, args[2]);
 		spin_unlock_bh(&br->lock);
-
-		return ret;
+		break;
 	}
 
 	case BRCTL_GET_FDB_ENTRIES:
@@ -289,7 +290,14 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 				       args[2], args[3]);
 	}
 
-	return -EOPNOTSUPP;
+	if (!ret) {
+		if (p)
+			br_ifinfo_notify(RTM_NEWLINK, p);
+		else
+			netdev_state_change(br->dev);
+	}
+
+	return ret;
 }
 
 static int old_deviceless(struct net *net, void __user *uarg)
-- 
2.1.0

^ permalink raw reply related

* [PATCHv2 net-next 5/6] bridge: a netlink notification should be sent when those attributes are changed by br_sysfs_if
From: Xin Long @ 2016-04-05  3:32 UTC (permalink / raw)
  To: network dev, bridge
  Cc: davem, Stephen Hemminger, Hannes Frederic Sowa, nikolay
In-Reply-To: <cover.1459827115.git.lucien.xin@gmail.com>

Now when we change the attributes of bridge or br_port by netlink,
a relevant netlink notification will be sent, but if we change them
by ioctl or sysfs, no notification will be sent.

We should ensure that whenever those attributes change internally or from
sysfs/ioctl, that a netlink notification is sent out to listeners.

Also, NetworkManager will use this in the future to listen for out-of-band
bridge master attribute updates and incorporate them into the runtime
configuration.

This patch is used for br_sysfs_if, and we also move br_ifinfo_notify out
of store_flag.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 net/bridge/br_sysfs_if.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index efe415a..1e04d4d 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -61,7 +61,6 @@ static int store_flag(struct net_bridge_port *p, unsigned long v,
 	if (flags != p->flags) {
 		p->flags = flags;
 		br_port_flags_change(p, mask);
-		br_ifinfo_notify(RTM_NEWLINK, p);
 	}
 	return 0;
 }
@@ -253,8 +252,10 @@ static ssize_t brport_store(struct kobject *kobj,
 			spin_lock_bh(&p->br->lock);
 			ret = brport_attr->store(p, val);
 			spin_unlock_bh(&p->br->lock);
-			if (ret == 0)
+			if (!ret) {
+				br_ifinfo_notify(RTM_NEWLINK, p);
 				ret = count;
+			}
 		}
 		rtnl_unlock();
 	}
-- 
2.1.0

^ permalink raw reply related

* [PATCHv2 net-next 4/6] bridge: a netlink notification should be sent when those attributes are changed by br_sysfs_br
From: Xin Long @ 2016-04-05  3:32 UTC (permalink / raw)
  To: network dev, bridge
  Cc: davem, Stephen Hemminger, Hannes Frederic Sowa, nikolay
In-Reply-To: <cover.1459827115.git.lucien.xin@gmail.com>

Now when we change the attributes of bridge or br_port by netlink,
a relevant netlink notification will be sent, but if we change them
by ioctl or sysfs, no notification will be sent.

We should ensure that whenever those attributes change internally or from
sysfs/ioctl, that a netlink notification is sent out to listeners.

Also, NetworkManager will use this in the future to listen for out-of-band
bridge master attribute updates and incorporate them into the runtime
configuration.

This patch is used for br_sysfs_br. and we also need to remove some
rtnl_trylock in old functions so that we can call it in a common one.

For group_addr_store, we cannot make it use store_bridge_parm, because
it's not a string-to-long convert, we will add notification on it
individually.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 net/bridge/br_sysfs_br.c | 18 +++++++++---------
 net/bridge/br_vlan.c     | 30 +++++-------------------------
 2 files changed, 14 insertions(+), 34 deletions(-)

diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 9918763..74d56dfd 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -43,7 +43,14 @@ static ssize_t store_bridge_parm(struct device *d,
 	if (endp == buf)
 		return -EINVAL;
 
+	if (!rtnl_trylock())
+		return restart_syscall();
+
 	err = (*set)(br, val);
+	if (!err)
+		netdev_state_change(br->dev);
+	rtnl_unlock();
+
 	return err ? err : len;
 }
 
@@ -101,15 +108,7 @@ static ssize_t ageing_time_show(struct device *d,
 
 static int set_ageing_time(struct net_bridge *br, unsigned long val)
 {
-	int ret;
-
-	if (!rtnl_trylock())
-		return restart_syscall();
-
-	ret = br_set_ageing_time(br, val);
-	rtnl_unlock();
-
-	return ret;
+	return br_set_ageing_time(br, val);
 }
 
 static ssize_t ageing_time_store(struct device *d,
@@ -311,6 +310,7 @@ static ssize_t group_addr_store(struct device *d,
 
 	br->group_addr_set = true;
 	br_recalculate_fwd_mask(br);
+	netdev_state_change(br->dev);
 
 	rtnl_unlock();
 
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 9309bb4..e001152 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -651,15 +651,7 @@ int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
 
 int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
 {
-	int err;
-
-	if (!rtnl_trylock())
-		return restart_syscall();
-
-	err = __br_vlan_filter_toggle(br, val);
-	rtnl_unlock();
-
-	return err;
+	return __br_vlan_filter_toggle(br, val);
 }
 
 int __br_vlan_set_proto(struct net_bridge *br, __be16 proto)
@@ -713,18 +705,10 @@ err_filt:
 
 int br_vlan_set_proto(struct net_bridge *br, unsigned long val)
 {
-	int err;
-
 	if (val != ETH_P_8021Q && val != ETH_P_8021AD)
 		return -EPROTONOSUPPORT;
 
-	if (!rtnl_trylock())
-		return restart_syscall();
-
-	err = __br_vlan_set_proto(br, htons(val));
-	rtnl_unlock();
-
-	return err;
+	return __br_vlan_set_proto(br, htons(val));
 }
 
 static bool vlan_default_pvid(struct net_bridge_vlan_group *vg, u16 vid)
@@ -855,21 +839,17 @@ int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val)
 	if (val >= VLAN_VID_MASK)
 		return -EINVAL;
 
-	if (!rtnl_trylock())
-		return restart_syscall();
-
 	if (pvid == br->default_pvid)
-		goto unlock;
+		goto out;
 
 	/* Only allow default pvid change when filtering is disabled */
 	if (br->vlan_enabled) {
 		pr_info_once("Please disable vlan filtering to change default_pvid\n");
 		err = -EPERM;
-		goto unlock;
+		goto out;
 	}
 	err = __br_vlan_set_default_pvid(br, pvid);
-unlock:
-	rtnl_unlock();
+out:
 	return err;
 }
 
-- 
2.1.0

^ permalink raw reply related

* [PATCHv2 net-next 3/6] bridge: simplify the stp_state_store by calling store_bridge_parm
From: Xin Long @ 2016-04-05  3:32 UTC (permalink / raw)
  To: network dev, bridge
  Cc: davem, Stephen Hemminger, Hannes Frederic Sowa, nikolay
In-Reply-To: <cover.1459827115.git.lucien.xin@gmail.com>

There are some repetitive codes in stp_state_store, we can remove
them by calling store_bridge_parm.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 net/bridge/br_sysfs_br.c | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 137cd3b..9918763 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -128,27 +128,17 @@ static ssize_t stp_state_show(struct device *d,
 }
 
 
+static int set_stp_state(struct net_bridge *br, unsigned long val)
+{
+	br_stp_set_enabled(br, val);
+	return 0;
+}
+
 static ssize_t stp_state_store(struct device *d,
 			       struct device_attribute *attr, const char *buf,
 			       size_t len)
 {
-	struct net_bridge *br = to_bridge(d);
-	char *endp;
-	unsigned long val;
-
-	if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN))
-		return -EPERM;
-
-	val = simple_strtoul(buf, &endp, 0);
-	if (endp == buf)
-		return -EINVAL;
-
-	if (!rtnl_trylock())
-		return restart_syscall();
-	br_stp_set_enabled(br, val);
-	rtnl_unlock();
-
-	return len;
+	return store_bridge_parm(d, buf, len, set_stp_state);
 }
 static DEVICE_ATTR_RW(stp_state);
 
-- 
2.1.0

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox