* [PATCH tip 7/9] samples: bpf: IO latency analysis (iosnoop/heatmap)
From: Alexei Starovoitov @ 2015-01-16 4:16 UTC (permalink / raw)
To: Ingo Molnar
Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
David S. Miller, Daniel Borkmann, Hannes Frederic Sowa,
Brendan Gregg, linux-api, netdev, linux-kernel
In-Reply-To: <1421381770-4866-1-git-send-email-ast@plumgrid.com>
eBPF C program attaches to block_rq_issue/block_rq_complete events to calculate
IO latency. Then it waits for the first 100 events to compute average latency
and uses range [0 .. ave_lat * 2] to record histogram of events in this latency
range.
User space reads this histogram map every 2 seconds and prints it as a 'heatmap'
using gray shades of text terminal. Black spaces have many events and white
spaces have very few events. Left most space is the smallest latency, right most
space is the largest latency in the range.
If kernel sees too many events that fall out of histogram range, user space
adjusts the range up, so heatmap for next 2 seconds will be more accurate.
Usage:
$ sudo ./tracex3
and do 'sudo dd if=/dev/sda of=/dev/null' in other terminal.
Observe IO latencies and how different activity (like 'make kernel') affects it.
Similar experiments can be done for network transmit latencies, syscalls, etc
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
samples/bpf/Makefile | 4 ++
samples/bpf/tracex3_kern.c | 96 +++++++++++++++++++++++++++++
samples/bpf/tracex3_user.c | 146 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 246 insertions(+)
create mode 100644 samples/bpf/tracex3_kern.c
create mode 100644 samples/bpf/tracex3_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 416af24b01fd..da0efd8032ab 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -9,6 +9,7 @@ hostprogs-y += sockex2
hostprogs-y += dropmon
hostprogs-y += tracex1
hostprogs-y += tracex2
+hostprogs-y += tracex3
dropmon-objs := dropmon.o libbpf.o
test_verifier-objs := test_verifier.o libbpf.o
@@ -18,6 +19,7 @@ sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
+tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
@@ -25,6 +27,7 @@ always += sockex1_kern.o
always += sockex2_kern.o
always += tracex1_kern.o
always += tracex2_kern.o
+always += tracex3_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
@@ -33,6 +36,7 @@ HOSTLOADLIBES_sockex1 += -lelf
HOSTLOADLIBES_sockex2 += -lelf
HOSTLOADLIBES_tracex1 += -lelf
HOSTLOADLIBES_tracex2 += -lelf
+HOSTLOADLIBES_tracex3 += -lelf
# point this to your LLVM backend with bpf support
LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c
new file mode 100644
index 000000000000..fa04603b80b8
--- /dev/null
+++ b/samples/bpf/tracex3_kern.c
@@ -0,0 +1,96 @@
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(long),
+ .value_size = sizeof(u64),
+ .max_entries = 4096,
+};
+
+SEC("events/block/block_rq_issue")
+int bpf_prog1(struct bpf_context *ctx)
+{
+ long rq = ctx->arg2;
+ u64 val = bpf_ktime_get_ns();
+
+ bpf_map_update_elem(&my_map, &rq, &val, BPF_ANY);
+ return 0;
+}
+
+struct globals {
+ u64 lat_ave;
+ u64 lat_sum;
+ u64 missed;
+ u64 max_lat;
+ int num_samples;
+};
+
+struct bpf_map_def SEC("maps") global_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(struct globals),
+ .max_entries = 1,
+};
+
+#define MAX_SLOT 32
+
+struct bpf_map_def SEC("maps") lat_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(u64),
+ .max_entries = MAX_SLOT,
+};
+
+SEC("events/block/block_rq_complete")
+int bpf_prog2(struct bpf_context *ctx)
+{
+ long rq = ctx->arg2;
+ void *value;
+
+ value = bpf_map_lookup_elem(&my_map, &rq);
+ if (!value)
+ return 0;
+
+ u64 cur_time = bpf_ktime_get_ns();
+ u64 delta = (cur_time - *(u64 *)value) / 1000;
+
+ bpf_map_delete_elem(&my_map, &rq);
+
+ int ind = 0;
+ struct globals *g = bpf_map_lookup_elem(&global_map, &ind);
+ if (!g)
+ return 0;
+ if (g->lat_ave == 0) {
+ g->num_samples++;
+ g->lat_sum += delta;
+ if (g->num_samples >= 100) {
+ g->lat_ave = g->lat_sum / g->num_samples;
+ if (0/* debug */) {
+ char fmt[] = "after %d samples average latency %ld usec\n";
+ bpf_printk(fmt, sizeof(fmt), g->num_samples,
+ g->lat_ave);
+ }
+ }
+ } else {
+ u64 max_lat = g->lat_ave * 2;
+ if (delta > max_lat) {
+ g->missed++;
+ if (delta > g->max_lat)
+ g->max_lat = delta;
+ return 0;
+ }
+
+ ind = delta * MAX_SLOT / max_lat;
+ value = bpf_map_lookup_elem(&lat_map, &ind);
+ if (!value)
+ return 0;
+ (*(u64 *)value) ++;
+ }
+
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c
new file mode 100644
index 000000000000..1945147925b5
--- /dev/null
+++ b/samples/bpf/tracex3_user.c
@@ -0,0 +1,146 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+struct globals {
+ __u64 lat_ave;
+ __u64 lat_sum;
+ __u64 missed;
+ __u64 max_lat;
+ int num_samples;
+};
+
+static void clear_stats(int fd)
+{
+ int key;
+ __u64 value = 0;
+ for (key = 0; key < 32; key++)
+ bpf_update_elem(fd, &key, &value, BPF_ANY);
+}
+
+const char *color[] = {
+ "\033[48;5;255m",
+ "\033[48;5;252m",
+ "\033[48;5;250m",
+ "\033[48;5;248m",
+ "\033[48;5;246m",
+ "\033[48;5;244m",
+ "\033[48;5;242m",
+ "\033[48;5;240m",
+ "\033[48;5;238m",
+ "\033[48;5;236m",
+ "\033[48;5;234m",
+ "\033[48;5;232m",
+};
+const int num_colors = ARRAY_SIZE(color);
+
+const char nocolor[] = "\033[00m";
+
+static void print_banner(__u64 max_lat)
+{
+ printf("0 usec ... %lld usec\n", max_lat);
+}
+
+static void print_hist(int fd)
+{
+ int key;
+ __u64 value;
+ __u64 cnt[32];
+ __u64 max_cnt = 0;
+ __u64 total_events = 0;
+ int max_bucket = 0;
+
+ for (key = 0; key < 32; key++) {
+ value = 0;
+ bpf_lookup_elem(fd, &key, &value);
+ if (value > 0)
+ max_bucket = key;
+ cnt[key] = value;
+ total_events += value;
+ if (value > max_cnt)
+ max_cnt = value;
+ }
+ clear_stats(fd);
+ for (key = 0; key < 32; key++) {
+ int c = num_colors * cnt[key] / (max_cnt + 1);
+ printf("%s %s", color[c], nocolor);
+ }
+ printf(" captured=%lld", total_events);
+
+ key = 0;
+ struct globals g = {};
+ bpf_lookup_elem(map_fd[1], &key, &g);
+
+ printf(" missed=%lld max_lat=%lld usec\n",
+ g.missed, g.max_lat);
+
+ if (g.missed > 10 && g.missed > total_events / 10) {
+ printf("adjusting range UP...\n");
+ g.lat_ave = g.max_lat / 2;
+ print_banner(g.lat_ave * 2);
+ } else if (max_bucket < 4 && total_events > 100) {
+ printf("adjusting range DOWN...\n");
+ g.lat_ave = g.lat_ave / 4;
+ print_banner(g.lat_ave * 2);
+ }
+ /* clear some globals */
+ g.missed = 0;
+ g.max_lat = 0;
+ bpf_update_elem(map_fd[1], &key, &g, BPF_ANY);
+}
+
+static void int_exit(int sig)
+{
+ print_hist(map_fd[2]);
+ exit(0);
+}
+
+int main(int ac, char **argv)
+{
+ char filename[256];
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ clear_stats(map_fd[2]);
+
+ signal(SIGINT, int_exit);
+
+ if (fork() == 0) {
+ read_trace_pipe();
+ } else {
+ struct globals g;
+
+ printf("waiting for events to determine average latency...\n");
+ for (;;) {
+ int key = 0;
+ bpf_lookup_elem(map_fd[1], &key, &g);
+ if (g.lat_ave)
+ break;
+ sleep(1);
+ }
+
+ printf(" IO latency in usec\n"
+ " %s %s - many events with this latency\n"
+ " %s %s - few events\n",
+ color[num_colors - 1], nocolor,
+ color[0], nocolor);
+ print_banner(g.lat_ave * 2);
+ for (;;) {
+ print_hist(map_fd[2]);
+ sleep(2);
+ }
+ }
+
+ return 0;
+}
--
1.7.9.5
^ permalink raw reply related
* [PATCH tip 8/9] tracing: attach eBPF programs to kprobe/kretprobe
From: Alexei Starovoitov @ 2015-01-16 4:16 UTC (permalink / raw)
To: Ingo Molnar
Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
David S. Miller, Daniel Borkmann, Hannes Frederic Sowa,
Brendan Gregg, linux-api, netdev, linux-kernel
In-Reply-To: <1421381770-4866-1-git-send-email-ast@plumgrid.com>
introduce new type of eBPF programs BPF_PROG_TYPE_KPROBE_FILTER.
Such programs are allowed to call the same helper functions
as tracing filters, but bpf_context is different:
For tracing filters bpf_context is 6 arguments of tracepoints or syscalls
For kprobe filters bpf_context == pt_regs
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
include/linux/ftrace_event.h | 2 ++
include/uapi/linux/bpf.h | 1 +
kernel/trace/bpf_trace.c | 39 ++++++++++++++++++++++++++++++++++++
kernel/trace/trace_events_filter.c | 10 ++++++---
kernel/trace/trace_kprobe.c | 11 +++++++++-
5 files changed, 59 insertions(+), 4 deletions(-)
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index a3897f5e43ca..0f1a0418bef7 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -249,6 +249,7 @@ enum {
TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
TRACE_EVENT_FL_TRACEPOINT_BIT,
TRACE_EVENT_FL_BPF_BIT,
+ TRACE_EVENT_FL_KPROBE_BIT,
};
/*
@@ -272,6 +273,7 @@ enum {
TRACE_EVENT_FL_USE_CALL_FILTER = (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
TRACE_EVENT_FL_TRACEPOINT = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
TRACE_EVENT_FL_BPF = (1 << TRACE_EVENT_FL_BPF_BIT),
+ TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT),
};
struct ftrace_event_call {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6075c4f4b67e..79ca0c63ffaf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -119,6 +119,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_UNSPEC,
BPF_PROG_TYPE_SOCKET_FILTER,
BPF_PROG_TYPE_TRACING_FILTER,
+ BPF_PROG_TYPE_KPROBE_FILTER,
};
/* flags for BPF_MAP_UPDATE_ELEM command */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 14cfbbcec32e..c485c7cc8d57 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -209,3 +209,42 @@ static int __init register_tracing_filter_ops(void)
return 0;
}
late_initcall(register_tracing_filter_ops);
+
+/* check access to fields of 'struct pt_regs' from BPF program */
+static bool kprobe_filter_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+ /* check bounds */
+ if (off < 0 || off >= sizeof(struct pt_regs))
+ return false;
+
+ /* only read is allowed */
+ if (type != BPF_READ)
+ return false;
+
+ /* disallow misaligned access */
+ if (off % size != 0)
+ return false;
+
+ return true;
+}
+/* kprobe filter programs are allowed to call the same helper functions
+ * as tracing filters, but bpf_context is different:
+ * For tracing filters bpf_context is 6 arguments of tracepoints or syscalls
+ * For kprobe filters bpf_context == pt_regs
+ */
+static struct bpf_verifier_ops kprobe_filter_ops = {
+ .get_func_proto = tracing_filter_func_proto,
+ .is_valid_access = kprobe_filter_is_valid_access,
+};
+
+static struct bpf_prog_type_list kprobe_tl = {
+ .ops = &kprobe_filter_ops,
+ .type = BPF_PROG_TYPE_KPROBE_FILTER,
+};
+
+static int __init register_kprobe_filter_ops(void)
+{
+ bpf_register_prog_type(&kprobe_tl);
+ return 0;
+}
+late_initcall(register_kprobe_filter_ops);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index bb0140414238..75b7e93b2d28 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1891,7 +1891,8 @@ static int create_filter_start(char *filter_str, bool set_str,
return err;
}
-static int create_filter_bpf(char *filter_str, struct event_filter **filterp)
+static int create_filter_bpf(struct ftrace_event_call *call, char *filter_str,
+ struct event_filter **filterp)
{
struct event_filter *filter;
struct bpf_prog *prog;
@@ -1920,7 +1921,10 @@ static int create_filter_bpf(char *filter_str, struct event_filter **filterp)
filter->prog = prog;
- if (prog->aux->prog_type != BPF_PROG_TYPE_TRACING_FILTER) {
+ if (((call->flags & TRACE_EVENT_FL_KPROBE) &&
+ prog->aux->prog_type != BPF_PROG_TYPE_KPROBE_FILTER) ||
+ (!(call->flags & TRACE_EVENT_FL_KPROBE) &&
+ prog->aux->prog_type != BPF_PROG_TYPE_TRACING_FILTER)) {
/* valid fd, but invalid bpf program type */
err = -EINVAL;
goto free_filter;
@@ -2051,7 +2055,7 @@ int apply_event_filter(struct ftrace_event_file *file, char *filter_string)
*/
if (memcmp(filter_string, "bpf", 3) == 0 && filter_string[3] != 0 &&
filter_string[4] != 0) {
- err = create_filter_bpf(filter_string, &filter);
+ err = create_filter_bpf(call, filter_string, &filter);
if (!err)
file->flags |= TRACE_EVENT_FL_BPF;
} else {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 296079ae6583..113d10973e39 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -19,6 +19,7 @@
#include <linux/module.h>
#include <linux/uaccess.h>
+#include <trace/bpf_trace.h>
#include "trace_probe.h"
@@ -930,6 +931,10 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
if (ftrace_trigger_soft_disabled(ftrace_file))
return;
+ if (ftrace_file->flags & TRACE_EVENT_FL_BPF)
+ if (trace_filter_call_bpf(ftrace_file->filter, regs) == 0)
+ return;
+
local_save_flags(irq_flags);
pc = preempt_count();
@@ -978,6 +983,10 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
if (ftrace_trigger_soft_disabled(ftrace_file))
return;
+ if (ftrace_file->flags & TRACE_EVENT_FL_BPF)
+ if (trace_filter_call_bpf(ftrace_file->filter, regs) == 0)
+ return;
+
local_save_flags(irq_flags);
pc = preempt_count();
@@ -1286,7 +1295,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
kfree(call->print_fmt);
return -ENODEV;
}
- call->flags = 0;
+ call->flags = TRACE_EVENT_FL_KPROBE;
call->class->reg = kprobe_register;
call->data = tk;
ret = trace_add_event_call(call);
--
1.7.9.5
^ permalink raw reply related
* [PATCH tip 9/9] samples: bpf: simple kprobe example
From: Alexei Starovoitov @ 2015-01-16 4:16 UTC (permalink / raw)
To: Ingo Molnar
Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
David S. Miller, Daniel Borkmann, Hannes Frederic Sowa,
Brendan Gregg, linux-api, netdev, linux-kernel
In-Reply-To: <1421381770-4866-1-git-send-email-ast@plumgrid.com>
the logic of the example is similar to tracex2, but syscall 'write' statistics
is capturead from kprobe placed at sys_write function instead of through
syscall instrumentation.
Also tracex4_kern.c has a different way of doing log2 in C.
Note, unlike tracepoint and syscall programs, kprobe programs receive
'struct pt_regs' as an input. It's responsibility of the program author
or higher level dynamic tracing tool to match registers to function arguments.
Since pt_regs are architecture dependent, programs are also arch dependent,
unlike tracepoint/syscalls programs which are universal.
Usage:
$ sudo tracex4
writing bpf-6 -> /sys/kernel/debug/tracing/events/kprobes/sys_write/filter
2216443+0 records in
2216442+0 records out
1134818304 bytes (1.1 GB) copied, 2.00746 s, 565 MB/s
kprobe sys_write() stats
byte_size : count distribution
1 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 0 | |
16 -> 31 : 0 | |
32 -> 63 : 0 | |
64 -> 127 : 1 | |
128 -> 255 : 0 | |
256 -> 511 : 0 | |
512 -> 1023 : 2214734 |************************************* |
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
samples/bpf/Makefile | 4 +++
samples/bpf/bpf_load.c | 3 ++
samples/bpf/tracex4_kern.c | 36 +++++++++++++++++++
samples/bpf/tracex4_user.c | 83 ++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 126 insertions(+)
create mode 100644 samples/bpf/tracex4_kern.c
create mode 100644 samples/bpf/tracex4_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index da0efd8032ab..22c7a38f3f95 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -10,6 +10,7 @@ hostprogs-y += dropmon
hostprogs-y += tracex1
hostprogs-y += tracex2
hostprogs-y += tracex3
+hostprogs-y += tracex4
dropmon-objs := dropmon.o libbpf.o
test_verifier-objs := test_verifier.o libbpf.o
@@ -20,6 +21,7 @@ sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
+tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
@@ -28,6 +30,7 @@ always += sockex2_kern.o
always += tracex1_kern.o
always += tracex2_kern.o
always += tracex3_kern.o
+always += tracex4_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
@@ -37,6 +40,7 @@ HOSTLOADLIBES_sockex2 += -lelf
HOSTLOADLIBES_tracex1 += -lelf
HOSTLOADLIBES_tracex2 += -lelf
HOSTLOADLIBES_tracex3 += -lelf
+HOSTLOADLIBES_tracex4 += -lelf
# point this to your LLVM backend with bpf support
LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 788ac51c1024..d8c5176f0564 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -25,6 +25,7 @@ int prog_cnt;
static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
{
bool is_socket = strncmp(event, "socket", 6) == 0;
+ bool is_kprobe = strncmp(event, "events/kprobes/", 15) == 0;
enum bpf_prog_type prog_type;
char path[256] = DEBUGFS;
char fmt[32];
@@ -32,6 +33,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
if (is_socket)
prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+ else if (is_kprobe)
+ prog_type = BPF_PROG_TYPE_KPROBE_FILTER;
else
prog_type = BPF_PROG_TYPE_TRACING_FILTER;
diff --git a/samples/bpf/tracex4_kern.c b/samples/bpf/tracex4_kern.c
new file mode 100644
index 000000000000..9646f9e43417
--- /dev/null
+++ b/samples/bpf/tracex4_kern.c
@@ -0,0 +1,36 @@
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include "bpf_helpers.h"
+
+static unsigned int log2l(unsigned long long n)
+{
+#define S(k) if (n >= (1ull << k)) { i += k; n >>= k; }
+ int i = -(n == 0);
+ S(32); S(16); S(8); S(4); S(2); S(1);
+ return i;
+#undef S
+}
+
+struct bpf_map_def SEC("maps") my_hist_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = 64,
+};
+
+SEC("events/kprobes/sys_write")
+int bpf_prog4(struct pt_regs *regs)
+{
+ long write_size = regs->dx; /* $rdx contains 3rd argument to a function */
+ long init_val = 1;
+ void *value;
+ u32 index = log2l(write_size);
+
+ value = bpf_map_lookup_elem(&my_hist_map, &index);
+ if (value)
+ __sync_fetch_and_add((long *)value, 1);
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c
new file mode 100644
index 000000000000..47dde2791f9e
--- /dev/null
+++ b/samples/bpf/tracex4_user.c
@@ -0,0 +1,83 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define MAX_INDEX 64
+#define MAX_STARS 38
+
+static void stars(char *str, long val, long max, int width)
+{
+ int i;
+
+ for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+ str[i] = '*';
+ if (val > max)
+ str[i - 1] = '+';
+ str[i] = '\0';
+}
+
+static void print_hist(int fd)
+{
+ int key;
+ long value;
+ long data[MAX_INDEX] = {};
+ char starstr[MAX_STARS];
+ int i;
+ int max_ind = -1;
+ long max_value = 0;
+
+ for (key = 0; key < MAX_INDEX; key++) {
+ bpf_lookup_elem(fd, &key, &value);
+ data[key] = value;
+ if (value && key > max_ind)
+ max_ind = key;
+ if (value > max_value)
+ max_value = value;
+ }
+
+ printf("\n kprobe sys_write() stats\n");
+ printf(" byte_size : count distribution\n");
+ for (i = 1; i <= max_ind + 1; i++) {
+ stars(starstr, data[i - 1], max_value, MAX_STARS);
+ printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+ (1l << i) >> 1, (1l << i) - 1, data[i - 1],
+ MAX_STARS, starstr);
+ }
+}
+static void int_exit(int sig)
+{
+ print_hist(map_fd[0]);
+ exit(0);
+}
+
+int main(int ac, char **argv)
+{
+ char filename[256];
+ FILE *f;
+ int i;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ signal(SIGINT, int_exit);
+
+ i = system("echo 'p:sys_write sys_write' > /sys/kernel/debug/tracing/kprobe_events");
+ (void) i;
+
+ /* start 'dd' in the background to have plenty of 'write' syscalls */
+ f = popen("dd if=/dev/zero of=/dev/null", "r");
+ (void) f;
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ sleep(2);
+ kill(0, SIGINT); /* send Ctrl-C to self and to 'dd' */
+
+ return 0;
+}
--
1.7.9.5
^ permalink raw reply related
* [PATCH tip 5/9] samples: bpf: simple tracing example in C
From: Alexei Starovoitov @ 2015-01-16 4:16 UTC (permalink / raw)
To: Ingo Molnar
Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
David S. Miller, Daniel Borkmann, Hannes Frederic Sowa,
Brendan Gregg, linux-api, netdev, linux-kernel
In-Reply-To: <1421381770-4866-1-git-send-email-ast@plumgrid.com>
tracex1_kern.c - C program which will be compiled into eBPF
to filter netif_receive_skb events on skb->dev->name == "lo"
The programs returns 1 to continue storing an event into trace buffer
and returns 0 - to discard an event.
tracex1_user.c - corresponding user space component that
forever reads /sys/.../trace_pipe
Usage:
$ sudo tracex1
should see:
writing bpf-4 -> /sys/kernel/debug/tracing/events/net/netif_receive_skb/filter
ping-364 [000] ..s2 8.089771: netif_receive_skb: dev=lo skbaddr=ffff88000dfcc100 len=84
ping-364 [000] ..s2 8.089889: netif_receive_skb: dev=lo skbaddr=ffff88000dfcc900 len=84
Ctrl-C at any time, kernel will auto cleanup
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
samples/bpf/Makefile | 4 +++
samples/bpf/bpf_helpers.h | 18 ++++++++++++++
samples/bpf/bpf_load.c | 59 +++++++++++++++++++++++++++++++++++++++-----
samples/bpf/bpf_load.h | 3 +++
samples/bpf/tracex1_kern.c | 28 +++++++++++++++++++++
samples/bpf/tracex1_user.c | 24 ++++++++++++++++++
6 files changed, 130 insertions(+), 6 deletions(-)
create mode 100644 samples/bpf/tracex1_kern.c
create mode 100644 samples/bpf/tracex1_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 789691374562..da28e1b6d3a6 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -7,6 +7,7 @@ hostprogs-y += sock_example
hostprogs-y += sockex1
hostprogs-y += sockex2
hostprogs-y += dropmon
+hostprogs-y += tracex1
dropmon-objs := dropmon.o libbpf.o
test_verifier-objs := test_verifier.o libbpf.o
@@ -14,17 +15,20 @@ test_maps-objs := test_maps.o libbpf.o
sock_example-objs := sock_example.o libbpf.o
sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
+tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
always += sockex1_kern.o
always += sockex2_kern.o
+always += tracex1_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
HOSTLOADLIBES_sockex1 += -lelf
HOSTLOADLIBES_sockex2 += -lelf
+HOSTLOADLIBES_tracex1 += -lelf
# point this to your LLVM backend with bpf support
LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index ca0333146006..81388e821eb3 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -15,6 +15,24 @@ static int (*bpf_map_update_elem)(void *map, void *key, void *value,
(void *) BPF_FUNC_map_update_elem;
static int (*bpf_map_delete_elem)(void *map, void *key) =
(void *) BPF_FUNC_map_delete_elem;
+static void *(*bpf_fetch_ptr)(void *unsafe_ptr) =
+ (void *) BPF_FUNC_fetch_ptr;
+static unsigned long long (*bpf_fetch_u64)(void *unsafe_ptr) =
+ (void *) BPF_FUNC_fetch_u64;
+static unsigned int (*bpf_fetch_u32)(void *unsafe_ptr) =
+ (void *) BPF_FUNC_fetch_u32;
+static unsigned short (*bpf_fetch_u16)(void *unsafe_ptr) =
+ (void *) BPF_FUNC_fetch_u16;
+static unsigned char (*bpf_fetch_u8)(void *unsafe_ptr) =
+ (void *) BPF_FUNC_fetch_u8;
+static int (*bpf_printk)(const char *fmt, int fmt_size, ...) =
+ (void *) BPF_FUNC_printk;
+static int (*bpf_memcmp)(void *unsafe_ptr, void *safe_ptr, int size) =
+ (void *) BPF_FUNC_memcmp;
+static void (*bpf_dump_stack)(void) =
+ (void *) BPF_FUNC_dump_stack;
+static unsigned long long (*bpf_ktime_get_ns)(void) =
+ (void *) BPF_FUNC_ktime_get_ns;
/* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 1831d236382b..788ac51c1024 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -14,6 +14,8 @@
#include "bpf_helpers.h"
#include "bpf_load.h"
+#define DEBUGFS "/sys/kernel/debug/tracing/"
+
static char license[128];
static bool processed_sec[128];
int map_fd[MAX_MAPS];
@@ -22,15 +24,18 @@ int prog_cnt;
static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
{
- int fd;
bool is_socket = strncmp(event, "socket", 6) == 0;
+ enum bpf_prog_type prog_type;
+ char path[256] = DEBUGFS;
+ char fmt[32];
+ int fd, event_fd, err;
- if (!is_socket)
- /* tracing events tbd */
- return -1;
+ if (is_socket)
+ prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+ else
+ prog_type = BPF_PROG_TYPE_TRACING_FILTER;
- fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER,
- prog, size, license);
+ fd = bpf_prog_load(prog_type, prog, size, license);
if (fd < 0) {
printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf);
@@ -39,6 +44,28 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
prog_fd[prog_cnt++] = fd;
+ if (is_socket)
+ return 0;
+
+ snprintf(fmt, sizeof(fmt), "bpf-%d", fd);
+
+ strcat(path, event);
+ strcat(path, "/filter");
+
+ printf("writing %s -> %s\n", fmt, path);
+
+ event_fd = open(path, O_WRONLY, 0);
+ if (event_fd < 0) {
+ printf("failed to open event %s\n", event);
+ return -1;
+ }
+
+ err = write(event_fd, fmt, strlen(fmt));
+ if (err < 0) {
+ printf("write to '%s' failed '%s'\n", event, strerror(errno));
+ return -1;
+ }
+
return 0;
}
@@ -201,3 +228,23 @@ int load_bpf_file(char *path)
close(fd);
return 0;
}
+
+void read_trace_pipe(void)
+{
+ int trace_fd;
+
+ trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
+ if (trace_fd < 0)
+ return;
+
+ while (1) {
+ static char buf[4096];
+ ssize_t sz;
+
+ sz = read(trace_fd, buf, sizeof(buf));
+ if (sz) {
+ buf[sz] = 0;
+ puts(buf);
+ }
+ }
+}
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 27789a34f5e6..d154fc2b0535 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -21,4 +21,7 @@ extern int prog_fd[MAX_PROGS];
*/
int load_bpf_file(char *path);
+/* forever reads /sys/.../trace_pipe */
+void read_trace_pipe(void);
+
#endif
diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c
new file mode 100644
index 000000000000..7849ceb4bce6
--- /dev/null
+++ b/samples/bpf/tracex1_kern.c
@@ -0,0 +1,28 @@
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include "bpf_helpers.h"
+
+SEC("events/net/netif_receive_skb")
+int bpf_prog1(struct bpf_context *ctx)
+{
+ /*
+ * attaches to /sys/kernel/debug/tracing/events/net/netif_receive_skb
+ * prints events for loobpack device only
+ */
+ char devname[] = "lo";
+ struct net_device *dev;
+ struct sk_buff *skb = 0;
+
+ skb = (struct sk_buff *) ctx->arg1;
+ dev = bpf_fetch_ptr(&skb->dev);
+ if (bpf_memcmp(dev->name, devname, 2) == 0)
+ /* print event using default tracepoint format */
+ return 1;
+
+ /* drop event */
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c
new file mode 100644
index 000000000000..e85c1b483f57
--- /dev/null
+++ b/samples/bpf/tracex1_user.c
@@ -0,0 +1,24 @@
+#include <stdio.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int ac, char **argv)
+{
+ FILE *f;
+ char filename[256];
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ f = popen("ping -c5 localhost", "r");
+ (void) f;
+
+ read_trace_pipe();
+
+ return 0;
+}
--
1.7.9.5
^ permalink raw reply related
* Re: [PATCH 0/2] Remove T4 FCoE support
From: Praveen Madhavan @ 2015-01-16 4:32 UTC (permalink / raw)
To: David Miller; +Cc: netdev, linux-scsi, JBottomley, hch, hariprasad, varun, hare
In-Reply-To: <20150115.140453.871607276763972458.davem@davemloft.net>
On Thu, Jan 15, 2015 at 02:04:53PM -0500, David Miller wrote:
> From: Praveen Madhavan <praveenm@chelsio.com>
> Date: Thu, 15 Jan 2015 19:15:50 +0530
>
> > These patches removes FCoE support for chelsio T4 adapter.
> > Please apply on net-next since depends on previous commits.
>
> Why is it being removed? You have to state this in the
> commit log messages at a minimum.
>
We found a subtle issue with FCoE on T4 very late in the game
and decided not to productize FCoE on T4 and therefore there
are no customers that will be impacted by this change. FCoE is
supported on T5 cards.
Sorry about that, not mentioning in commit log. Can i resend this
patch with updated commit log?
^ permalink raw reply
* Re: [patch-net-next v2 3/3] net: ethernet: cpsw: don't requests IRQs we don't use
From: David Miller @ 2015-01-16 4:36 UTC (permalink / raw)
To: balbi; +Cc: mugunthanvnm, tony, linux-omap, netdev
In-Reply-To: <20150116012852.GA3115@saruman>
From: Felipe Balbi <balbi@ti.com>
Date: Thu, 15 Jan 2015 19:28:52 -0600
> On Thu, Jan 15, 2015 at 06:16:15PM -0500, David Miller wrote:
>> Indeed, I agree that adding something as a placeholder that just gets
>> immediately removed should be avoided unless it is extremely difficult
>> to do so.
>
> what does this mean ? you prefer both patches to be combined ?
Yes, something like that.
^ permalink raw reply
* Re: [PATCH net-next] Driver: Vmxnet3: Fix ethtool -S to return correct rx queue stats
From: David Miller @ 2015-01-16 5:30 UTC (permalink / raw)
To: skhare; +Cc: sbhatewara, pv-drivers, netdev, linux-kernel, gzhenyu
In-Reply-To: <1421351670-18424-1-git-send-email-skhare@vmware.com>
From: Shrikrishna Khare <skhare@vmware.com>
Date: Thu, 15 Jan 2015 11:54:30 -0800
> Signed-off-by: Gao Zhenyu <gzhenyu@vmware.com>
> Signed-off-by: Shrikrishna Khare <skhare@vmware.com>
> Reviewed-by: Shreyas N Bhatewara <sbhatewara@vmware.com>
Applied, thank you.
^ permalink raw reply
* Re: [PATCH v3 1/3] net/macb: Fix comments to meet style guidelines
From: David Miller @ 2015-01-16 5:31 UTC (permalink / raw)
To: xander.huff
Cc: nicolas.ferre, david.light, netdev, jaeden.amero, rich.tollerton,
brad.mouring, linux-kernel, cyrille.pitchen
In-Reply-To: <1421358316-23660-1-git-send-email-xander.huff@ni.com>
From: Xander Huff <xander.huff@ni.com>
Date: Thu, 15 Jan 2015 15:45:14 -0600
> Change comments to not exceed 80 characters per line.
> Update block comments in macb.h to start on the line after /*.
>
> Signed-off-by: Xander Huff <xander.huff@ni.com>
Applied.
^ permalink raw reply
* Re: [PATCH v3 2/3] net/macb: Add whitespace around arithmetic operators
From: David Miller @ 2015-01-16 5:32 UTC (permalink / raw)
To: xander.huff
Cc: nicolas.ferre, david.light, netdev, jaeden.amero, rich.tollerton,
brad.mouring, linux-kernel, cyrille.pitchen
In-Reply-To: <1421358316-23660-2-git-send-email-xander.huff@ni.com>
From: Xander Huff <xander.huff@ni.com>
Date: Thu, 15 Jan 2015 15:45:15 -0600
> Spaces should surround add, multiply, and bitshift operators.
>
> Signed-off-by: Xander Huff <xander.huff@ni.com>
Applied.
^ permalink raw reply
* Re: [PATCH v3 3/3] net/macb: Create gem_ethtool_ops for new statistics functions
From: David Miller @ 2015-01-16 5:32 UTC (permalink / raw)
To: xander.huff
Cc: nicolas.ferre, david.light, netdev, jaeden.amero, rich.tollerton,
brad.mouring, linux-kernel, cyrille.pitchen
In-Reply-To: <1421358316-23660-3-git-send-email-xander.huff@ni.com>
From: Xander Huff <xander.huff@ni.com>
Date: Thu, 15 Jan 2015 15:45:16 -0600
> 10/100 MACB does not have the same statistics possibilities as GEM. Separate
> macb_ethtool_ops to make a new GEM-specific struct with the new statistics
> functions included.
>
> Signed-off-by: Xander Huff <xander.huff@ni.com>
Applied.
^ permalink raw reply
* Re: [PATCHv2 0/6] Fixes for davinci_emac
From: David Miller @ 2015-01-16 6:01 UTC (permalink / raw)
To: tony; +Cc: netdev, linux-omap
In-Reply-To: <1421361914-4612-1-git-send-email-tony@atomide.com>
From: Tony Lindgren <tony@atomide.com>
Date: Thu, 15 Jan 2015 14:45:08 -0800
> Here's a repost of the fixes for davinci_emac with patches
> updated for comments and acks collected.
Series applied, thanks Tony.
^ permalink raw reply
* Re: [PATCH v2 net] net: rps: fix cpu unplug
From: David Miller @ 2015-01-16 6:05 UTC (permalink / raw)
To: eric.dumazet; +Cc: subashab, psodagud, netdev, therbert
In-Reply-To: <1421370262.11734.111.camel@edumazet-glaptop2.roam.corp.google.com>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 15 Jan 2015 17:04:22 -0800
> From: Eric Dumazet <edumazet@google.com>
>
> softnet_data.input_pkt_queue is protected by a spinlock that
> we must hold when transferring packets from victim queue to an active
> one. This is because other cpus could still be trying to enqueue packets
> into victim queue.
>
> A second problem is that when we transfert the NAPI poll_list from
> victim to current cpu, we absolutely need to special case the percpu
> backlog, because we do not want to add complex locking to protect
> process_queue : Only owner cpu is allowed to manipulate it, unless cpu
> is offline.
>
> Based on initial patch from Prasad Sodagudi & Subash Abhinov
> Kasiviswanathan.
>
> This version is better because we do not slow down packet processing,
> only make migration safer.
>
> Reported-by: Prasad Sodagudi <psodagud@codeaurora.org>
> Reported-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Applied and queued up for -stable, thanks Eric.
^ permalink raw reply
* Re: [PATCH for-next 0/2] Refactor macros to conform to uniform standards
From: David Miller @ 2015-01-16 6:07 UTC (permalink / raw)
To: hariprasad-ut6Up61K2wZBDgjK7y7TUQ
Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
roland-BHEL68pLQRGGvPXPguhicg, leedom-ut6Up61K2wZBDgjK7y7TUQ,
anish-ut6Up61K2wZBDgjK7y7TUQ, nirranjan-ut6Up61K2wZBDgjK7y7TUQ,
praveenm-ut6Up61K2wZBDgjK7y7TUQ,
swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW
In-Reply-To: <1421380488-1111-1-git-send-email-hariprasad-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
From: Hariprasad Shenai <hariprasad-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
Date: Fri, 16 Jan 2015 09:24:46 +0530
> This patch series cleansup macros/register defines, defined in t4.h and
> t4fw_ri_api.h and all the affected files.
>
> This patch series is created against net-next tree and includes patches on
> iw_cxgb4 tree. Since the patches are dependent on previous cleanup patched we
> would line to get this series merged through net-next tree.
>
> We have included all the maintainers of respective drivers. Kindly review the
> change and let us know in case of any review comments.
Series applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [PATCH net-next v5] rhashtable: Fix race in rhashtable_destroy() and use regular work_struct
From: David Miller @ 2015-01-16 6:19 UTC (permalink / raw)
To: ying.xue; +Cc: tgraf, sergei.shtylyov, netdev
In-Reply-To: <1421377989-7891-1-git-send-email-ying.xue@windriver.com>
From: Ying Xue <ying.xue@windriver.com>
Date: Fri, 16 Jan 2015 11:13:09 +0800
> When we put our declared work task in the global workqueue with
> schedule_delayed_work(), its delay parameter is always zero.
> Therefore, we should define a regular work in rhashtable structure
> instead of a delayed work.
>
> By the way, we add a condition to check whether resizing functions
> are NULL before cancelling the work, avoiding to cancel an
> uninitialized work.
>
> Lastly, while we wait for all work items we submitted before to run
> to completion with cancel_delayed_work(), ht->mutex has been taken in
> rhashtable_destroy(). Moreover, cancel_delayed_work() doesn't return
> until all work items are accomplished, and when work items are
> scheduled, the work's function - rht_deferred_worker() will be called.
> However, as rht_deferred_worker() also needs to acquire the lock,
> deadlock might happen at the moment as the lock is already held before.
> So if the cancel work function is moved out of the lock covered scope,
> this will avoid the deadlock.
>
> Fixes: 97defe1 ("rhashtable: Per bucket locks & deferred expansion/shrinking")
> Signed-off-by: Ying Xue <ying.xue@windriver.com>
> Cc: Thomas Graf <tgraf@suug.ch>
> Acked-by: Thomas Graf <tgraf@suug.ch>
Applied, thanks.
^ permalink raw reply
* netlink: Fix netlink_insert EADDRINUSE error
From: Herbert Xu @ 2015-01-16 6:23 UTC (permalink / raw)
To: netdev, Ying Xue
The patch c5adde9468b0714a051eac7f9666f23eb10b61f7 ("netlink:
eliminate nl_sk_hash_lock") introduced a bug where the EADDRINUSE
error has been replaced by ENOMEM. This patch rectifies that
problem.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 01b702d..7a94185 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1050,7 +1050,7 @@ netlink_update_listeners(struct sock *sk)
static int netlink_insert(struct sock *sk, struct net *net, u32 portid)
{
struct netlink_table *table = &nl_table[sk->sk_protocol];
- int err = -EADDRINUSE;
+ int err;
lock_sock(sk);
@@ -1065,10 +1065,13 @@ static int netlink_insert(struct sock *sk, struct net *net, u32 portid)
nlk_sk(sk)->portid = portid;
sock_hold(sk);
- if (__netlink_insert(table, sk, net))
- err = 0;
- else
+
+ err = 0;
+ if (!__netlink_insert(table, sk, net)) {
+ err = -EADDRINUSE;
sock_put(sk);
+ }
+
err:
release_sock(sk);
return err;
--
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply related
* Re: netlink: Fix netlink_insert EADDRINUSE error
From: Ying Xue @ 2015-01-16 6:30 UTC (permalink / raw)
To: Herbert Xu, netdev
In-Reply-To: <20150116062348.GA8588@gondor.apana.org.au>
On 01/16/2015 02:23 PM, Herbert Xu wrote:
> The patch c5adde9468b0714a051eac7f9666f23eb10b61f7 ("netlink:
> eliminate nl_sk_hash_lock") introduced a bug where the EADDRINUSE
> error has been replaced by ENOMEM. This patch rectifies that
> problem.
>
Nice catch!
Acked-by: Ying Xue <ying.xue@windriver.com>
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
>
> diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
> index 01b702d..7a94185 100644
> --- a/net/netlink/af_netlink.c
> +++ b/net/netlink/af_netlink.c
> @@ -1050,7 +1050,7 @@ netlink_update_listeners(struct sock *sk)
> static int netlink_insert(struct sock *sk, struct net *net, u32 portid)
> {
> struct netlink_table *table = &nl_table[sk->sk_protocol];
> - int err = -EADDRINUSE;
> + int err;
>
> lock_sock(sk);
>
> @@ -1065,10 +1065,13 @@ static int netlink_insert(struct sock *sk, struct net *net, u32 portid)
>
> nlk_sk(sk)->portid = portid;
> sock_hold(sk);
> - if (__netlink_insert(table, sk, net))
> - err = 0;
> - else
> +
> + err = 0;
> + if (!__netlink_insert(table, sk, net)) {
> + err = -EADDRINUSE;
> sock_put(sk);
> + }
> +
> err:
> release_sock(sk);
> return err;
>
^ permalink raw reply
* Re: [PATCH for 3.19 2/3] rtlwifi: Fix handling of new style descriptors
From: Kalle Valo @ 2015-01-16 6:40 UTC (permalink / raw)
To: Larry Finger
Cc: 谭杭波, linux-wireless@vger.kernel.org,
netdev@vger.kernel.org
In-Reply-To: <54B81EAA.9040706-tQ5ms3gMjBLk1uMJSBkQmQ@public.gmane.org>
Larry Finger <Larry.Finger-tQ5ms3gMjBLk1uMJSBkQmQ@public.gmane.org> writes:
> Troy and I will try to prepare a patch that only fixes the bugs, and
> we will submit the cleanup for -next.
That's great, thank you.
--
Kalle Valo
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* UDP checksum handling in UFO packets from raw sockets
From: Michal Kubecek @ 2015-01-16 6:52 UTC (permalink / raw)
To: netdev
Hello,
I'm working on an issue with sending over-MTU UDP datagrams from a raw
socket via a virtio_net interface. The problem is quite clear:
ip_ufo_append_data() sets skb->ip_summed to CHECKSUM_PARTIAL
unconditionally but skb->csum_start and skb->csum_offset are never set
properly as it is normally done in udp_send_skb() which these packets
never pass through.
There are few possible solutions but I realized I have no idea which
behaviour would be the correct one (documentation is either missing or
unclear).
1. Make sure that for UFO packets csum_start and csum_offset are always
set even if they come from a raw socket. Pro: consistent with UFO
packets from regular UDP sockets, easy. Con: if sender sets the checksum
field to zero or sets SO_NO_CHECK, we ignore his wish (one could even
argue that we shouldn't touch higher layer headers at all for raw
sockets). It would be also inconsistent between UFO and non-UFO packets.
2. Always preserve UDP checksum set by userspace and set ip_summed to
CHECKSUM_NONE. Pro: we preserve the UDP datagram as provided by
userspace application which seems to be the logic behind raw sockets.
Con: this would require an exception in skb_gso_segment() which
currently issues a WARN as it doesn't expect packets other than
CHECKSUM_PARTIAL.
3. Do (2) if checksum field is zero ("no checksum" according to RFC 768)
or socket has SO_NO_CHECK option, (1) otherwise. Both pros and cons are
combination of those of (1) and (2).
4. Don't allow UFO for UDP packets from raw sockets. Pro: very easy,
consistency between "short" and "long" datagrams. Con: inefficient (but
using raw sockets to generate UDP traffic is unusual and rare).
I suppose the key question is: what are we supposed to do with the UDP
checksum? Should we always preserve it (user expects us to send the
datagram they created), always recalculate (we do so for IPv4 checksum
for raw sockets with IP_HDRINCL option) or something between
(recalculate unless it's zero or SO_NO_CHECK is set)? I would be
thankful for any ideas or references to documents saying what it should
work like.
Michal Kubecek
^ permalink raw reply
* [PATCH net-next] iproute2: bridge: support vlan range
From: roopa @ 2015-01-16 6:52 UTC (permalink / raw)
To: netdev, shemminger, vyasevic; +Cc: wkok
From: Roopa Prabhu <roopa@cumulusnetworks.com>
This patch adds vlan range support to bridge command
using the newly added vinfo flags BRIDGE_VLAN_INFO_RANGE_BEGIN and
BRIDGE_VLAN_INFO_RANGE_END.
$bridge vlan show
port vlan ids
br0 1 PVID Egress Untagged
dummy0 1 PVID Egress Untagged
$bridge vlan add vid 10-15 dev dummy0
port vlan ids
br0 1 PVID Egress Untagged
dummy0 1 PVID Egress Untagged
10
11
12
13
14
15
$bridge vlan del vid 14 dev dummy0
$bridge vlan show
port vlan ids
br0 1 PVID Egress Untagged
dummy0 1 PVID Egress Untagged
10
11
12
13
15
$bridge vlan del vid 10-15 dev dummy0
$bridge vlan show
port vlan ids
br0 1 PVID Egress Untagged
dummy0 1 PVID Egress Untagged
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: Wilson Kok <wkok@cumulusnetworks.com>
---
bridge/vlan.c | 46 ++++++++++++++++++++++++++++++++++++++-------
include/linux/if_bridge.h | 2 ++
2 files changed, 41 insertions(+), 7 deletions(-)
diff --git a/bridge/vlan.c b/bridge/vlan.c
index 3bd7b0d..90b3b6b 100644
--- a/bridge/vlan.c
+++ b/bridge/vlan.c
@@ -32,6 +32,7 @@ static int vlan_modify(int cmd, int argc, char **argv)
} req;
char *d = NULL;
short vid = -1;
+ short vid_end = -1;
struct rtattr *afspec;
struct bridge_vlan_info vinfo;
unsigned short flags = 0;
@@ -49,8 +50,18 @@ static int vlan_modify(int cmd, int argc, char **argv)
NEXT_ARG();
d = *argv;
} else if (strcmp(*argv, "vid") == 0) {
+ char *p;
NEXT_ARG();
- vid = atoi(*argv);
+ p = strchr(*argv, '-');
+ if (p) {
+ *p = '\0';
+ p++;
+ vinfo.vid = atoi(*argv);
+ vid_end = atoi(p);
+ vinfo.flags |= BRIDGE_VLAN_INFO_RANGE_BEGIN;
+ } else {
+ vinfo.vid = atoi(*argv);
+ }
} else if (strcmp(*argv, "self") == 0) {
flags |= BRIDGE_FLAGS_SELF;
} else if (strcmp(*argv, "master") == 0) {
@@ -67,7 +78,7 @@ static int vlan_modify(int cmd, int argc, char **argv)
argc--; argv++;
}
- if (d == NULL || vid == -1) {
+ if (d == NULL || vinfo.vid == -1) {
fprintf(stderr, "Device and VLAN ID are required arguments.\n");
exit(-1);
}
@@ -78,20 +89,41 @@ static int vlan_modify(int cmd, int argc, char **argv)
return -1;
}
- if (vid >= 4096) {
- fprintf(stderr, "Invalid VLAN ID \"%hu\"\n", vid);
+ if (vinfo.vid >= 4096) {
+ fprintf(stderr, "Invalid VLAN ID \"%hu\"\n", vinfo.vid);
return -1;
}
- vinfo.vid = vid;
+ if (vinfo.flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
+ if (vid_end == -1 || vid_end >= 4096 || vinfo.vid >= vid_end) {
+ fprintf(stderr, "Invalid VLAN range \"%hu-%hu\"\n",
+ vinfo.vid, vid_end);
+ return -1;
+ }
+ if (vinfo.flags & BRIDGE_VLAN_INFO_PVID) {
+ fprintf(stderr,
+ "pvid cannot be configured for a vlan range\n");
+ return -1;
+ }
+ }
afspec = addattr_nest(&req.n, sizeof(req), IFLA_AF_SPEC);
if (flags)
addattr16(&req.n, sizeof(req), IFLA_BRIDGE_FLAGS, flags);
- addattr_l(&req.n, sizeof(req), IFLA_BRIDGE_VLAN_INFO, &vinfo,
- sizeof(vinfo));
+ if (vid_end != -1) {
+ addattr_l(&req.n, sizeof(req), IFLA_BRIDGE_VLAN_INFO, &vinfo,
+ sizeof(vinfo));
+ vinfo.flags &= ~BRIDGE_VLAN_INFO_RANGE_BEGIN;
+ vinfo.flags |= BRIDGE_VLAN_INFO_RANGE_END;
+ vinfo.vid = vid_end;
+ addattr_l(&req.n, sizeof(req), IFLA_BRIDGE_VLAN_INFO, &vinfo,
+ sizeof(vinfo));
+ } else {
+ addattr_l(&req.n, sizeof(req), IFLA_BRIDGE_VLAN_INFO, &vinfo,
+ sizeof(vinfo));
+ }
addattr_nest_end(&req.n, afspec);
diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index ed6868e..e21a649 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -124,6 +124,8 @@ enum {
#define BRIDGE_VLAN_INFO_MASTER (1<<0) /* Operate on Bridge device as well */
#define BRIDGE_VLAN_INFO_PVID (1<<1) /* VLAN is PVID, ingress untagged */
#define BRIDGE_VLAN_INFO_UNTAGGED (1<<2) /* VLAN egresses untagged */
+#define BRIDGE_VLAN_INFO_RANGE_BEGIN (1<<3) /* VLAN is start of vlan range */
+#define BRIDGE_VLAN_INFO_RANGE_END (1<<4) /* VLAN is end of vlan range */
struct bridge_vlan_info {
__u16 flags;
--
1.7.10.4
^ permalink raw reply related
* Re: [PATCH] [PATCH] net: sxgbe: Fix waring for double kfree()
From: Dan Carpenter @ 2015-01-16 7:12 UTC (permalink / raw)
To: David Miller; +Cc: kgene, netdev, davem, bh74.an
In-Reply-To: <20150115.191416.1303150560556013280.davem@davemloft.net>
On Thu, Jan 15, 2015 at 07:14:16PM -0500, David Miller wrote:
> From: Kukjin Kim <kgene@kernel.org>
> Date: Thu, 15 Jan 2015 10:43:11 +0900
>
> > From: Byungho An <bh74.an@samsung.com>
> >
> > This patch fixes double kfree() calls at init_rx_ring() because
> > it causes static checker warning.
> >
> > Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
> > Signed-off-by: Byungho An <bh74.an@samsung.com>
> > Signed-off-by: Kukjin Kim <kgene@kernel.org>
>
> Applied.
It does silence the warning but it doesn't fix the bug.
Kukjin, let me know if you have any questions. I can write the fix if
you need me to.
regards,
dan carpenter
^ permalink raw reply
* Re: [PATCH 0/2] Remove T4 FCoE support
From: Hannes Reinecke @ 2015-01-16 7:19 UTC (permalink / raw)
To: Praveen Madhavan, David Miller
Cc: netdev, linux-scsi, JBottomley, hch, hariprasad, varun
In-Reply-To: <20150116042616.GA1264@fcoe-test11>
On 01/16/2015 05:32 AM, Praveen Madhavan wrote:
> On Thu, Jan 15, 2015 at 02:04:53PM -0500, David Miller wrote:
>> From: Praveen Madhavan <praveenm@chelsio.com>
>> Date: Thu, 15 Jan 2015 19:15:50 +0530
>>
>>> These patches removes FCoE support for chelsio T4 adapter.
>>> Please apply on net-next since depends on previous commits.
>>
>> Why is it being removed? You have to state this in the
>> commit log messages at a minimum.
>>
> We found a subtle issue with FCoE on T4 very late in the game
> and decided not to productize FCoE on T4 and therefore there
> are no customers that will be impacted by this change. FCoE is
> supported on T5 cards.
> Sorry about that, not mentioning in commit log. Can i resend this
> patch with updated commit log?
>
Yes, please do.
Cheers,
Hannes
--
Dr. Hannes Reinecke zSeries & Storage
hare@suse.de +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)
--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: pull-request: mac80211-next 2015-01-15
From: Johannes Berg @ 2015-01-16 7:36 UTC (permalink / raw)
To: David Miller
Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
linux-wireless-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20150115.192820.2095761147762961625.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
On Thu, 2015-01-15 at 19:28 -0500, David Miller wrote:
> I had to resolve a minor merge conflict, please take a look to make sure
> I got it right.
Ah, yes, sorry about that - I knew about the conflict (had resolved it
numerous times) but forgot to give you a heads-up.
What you did is good, thanks.
johannes
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: netlink: Fix netlink_insert EADDRINUSE error
From: David Miller @ 2015-01-16 7:38 UTC (permalink / raw)
To: herbert; +Cc: netdev, ying.xue
In-Reply-To: <20150116062348.GA8588@gondor.apana.org.au>
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 16 Jan 2015 17:23:48 +1100
> The patch c5adde9468b0714a051eac7f9666f23eb10b61f7 ("netlink:
> eliminate nl_sk_hash_lock") introduced a bug where the EADDRINUSE
> error has been replaced by ENOMEM. This patch rectifies that
> problem.
>
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Applied, thanks Herbert.
^ permalink raw reply
* [PATCH] net: ipv4: Fix incorrect free in ICMP receive
From: subashab @ 2015-01-16 7:48 UTC (permalink / raw)
To: netdev
An exception is seen in ICMP ping receive path where the skb
destructor sock_rfree() tries to access a freed socket. This happens
because ping_rcv() releases socket reference with sock_put() and this
internally frees up the socket. Later icmp_rcv() will try to free the
skb and as part of this, skb destructor is called and panics as the
socket is freed already in ping_rcv().
WARN stack trace @ WARN_ON(atomic_read(&sk->sk_rmem_alloc));
dump_backtrace+0x0/0x248
show_stack+0x10/0x1c
dump_stack+0x1c/0x28
warn_slowpath_common+0x74/0x9c
warn_slowpath_null+0x14/0x20
inet_sock_destruct+0x130/0x1a0
__sk_free+0x1c/0x168
sk_free+0x24/0x30
ping_rcv+0xf4/0x124
icmp_rcv+0x224/0x2c4
ip_local_deliver_finish+0x108/0x214
ip_local_deliver+0x88/0xa0
ip_rcv_finish+0x234/0x284
ip_rcv+0x258/0x2e8
__netif_receive_skb_core+0x640/0x6b4
<snip>
-->|exception
-007|sk_mem_uncharge
-007|sock_rfree
-008|skb_release_head_state
-009|skb_release_all
-009|__kfree_skb
-010|kfree_skb
-011|icmp_rcv
-012|ip_local_deliver_finish
Fix this by orphaning the skb's before freeing the socket
Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
---
net/ipv4/af_inet.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index b507a47..0c58f0e5 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -147,6 +147,12 @@ EXPORT_SYMBOL(ipv4_config);
void inet_sock_destruct(struct sock *sk)
{
struct inet_sock *inet = inet_sk(sk);
+ struct sk_buff *skb;
+
+ skb_queue_walk(&sk->sk_receive_queue, skb)
+ skb_orphan(skb);
+ skb_queue_walk(&sk->sk_error_queue, skb)
+ skb_orphan(skb);
__skb_queue_purge(&sk->sk_receive_queue);
__skb_queue_purge(&sk->sk_error_queue);
--
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project
^ permalink raw reply related
* Re: [PATCH net-next v12 5/5] openvswitch: Add support for unique flow IDs.
From: Pravin Shelar @ 2015-01-16 8:07 UTC (permalink / raw)
To: Joe Stringer; +Cc: netdev, LKML, dev@openvswitch.org
In-Reply-To: <1421358507-5992-6-git-send-email-joestringer@nicira.com>
On Thu, Jan 15, 2015 at 1:48 PM, Joe Stringer <joestringer@nicira.com> wrote:
> Previously, flows were manipulated by userspace specifying a full,
> unmasked flow key. This adds significant burden onto flow
> serialization/deserialization, particularly when dumping flows.
>
> This patch adds an alternative way to refer to flows using a
> variable-length "unique flow identifier" (UFID). At flow setup time,
> userspace may specify a UFID for a flow, which is stored with the flow
> and inserted into a separate table for lookup, in addition to the
> standard flow table. Flows created using a UFID must be fetched or
> deleted using the UFID.
>
> All flow dump operations may now be made more terse with OVS_UFID_F_*
> flags. For example, the OVS_UFID_F_OMIT_KEY flag allows responses to
> omit the flow key from a datapath operation if the flow has a
> corresponding UFID. This significantly reduces the time spent assembling
> and transacting netlink messages. With all OVS_UFID_F_OMIT_* flags
> enabled, the datapath only returns the UFID and statistics for each flow
> during flow dump, increasing ovs-vswitchd revalidator performance by 40%
> or more.
>
> Signed-off-by: Joe Stringer <joestringer@nicira.com>
Patch looks pretty good now. I have one comment below.
> +#define MAX_UFID_LENGTH 16 /* 128 bits */
> +
> +struct sw_flow_id {
> + u32 ufid_len;
> + union {
> + u32 ufid[MAX_UFID_LENGTH / 4];
> + struct sw_flow_key flow_key;
> + };
> +};
> +
> struct sw_flow_actions {
> struct rcu_head rcu;
> u32 actions_len;
> @@ -213,13 +223,15 @@ struct flow_stats {
>
> struct sw_flow {
> struct rcu_head rcu;
> - struct hlist_node hash_node[2];
> - u32 hash;
> + struct {
> + struct hlist_node node[2];
> + u32 hash;
> + } flow_table, ufid_table;
> int stats_last_writer; /* NUMA-node id of the last writer on
> * 'stats[0]'.
> */
> struct sw_flow_key key;
> - struct sw_flow_key unmasked_key;
> + struct sw_flow_id *id;
> struct sw_flow_mask *mask;
> struct sw_flow_actions __rcu *sf_acts;
> struct flow_stats __rcu *stats[]; /* One for each NUMA node. First one
> @@ -243,6 +255,16 @@ struct arp_eth_header {
> unsigned char ar_tip[4]; /* target IP address */
> } __packed;
>
In last round we agreed on following struct flow-id which saves around
four hundred bytes per flow and kmalloc per flow add operation for
common case. Is there any reason for not doing it?
struct {
u32 ufid_len;
union {
u32 ufid[MAX_UFID_LENGTH / 4];
struct sw_flow_key *unmasked_key;
}
} id;
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox