Linux userland API discussions
 help / color / mirror / Atom feed
* [PATCH v3 linux-trace 1/8] tracing: attach eBPF programs to tracepoints and syscalls
From: Alexei Starovoitov @ 2015-02-10  3:45 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1423539961-21792-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>

User interface:
struct perf_event_attr attr = {.type = PERF_TYPE_TRACEPOINT, .config = event_id, ...};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);

prog_fd is a file descriptor associated with eBPF program previously loaded.
event_id is an ID of static tracepoint event or syscall.
(kprobe support is in next patch)

close(event_fd) - automatically detaches eBPF program from it

eBPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- fetch_ptr/u64/u32/u16/u8 values from unsafe address via probe_kernel_read(),
  so that eBPF program can walk any kernel data structures
- probe_memcmp - combination of probe_kernel_read() and memcmp()

Signed-off-by: Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
---
 include/linux/bpf.h             |    6 +-
 include/linux/ftrace_event.h    |   11 +++
 include/trace/bpf_trace.h       |   25 +++++++
 include/trace/ftrace.h          |   31 +++++++++
 include/uapi/linux/bpf.h        |    7 ++
 include/uapi/linux/perf_event.h |    1 +
 kernel/events/core.c            |   55 +++++++++++++++
 kernel/trace/Makefile           |    1 +
 kernel/trace/bpf_trace.c        |  145 +++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_syscalls.c   |   35 ++++++++++
 10 files changed, 316 insertions(+), 1 deletion(-)
 create mode 100644 include/trace/bpf_trace.h
 create mode 100644 kernel/trace/bpf_trace.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bbfceb756452..a0f6f636ced0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -130,10 +130,14 @@ struct bpf_prog_aux {
 
 #ifdef CONFIG_BPF_SYSCALL
 void bpf_prog_put(struct bpf_prog *prog);
+struct bpf_prog *bpf_prog_get(u32 ufd);
 #else
 static inline void bpf_prog_put(struct bpf_prog *prog) {}
+static inline struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+	return ERR_PTR(-ENOENT);
+}
 #endif
-struct bpf_prog *bpf_prog_get(u32 ufd);
 /* verify correctness of eBPF program */
 int bpf_check(struct bpf_prog *fp, union bpf_attr *attr);
 
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 0bebb5c348b8..479d0a4a42b3 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -13,6 +13,7 @@ struct trace_array;
 struct trace_buffer;
 struct tracer;
 struct dentry;
+struct bpf_prog;
 
 struct trace_print_flags {
 	unsigned long		mask;
@@ -299,6 +300,7 @@ struct ftrace_event_call {
 #ifdef CONFIG_PERF_EVENTS
 	int				perf_refcount;
 	struct hlist_head __percpu	*perf_events;
+	struct bpf_prog			*prog;
 
 	int	(*perf_perm)(struct ftrace_event_call *,
 			     struct perf_event *);
@@ -544,6 +546,15 @@ event_trigger_unlock_commit_regs(struct ftrace_event_file *file,
 		event_triggers_post_call(file, tt);
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
+#else
+static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+	return 1;
+}
+#endif
+
 enum {
 	FILTER_OTHER = 0,
 	FILTER_STATIC_STRING,
diff --git a/include/trace/bpf_trace.h b/include/trace/bpf_trace.h
new file mode 100644
index 000000000000..4e64f61f484d
--- /dev/null
+++ b/include/trace/bpf_trace.h
@@ -0,0 +1,25 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef _LINUX_KERNEL_BPF_TRACE_H
+#define _LINUX_KERNEL_BPF_TRACE_H
+
+/* For tracepoint filters argN fields match one to one to arguments
+ * passed to tracepoint events
+ *
+ * For syscall entry filters argN fields match syscall arguments
+ * For syscall exit filters arg1 is a return value
+ */
+struct bpf_context {
+	u64 arg1;
+	u64 arg2;
+	u64 arg3;
+	u64 arg4;
+	u64 arg5;
+	u64 arg6;
+};
+
+#endif /* _LINUX_KERNEL_BPF_TRACE_H */
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 139b5067345b..4c275ce2dcf0 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -17,6 +17,7 @@
  */
 
 #include <linux/ftrace_event.h>
+#include <trace/bpf_trace.h>
 
 /*
  * DECLARE_EVENT_CLASS can be used to add a generic function
@@ -755,12 +756,32 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call
 #undef __perf_task
 #define __perf_task(t)	(__task = (t))
 
+/* zero extend integer, pointer or aggregate type to u64 without warnings */
+#define __CAST_TO_U64(EXPR) ({ \
+	u64 ret = 0; \
+	typeof(EXPR) expr = EXPR; \
+	switch (sizeof(expr)) { \
+	case 8: ret = *(u64 *) &expr; break; \
+	case 4: ret = *(u32 *) &expr; break; \
+	case 2: ret = *(u16 *) &expr; break; \
+	case 1: ret = *(u8 *) &expr; break; \
+	} \
+	ret; })
+
+#define __BPF_CAST1(a,...) __CAST_TO_U64(a)
+#define __BPF_CAST2(a,...) __CAST_TO_U64(a), __BPF_CAST1(__VA_ARGS__)
+#define __BPF_CAST3(a,...) __CAST_TO_U64(a), __BPF_CAST2(__VA_ARGS__)
+#define __BPF_CAST4(a,...) __CAST_TO_U64(a), __BPF_CAST3(__VA_ARGS__)
+#define __BPF_CAST5(a,...) __CAST_TO_U64(a), __BPF_CAST4(__VA_ARGS__)
+#define __BPF_CAST6(a,...) __CAST_TO_U64(a), __BPF_CAST5(__VA_ARGS__)
+
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 static notrace void							\
 perf_trace_##call(void *__data, proto)					\
 {									\
 	struct ftrace_event_call *event_call = __data;			\
+	struct bpf_prog *prog = event_call->prog;			\
 	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
 	struct ftrace_raw_##call *entry;				\
 	struct pt_regs __regs;						\
@@ -771,6 +792,16 @@ perf_trace_##call(void *__data, proto)					\
 	int __data_size;						\
 	int rctx;							\
 									\
+	if (prog) {							\
+		__maybe_unused const u64 z = 0;				\
+		struct bpf_context __ctx = ((struct bpf_context) {	\
+				__BPF_CAST6(args, z, z, z, z, z)	\
+			});						\
+									\
+		if (!trace_call_bpf(prog, &__ctx))			\
+			return;						\
+	}								\
+									\
 	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
 									\
 	head = this_cpu_ptr(event_call->perf_events);			\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 45da7ec7d274..d73d7d0abe6e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -118,6 +118,7 @@ enum bpf_map_type {
 enum bpf_prog_type {
 	BPF_PROG_TYPE_UNSPEC,
 	BPF_PROG_TYPE_SOCKET_FILTER,
+	BPF_PROG_TYPE_TRACEPOINT,
 };
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
@@ -162,6 +163,12 @@ enum bpf_func_id {
 	BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
 	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
+	BPF_FUNC_fetch_ptr,       /* void *bpf_fetch_ptr(void *unsafe_ptr) */
+	BPF_FUNC_fetch_u64,       /* u64 bpf_fetch_u64(void *unsafe_ptr) */
+	BPF_FUNC_fetch_u32,       /* u32 bpf_fetch_u32(void *unsafe_ptr) */
+	BPF_FUNC_fetch_u16,       /* u16 bpf_fetch_u16(void *unsafe_ptr) */
+	BPF_FUNC_fetch_u8,        /* u8 bpf_fetch_u8(void *unsafe_ptr) */
+	BPF_FUNC_probe_memcmp,    /* int bpf_probe_memcmp(unsafe_ptr, safe_ptr, size) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 9b79abbd1ab8..d7ba67234761 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -360,6 +360,7 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
 #define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
 #define PERF_EVENT_IOC_ID		_IOR('$', 7, __u64 *)
+#define PERF_EVENT_IOC_SET_BPF		_IOW('$', 8, __u32)
 
 enum perf_event_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 882f835a0d85..674a8ca17190 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -42,6 +42,8 @@
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/compat.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
 
 #include "internal.h"
 
@@ -3283,6 +3285,7 @@ errout:
 }
 
 static void perf_event_free_filter(struct perf_event *event);
+static void perf_event_free_bpf_prog(struct perf_event *event);
 
 static void free_event_rcu(struct rcu_head *head)
 {
@@ -3292,6 +3295,7 @@ static void free_event_rcu(struct rcu_head *head)
 	if (event->ns)
 		put_pid_ns(event->ns);
 	perf_event_free_filter(event);
+	perf_event_free_bpf_prog(event);
 	kfree(event);
 }
 
@@ -3795,6 +3799,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
 static int perf_event_set_output(struct perf_event *event,
 				 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
 
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
@@ -3849,6 +3854,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case PERF_EVENT_IOC_SET_FILTER:
 		return perf_event_set_filter(event, (void __user *)arg);
 
+	case PERF_EVENT_IOC_SET_BPF:
+		return perf_event_set_bpf_prog(event, arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -6266,6 +6274,45 @@ static void perf_event_free_filter(struct perf_event *event)
 	ftrace_profile_free_filter(event);
 }
 
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+	struct bpf_prog *prog;
+
+	if (event->attr.type != PERF_TYPE_TRACEPOINT)
+		return -EINVAL;
+
+	if (event->tp_event->prog)
+		return -EEXIST;
+
+	prog = bpf_prog_get(prog_fd);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	if (prog->aux->prog_type != BPF_PROG_TYPE_TRACEPOINT) {
+		/* valid fd, but invalid bpf program type */
+		bpf_prog_put(prog);
+		return -EINVAL;
+	}
+
+	event->tp_event->prog = prog;
+
+	return 0;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+	struct bpf_prog *prog;
+
+	if (!event->tp_event)
+		return;
+
+	prog = event->tp_event->prog;
+	if (prog) {
+		event->tp_event->prog = NULL;
+		bpf_prog_put(prog);
+	}
+}
+
 #else
 
 static inline void perf_tp_register(void)
@@ -6281,6 +6328,14 @@ static void perf_event_free_filter(struct perf_event *event)
 {
 }
 
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+	return -ENOENT;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+}
 #endif /* CONFIG_EVENT_TRACING */
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 979ccde26720..54ae225e5fc6 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_trace.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_PM),y)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
new file mode 100644
index 000000000000..ec065e0a364e
--- /dev/null
+++ b/kernel/trace/bpf_trace.c
@@ -0,0 +1,145 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/uaccess.h>
+#include <trace/bpf_trace.h>
+#include "trace.h"
+
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+	unsigned int ret;
+
+	if (in_nmi()) /* not supported yet */
+		return 1;
+
+	rcu_read_lock();
+	ret = BPF_PROG_RUN(prog, ctx);
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(trace_call_bpf);
+
+static u64 bpf_fetch_ptr(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	void *unsafe_ptr = (void *) (long) r1;
+	void *ptr = NULL;
+
+	probe_kernel_read(&ptr, unsafe_ptr, sizeof(ptr));
+	return (u64) (unsigned long) ptr;
+}
+
+#define FETCH(SIZE) \
+static u64 bpf_fetch_##SIZE(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)	\
+{									\
+	void *unsafe_ptr = (void *) (long) r1;				\
+	SIZE val = 0;							\
+									\
+	probe_kernel_read(&val, unsafe_ptr, sizeof(val));		\
+	return (u64) (SIZE) val;					\
+}
+FETCH(u64)
+FETCH(u32)
+FETCH(u16)
+FETCH(u8)
+#undef FETCH
+
+static u64 bpf_probe_memcmp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	void *unsafe_ptr = (void *) (long) r1;
+	void *safe_ptr = (void *) (long) r2;
+	u32 size = (u32) r3;
+	char buf[64];
+	int err;
+
+	if (size < 64) {
+		err = probe_kernel_read(buf, unsafe_ptr, size);
+		if (err)
+			return err;
+		return memcmp(buf, safe_ptr, size);
+	}
+	return -1;
+}
+
+static struct bpf_func_proto tp_prog_funcs[] = {
+#define FETCH(SIZE)				\
+	[BPF_FUNC_fetch_##SIZE] = {		\
+		.func = bpf_fetch_##SIZE,	\
+		.gpl_only = true,		\
+		.ret_type = RET_INTEGER,	\
+	},
+	FETCH(ptr)
+	FETCH(u64)
+	FETCH(u32)
+	FETCH(u16)
+	FETCH(u8)
+#undef FETCH
+	[BPF_FUNC_probe_memcmp] = {
+		.func = bpf_probe_memcmp,
+		.gpl_only = false,
+		.ret_type = RET_INTEGER,
+		.arg1_type = ARG_ANYTHING,
+		.arg2_type = ARG_PTR_TO_STACK,
+		.arg3_type = ARG_CONST_STACK_SIZE,
+	},
+};
+
+static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	default:
+		if (func_id < 0 || func_id >= ARRAY_SIZE(tp_prog_funcs))
+			return NULL;
+		return &tp_prog_funcs[func_id];
+	}
+}
+
+/* check access to argN fields of 'struct bpf_context' from program */
+static bool tp_prog_is_valid_access(int off, int size,
+				    enum bpf_access_type type)
+{
+	/* check bounds */
+	if (off < 0 || off >= sizeof(struct bpf_context))
+		return false;
+
+	/* only read is allowed */
+	if (type != BPF_READ)
+		return false;
+
+	/* disallow misaligned access */
+	if (off % size != 0)
+		return false;
+
+	return true;
+}
+
+static struct bpf_verifier_ops tp_prog_ops = {
+	.get_func_proto = tp_prog_func_proto,
+	.is_valid_access = tp_prog_is_valid_access,
+};
+
+static struct bpf_prog_type_list tl = {
+	.ops = &tp_prog_ops,
+	.type = BPF_PROG_TYPE_TRACEPOINT,
+};
+
+static int __init register_tp_prog_ops(void)
+{
+	bpf_register_prog_type(&tl);
+	return 0;
+}
+late_initcall(register_tp_prog_ops);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c6ee36fcbf90..3487c41f4c0e 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -7,6 +7,7 @@
 #include <linux/ftrace.h>
 #include <linux/perf_event.h>
 #include <asm/syscall.h>
+#include <trace/bpf_trace.h>
 
 #include "trace_output.h"
 #include "trace.h"
@@ -545,11 +546,26 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
 static int sys_perf_refcount_enter;
 static int sys_perf_refcount_exit;
 
+static void populate_bpf_ctx(struct bpf_context *ctx, struct pt_regs *regs)
+{
+	struct task_struct *task = current;
+	unsigned long args[6];
+
+	syscall_get_arguments(task, regs, 0, 6, args);
+	ctx->arg1 = args[0];
+	ctx->arg2 = args[1];
+	ctx->arg3 = args[2];
+	ctx->arg4 = args[3];
+	ctx->arg5 = args[4];
+	ctx->arg6 = args[5];
+}
+
 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_enter *rec;
 	struct hlist_head *head;
+	struct bpf_prog *prog;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -564,6 +580,15 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	if (!sys_data)
 		return;
 
+	prog = sys_data->enter_event->prog;
+	if (prog) {
+		struct bpf_context ctx;
+
+		populate_bpf_ctx(&ctx, regs);
+		if (!trace_call_bpf(prog, &ctx))
+			return;
+	}
+
 	head = this_cpu_ptr(sys_data->enter_event->perf_events);
 	if (hlist_empty(head))
 		return;
@@ -624,6 +649,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_exit *rec;
 	struct hlist_head *head;
+	struct bpf_prog *prog;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -638,6 +664,15 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	if (!sys_data)
 		return;
 
+	prog = sys_data->exit_event->prog;
+	if (prog) {
+		struct bpf_context ctx = {};
+
+		ctx.arg1 = syscall_get_return_value(current, regs);
+		if (!trace_call_bpf(prog, &ctx))
+			return;
+	}
+
 	head = this_cpu_ptr(sys_data->exit_event->perf_events);
 	if (hlist_empty(head))
 		return;
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v3 linux-trace 2/8] tracing: allow eBPF programs to call ktime_get_ns()
From: Alexei Starovoitov @ 2015-02-10  3:45 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, linux-api, netdev, linux-kernel
In-Reply-To: <1423539961-21792-1-git-send-email-ast@plumgrid.com>

bpf_ktime_get_ns() is used by programs to compue time delta between events
or as a timestamp

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 include/uapi/linux/bpf.h |    1 +
 kernel/trace/bpf_trace.c |   10 ++++++++++
 2 files changed, 11 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d73d7d0abe6e..ecae21e58ba3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -169,6 +169,7 @@ enum bpf_func_id {
 	BPF_FUNC_fetch_u16,       /* u16 bpf_fetch_u16(void *unsafe_ptr) */
 	BPF_FUNC_fetch_u8,        /* u8 bpf_fetch_u8(void *unsafe_ptr) */
 	BPF_FUNC_probe_memcmp,    /* int bpf_probe_memcmp(unsafe_ptr, safe_ptr, size) */
+	BPF_FUNC_ktime_get_ns,    /* u64 bpf_ktime_get_ns(void) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ec065e0a364e..e3196266b72f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -69,6 +69,11 @@ static u64 bpf_probe_memcmp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 	return -1;
 }
 
+static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	return ktime_get_ns();
+}
+
 static struct bpf_func_proto tp_prog_funcs[] = {
 #define FETCH(SIZE)				\
 	[BPF_FUNC_fetch_##SIZE] = {		\
@@ -90,6 +95,11 @@ static struct bpf_func_proto tp_prog_funcs[] = {
 		.arg2_type = ARG_PTR_TO_STACK,
 		.arg3_type = ARG_CONST_STACK_SIZE,
 	},
+	[BPF_FUNC_ktime_get_ns] = {
+		.func = bpf_ktime_get_ns,
+		.gpl_only = true,
+		.ret_type = RET_INTEGER,
+	},
 };
 
 static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v3 linux-trace 3/8] samples: bpf: simple tracing example in eBPF assembler
From: Alexei Starovoitov @ 2015-02-10  3:45 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, linux-api, netdev, linux-kernel
In-Reply-To: <1423539961-21792-1-git-send-email-ast@plumgrid.com>

simple packet drop monitor:
- in-kernel eBPF program attaches to skb:kfree_skb event and records number
  of packet drops at given location
- userspace iterates over the map every second and prints stats

Usage:
$ sudo dropmon
location 0xffffffff81695995 count 1
location 0xffffffff816d0da9 count 2

location 0xffffffff81695995 count 2
location 0xffffffff816d0da9 count 2

location 0xffffffff81695995 count 3
location 0xffffffff816d0da9 count 2

$ addr2line -ape ./bld_x64/vmlinux 0xffffffff81695995 0xffffffff816d0da9
0xffffffff81695995: ./bld_x64/../net/ipv4/icmp.c:1038
0xffffffff816d0da9: ./bld_x64/../net/unix/af_unix.c:1231

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 samples/bpf/Makefile  |    2 +
 samples/bpf/dropmon.c |  143 +++++++++++++++++++++++++++++++++++++++++++++++++
 samples/bpf/libbpf.c  |    7 +++
 samples/bpf/libbpf.h  |    4 ++
 4 files changed, 156 insertions(+)
 create mode 100644 samples/bpf/dropmon.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index b5b3600dcdf5..789691374562 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -6,7 +6,9 @@ hostprogs-y := test_verifier test_maps
 hostprogs-y += sock_example
 hostprogs-y += sockex1
 hostprogs-y += sockex2
+hostprogs-y += dropmon
 
+dropmon-objs := dropmon.o libbpf.o
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
 sock_example-objs := sock_example.o libbpf.o
diff --git a/samples/bpf/dropmon.c b/samples/bpf/dropmon.c
new file mode 100644
index 000000000000..515504f68506
--- /dev/null
+++ b/samples/bpf/dropmon.c
@@ -0,0 +1,143 @@
+/* simple packet drop monitor:
+ * - in-kernel eBPF program attaches to kfree_skb() event and records number
+ *   of packet drops at given location
+ * - userspace iterates over the map every second and prints stats
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include <linux/unistd.h>
+#include <string.h>
+#include <linux/filter.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <linux/perf_event.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include "libbpf.h"
+
+#define TRACEPOINT "/sys/kernel/debug/tracing/events/skb/kfree_skb/id"
+
+static int dropmon(void)
+{
+	long long key, next_key, value = 0;
+	int prog_fd, map_fd, i, event_fd, efd, err;
+	char buf[32];
+
+	map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), 1024);
+	if (map_fd < 0) {
+		printf("failed to create map '%s'\n", strerror(errno));
+		goto cleanup;
+	}
+
+	/* the following eBPF program is equivalent to C:
+	 * int filter(struct bpf_context *ctx)
+	 * {
+	 *   long loc = ctx->arg2;
+	 *   long init_val = 1;
+	 *   long *value;
+	 *
+	 *   value = bpf_map_lookup_elem(MAP_ID, &loc);
+	 *   if (value) {
+	 *      __sync_fetch_and_add(value, 1);
+	 *   } else {
+	 *      bpf_map_update_elem(MAP_ID, &loc, &init_val, BPF_ANY);
+	 *   }
+	 *   return 0;
+	 * }
+	 */
+	struct bpf_insn prog[] = {
+		BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8), /* r2 = *(u64 *)(r1 + 8) */
+		BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), /* *(u64 *)(fp - 8) = r2 */
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), /* r2 = fp - 8 */
+		BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+		BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
+		BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+		BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
+		BPF_EXIT_INSN(),
+		BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, 1), /* *(u64 *)(fp - 16) = 1 */
+		BPF_MOV64_IMM(BPF_REG_4, BPF_ANY),
+		BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -16), /* r3 = fp - 16 */
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), /* r2 = fp - 8 */
+		BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_update_elem),
+		BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
+		BPF_EXIT_INSN(),
+	};
+
+	prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, prog,
+				sizeof(prog), "GPL");
+	if (prog_fd < 0) {
+		printf("failed to load prog '%s'\n%s",
+		       strerror(errno), bpf_log_buf);
+		return -1;
+	}
+
+
+	event_fd = open(TRACEPOINT, O_RDONLY, 0);
+	if (event_fd < 0) {
+		printf("failed to open event %s\n", TRACEPOINT);
+		return -1;
+	}
+
+	err = read(event_fd, buf, sizeof(buf));
+	if (err < 0 || err >= sizeof(buf)) {
+		printf("read from '%s' failed '%s'\n",
+		       TRACEPOINT, strerror(errno));
+		return -1;
+	}
+
+	close(event_fd);
+
+	buf[err] = 0;
+
+	struct perf_event_attr attr = {.type = PERF_TYPE_TRACEPOINT};
+	attr.config = atoi(buf);
+
+	efd = perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
+	if (efd < 0) {
+		printf("event %lld fd %d err %s\n",
+		       attr.config, efd, strerror(errno));
+		return -1;
+	}
+	ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
+	ioctl(efd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+
+	for (i = 0; i < 10; i++) {
+		key = 0;
+		while (bpf_get_next_key(map_fd, &key, &next_key) == 0) {
+			bpf_lookup_elem(map_fd, &next_key, &value);
+			printf("location 0x%llx count %lld\n", next_key, value);
+			key = next_key;
+		}
+		if (key)
+			printf("\n");
+		sleep(1);
+	}
+
+cleanup:
+	/* maps, programs, tracepoint filters will auto cleanup on process exit */
+
+	return 0;
+}
+
+int main(void)
+{
+	FILE *f;
+
+	/* start ping in the background to get some kfree_skb events */
+	f = popen("ping -c5 localhost", "r");
+	(void) f;
+
+	dropmon();
+	return 0;
+}
diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c
index 46d50b7ddf79..f4f428149a7d 100644
--- a/samples/bpf/libbpf.c
+++ b/samples/bpf/libbpf.c
@@ -121,3 +121,10 @@ int open_raw_sock(const char *name)
 
 	return sock;
 }
+
+int perf_event_open(struct perf_event_attr *attr, int pid, int cpu,
+		    int group_fd, unsigned long flags)
+{
+	return syscall(__NR_perf_event_open, attr, pid, cpu,
+		       group_fd, flags);
+}
diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
index 58c5fe1bdba1..92ff824eaed5 100644
--- a/samples/bpf/libbpf.h
+++ b/samples/bpf/libbpf.h
@@ -182,4 +182,8 @@ extern char bpf_log_buf[LOG_BUF_SIZE];
 /* create RAW socket and bind to interface 'name' */
 int open_raw_sock(const char *name);
 
+struct perf_event_attr;
+int perf_event_open(struct perf_event_attr *attr, int pid, int cpu,
+		    int group_fd, unsigned long flags);
+
 #endif
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v3 linux-trace 4/8] samples: bpf: simple tracing example in C
From: Alexei Starovoitov @ 2015-02-10  3:45 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, linux-api, netdev, linux-kernel
In-Reply-To: <1423539961-21792-1-git-send-email-ast@plumgrid.com>

tracex1_kern.c - C program which will be compiled into eBPF
to filter netif_receive_skb events on skb->dev->name == "lo"
The programs returns 1 to store an event into ring_buffer
and returns 0 - to discard an event.

tracex1_user.c - corresponding user space component that:
- loads bpf program via bpf() syscall
- opens net:netif_receive_skb event via perf_event_open() syscall
- attaches the program to event via ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
- mmaps event_fd
- polls event_fd, walks ring_buffer and prints events

Usage:
$ sudo tracex1
pid 29241 skb 0xffff88045e58c500 len 84 dev lo
pid 29241 skb 0xffff88045e58cd00 len 84 dev lo
pid 29241 skb 0xffff880074c35000 len 84 dev lo
pid 29241 skb 0xffff880074c35200 len 84 dev lo

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 samples/bpf/Makefile       |    4 ++
 samples/bpf/bpf_helpers.h  |   14 +++++
 samples/bpf/bpf_load.c     |  129 +++++++++++++++++++++++++++++++++++++++++---
 samples/bpf/bpf_load.h     |   12 +++++
 samples/bpf/tracex1_kern.c |   28 ++++++++++
 samples/bpf/tracex1_user.c |   50 +++++++++++++++++
 6 files changed, 231 insertions(+), 6 deletions(-)
 create mode 100644 samples/bpf/tracex1_kern.c
 create mode 100644 samples/bpf/tracex1_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 789691374562..da28e1b6d3a6 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -7,6 +7,7 @@ hostprogs-y += sock_example
 hostprogs-y += sockex1
 hostprogs-y += sockex2
 hostprogs-y += dropmon
+hostprogs-y += tracex1
 
 dropmon-objs := dropmon.o libbpf.o
 test_verifier-objs := test_verifier.o libbpf.o
@@ -14,17 +15,20 @@ test_maps-objs := test_maps.o libbpf.o
 sock_example-objs := sock_example.o libbpf.o
 sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
 sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
+tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
 always += sockex1_kern.o
 always += sockex2_kern.o
+always += tracex1_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
 HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
 HOSTLOADLIBES_sockex1 += -lelf
 HOSTLOADLIBES_sockex2 += -lelf
+HOSTLOADLIBES_tracex1 += -lelf
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index ca0333146006..406e9705d99e 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -15,6 +15,20 @@ static int (*bpf_map_update_elem)(void *map, void *key, void *value,
 	(void *) BPF_FUNC_map_update_elem;
 static int (*bpf_map_delete_elem)(void *map, void *key) =
 	(void *) BPF_FUNC_map_delete_elem;
+static void *(*bpf_fetch_ptr)(void *unsafe_ptr) =
+	(void *) BPF_FUNC_fetch_ptr;
+static unsigned long long (*bpf_fetch_u64)(void *unsafe_ptr) =
+	(void *) BPF_FUNC_fetch_u64;
+static unsigned int (*bpf_fetch_u32)(void *unsafe_ptr) =
+	(void *) BPF_FUNC_fetch_u32;
+static unsigned short (*bpf_fetch_u16)(void *unsafe_ptr) =
+	(void *) BPF_FUNC_fetch_u16;
+static unsigned char (*bpf_fetch_u8)(void *unsafe_ptr) =
+	(void *) BPF_FUNC_fetch_u8;
+static int (*bpf_probe_memcmp)(void *unsafe_ptr, void *safe_ptr, int size) =
+	(void *) BPF_FUNC_probe_memcmp;
+static unsigned long long (*bpf_ktime_get_ns)(void) =
+	(void *) BPF_FUNC_ktime_get_ns;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 1831d236382b..2aece65963e4 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -8,29 +8,47 @@
 #include <unistd.h>
 #include <string.h>
 #include <stdbool.h>
+#include <stdlib.h>
 #include <linux/bpf.h>
 #include <linux/filter.h>
+#include <linux/perf_event.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <poll.h>
 #include "libbpf.h"
 #include "bpf_helpers.h"
 #include "bpf_load.h"
 
+#define DEBUGFS "/sys/kernel/debug/tracing/"
+
 static char license[128];
 static bool processed_sec[128];
 int map_fd[MAX_MAPS];
 int prog_fd[MAX_PROGS];
+int event_fd[MAX_PROGS];
 int prog_cnt;
 
 static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 {
-	int fd;
 	bool is_socket = strncmp(event, "socket", 6) == 0;
+	enum bpf_prog_type prog_type;
+	char path[256] = DEBUGFS;
+	char buf[32];
+	int fd, efd, err, id;
+	struct perf_event_attr attr;
 
-	if (!is_socket)
-		/* tracing events tbd */
-		return -1;
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+
+	if (is_socket)
+		prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+	else
+		prog_type = BPF_PROG_TYPE_TRACEPOINT;
 
-	fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER,
-			   prog, size, license);
+	fd = bpf_prog_load(prog_type, prog, size, license);
 
 	if (fd < 0) {
 		printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf);
@@ -39,6 +57,39 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 
 	prog_fd[prog_cnt++] = fd;
 
+	if (is_socket)
+		return 0;
+
+	strcat(path, event);
+	strcat(path, "/id");
+
+	efd = open(path, O_RDONLY, 0);
+	if (efd < 0) {
+		printf("failed to open event %s\n", event);
+		return -1;
+	}
+
+	err = read(efd, buf, sizeof(buf));
+	if (err < 0 || err >= sizeof(buf)) {
+		printf("read from '%s' failed '%s'\n", event, strerror(errno));
+		return -1;
+	}
+
+	close(efd);
+
+	buf[err] = 0;
+	id = atoi(buf);
+	attr.config = id;
+
+	efd = perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
+	if (efd < 0) {
+		printf("event %d fd %d err %s\n", id, efd, strerror(errno));
+		return -1;
+	}
+	event_fd[prog_cnt - 1] = efd;
+	ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
+	ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
+
 	return 0;
 }
 
@@ -201,3 +252,69 @@ int load_bpf_file(char *path)
 	close(fd);
 	return 0;
 }
+
+int page_size;
+int page_cnt = 8;
+volatile struct perf_event_mmap_page *header;
+
+int perf_event_mmap(int fd)
+{
+	void *base;
+	int mmap_size;
+
+	page_size = getpagesize();
+	mmap_size = page_size * (page_cnt + 1);
+
+	base = mmap(NULL, mmap_size, PROT_READ, MAP_SHARED, fd, 0);
+	if (base == MAP_FAILED) {
+		printf("mmap err\n");
+		return -1;
+	}
+
+	header = base;
+	return 0;
+}
+
+int perf_event_poll(int fd)
+{
+	struct pollfd pfd = {.fd = fd, .events = POLLIN};
+	return poll(&pfd, 1, -1);
+}
+
+struct perf_event_sample {
+	  struct perf_event_header header;
+	  __u32 size;
+	  char data[];
+};
+
+void perf_event_read(print_fn fn)
+{
+	static __u64 old_data_head = 0;
+	__u64 data_head = header->data_head;
+	__u64 buffer_size = page_cnt * page_size;
+	void *base, *begin, *end;
+
+	if (data_head == old_data_head)
+		return;
+
+	base = ((char *)header) + page_size;
+
+	begin = base + old_data_head % buffer_size;
+	end = base + data_head % buffer_size;
+
+	while (begin < end) {
+		struct perf_event_sample *e;
+
+		e = begin;
+
+		if (e->header.type != PERF_RECORD_SAMPLE) {
+			printf("event is not a sample type %d\n", e->header.type);
+		}
+
+		begin += sizeof(*e) + e->size;
+		fn(e->data, e->size);
+	}
+	/* else when end > begin - the events had wrapped. ignored for now */
+
+	old_data_head = data_head;
+}
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 27789a34f5e6..cf55663405da 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -6,6 +6,7 @@
 
 extern int map_fd[MAX_MAPS];
 extern int prog_fd[MAX_PROGS];
+extern int event_fd[MAX_PROGS];
 
 /* parses elf file compiled by llvm .c->.o
  * . parses 'maps' section and creates maps via BPF syscall
@@ -21,4 +22,15 @@ extern int prog_fd[MAX_PROGS];
  */
 int load_bpf_file(char *path);
 
+int perf_event_mmap(int fd);
+int perf_event_poll(int fd);
+typedef void (*print_fn)(void *data, int size);
+void perf_event_read(print_fn fn);
+struct trace_entry {
+	unsigned short          type;
+	unsigned char           flags;
+	unsigned char           preempt_count;
+	int                     pid;
+};
+
 #endif
diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c
new file mode 100644
index 000000000000..449db961c642
--- /dev/null
+++ b/samples/bpf/tracex1_kern.c
@@ -0,0 +1,28 @@
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include "bpf_helpers.h"
+
+SEC("events/net/netif_receive_skb")
+int bpf_prog1(struct bpf_context *ctx)
+{
+	/*
+	 * attaches to net:netif_receive_skb
+	 * and filters events for loobpack device only
+	 */
+	char devname[] = "lo";
+	struct net_device *dev;
+	struct sk_buff *skb = 0;
+
+	skb = (struct sk_buff *) ctx->arg1;
+	dev = bpf_fetch_ptr(&skb->dev);
+	if (bpf_probe_memcmp(dev->name, devname, 2) == 0)
+		/* pass event to userspace via perf ring_buffer */
+		return 1;
+
+	/* drop event */
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c
new file mode 100644
index 000000000000..a49de23eb30b
--- /dev/null
+++ b/samples/bpf/tracex1_user.c
@@ -0,0 +1,50 @@
+#include <stdio.h>
+#include <linux/bpf.h>
+#include <unistd.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+static char *get_str(void *entry, __u32 arrloc)
+{
+	int off = arrloc & 0xffff;
+	return entry + off;
+}
+
+static void print_netif_receive_skb(void *data, int size)
+{
+	struct ftrace_raw_netif_receive_skb {
+		struct trace_entry t;
+		void *skb;
+		__u32 len;
+		__u32 name;
+	} *e = data;
+
+	printf("pid %d skb %p len %d dev %s\n",
+	       e->t.pid, e->skb, e->len, get_str(e, e->name));
+}
+
+int main(int ac, char **argv)
+{
+	FILE *f;
+	char filename[256];
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	if (perf_event_mmap(event_fd[0]) < 0)
+		return 1;
+
+	f = popen("taskset 1 ping -c5 localhost", "r");
+	(void) f;
+
+	for (;;) {
+		perf_event_poll(event_fd[0]);
+		perf_event_read(print_netif_receive_skb);
+	}
+
+	return 0;
+}
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v3 linux-trace 5/8] samples: bpf: counting example for kfree_skb tracepoint and write syscall
From: Alexei Starovoitov @ 2015-02-10  3:45 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1423539961-21792-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>

this example has two probes in one C file that attach to different tracepoints
and use two different maps.

1st probe is the similar to dropmon.c. It attaches to kfree_skb tracepoint and
count number of packet drops at different locations

2nd probe attaches to syscalls/sys_enter_write and computes a histogram of different
write sizes

Usage:
$ sudo tracex2
location 0xffffffff816959a5 count 1

location 0xffffffff816959a5 count 2

557145+0 records in
557145+0 records out
285258240 bytes (285 MB) copied, 1.02379 s, 279 MB/s
           syscall write() stats
     byte_size       : count     distribution
       1 -> 1        : 3        |                                      |
       2 -> 3        : 0        |                                      |
       4 -> 7        : 0        |                                      |
       8 -> 15       : 0        |                                      |
      16 -> 31       : 2        |                                      |
      32 -> 63       : 3        |                                      |
      64 -> 127      : 1        |                                      |
     128 -> 255      : 1        |                                      |
     256 -> 511      : 0        |                                      |
     512 -> 1023     : 1118968  |************************************* |

Ctrl-C at any time. Kernel will auto cleanup maps and programs

Signed-off-by: Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
---
 samples/bpf/Makefile       |    4 ++
 samples/bpf/tracex2_kern.c |   71 +++++++++++++++++++++++++++++++++
 samples/bpf/tracex2_user.c |   95 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 170 insertions(+)
 create mode 100644 samples/bpf/tracex2_kern.c
 create mode 100644 samples/bpf/tracex2_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index da28e1b6d3a6..416af24b01fd 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -8,6 +8,7 @@ hostprogs-y += sockex1
 hostprogs-y += sockex2
 hostprogs-y += dropmon
 hostprogs-y += tracex1
+hostprogs-y += tracex2
 
 dropmon-objs := dropmon.o libbpf.o
 test_verifier-objs := test_verifier.o libbpf.o
@@ -16,12 +17,14 @@ sock_example-objs := sock_example.o libbpf.o
 sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
 sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
 tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
+tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
 always += sockex1_kern.o
 always += sockex2_kern.o
 always += tracex1_kern.o
+always += tracex2_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -29,6 +32,7 @@ HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
 HOSTLOADLIBES_sockex1 += -lelf
 HOSTLOADLIBES_sockex2 += -lelf
 HOSTLOADLIBES_tracex1 += -lelf
+HOSTLOADLIBES_tracex2 += -lelf
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c
new file mode 100644
index 000000000000..a789c456c1b4
--- /dev/null
+++ b/samples/bpf/tracex2_kern.c
@@ -0,0 +1,71 @@
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(long),
+	.value_size = sizeof(long),
+	.max_entries = 1024,
+};
+
+SEC("events/skb/kfree_skb")
+int bpf_prog2(struct bpf_context *ctx)
+{
+	long loc = ctx->arg2;
+	long init_val = 1;
+	long *value;
+
+	value = bpf_map_lookup_elem(&my_map, &loc);
+	if (value)
+		*value += 1;
+	else
+		bpf_map_update_elem(&my_map, &loc, &init_val, BPF_ANY);
+	return 0;
+}
+
+static unsigned int log2(unsigned int v)
+{
+	unsigned int r;
+	unsigned int shift;
+
+	r = (v > 0xFFFF) << 4; v >>= r;
+	shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+	shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+	shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+	r |= (v >> 1);
+	return r;
+}
+
+static unsigned int log2l(unsigned long v)
+{
+	unsigned int hi = v >> 32;
+	if (hi)
+		return log2(hi) + 32;
+	else
+		return log2(v);
+}
+
+struct bpf_map_def SEC("maps") my_hist_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(long),
+	.max_entries = 64,
+};
+
+SEC("events/syscalls/sys_enter_write")
+int bpf_prog3(struct bpf_context *ctx)
+{
+	long write_size = ctx->arg3;
+	long init_val = 1;
+	long *value;
+	u32 index = log2l(write_size);
+
+	value = bpf_map_lookup_elem(&my_hist_map, &index);
+	if (value)
+		__sync_fetch_and_add(value, 1);
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c
new file mode 100644
index 000000000000..91b8d0896fbb
--- /dev/null
+++ b/samples/bpf/tracex2_user.c
@@ -0,0 +1,95 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define MAX_INDEX	64
+#define MAX_STARS	38
+
+static void stars(char *str, long val, long max, int width)
+{
+	int i;
+
+	for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+		str[i] = '*';
+	if (val > max)
+		str[i - 1] = '+';
+	str[i] = '\0';
+}
+
+static void print_hist(int fd)
+{
+	int key;
+	long value;
+	long data[MAX_INDEX] = {};
+	char starstr[MAX_STARS];
+	int i;
+	int max_ind = -1;
+	long max_value = 0;
+
+	for (key = 0; key < MAX_INDEX; key++) {
+		bpf_lookup_elem(fd, &key, &value);
+		data[key] = value;
+		if (value && key > max_ind)
+			max_ind = key;
+		if (value > max_value)
+			max_value = value;
+	}
+
+	printf("           syscall write() stats\n");
+	printf("     byte_size       : count     distribution\n");
+	for (i = 1; i <= max_ind + 1; i++) {
+		stars(starstr, data[i - 1], max_value, MAX_STARS);
+		printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+		       (1l << i) >> 1, (1l << i) - 1, data[i - 1],
+		       MAX_STARS, starstr);
+	}
+}
+static void int_exit(int sig)
+{
+	print_hist(map_fd[1]);
+	exit(0);
+}
+
+int main(int ac, char **argv)
+{
+	char filename[256];
+	long key, next_key, value;
+	FILE *f;
+	int i;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	signal(SIGINT, int_exit);
+
+	/* start 'ping' in the background to have some kfree_skb events */
+	f = popen("ping -c5 localhost", "r");
+	(void) f;
+
+	/* start 'dd' in the background to have plenty of 'write' syscalls */
+	f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r");
+	(void) f;
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	for (i = 0; i < 5; i++) {
+		key = 0;
+		while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) {
+			bpf_lookup_elem(map_fd[0], &next_key, &value);
+			printf("location 0x%lx count %ld\n", next_key, value);
+			key = next_key;
+		}
+		if (key)
+			printf("\n");
+		sleep(1);
+	}
+	print_hist(map_fd[1]);
+
+	return 0;
+}
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v3 linux-trace 6/8] samples: bpf: IO latency analysis (iosnoop/heatmap)
From: Alexei Starovoitov @ 2015-02-10  3:45 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, linux-api, netdev, linux-kernel
In-Reply-To: <1423539961-21792-1-git-send-email-ast@plumgrid.com>

eBPF C program attaches to block_rq_issue/block_rq_complete events to calculate
IO latency. Then it waits for the first 100 events to compute average latency
and uses range [0 .. ave_lat * 2] to record histogram of events in this latency
range.
User space reads this histogram map every 2 seconds and prints it as a 'heatmap'
using gray shades of text terminal. Black spaces have many events and white
spaces have very few events. Left most space is the smallest latency, right most
space is the largest latency in the range.
If kernel sees too many events that fall out of histogram range, user space
adjusts the range up, so heatmap for next 2 seconds will be more accurate.

Usage:
$ sudo ./tracex3
and do 'sudo dd if=/dev/sda of=/dev/null' in other terminal.
Observe IO latencies and how different activity (like 'make kernel') affects it.

Similar experiments can be done for network transmit latencies, syscalls, etc

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 samples/bpf/Makefile       |    4 ++
 samples/bpf/tracex3_kern.c |   98 ++++++++++++++++++++++++++++
 samples/bpf/tracex3_user.c |  152 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 254 insertions(+)
 create mode 100644 samples/bpf/tracex3_kern.c
 create mode 100644 samples/bpf/tracex3_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 416af24b01fd..da0efd8032ab 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -9,6 +9,7 @@ hostprogs-y += sockex2
 hostprogs-y += dropmon
 hostprogs-y += tracex1
 hostprogs-y += tracex2
+hostprogs-y += tracex3
 
 dropmon-objs := dropmon.o libbpf.o
 test_verifier-objs := test_verifier.o libbpf.o
@@ -18,6 +19,7 @@ sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
 sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
 tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
 tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
+tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -25,6 +27,7 @@ always += sockex1_kern.o
 always += sockex2_kern.o
 always += tracex1_kern.o
 always += tracex2_kern.o
+always += tracex3_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -33,6 +36,7 @@ HOSTLOADLIBES_sockex1 += -lelf
 HOSTLOADLIBES_sockex2 += -lelf
 HOSTLOADLIBES_tracex1 += -lelf
 HOSTLOADLIBES_tracex2 += -lelf
+HOSTLOADLIBES_tracex3 += -lelf
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c
new file mode 100644
index 000000000000..961f3f373270
--- /dev/null
+++ b/samples/bpf/tracex3_kern.c
@@ -0,0 +1,98 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(long),
+	.value_size = sizeof(u64),
+	.max_entries = 4096,
+};
+
+SEC("events/block/block_rq_issue")
+int bpf_prog1(struct bpf_context *ctx)
+{
+	long rq = ctx->arg2;
+	u64 val = bpf_ktime_get_ns();
+
+	bpf_map_update_elem(&my_map, &rq, &val, BPF_ANY);
+	return 0;
+}
+
+struct globals {
+	u64 lat_ave;
+	u64 lat_sum;
+	u64 missed;
+	u64 max_lat;
+	int num_samples;
+};
+
+struct bpf_map_def SEC("maps") global_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(struct globals),
+	.max_entries = 1,
+};
+
+#define MAX_SLOT 32
+
+struct bpf_map_def SEC("maps") lat_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(u64),
+	.max_entries = MAX_SLOT,
+};
+
+SEC("events/block/block_rq_complete")
+int bpf_prog2(struct bpf_context *ctx)
+{
+	long rq = ctx->arg2;
+	void *value;
+
+	value = bpf_map_lookup_elem(&my_map, &rq);
+	if (!value)
+		return 0;
+
+	u64 cur_time = bpf_ktime_get_ns();
+	u64 delta = (cur_time - *(u64 *)value) / 1000;
+
+	bpf_map_delete_elem(&my_map, &rq);
+
+	int ind = 0;
+	struct globals *g = bpf_map_lookup_elem(&global_map, &ind);
+
+	if (!g)
+		return 0;
+	if (g->lat_ave == 0) {
+		g->num_samples++;
+		g->lat_sum += delta;
+		if (g->num_samples >= 100)
+			g->lat_ave = g->lat_sum / g->num_samples;
+	} else {
+		u64 max_lat = g->lat_ave * 2;
+
+		if (delta > max_lat) {
+			g->missed++;
+			if (delta > g->max_lat)
+				g->max_lat = delta;
+			return 0;
+		}
+
+		ind = delta * MAX_SLOT / max_lat;
+		value = bpf_map_lookup_elem(&lat_map, &ind);
+		if (!value)
+			return 0;
+		(*(u64 *)value)++;
+	}
+
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c
new file mode 100644
index 000000000000..c49f41f28cba
--- /dev/null
+++ b/samples/bpf/tracex3_user.c
@@ -0,0 +1,152 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+struct globals {
+	__u64 lat_ave;
+	__u64 lat_sum;
+	__u64 missed;
+	__u64 max_lat;
+	int num_samples;
+};
+
+static void clear_stats(int fd)
+{
+	int key;
+	__u64 value = 0;
+
+	for (key = 0; key < 32; key++)
+		bpf_update_elem(fd, &key, &value, BPF_ANY);
+}
+
+const char *color[] = {
+	"\033[48;5;255m",
+	"\033[48;5;252m",
+	"\033[48;5;250m",
+	"\033[48;5;248m",
+	"\033[48;5;246m",
+	"\033[48;5;244m",
+	"\033[48;5;242m",
+	"\033[48;5;240m",
+	"\033[48;5;238m",
+	"\033[48;5;236m",
+	"\033[48;5;234m",
+	"\033[48;5;232m",
+};
+const int num_colors = ARRAY_SIZE(color);
+
+const char nocolor[] = "\033[00m";
+
+static void print_banner(__u64 max_lat)
+{
+	printf("0 usec     ...          %lld usec\n", max_lat);
+}
+
+static void print_hist(int fd)
+{
+	int key;
+	__u64 value;
+	__u64 cnt[32];
+	__u64 max_cnt = 0;
+	__u64 total_events = 0;
+	int max_bucket = 0;
+
+	for (key = 0; key < 32; key++) {
+		value = 0;
+		bpf_lookup_elem(fd, &key, &value);
+		if (value > 0)
+			max_bucket = key;
+		cnt[key] = value;
+		total_events += value;
+		if (value > max_cnt)
+			max_cnt = value;
+	}
+	clear_stats(fd);
+	for (key = 0; key < 32; key++) {
+		int c = num_colors * cnt[key] / (max_cnt + 1);
+
+		printf("%s %s", color[c], nocolor);
+	}
+	printf(" captured=%lld", total_events);
+
+	key = 0;
+	struct globals g = {};
+
+	bpf_lookup_elem(map_fd[1], &key, &g);
+
+	printf(" missed=%lld max_lat=%lld usec\n",
+	       g.missed, g.max_lat);
+
+	if (g.missed > 10 && g.missed > total_events / 10) {
+		printf("adjusting range UP...\n");
+		g.lat_ave = g.max_lat / 2;
+		print_banner(g.lat_ave * 2);
+	} else if (max_bucket < 4 && total_events > 100) {
+		printf("adjusting range DOWN...\n");
+		g.lat_ave = g.lat_ave / 4;
+		print_banner(g.lat_ave * 2);
+	}
+	/* clear some globals */
+	g.missed = 0;
+	g.max_lat = 0;
+	bpf_update_elem(map_fd[1], &key, &g, BPF_ANY);
+}
+
+static void int_exit(int sig)
+{
+	print_hist(map_fd[2]);
+	exit(0);
+}
+
+int main(int ac, char **argv)
+{
+	char filename[256];
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	clear_stats(map_fd[2]);
+
+	signal(SIGINT, int_exit);
+
+	struct globals g;
+
+	printf("waiting for events to determine average latency...\n");
+	for (;;) {
+		int key = 0;
+
+		bpf_lookup_elem(map_fd[1], &key, &g);
+		if (g.lat_ave)
+			break;
+		sleep(1);
+	}
+
+	printf("  IO latency in usec\n"
+	       "  %s %s - many events with this latency\n"
+	       "  %s %s - few events\n",
+	       color[num_colors - 1], nocolor,
+	       color[0], nocolor);
+	print_banner(g.lat_ave * 2);
+	for (;;) {
+		print_hist(map_fd[2]);
+		sleep(2);
+	}
+
+	return 0;
+}
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v3 linux-trace 7/8] tracing: attach eBPF programs to kprobe/kretprobe
From: Alexei Starovoitov @ 2015-02-10  3:46 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, linux-api, netdev, linux-kernel
In-Reply-To: <1423539961-21792-1-git-send-email-ast@plumgrid.com>

introduce new type of eBPF programs BPF_PROG_TYPE_KPROBE.
Such programs are allowed to call the same helper functions
as tracing filters, but bpf_context is different:
For tracing filters bpf_context is 6 arguments of tracepoints or syscalls
For kprobe filters bpf_context == pt_regs

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 include/linux/ftrace_event.h |    3 +++
 include/uapi/linux/bpf.h     |    1 +
 kernel/events/core.c         |    5 ++++-
 kernel/trace/bpf_trace.c     |   39 +++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_kprobe.c  |   10 +++++++++-
 5 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 479d0a4a42b3..cd6efd23bfae 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -249,6 +249,7 @@ enum {
 	TRACE_EVENT_FL_WAS_ENABLED_BIT,
 	TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
 	TRACE_EVENT_FL_TRACEPOINT_BIT,
+	TRACE_EVENT_FL_KPROBE_BIT,
 };
 
 /*
@@ -262,6 +263,7 @@ enum {
  *                     it is best to clear the buffers that used it).
  *  USE_CALL_FILTER - For ftrace internal events, don't use file filter
  *  TRACEPOINT    - Event is a tracepoint
+ *  KPROBE        - Event is a kprobe
  */
 enum {
 	TRACE_EVENT_FL_FILTERED		= (1 << TRACE_EVENT_FL_FILTERED_BIT),
@@ -271,6 +273,7 @@ enum {
 	TRACE_EVENT_FL_WAS_ENABLED	= (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
 	TRACE_EVENT_FL_USE_CALL_FILTER	= (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
 	TRACE_EVENT_FL_TRACEPOINT	= (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
+	TRACE_EVENT_FL_KPROBE		= (1 << TRACE_EVENT_FL_KPROBE_BIT),
 };
 
 struct ftrace_event_call {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ecae21e58ba3..cf443900318d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -119,6 +119,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_UNSPEC,
 	BPF_PROG_TYPE_SOCKET_FILTER,
 	BPF_PROG_TYPE_TRACEPOINT,
+	BPF_PROG_TYPE_KPROBE,
 };
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 674a8ca17190..94de727a8c0c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6288,7 +6288,10 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
 
-	if (prog->aux->prog_type != BPF_PROG_TYPE_TRACEPOINT) {
+	if (((event->tp_event->flags & TRACE_EVENT_FL_KPROBE) &&
+	     prog->aux->prog_type != BPF_PROG_TYPE_KPROBE) ||
+	    (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE) &&
+	     prog->aux->prog_type != BPF_PROG_TYPE_TRACEPOINT)) {
 		/* valid fd, but invalid bpf program type */
 		bpf_prog_put(prog);
 		return -EINVAL;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e3196266b72f..52dbd1e7dc28 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -153,3 +153,42 @@ static int __init register_tp_prog_ops(void)
 	return 0;
 }
 late_initcall(register_tp_prog_ops);
+
+/* check access to fields of 'struct pt_regs' from BPF program */
+static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+	/* check bounds */
+	if (off < 0 || off >= sizeof(struct pt_regs))
+		return false;
+
+	/* only read is allowed */
+	if (type != BPF_READ)
+		return false;
+
+	/* disallow misaligned access */
+	if (off % size != 0)
+		return false;
+
+	return true;
+}
+/* kprobe filter programs are allowed to call the same helper functions
+ * as tracing filters, but bpf_context is different:
+ * For tracing filters bpf_context is 6 arguments of tracepoints or syscalls
+ * For kprobe filters bpf_context == pt_regs
+ */
+static struct bpf_verifier_ops kprobe_prog_ops = {
+	.get_func_proto = tp_prog_func_proto,
+	.is_valid_access = kprobe_prog_is_valid_access,
+};
+
+static struct bpf_prog_type_list kprobe_tl = {
+	.ops = &kprobe_prog_ops,
+	.type = BPF_PROG_TYPE_KPROBE,
+};
+
+static int __init register_kprobe_prog_ops(void)
+{
+	bpf_register_prog_type(&kprobe_tl);
+	return 0;
+}
+late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5edb518be345..d503ac43b85b 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1134,11 +1134,15 @@ static void
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct ftrace_event_call *call = &tk->tp.call;
+	struct bpf_prog *prog = call->prog;
 	struct kprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
+	if (prog && !trace_call_bpf(prog, regs))
+		return;
+
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
 		return;
@@ -1165,11 +1169,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 		    struct pt_regs *regs)
 {
 	struct ftrace_event_call *call = &tk->tp.call;
+	struct bpf_prog *prog = call->prog;
 	struct kretprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
+	if (prog && !trace_call_bpf(prog, regs))
+		return;
+
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
 		return;
@@ -1286,7 +1294,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
 		kfree(call->print_fmt);
 		return -ENODEV;
 	}
-	call->flags = 0;
+	call->flags = TRACE_EVENT_FL_KPROBE;
 	call->class->reg = kprobe_register;
 	call->data = tk;
 	ret = trace_add_event_call(call);
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v3 linux-trace 8/8] samples: bpf: simple kprobe example
From: Alexei Starovoitov @ 2015-02-10  3:46 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, linux-api, netdev, linux-kernel
In-Reply-To: <1423539961-21792-1-git-send-email-ast@plumgrid.com>

the logic of the example is similar to tracex2, but syscall 'write' statistics
is capturead from kprobe placed at sys_write function instead of through
syscall instrumentation.
Also tracex4_kern.c has a different way of doing log2 in C.
Note, unlike tracepoint and syscall programs, kprobe programs receive
'struct pt_regs' as an input. It's responsibility of the program author
or higher level dynamic tracing tool to match registers to function arguments.
Since pt_regs are architecture dependent, programs are also arch dependent,
unlike tracepoint/syscalls programs which are universal.

Usage:
$ sudo tracex4
2216443+0 records in
2216442+0 records out
1134818304 bytes (1.1 GB) copied, 2.00746 s, 565 MB/s

           kprobe sys_write() stats
     byte_size       : count     distribution
       1 -> 1        : 0        |                                      |
       2 -> 3        : 0        |                                      |
       4 -> 7        : 0        |                                      |
       8 -> 15       : 0        |                                      |
      16 -> 31       : 0        |                                      |
      32 -> 63       : 0        |                                      |
      64 -> 127      : 1        |                                      |
     128 -> 255      : 0        |                                      |
     256 -> 511      : 0        |                                      |
     512 -> 1023     : 2214734  |************************************* |

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 samples/bpf/Makefile       |    4 +++
 samples/bpf/bpf_load.c     |    3 ++
 samples/bpf/tracex4_kern.c |   36 +++++++++++++++++++
 samples/bpf/tracex4_user.c |   83 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 126 insertions(+)
 create mode 100644 samples/bpf/tracex4_kern.c
 create mode 100644 samples/bpf/tracex4_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index da0efd8032ab..22c7a38f3f95 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -10,6 +10,7 @@ hostprogs-y += dropmon
 hostprogs-y += tracex1
 hostprogs-y += tracex2
 hostprogs-y += tracex3
+hostprogs-y += tracex4
 
 dropmon-objs := dropmon.o libbpf.o
 test_verifier-objs := test_verifier.o libbpf.o
@@ -20,6 +21,7 @@ sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
 tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
 tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
 tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
+tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -28,6 +30,7 @@ always += sockex2_kern.o
 always += tracex1_kern.o
 always += tracex2_kern.o
 always += tracex3_kern.o
+always += tracex4_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -37,6 +40,7 @@ HOSTLOADLIBES_sockex2 += -lelf
 HOSTLOADLIBES_tracex1 += -lelf
 HOSTLOADLIBES_tracex2 += -lelf
 HOSTLOADLIBES_tracex3 += -lelf
+HOSTLOADLIBES_tracex4 += -lelf
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 2aece65963e4..2206b49df625 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -32,6 +32,7 @@ int prog_cnt;
 static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 {
 	bool is_socket = strncmp(event, "socket", 6) == 0;
+	bool is_kprobe = strncmp(event, "events/kprobes/", 15) == 0;
 	enum bpf_prog_type prog_type;
 	char path[256] = DEBUGFS;
 	char buf[32];
@@ -45,6 +46,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 
 	if (is_socket)
 		prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+	else if (is_kprobe)
+		prog_type = BPF_PROG_TYPE_KPROBE;
 	else
 		prog_type = BPF_PROG_TYPE_TRACEPOINT;
 
diff --git a/samples/bpf/tracex4_kern.c b/samples/bpf/tracex4_kern.c
new file mode 100644
index 000000000000..9646f9e43417
--- /dev/null
+++ b/samples/bpf/tracex4_kern.c
@@ -0,0 +1,36 @@
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include "bpf_helpers.h"
+
+static unsigned int log2l(unsigned long long n)
+{
+#define S(k) if (n >= (1ull << k)) { i += k; n >>= k; }
+	int i = -(n == 0);
+	S(32); S(16); S(8); S(4); S(2); S(1);
+	return i;
+#undef S
+}
+
+struct bpf_map_def SEC("maps") my_hist_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(long),
+	.max_entries = 64,
+};
+
+SEC("events/kprobes/sys_write")
+int bpf_prog4(struct pt_regs *regs)
+{
+	long write_size = regs->dx; /* $rdx contains 3rd argument to a function */
+	long init_val = 1;
+	void *value;
+	u32 index = log2l(write_size);
+
+	value = bpf_map_lookup_elem(&my_hist_map, &index);
+	if (value)
+		__sync_fetch_and_add((long *)value, 1);
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c
new file mode 100644
index 000000000000..741206127768
--- /dev/null
+++ b/samples/bpf/tracex4_user.c
@@ -0,0 +1,83 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define MAX_INDEX	64
+#define MAX_STARS	38
+
+static void stars(char *str, long val, long max, int width)
+{
+	int i;
+
+	for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+		str[i] = '*';
+	if (val > max)
+		str[i - 1] = '+';
+	str[i] = '\0';
+}
+
+static void print_hist(int fd)
+{
+	int key;
+	long value;
+	long data[MAX_INDEX] = {};
+	char starstr[MAX_STARS];
+	int i;
+	int max_ind = -1;
+	long max_value = 0;
+
+	for (key = 0; key < MAX_INDEX; key++) {
+		bpf_lookup_elem(fd, &key, &value);
+		data[key] = value;
+		if (value && key > max_ind)
+			max_ind = key;
+		if (value > max_value)
+			max_value = value;
+	}
+
+	printf("\n           kprobe sys_write() stats\n");
+	printf("     byte_size       : count     distribution\n");
+	for (i = 1; i <= max_ind + 1; i++) {
+		stars(starstr, data[i - 1], max_value, MAX_STARS);
+		printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+		       (1l << i) >> 1, (1l << i) - 1, data[i - 1],
+		       MAX_STARS, starstr);
+	}
+}
+static void int_exit(int sig)
+{
+	print_hist(map_fd[0]);
+	exit(0);
+}
+
+int main(int ac, char **argv)
+{
+	char filename[256];
+	FILE *f;
+	int i;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	signal(SIGINT, int_exit);
+
+	i = system("echo 'p:sys_write sys_write' > /sys/kernel/debug/tracing/kprobe_events");
+	(void) i;
+
+	/* start 'dd' in the background to have plenty of 'write' syscalls */
+	f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r");
+	(void) f;
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	sleep(2);
+	kill(0, SIGINT); /* send Ctrl-C to self and to 'dd' */
+
+	return 0;
+}
-- 
1.7.9.5

^ permalink raw reply related

* Re: [PATCH 2/2] epoll: introduce EPOLLEXCLUSIVE and EPOLLROUNDROBIN
From: Jason Baron @ 2015-02-10  3:59 UTC (permalink / raw)
  To: Andy Lutomirski, Linux API
  Cc: Peter Zijlstra, Ingo Molnar, Al Viro, Andrew Morton, Eric Wong,
	Davide Libenzi, Michael Kerrisk-manpages,
	linux-kernel@vger.kernel.org, Linux FS Devel
In-Reply-To: <CALCETrUQZHHE_PJYoMw8g9C5AAxSdF-jU85EF317O4Krk+bV9w@mail.gmail.com>

On 02/09/2015 05:45 PM, Andy Lutomirski wrote:
> On Mon, Feb 9, 2015 at 1:32 PM, Jason Baron <jbaron@akamai.com> wrote:
>> On 02/09/2015 03:18 PM, Andy Lutomirski wrote:
>>> On 02/09/2015 12:06 PM, Jason Baron wrote:
>>>> Epoll file descriptors that are added to a shared wakeup source are always
>>>> added in a non-exclusive manner. That means that when we have multiple epoll
>>>> fds attached to a shared wakeup source they are all woken up. This can
>>>> lead to excessive cpu usage and uneven load distribution.
>>>>
>>>> This patch introduces two new 'events' flags that are intended to be used
>>>> with EPOLL_CTL_ADD operations. EPOLLEXCLUSIVE, adds the epoll fd to the event
>>>> source in an exclusive manner such that the minimum number of threads are
>>>> woken. EPOLLROUNDROBIN, which depends on EPOLLEXCLUSIVE also being set, can
>>>> also be added to the 'events' flag, such that we round robin around the set
>>>> of waiting threads.
>>>>
>>>> An implementation note is that in the epoll wakeup routine,
>>>> 'ep_poll_callback()', if EPOLLROUNDROBIN is set, we return 1, for a successful
>>>> wakeup, only when there are current waiters. The idea is to use this additional
>>>> heuristic in order minimize wakeup latencies.
>>> I don't understand what this is intended to do.
>>>
>>> If an event has EPOLLONESHOT, then this only one thread should be woken regardless, right?  If not, isn't that just a bug that should be fixed?
>>>
>> hmm...so with EPOLLONESHOT you basically get notified once about an event. If i have multiple epoll fds (say 1 per-thread) attached to a single source in EPOLLONESHOT, then all threads will potentially get woken up once per event. Then, I would have to re-arm all of them. So I don't think this addresses this particular usecase...what I am trying to avoid is this mass wakeup or thundering herd for a shared event source.
> Now I understand.  Why are you using multiple epollfds?
>
> --Andy

So the multiple epollfds is really a way to partition the set of events. Otherwise, I have all the threads contending on all the events that are being generated. So I'm not sure if that is scalable.

In the use-case I'm trying to describe, I've partitioned a large set of the events, but there may still be some event sources that we wish to share among all of the threads (or even subsets of them), so as not to overload any one in particular.

More specifically, in the case of a single listen socket, its natural to call accept() on the thread that has been woken up, but without doing round robin, you quickly get into a very unbalanced load, and in addition you waste a lot of cpu doing unnecessary wakeups. There are other approaches to solve this, specifically using SO_REUSEPORT, which creates a separate socket per-thread and gets one back to the separately partitioned events case previously described. However, SO_REUSEPORT, I believe is very specific to tcp/udp, and in addition does not have knowledge of the threads that are actively waiting as the epoll code does.

Thanks,

-Jason

^ permalink raw reply

* Re: [PATCH v3 linux-trace 4/8] samples: bpf: simple tracing example in C
From: Steven Rostedt @ 2015-02-10  4:08 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, linux-api, netdev, linux-kernel
In-Reply-To: <1423539961-21792-5-git-send-email-ast@plumgrid.com>

On Mon,  9 Feb 2015 19:45:57 -0800
Alexei Starovoitov <ast@plumgrid.com> wrote:
 
> +int perf_event_mmap(int fd);
> +int perf_event_poll(int fd);
> +typedef void (*print_fn)(void *data, int size);
> +void perf_event_read(print_fn fn);
> +struct trace_entry {
> +	unsigned short          type;
> +	unsigned char           flags;
> +	unsigned char           preempt_count;
> +	int                     pid;
> +};
> +

Please do not hard code any structures. This is not a stable ABI, and
it may not even match if you are running 32 bit userspace on top of a
64 bit kernel.

Please parse the format files. libtraceevent does this for you. If need
be, link to that. But if you look at the event format files you'll see
the offsets and sizes in the binary code:

	field:unsigned short common_type;	offset:0;	size:2;	signed:0;
	field:unsigned char common_flags;	offset:2;	size:1;	signed:0;
	field:unsigned char common_preempt_count;	offset:3;	size:1;	signed:0;
	field:int common_pid;	offset:4;	size:4;	signed:1;

I don't want to get stuck with pinned kernel data structures again. We
had 4 blank bytes of data for every event, because latency top hard
coded the field. Luckily, the 64 bit / 32 bit interface caused latency
top to have to use the event_parse code to work, and we were able to
remove that field after it was converted.

-- Steve

^ permalink raw reply

* Re: [PATCH v3 linux-trace 4/8] samples: bpf: simple tracing example in C
From: Steven Rostedt @ 2015-02-10  4:12 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1423539961-21792-5-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>

On Mon,  9 Feb 2015 19:45:57 -0800
Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org> wrote:

> +static void print_netif_receive_skb(void *data, int size)
> +{
> +	struct ftrace_raw_netif_receive_skb {
> +		struct trace_entry t;
> +		void *skb;
> +		__u32 len;
> +		__u32 name;
> +	} *e = data;

Same here, there's no guarantee that this structure will always match.

cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/format 
name: netif_receive_skb
ID: 975
format:
	field:unsigned short common_type;	offset:0;	size:2;	signed:0;
	field:unsigned char common_flags;	offset:2;	size:1;	signed:0;
	field:unsigned char common_preempt_count;	offset:3;	size:1;	signed:0;
	field:int common_pid;	offset:4;	size:4;	signed:1;

	field:void * skbaddr;	offset:8;	size:8;	signed:0;
	field:unsigned int len;	offset:16;	size:4;	signed:0;
	field:__data_loc char[] name;	offset:20;	size:4;	signed:1;

print fmt: "dev=%s skbaddr=%p len=%u", __get_str(name), REC->skbaddr, REC->len

-- Steve

^ permalink raw reply

* Re: [PATCH v3 linux-trace 1/8] tracing: attach eBPF programs to tracepoints and syscalls
From: Steven Rostedt @ 2015-02-10  4:46 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1423539961-21792-2-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>

On Mon,  9 Feb 2015 19:45:54 -0800
Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org> wrote:

> +#endif /* _LINUX_KERNEL_BPF_TRACE_H */
> diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
> index 139b5067345b..4c275ce2dcf0 100644
> --- a/include/trace/ftrace.h
> +++ b/include/trace/ftrace.h
> @@ -17,6 +17,7 @@
>   */
>  
>  #include <linux/ftrace_event.h>
> +#include <trace/bpf_trace.h>
>  
>  /*
>   * DECLARE_EVENT_CLASS can be used to add a generic function
> @@ -755,12 +756,32 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call
>  #undef __perf_task
>  #define __perf_task(t)	(__task = (t))
>  
> +/* zero extend integer, pointer or aggregate type to u64 without warnings */
> +#define __CAST_TO_U64(EXPR) ({ \
> +	u64 ret = 0; \
> +	typeof(EXPR) expr = EXPR; \
> +	switch (sizeof(expr)) { \
> +	case 8: ret = *(u64 *) &expr; break; \
> +	case 4: ret = *(u32 *) &expr; break; \
> +	case 2: ret = *(u16 *) &expr; break; \
> +	case 1: ret = *(u8 *) &expr; break; \
> +	} \
> +	ret; })
> +
> +#define __BPF_CAST1(a,...) __CAST_TO_U64(a)
> +#define __BPF_CAST2(a,...) __CAST_TO_U64(a), __BPF_CAST1(__VA_ARGS__)
> +#define __BPF_CAST3(a,...) __CAST_TO_U64(a), __BPF_CAST2(__VA_ARGS__)
> +#define __BPF_CAST4(a,...) __CAST_TO_U64(a), __BPF_CAST3(__VA_ARGS__)
> +#define __BPF_CAST5(a,...) __CAST_TO_U64(a), __BPF_CAST4(__VA_ARGS__)
> +#define __BPF_CAST6(a,...) __CAST_TO_U64(a), __BPF_CAST5(__VA_ARGS__)
> +
>  #undef DECLARE_EVENT_CLASS
>  #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
>  static notrace void							\
>  perf_trace_##call(void *__data, proto)					\
>  {									\
>  	struct ftrace_event_call *event_call = __data;			\
> +	struct bpf_prog *prog = event_call->prog;			\


Looks like this is entirely perf based and does not interact with
ftrace at all. In other words, it's perf not tracing.

It makes more sense to go through tip than the tracing tree.

But I still do not want any hard coded event structures. All access to
data from the binary code must be parsed by looking at the event/format
files. Otherwise you will lock internals of the kernel as userspace
ABI, because eBPF programs will break if those internals change, and
that could severely limit progress in the future.

-- Steve

>  	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
>  	struct ftrace_raw_##call *entry;				\
>  	struct pt_regs __regs;						\
> @@ -771,6 +792,16 @@ perf_trace_##call(void *__data, proto)					\
>  	int __data_size;						\
>  	int rctx;							\
>  									\
> +	if (prog) {							\
> +		__maybe_unused const u64 z = 0;				\
> +		struct bpf_context __ctx = ((struct bpf_context) {	\
> +				__BPF_CAST6(args, z, z, z, z, z)	\
> +			});						\
> +									\
> +		if (!trace_call_bpf(prog, &__ctx))			\
> +			return;						\
> +	}								\
> +									\
>  	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
>  									\
>  	head = this_cpu_ptr(event_call->perf_events);			\

^ permalink raw reply

* Re: [PATCH 2/2] epoll: introduce EPOLLEXCLUSIVE and EPOLLROUNDROBIN
From: Eric Wong @ 2015-02-10  4:49 UTC (permalink / raw)
  To: Jason Baron
  Cc: Andy Lutomirski, Linux API, Peter Zijlstra, Ingo Molnar, Al Viro,
	Andrew Morton, Davide Libenzi, Michael Kerrisk-manpages,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	Linux FS Devel
In-Reply-To: <54D98209.2080901-JqFfY2XvxFXQT0dZR+AlfA@public.gmane.org>

Jason Baron <jbaron-JqFfY2XvxFXQT0dZR+AlfA@public.gmane.org> wrote:
> On 02/09/2015 05:45 PM, Andy Lutomirski wrote:
> > On Mon, Feb 9, 2015 at 1:32 PM, Jason Baron <jbaron-JqFfY2XvxFXQT0dZR+AlfA@public.gmane.org> wrote:
> >> On 02/09/2015 03:18 PM, Andy Lutomirski wrote:
> >>> On 02/09/2015 12:06 PM, Jason Baron wrote:
> >>>> Epoll file descriptors that are added to a shared wakeup source are always
> >>>> added in a non-exclusive manner. That means that when we have multiple epoll
> >>>> fds attached to a shared wakeup source they are all woken up. This can
> >>>> lead to excessive cpu usage and uneven load distribution.
> >>>>
> >>>> This patch introduces two new 'events' flags that are intended to be used
> >>>> with EPOLL_CTL_ADD operations. EPOLLEXCLUSIVE, adds the epoll fd to the event
> >>>> source in an exclusive manner such that the minimum number of threads are
> >>>> woken. EPOLLROUNDROBIN, which depends on EPOLLEXCLUSIVE also being set, can
> >>>> also be added to the 'events' flag, such that we round robin around the set
> >>>> of waiting threads.
> >>>>
> >>>> An implementation note is that in the epoll wakeup routine,
> >>>> 'ep_poll_callback()', if EPOLLROUNDROBIN is set, we return 1, for a successful
> >>>> wakeup, only when there are current waiters. The idea is to use this additional
> >>>> heuristic in order minimize wakeup latencies.
> >>> I don't understand what this is intended to do.
> >>>
> >>> If an event has EPOLLONESHOT, then this only one thread should be woken regardless, right?  If not, isn't that just a bug that should be fixed?
> >>>
> >> hmm...so with EPOLLONESHOT you basically get notified once about an event. If i have multiple epoll fds (say 1 per-thread) attached to a single source in EPOLLONESHOT, then all threads will potentially get woken up once per event. Then, I would have to re-arm all of them. So I don't think this addresses this particular usecase...what I am trying to avoid is this mass wakeup or thundering herd for a shared event source.
> > Now I understand.  Why are you using multiple epollfds?
> >
> > --Andy
> 
> So the multiple epollfds is really a way to partition the set of
> events. Otherwise, I have all the threads contending on all the events
> that are being generated. So I'm not sure if that is scalable.

I wonder if EPOLLONESHOT + epoll_wait with a sufficiently large
maxevents value is sufficient for you.  All events would be shared, so
they can migrate between threads(*).  Each thread takes a largish set of
events on every epoll_wait call and doesn't call epoll_wait again until
it's done with the whole set it got.

You'll hit more contention on EPOLL_CTL_MOD with shared events and a
single epoll, but I think it's a better goal to make that lock-free.

(*) Too large a maxevents will lead to head-of-line blocking, but from
what I'm inferring, you already risk that with multiple epollfds and
separate threads working on them.

Do you have a userland use case to share?

> In the use-case I'm trying to describe, I've partitioned a large set
> of the events, but there may still be some event sources that we wish
> to share among all of the threads (or even subsets of them), so as not
> to overload any one in particular.
 
> More specifically, in the case of a single listen socket, its natural
> to call accept() on the thread that has been woken up, but without
> doing round robin, you quickly get into a very unbalanced load, and in
> addition you waste a lot of cpu doing unnecessary wakeups. There are
> other approaches to solve this, specifically using SO_REUSEPORT, which
> creates a separate socket per-thread and gets one back to the
> separately partitioned events case previously described. However,
> SO_REUSEPORT, I believe is very specific to tcp/udp, and in addition
> does not have knowledge of the threads that are actively waiting as
> the epoll code does.

Did you try my suggestion of using a dedicated thread (or thread pool)
which does nothing but loop on accept() + EPOLL_CTL_ADD?

Those dedicated threads could do its own round-robin in userland to pick
a different epollfd to call EPOLL_CTL_ADD on.

^ permalink raw reply

* Re: [PATCH v3 linux-trace 1/8] tracing: attach eBPF programs to tracepoints and syscalls
From: Steven Rostedt @ 2015-02-10  5:13 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1423539961-21792-2-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>

On Mon,  9 Feb 2015 19:45:54 -0800
Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org> wrote:

> +/* For tracepoint filters argN fields match one to one to arguments
> + * passed to tracepoint events
> + *
> + * For syscall entry filters argN fields match syscall arguments
> + * For syscall exit filters arg1 is a return value
> + */
> +struct bpf_context {
> +	u64 arg1;
> +	u64 arg2;
> +	u64 arg3;
> +	u64 arg4;
> +	u64 arg5;
> +	u64 arg6;
> +};
> +
> +#endif /* _LINUX_KERNEL_BPF_TRACE_H */
> diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
> index 139b5067345b..4c275ce2dcf0 100644
> --- a/include/trace/ftrace.h
> +++ b/include/trace/ftrace.h
> @@ -17,6 +17,7 @@
>   */
>  
>  #include <linux/ftrace_event.h>
> +#include <trace/bpf_trace.h>
>  
>  /*
>   * DECLARE_EVENT_CLASS can be used to add a generic function
> @@ -755,12 +756,32 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call
>  #undef __perf_task
>  #define __perf_task(t)	(__task = (t))
>  
> +/* zero extend integer, pointer or aggregate type to u64 without warnings */
> +#define __CAST_TO_U64(EXPR) ({ \
> +	u64 ret = 0; \
> +	typeof(EXPR) expr = EXPR; \
> +	switch (sizeof(expr)) { \
> +	case 8: ret = *(u64 *) &expr; break; \
> +	case 4: ret = *(u32 *) &expr; break; \
> +	case 2: ret = *(u16 *) &expr; break; \
> +	case 1: ret = *(u8 *) &expr; break; \
> +	} \
> +	ret; })
> +
> +#define __BPF_CAST1(a,...) __CAST_TO_U64(a)
> +#define __BPF_CAST2(a,...) __CAST_TO_U64(a), __BPF_CAST1(__VA_ARGS__)
> +#define __BPF_CAST3(a,...) __CAST_TO_U64(a), __BPF_CAST2(__VA_ARGS__)
> +#define __BPF_CAST4(a,...) __CAST_TO_U64(a), __BPF_CAST3(__VA_ARGS__)
> +#define __BPF_CAST5(a,...) __CAST_TO_U64(a), __BPF_CAST4(__VA_ARGS__)
> +#define __BPF_CAST6(a,...) __CAST_TO_U64(a), __BPF_CAST5(__VA_ARGS__)
> +
>  #undef DECLARE_EVENT_CLASS
>  #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
>  static notrace void							\
>  perf_trace_##call(void *__data, proto)					\
>  {									\
>  	struct ftrace_event_call *event_call = __data;			\
> +	struct bpf_prog *prog = event_call->prog;			\
>  	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
>  	struct ftrace_raw_##call *entry;				\
>  	struct pt_regs __regs;						\
> @@ -771,6 +792,16 @@ perf_trace_##call(void *__data, proto)					\
>  	int __data_size;						\
>  	int rctx;							\
>  									\
> +	if (prog) {							\
> +		__maybe_unused const u64 z = 0;				\
> +		struct bpf_context __ctx = ((struct bpf_context) {	\
> +				__BPF_CAST6(args, z, z, z, z, z)	\

Note, there is no guarantee that args is at most 6. For example, in
drivers/net/wireless/brcm80211/brcmsmac/brcms_trace_events.h, the
trace_event brcms_txstatus has 8 args.

But I guess that's OK if you do not need those last args, right?

Also, there's no interface the lets us know what the args are. I may be
able to come up with something. That's the reason I never filtered
before tracing. Because we had no way of knowing what to filter on,
because the args were never visible.

I'm nervous about showing args of tracepoints too, because we don't want
that to become a strict ABI either.

-- Steve



> +			});						\
> +									\
> +		if (!trace_call_bpf(prog, &__ctx))			\
> +			return;						\
> +	}								\
> +									\
>  	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \

^ permalink raw reply

* Re: [PATCH v3 linux-trace 4/8] samples: bpf: simple tracing example in C
From: Steven Rostedt @ 2015-02-10  5:16 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20150209230836.7f913c60-2kNGR76GQU9OHLTnHDQRgA@public.gmane.org>

On Mon, 9 Feb 2015 23:08:36 -0500
Steven Rostedt <rostedt-nx8X9YLhiw1AfugRpC6u6w@public.gmane.org> wrote:
 
> I don't want to get stuck with pinned kernel data structures again. We
> had 4 blank bytes of data for every event, because latency top hard
> coded the field. Luckily, the 64 bit / 32 bit interface caused latency
> top to have to use the event_parse code to work, and we were able to
> remove that field after it was converted.

I'm wondering if we should label eBPF programs as "modules". That is,
they have no guarantee of working from one kernel to the next. They
execute in the kernel, thus they are very similar to modules.

If we can get Linus to say that eBPF programs are not user space, and
that they are treated the same as modules (no internal ABI), then I
think we can be a bit more free at what we allow.

-- Steve

^ permalink raw reply

* Re: [PATCH v3 linux-trace 4/8] samples: bpf: simple tracing example in C
From: Alexei Starovoitov @ 2015-02-10  5:45 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, Linux API, Network Development, LKML
In-Reply-To: <20150210001608.157a9190@grimm.local.home>

On Mon, Feb 9, 2015 at 9:16 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> On Mon, 9 Feb 2015 23:08:36 -0500
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
>> I don't want to get stuck with pinned kernel data structures again. We
>> had 4 blank bytes of data for every event, because latency top hard
>> coded the field. Luckily, the 64 bit / 32 bit interface caused latency
>> top to have to use the event_parse code to work, and we were able to
>> remove that field after it was converted.

I think your main point boils down to:

> But I still do not want any hard coded event structures. All access to
> data from the binary code must be parsed by looking at the event/format
> files. Otherwise you will lock internals of the kernel as userspace
> ABI, because eBPF programs will break if those internals change, and
> that could severely limit progress in the future.

and I completely agree.

the patch 4 is an example. It doesn't mean in any way
that structs defined here is an ABI.
To be compatible across kernels the user space must read
format file as you mentioned in your other reply.

> I'm wondering if we should label eBPF programs as "modules". That is,
> they have no guarantee of working from one kernel to the next. They
> execute in the kernel, thus they are very similar to modules.
>
> If we can get Linus to say that eBPF programs are not user space, and
> that they are treated the same as modules (no internal ABI), then I
> think we can be a bit more free at what we allow.

I thought we already stated that.
Here is the quote from perf_event.h:
         *      # The RAW record below is opaque data wrt the ABI
         *      #
         *      # That is, the ABI doesn't make any promises wrt to
         *      # the stability of its content, it may vary depending
         *      # on event, hardware, kernel version and phase of
         *      # the moon.
         *      #
         *      # In other words, PERF_SAMPLE_RAW contents are not an ABI.

and this example is reading PERF_SAMPLE_RAW events and
uses locally defined structs to print them for simplicity.

^ permalink raw reply

* Re: [PATCH v3 linux-trace 4/8] samples: bpf: simple tracing example in C
From: Alexei Starovoitov @ 2015-02-10  5:47 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, Linux API, Network Development, LKML
In-Reply-To: <CAMEtUuzon5LfG7PS9YmuW+0GzYMKehz1Ddk+6tXogZOZYdpb3g-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Mon, Feb 9, 2015 at 9:45 PM, Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org> wrote:
> I thought we already stated that.
> Here is the quote from perf_event.h:
>          *      # The RAW record below is opaque data wrt the ABI
>          *      #
>          *      # That is, the ABI doesn't make any promises wrt to
>          *      # the stability of its content, it may vary depending
>          *      # on event, hardware, kernel version and phase of
>          *      # the moon.
>          *      #
>          *      # In other words, PERF_SAMPLE_RAW contents are not an ABI.
>
> and this example is reading PERF_SAMPLE_RAW events and
> uses locally defined structs to print them for simplicity.

to underline my point once more:
addition of bpf doesn't change at all what PERF_SAMPLE_RAW already
delivers to user space.
so no new ABIs anywhere.

^ permalink raw reply

* Re: [PATCH v3 linux-trace 1/8] tracing: attach eBPF programs to tracepoints and syscalls
From: Alexei Starovoitov @ 2015-02-10  5:51 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, Linux API, Network Development, LKML

On Mon, Feb 9, 2015 at 8:46 PM, Steven Rostedt <rostedt-nx8X9YLhiw1AfugRpC6u6w@public.gmane.org> wrote:
>
> Looks like this is entirely perf based and does not interact with
> ftrace at all. In other words, it's perf not tracing.
>
> It makes more sense to go through tip than the tracing tree.

well, all of earlier series were based on ftrace only,
but I was given convincing enough arguments that
perf_even_open+ioctl is a better interface :)
Ok. will rebase on tip in the next version.

^ permalink raw reply

* Re: [PATCH v3 linux-trace 1/8] tracing: attach eBPF programs to tracepoints and syscalls
From: Alexei Starovoitov @ 2015-02-10  6:10 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, Linux API, Network Development, LKML

On Mon, Feb 9, 2015 at 9:13 PM, Steven Rostedt <rostedt-nx8X9YLhiw1AfugRpC6u6w@public.gmane.org> wrote:
>>                                                                       \
>> +     if (prog) {                                                     \
>> +             __maybe_unused const u64 z = 0;                         \
>> +             struct bpf_context __ctx = ((struct bpf_context) {      \
>> +                             __BPF_CAST6(args, z, z, z, z, z)        \
>
> Note, there is no guarantee that args is at most 6. For example, in
> drivers/net/wireless/brcm80211/brcmsmac/brcms_trace_events.h, the
> trace_event brcms_txstatus has 8 args.
>
> But I guess that's OK if you do not need those last args, right?

yeah, some tracepoints pass a lot of things.
That's rare and in most of the cases they can be fetched
from parent structure.

> I'm nervous about showing args of tracepoints too, because we don't want
> that to become a strict ABI either.

One can argue that current TP_printk format is already an ABI,
because somebody might be parsing the text output.
so in some cases we cannot change tracepoints without
somebody complaining that his tool broke.
In other cases tracepoints are used for debugging only
and no one will notice when they change...
It was and still a grey area.
bpf doesn't change any of that.
It actually makes addition of new tracepoints easier.
In the future we might add a tracepoint and pass a single
pointer to interesting data struct to it. bpf programs will walk
data structures 'as safe modules' via bpf_fetch*() methods
without exposing it as ABI.
whereas today we pass a lot of fields to tracepoints and
make all of these fields immutable.

To me tracepoints are like gdb breakpoints.
and bpf programs like live debugger that examine things.

the next step is to be able to write bpf scripts on the fly
without leaving debugger. Something like perf probe +
editor + live execution. Truly like gdb for kernel.
while kernel is running.

^ permalink raw reply

* Re: [RFC PATCH] iio: Export userspace IIO headers
From: Daniel Baluta @ 2015-02-10  8:05 UTC (permalink / raw)
  To: Lars-Peter Clausen
  Cc: Daniel Baluta, Jonathan Cameron, Hartmut Knaack, Peter Meerwald,
	Irina Tirdea, Roberta Dobrescu, Linux Kernel Mailing List,
	linux-iio-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <54D8E8BD.3090904-Qo5EllUWu/uELgA04lAiVw@public.gmane.org>

On Mon, Feb 9, 2015 at 7:05 PM, Lars-Peter Clausen <lars-Qo5EllUWu/uELgA04lAiVw@public.gmane.org> wrote:
> On 02/09/2015 05:49 PM, Daniel Baluta wrote:
>>
>> After UAPI header file split [1] all user-kernel interfaces were
>> placed under include/uapi/.
>>
>> This patch moves IIO user specific API from:
>>         * include/linux/iio/events.h => include/uapi/linux/iio/events.h
>>         * include/linux/iio/types.h => include/uapi/linux/iio/types.h
>>
>> Now there is no need for nasty tricks to compile userspace programs
>> (e.g iio_event_monitor). Just installing the kernel headers with
>> make headers_install command does the job.
>>
>> [1] http://lwn.net/Articles/507794/
>
>
> Thanks for taking care of this, this is something that should have done a
> while ago.

Hi Lars,

Thanks for the feedback!
>
> [...]
>
>> index 580ed5b..146cda1 100644

<snip>

>
>
> I think everything in this file below is not part of the ABI and should not
> be exported to userspace.

I agree about iio_event_info and IIO_VAL_* constants.

>
>> -
>> -enum iio_event_info {
>> -       IIO_EV_INFO_ENABLE,
>> -       IIO_EV_INFO_VALUE,
>> -       IIO_EV_INFO_HYSTERESIS,
>> -       IIO_EV_INFO_PERIOD,
>> -};
>> -
>> -enum iio_event_direction {
>> -       IIO_EV_DIR_EITHER,
>> -       IIO_EV_DIR_RISING,
>> -       IIO_EV_DIR_FALLING,
>> -       IIO_EV_DIR_NONE,
>> -};

iio_event_direction is exported to userspace via IIO_EVENT_CODE and
I think it belongs to this file.

>> -
>> -#define IIO_VAL_INT 1
>> -#define IIO_VAL_INT_PLUS_MICRO 2
>> -#define IIO_VAL_INT_PLUS_NANO 3
>> -#define IIO_VAL_INT_PLUS_MICRO_DB 4
>> -#define IIO_VAL_INT_MULTIPLE 5
>> -#define IIO_VAL_FRACTIONAL 10
>> -#define IIO_VAL_FRACTIONAL_LOG2 11
>> -

I will send v2 in few hours, hopefully I'll get more feedback meanwhile.

Do you thing should I remove the RFC tag?

thanks,
Daniel.

^ permalink raw reply

* Re: [Patch v4] firmware: dmi-sysfs: add SMBIOS entry point area attribute
From: Ivan Khoronzhuk @ 2015-02-10  9:51 UTC (permalink / raw)
  To: matt, Grant Likely, Ard Biesheuvel
  Cc: linux-kernel@vger.kernel.org, linux-api, linux-doc, Leif Lindholm,
	Mark Salter
In-Reply-To: <1423069563-26467-1-git-send-email-ivan.khoronzhuk@linaro.org>

Hi Matt,

On 02/04/2015 07:06 PM, Ivan Khoronzhuk wrote:
> Some utils, like dmidecode and smbios, need to access SMBIOS entry
> table area in order to get information like SMBIOS version, size, etc.
> Currently it's done via /dev/mem. But for situation when /dev/mem
> usage is disabled, the utils have to use dmi sysfs instead, which
> doesn't represent SMBIOS entry. So this patch adds SMBIOS area to
> dmi-sysfs in order to allow utils in question to work correctly with
> dmi sysfs interface.
>
> Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
> ---
>
> v1: https://lkml.org/lkml/2015/1/23/643
> v2: https://lkml.org/lkml/2015/1/26/345
> v3: https://lkml.org/lkml/2015/1/28/768
>
> v4..v2:
>    firmware: dmi_scan: add symbol to get SMBIOS entry area
>    	- used u8 type for smbios_header var
>    firmware: dmi-sysfs: add SMBIOS entry point area attribute
>    	- replaced -ENODATA on -EINVAL
>
> v3..v2:
>    firmware: dmi_scan: add symbol to get SMBIOS entry area
>    firmware: dmi-sysfs: add SMBIOS entry point area attribute
> 	- combined in one patch
> 	- added SMBIOS information to ABI sysfs-dmi documentaton
>
> v2..v1:
>    firmware: dmi_scan: add symbol to get SMBIOS entry area
> 	- used additional static var to hold SMBIOS raw table size
> 	- changed format of get_smbios_entry_area symbol
> 	  returned pointer on const smbios table
>
>    firmware: dmi-sysfs: add SMBIOS entry point area attribute
> 	- adopted to updated get_smbios_entry_area symbol
>    	- removed redundant array to save smbios table
>
>   Documentation/ABI/testing/sysfs-firmware-dmi | 10 +++++++
>   drivers/firmware/dmi-sysfs.c                 | 42 ++++++++++++++++++++++++++++
>   drivers/firmware/dmi_scan.c                  | 26 +++++++++++++++++
>   include/linux/dmi.h                          |  3 ++
>   4 files changed, 81 insertions(+)
>
> diff --git a/Documentation/ABI/testing/sysfs-firmware-dmi b/Documentation/ABI/testing/sysfs-firmware-dmi
> index c78f9ab..3a9ffe8 100644
> --- a/Documentation/ABI/testing/sysfs-firmware-dmi
> +++ b/Documentation/ABI/testing/sysfs-firmware-dmi
> @@ -12,6 +12,16 @@ Description:
>   		cannot ensure that the data as exported to userland is
>   		without error either.
>   
> +		The firmware provides DMI structures as a packed list of
> +		data referenced by a SMBIOS table entry point. The SMBIOS
> +		entry point contains general information, like SMBIOS
> +		version, DMI table size, etc. The structure, content and
> +		size of SMBIOS entry point is dependent on SMBIOS version.
> +		That's why SMBIOS entry point is represented in dmi sysfs
> +		like a raw attribute and is accessible via
> +		/sys/firmware/dmi/smbios_raw_header. The format of SMBIOS
> +		entry point header can be read in SMBIOS specification.
> +
>   		DMI is structured as a large table of entries, where
>   		each entry has a common header indicating the type and
>   		length of the entry, as well as a firmware-provided
> diff --git a/drivers/firmware/dmi-sysfs.c b/drivers/firmware/dmi-sysfs.c
> index e0f1cb3..9b396d7 100644
> --- a/drivers/firmware/dmi-sysfs.c
> +++ b/drivers/firmware/dmi-sysfs.c
> @@ -29,6 +29,8 @@
>   #define MAX_ENTRY_TYPE 255 /* Most of these aren't used, but we consider
>   			      the top entry type is only 8 bits */
>   
> +static const u8 *smbios_raw_header;
> +
>   struct dmi_sysfs_entry {
>   	struct dmi_header dh;
>   	struct kobject kobj;
> @@ -646,9 +648,37 @@ static void cleanup_entry_list(void)
>   	}
>   }
>   
> +static ssize_t smbios_entry_area_raw_read(struct file *filp,
> +					  struct kobject *kobj,
> +					  struct bin_attribute *bin_attr,
> +					  char *buf, loff_t pos, size_t count)
> +{
> +	ssize_t size;
> +
> +	size = bin_attr->size;
> +
> +	if (size > pos)
> +		size -= pos;
> +	else
> +		return 0;
> +
> +	if (count < size)
> +		size = count;
> +
> +	memcpy(buf, &smbios_raw_header[pos], size);
> +
> +	return size;
> +}
> +
> +static struct bin_attribute smbios_raw_area_attr = {
> +	.read = smbios_entry_area_raw_read,
> +	.attr = {.name = "smbios_raw_header", .mode = 0400},
> +};
> +
>   static int __init dmi_sysfs_init(void)
>   {
>   	int error = -ENOMEM;
> +	int size;
>   	int val;
>   
>   	/* Set up our directory */
> @@ -669,6 +699,18 @@ static int __init dmi_sysfs_init(void)
>   		goto err;
>   	}
>   
> +	smbios_raw_header = dmi_get_smbios_entry_area(&size);
> +	if (!smbios_raw_header) {
> +		pr_debug("dmi-sysfs: SMBIOS raw data is not available.\n");
> +		error = -EINVAL;
> +		goto err;
> +	}
> +
> +	/* Create the raw binary file to access the entry area */
> +	smbios_raw_area_attr.size = size;
> +	if (sysfs_create_bin_file(dmi_kobj, &smbios_raw_area_attr))
> +		goto err;
> +
>   	pr_debug("dmi-sysfs: loaded.\n");
>   
>   	return 0;
> diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
> index 420c8d8..99c5f6c 100644
> --- a/drivers/firmware/dmi_scan.c
> +++ b/drivers/firmware/dmi_scan.c
> @@ -113,6 +113,8 @@ static void dmi_table(u8 *buf, int len, int num,
>   	}
>   }
>   
> +static u8 smbios_header[32];
> +static int smbios_header_size;
>   static phys_addr_t dmi_base;
>   static u16 dmi_len;
>   static u16 dmi_num;
> @@ -474,6 +476,8 @@ static int __init dmi_present(const u8 *buf)
>   	if (memcmp(buf, "_SM_", 4) == 0 &&
>   	    buf[5] < 32 && dmi_checksum(buf, buf[5])) {
>   		smbios_ver = get_unaligned_be16(buf + 6);
> +		smbios_header_size = buf[5];
> +		memcpy(smbios_header, buf, smbios_header_size);
>   
>   		/* Some BIOS report weird SMBIOS version, fix that up */
>   		switch (smbios_ver) {
> @@ -505,6 +509,8 @@ static int __init dmi_present(const u8 *buf)
>   				pr_info("SMBIOS %d.%d present.\n",
>   				       dmi_ver >> 8, dmi_ver & 0xFF);
>   			} else {
> +				smbios_header_size = 15;
> +				memcpy(smbios_header, buf, smbios_header_size);
>   				dmi_ver = (buf[14] & 0xF0) << 4 |
>   					   (buf[14] & 0x0F);
>   				pr_info("Legacy DMI %d.%d present.\n",
> @@ -531,6 +537,8 @@ static int __init dmi_smbios3_present(const u8 *buf)
>   		dmi_ver &= 0xFFFFFF;
>   		dmi_len = get_unaligned_le32(buf + 12);
>   		dmi_base = get_unaligned_le64(buf + 16);
> +		smbios_header_size = buf[6];
> +		memcpy(smbios_header, buf, smbios_header_size);
>   
>   		/*
>   		 * The 64-bit SMBIOS 3.0 entry point no longer has a field
> @@ -944,3 +952,21 @@ void dmi_memdev_name(u16 handle, const char **bank, const char **device)
>   	}
>   }
>   EXPORT_SYMBOL_GPL(dmi_memdev_name);
> +
> +/**
> + * dmi_get_smbios_entry_area - copy SMBIOS entry point area to array.
> + * @size - pointer to assign actual size of SMBIOS entry point area.
> + *
> + * returns NULL if table is not available, otherwise returns pointer on
> + * SMBIOS entry point area array.
> + */
> +const u8 *dmi_get_smbios_entry_area(int *size)
> +{
> +	if (!smbios_header_size || !dmi_available)
> +		return NULL;
> +
> +	*size = smbios_header_size;
> +
> +	return smbios_header;
> +}
> +EXPORT_SYMBOL_GPL(dmi_get_smbios_entry_area);
> diff --git a/include/linux/dmi.h b/include/linux/dmi.h
> index f820f0a..8e1a28d 100644
> --- a/include/linux/dmi.h
> +++ b/include/linux/dmi.h
> @@ -109,6 +109,7 @@ extern int dmi_walk(void (*decode)(const struct dmi_header *, void *),
>   	void *private_data);
>   extern bool dmi_match(enum dmi_field f, const char *str);
>   extern void dmi_memdev_name(u16 handle, const char **bank, const char **device);
> +const u8 *dmi_get_smbios_entry_area(int *size);
>   
>   #else
>   
> @@ -140,6 +141,8 @@ static inline void dmi_memdev_name(u16 handle, const char **bank,
>   		const char **device) { }
>   static inline const struct dmi_system_id *
>   	dmi_first_match(const struct dmi_system_id *list) { return NULL; }
> +static inline const u8 *dmi_get_smbios_entry_area(int *size)
> +	{ return NULL; }
>   
>   #endif
>   

If you are Ok with this patch, could you please pickup it?

^ permalink raw reply

* Re: [RFC PATCH] iio: Export userspace IIO headers
From: Lars-Peter Clausen @ 2015-02-10  9:57 UTC (permalink / raw)
  To: Daniel Baluta
  Cc: Jonathan Cameron, Hartmut Knaack, Peter Meerwald, Irina Tirdea,
	Roberta Dobrescu, Linux Kernel Mailing List,
	linux-iio-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <CAEnQRZBWRmWcSe1izK7cEikwfG_oAEty05eCQXxWr71=8jbDEw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On 02/10/2015 09:05 AM, Daniel Baluta wrote:
> On Mon, Feb 9, 2015 at 7:05 PM, Lars-Peter Clausen <lars-Qo5EllUWu/uELgA04lAiVw@public.gmane.org> wrote:
>> On 02/09/2015 05:49 PM, Daniel Baluta wrote:
>>>
>>> After UAPI header file split [1] all user-kernel interfaces were
>>> placed under include/uapi/.
>>>
>>> This patch moves IIO user specific API from:
>>>          * include/linux/iio/events.h => include/uapi/linux/iio/events.h
>>>          * include/linux/iio/types.h => include/uapi/linux/iio/types.h
>>>
>>> Now there is no need for nasty tricks to compile userspace programs
>>> (e.g iio_event_monitor). Just installing the kernel headers with
>>> make headers_install command does the job.
>>>
>>> [1] http://lwn.net/Articles/507794/
>>
>>
>> Thanks for taking care of this, this is something that should have done a
>> while ago.
>
> Hi Lars,
>
> Thanks for the feedback!
>>
>> [...]
>>
>>> index 580ed5b..146cda1 100644
>
> <snip>
>
>>
>>
>> I think everything in this file below is not part of the ABI and should not
>> be exported to userspace.
>
> I agree about iio_event_info and IIO_VAL_* constants.
>
>>
>>> -
>>> -enum iio_event_info {
>>> -       IIO_EV_INFO_ENABLE,
>>> -       IIO_EV_INFO_VALUE,
>>> -       IIO_EV_INFO_HYSTERESIS,
>>> -       IIO_EV_INFO_PERIOD,
>>> -};
>>> -
>>> -enum iio_event_direction {
>>> -       IIO_EV_DIR_EITHER,
>>> -       IIO_EV_DIR_RISING,
>>> -       IIO_EV_DIR_FALLING,
>>> -       IIO_EV_DIR_NONE,
>>> -};
>
> iio_event_direction is exported to userspace via IIO_EVENT_CODE and
> I think it belongs to this file.

Right.

>
>>> -
>>> -#define IIO_VAL_INT 1
>>> -#define IIO_VAL_INT_PLUS_MICRO 2
>>> -#define IIO_VAL_INT_PLUS_NANO 3
>>> -#define IIO_VAL_INT_PLUS_MICRO_DB 4
>>> -#define IIO_VAL_INT_MULTIPLE 5
>>> -#define IIO_VAL_FRACTIONAL 10
>>> -#define IIO_VAL_FRACTIONAL_LOG2 11
>>> -
>
> I will send v2 in few hours, hopefully I'll get more feedback meanwhile.
>
> Do you thing should I remove the RFC tag?

Patch looks good to me.

^ permalink raw reply

* Re: [PATCH] tpm, tpm_tis: fix TPM 2.0 probing
From: Stefan Berger @ 2015-02-10 12:16 UTC (permalink / raw)
  To: Jarkko Sakkinen, Peter Hüwe
  Cc: Ashley Lai, Marcel Selhorst,
	tpmdd-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, josh-iaAMLnmF4UmaiuxdJuQwMA,
	christophe.ricard-Re5JQEeQqe8AvxtiuMwx3w,
	jason.gunthorpe-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	trousers-tech-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
In-Reply-To: <20150209083947.GC29987-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

On 02/09/2015 03:39 AM, Jarkko Sakkinen wrote:
> On Mon, Feb 09, 2015 at 12:08:46AM +0100, Peter Hüwe wrote:
>> Am Mittwoch, 4. Februar 2015, 15:21:09 schrieb Jarkko Sakkinen:
>>> If during transmission system error was returned, the logic was to
>>> incorrectly deduce that chip is a TPM 1.x chip. This patch fixes this
>>> issue. Also, this patch changes probing so that message tag is used as the
>>> measure for TPM 2.x, which should be much more stable.
>> Is it aware that some TPMs may respond with 0x00C1 as TAG for TPM1.2 commands?
> I guess none of the TPM 1.2 command answer with the tag 0x8002?


FYI: pdf page 26 , section 6.1 explains the predictable return value for 
a TPM1.2 command seen by a TPM2

http://www.trustedcomputinggroup.org/files/static_page_files/8C68ADA8-1A4B-B294-D0FC06D3773F7DAA/TPM%20Rev%202.0%20Part%203%20-%20Commands%2001.16-code.pdf

Following this:

Sending a TPM1.2 command to a TPM2 should return a TPM1.2 header (tag = 
0xc4) and error code (TPM_BADTAG = 0x1e)

Sending a TPM 2 command to a TPM 2 will give a TPM 2 tag in the header.
Sending a TPM 2 command to a TPM 1.2 will give a TPM 1.2 tag in the 
header and an error code.


    Stefan

^ permalink raw reply

* Re: [PATCH v3 linux-trace 4/8] samples: bpf: simple tracing example in C
From: Steven Rostedt @ 2015-02-10 12:24 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, Linux API, Network Development, LKML,
	Linus Torvalds
In-Reply-To: <CAMEtUuzon5LfG7PS9YmuW+0GzYMKehz1Ddk+6tXogZOZYdpb3g-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

Added Linus because he's the one that would revert changes on breakage.

On Mon, 9 Feb 2015 21:45:21 -0800
Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org> wrote:

> On Mon, Feb 9, 2015 at 9:16 PM, Steven Rostedt <rostedt-nx8X9YLhiw1AfugRpC6u6w@public.gmane.org> wrote:
> > On Mon, 9 Feb 2015 23:08:36 -0500
> > Steven Rostedt <rostedt-nx8X9YLhiw1AfugRpC6u6w@public.gmane.org> wrote:
> >
> >> I don't want to get stuck with pinned kernel data structures again. We
> >> had 4 blank bytes of data for every event, because latency top hard
> >> coded the field. Luckily, the 64 bit / 32 bit interface caused latency
> >> top to have to use the event_parse code to work, and we were able to
> >> remove that field after it was converted.
> 
> I think your main point boils down to:
> 
> > But I still do not want any hard coded event structures. All access to
> > data from the binary code must be parsed by looking at the event/format
> > files. Otherwise you will lock internals of the kernel as userspace
> > ABI, because eBPF programs will break if those internals change, and
> > that could severely limit progress in the future.
> 
> and I completely agree.
> 
> the patch 4 is an example. It doesn't mean in any way
> that structs defined here is an ABI.
> To be compatible across kernels the user space must read
> format file as you mentioned in your other reply.

The thing is, this is a sample. Which means it will be cut and pasted
into other programs. If the sample does not follow the way we want
users to use this, then how can we complain if they hard code it as
well?

> 
> > I'm wondering if we should label eBPF programs as "modules". That is,
> > they have no guarantee of working from one kernel to the next. They
> > execute in the kernel, thus they are very similar to modules.
> >
> > If we can get Linus to say that eBPF programs are not user space, and
> > that they are treated the same as modules (no internal ABI), then I
> > think we can be a bit more free at what we allow.
> 
> I thought we already stated that.
> Here is the quote from perf_event.h:
>          *      # The RAW record below is opaque data wrt the ABI
>          *      #
>          *      # That is, the ABI doesn't make any promises wrt to
>          *      # the stability of its content, it may vary depending
>          *      # on event, hardware, kernel version and phase of
>          *      # the moon.
>          *      #
>          *      # In other words, PERF_SAMPLE_RAW contents are not an ABI.
> 
> and this example is reading PERF_SAMPLE_RAW events and
> uses locally defined structs to print them for simplicity.

As we found out the hard way with latencytop, comments like this does
not matter. If an application does something like this, it's our fault
if it breaks later. We can't say "hey you were suppose to do it this
way". That argument breaks down even more if our own examples do not
follow the way we want others to do things.

-- Steve

^ permalink raw reply

* Re: [PATCH v3 linux-trace 4/8] samples: bpf: simple tracing example in C
From: Steven Rostedt @ 2015-02-10 12:27 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, Linux API, Network Development, LKML,
	Linus Torvalds
In-Reply-To: <CAMEtUuyCmO+N2SOTNXfOVnRA59TWsZczPCJBWsB+h0E45NxHPA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Mon, 9 Feb 2015 21:47:42 -0800
Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org> wrote:

> On Mon, Feb 9, 2015 at 9:45 PM, Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org> wrote:
> > I thought we already stated that.
> > Here is the quote from perf_event.h:
> >          *      # The RAW record below is opaque data wrt the ABI
> >          *      #
> >          *      # That is, the ABI doesn't make any promises wrt to
> >          *      # the stability of its content, it may vary depending
> >          *      # on event, hardware, kernel version and phase of
> >          *      # the moon.
> >          *      #
> >          *      # In other words, PERF_SAMPLE_RAW contents are not an ABI.
> >
> > and this example is reading PERF_SAMPLE_RAW events and
> > uses locally defined structs to print them for simplicity.
> 
> to underline my point once more:
> addition of bpf doesn't change at all what PERF_SAMPLE_RAW already
> delivers to user space.
> so no new ABIs anywhere.

Again, it we give an example of how to hard code the data, definitely
expect this to show up in user space. Users are going to look at this
code to learn how to use eBPF. I really want it to do it the correct
way instead of the 'easy' way. Because whatever way we have it here,
will be the way we will see it out in the wild.

-- Steve

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox