Linux userland API discussions

Linux userland API discussions
 help / color / mirror / Atom feed

* [PATCH v6 tip 2/8] tracing: attach BPF programs to kprobes
From: Alexei Starovoitov @ 2015-03-11  4:18 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1426047534-8148-1-git-send-email-ast@plumgrid.com>

User interface:
struct perf_event_attr attr = {.type = PERF_TYPE_TRACEPOINT, .config = event_id, ...};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);

prog_fd is a file descriptor associated with BPF program previously loaded.
event_id is an ID of created kprobe

close(event_fd) - automatically detaches BPF program from it

BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any kernel
  data structures

BPF programs receive 'struct pt_regs *' as an input
('struct pt_regs' is architecture dependent)

Note, kprobes are _not_ a stable kernel ABI, so bpf programs attached to
kprobes must be recompiled for every kernel version and user must supply correct
LINUX_VERSION_CODE in attr.kern_version during bpf_prog_load() call.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 include/linux/ftrace_event.h    |   14 +++++
 include/uapi/linux/bpf.h        |    3 +
 include/uapi/linux/perf_event.h |    1 +
 kernel/bpf/syscall.c            |    7 ++-
 kernel/events/core.c            |   59 +++++++++++++++++++
 kernel/trace/Makefile           |    1 +
 kernel/trace/bpf_trace.c        |  119 +++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_kprobe.c     |   10 +++-
 8 files changed, 212 insertions(+), 2 deletions(-)
 create mode 100644 kernel/trace/bpf_trace.c

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index c674ee8f7fca..0aa535bc9f05 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -13,6 +13,7 @@ struct trace_array;
 struct trace_buffer;
 struct tracer;
 struct dentry;
+struct bpf_prog;
 
 struct trace_print_flags {
 	unsigned long		mask;
@@ -252,6 +253,7 @@ enum {
 	TRACE_EVENT_FL_WAS_ENABLED_BIT,
 	TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
 	TRACE_EVENT_FL_TRACEPOINT_BIT,
+	TRACE_EVENT_FL_KPROBE_BIT,
 };
 
 /*
@@ -265,6 +267,7 @@ enum {
  *                     it is best to clear the buffers that used it).
  *  USE_CALL_FILTER - For ftrace internal events, don't use file filter
  *  TRACEPOINT    - Event is a tracepoint
+ *  KPROBE        - Event is a kprobe
  */
 enum {
 	TRACE_EVENT_FL_FILTERED		= (1 << TRACE_EVENT_FL_FILTERED_BIT),
@@ -274,6 +277,7 @@ enum {
 	TRACE_EVENT_FL_WAS_ENABLED	= (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
 	TRACE_EVENT_FL_USE_CALL_FILTER	= (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
 	TRACE_EVENT_FL_TRACEPOINT	= (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
+	TRACE_EVENT_FL_KPROBE		= (1 << TRACE_EVENT_FL_KPROBE_BIT),
 };
 
 struct ftrace_event_call {
@@ -303,6 +307,7 @@ struct ftrace_event_call {
 #ifdef CONFIG_PERF_EVENTS
 	int				perf_refcount;
 	struct hlist_head __percpu	*perf_events;
+	struct bpf_prog			*prog;
 
 	int	(*perf_perm)(struct ftrace_event_call *,
 			     struct perf_event *);
@@ -548,6 +553,15 @@ event_trigger_unlock_commit_regs(struct ftrace_event_file *file,
 		event_triggers_post_call(file, tt);
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
+#else
+static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+	return 1;
+}
+#endif
+
 enum {
 	FILTER_OTHER = 0,
 	FILTER_STATIC_STRING,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 45da7ec7d274..4486d36d2e9e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -118,6 +118,7 @@ enum bpf_map_type {
 enum bpf_prog_type {
 	BPF_PROG_TYPE_UNSPEC,
 	BPF_PROG_TYPE_SOCKET_FILTER,
+	BPF_PROG_TYPE_KPROBE,
 };
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
@@ -151,6 +152,7 @@ union bpf_attr {
 		__u32		log_level;	/* verbosity level of verifier */
 		__u32		log_size;	/* size of user buffer */
 		__aligned_u64	log_buf;	/* user supplied buffer */
+		__u32		kern_version;	/* checked when type=kprobe */
 	};
 } __attribute__((aligned(8)));
 
@@ -162,6 +164,7 @@ enum bpf_func_id {
 	BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
 	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
+	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 3c8b45de57ec..ad4dade2a502 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -382,6 +382,7 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
 #define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
 #define PERF_EVENT_IOC_ID		_IOR('$', 7, __u64 *)
+#define PERF_EVENT_IOC_SET_BPF		_IOW('$', 8, __u32)
 
 enum perf_event_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 536edc2be307..504c10b990ef 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -16,6 +16,7 @@
 #include <linux/file.h>
 #include <linux/license.h>
 #include <linux/filter.h>
+#include <linux/version.h>
 
 static LIST_HEAD(bpf_map_types);
 
@@ -467,7 +468,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD log_buf
+#define	BPF_PROG_LOAD_LAST_FIELD kern_version
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -492,6 +493,10 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (attr->insn_cnt >= BPF_MAXINSNS)
 		return -EINVAL;
 
+	if (type == BPF_PROG_TYPE_KPROBE &&
+	    attr->kern_version != LINUX_VERSION_CODE)
+		return -EINVAL;
+
 	/* plain bpf_prog allocation */
 	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
 	if (!prog)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a83581c35f79..1310406007f4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -42,6 +42,8 @@
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/compat.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
 
 #include "internal.h"
 
@@ -3402,6 +3404,7 @@ errout:
 }
 
 static void perf_event_free_filter(struct perf_event *event);
+static void perf_event_free_bpf_prog(struct perf_event *event);
 
 static void free_event_rcu(struct rcu_head *head)
 {
@@ -3411,6 +3414,7 @@ static void free_event_rcu(struct rcu_head *head)
 	if (event->ns)
 		put_pid_ns(event->ns);
 	perf_event_free_filter(event);
+	perf_event_free_bpf_prog(event);
 	kfree(event);
 }
 
@@ -3923,6 +3927,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
 static int perf_event_set_output(struct perf_event *event,
 				 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
 
 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
 {
@@ -3976,6 +3981,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
 	case PERF_EVENT_IOC_SET_FILTER:
 		return perf_event_set_filter(event, (void __user *)arg);
 
+	case PERF_EVENT_IOC_SET_BPF:
+		return perf_event_set_bpf_prog(event, arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -6436,6 +6444,49 @@ static void perf_event_free_filter(struct perf_event *event)
 	ftrace_profile_free_filter(event);
 }
 
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+	struct bpf_prog *prog;
+
+	if (event->attr.type != PERF_TYPE_TRACEPOINT)
+		return -EINVAL;
+
+	if (event->tp_event->prog)
+		return -EEXIST;
+
+	if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
+		/* bpf programs can only be attached to kprobes */
+		return -EINVAL;
+
+	prog = bpf_prog_get(prog_fd);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	if (prog->aux->prog_type != BPF_PROG_TYPE_KPROBE) {
+		/* valid fd, but invalid bpf program type */
+		bpf_prog_put(prog);
+		return -EINVAL;
+	}
+
+	event->tp_event->prog = prog;
+
+	return 0;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+	struct bpf_prog *prog;
+
+	if (!event->tp_event)
+		return;
+
+	prog = event->tp_event->prog;
+	if (prog) {
+		event->tp_event->prog = NULL;
+		bpf_prog_put(prog);
+	}
+}
+
 #else
 
 static inline void perf_tp_register(void)
@@ -6451,6 +6502,14 @@ static void perf_event_free_filter(struct perf_event *event)
 {
 }
 
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+	return -ENOENT;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+}
 #endif /* CONFIG_EVENT_TRACING */
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 98f26588255e..c575a300103b 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_trace.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_PM),y)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
new file mode 100644
index 000000000000..450ea93ac4ab
--- /dev/null
+++ b/kernel/trace/bpf_trace.c
@@ -0,0 +1,119 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/uaccess.h>
+#include "trace.h"
+
+static DEFINE_PER_CPU(int, bpf_prog_active);
+
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+	unsigned int ret;
+	int cpu;
+
+	if (in_nmi()) /* not supported yet */
+		return 1;
+
+	preempt_disable_notrace();
+
+	cpu = raw_smp_processor_id();
+	if (unlikely(per_cpu(bpf_prog_active, cpu)++ != 0)) {
+		/* since some bpf program is already running on this cpu,
+		 * don't call into another bpf program (same or different)
+		 * and don't send kprobe event into ring-buffer,
+		 * so return zero here
+		 */
+		ret = 0;
+		goto out;
+	}
+
+	rcu_read_lock();
+	ret = BPF_PROG_RUN(prog, ctx);
+	rcu_read_unlock();
+
+ out:
+	per_cpu(bpf_prog_active, cpu)--;
+	preempt_enable_notrace();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(trace_call_bpf);
+
+static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	void *dst = (void *) (long) r1;
+	int size = (int) r2;
+	void *unsafe_ptr = (void *) (long) r3;
+
+	return probe_kernel_read(dst, unsafe_ptr, size);
+}
+
+static struct bpf_func_proto kprobe_prog_funcs[] = {
+	[BPF_FUNC_probe_read] = {
+		.func = bpf_probe_read,
+		.gpl_only = true,
+		.ret_type = RET_INTEGER,
+		.arg1_type = ARG_PTR_TO_STACK,
+		.arg2_type = ARG_CONST_STACK_SIZE,
+		.arg3_type = ARG_ANYTHING,
+	},
+};
+
+static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	default:
+		if (func_id < 0 || func_id >= ARRAY_SIZE(kprobe_prog_funcs))
+			return NULL;
+		return &kprobe_prog_funcs[func_id];
+	}
+}
+
+/* bpf+kprobe programs can access fields of 'struct pt_regs' */
+static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+	/* check bounds */
+	if (off < 0 || off >= sizeof(struct pt_regs))
+		return false;
+
+	/* only read is allowed */
+	if (type != BPF_READ)
+		return false;
+
+	/* disallow misaligned access */
+	if (off % size != 0)
+		return false;
+
+	return true;
+}
+
+static struct bpf_verifier_ops kprobe_prog_ops = {
+	.get_func_proto = kprobe_prog_func_proto,
+	.is_valid_access = kprobe_prog_is_valid_access,
+};
+
+static struct bpf_prog_type_list kprobe_tl = {
+	.ops = &kprobe_prog_ops,
+	.type = BPF_PROG_TYPE_KPROBE,
+};
+
+static int __init register_kprobe_prog_ops(void)
+{
+	bpf_register_prog_type(&kprobe_tl);
+	return 0;
+}
+late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index d73f565b4e06..dc3462507d7c 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1134,11 +1134,15 @@ static void
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct ftrace_event_call *call = &tk->tp.call;
+	struct bpf_prog *prog = call->prog;
 	struct kprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
+	if (prog && !trace_call_bpf(prog, regs))
+		return;
+
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
 		return;
@@ -1165,11 +1169,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 		    struct pt_regs *regs)
 {
 	struct ftrace_event_call *call = &tk->tp.call;
+	struct bpf_prog *prog = call->prog;
 	struct kretprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
+	if (prog && !trace_call_bpf(prog, regs))
+		return;
+
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
 		return;
@@ -1286,7 +1294,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
 		kfree(call->print_fmt);
 		return -ENODEV;
 	}
-	call->flags = 0;
+	call->flags = TRACE_EVENT_FL_KPROBE;
 	call->class->reg = kprobe_register;
 	call->data = tk;
 	ret = trace_add_event_call(call);
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v6 tip 3/8] tracing: allow BPF programs to call bpf_ktime_get_ns()
From: Alexei Starovoitov @ 2015-03-11  4:18 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1426047534-8148-1-git-send-email-ast@plumgrid.com>

bpf_ktime_get_ns() is used by programs to compue time delta between events
or as a timestamp

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 include/uapi/linux/bpf.h |    1 +
 kernel/trace/bpf_trace.c |   11 +++++++++++
 2 files changed, 12 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4486d36d2e9e..101e509d1001 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -165,6 +165,7 @@ enum bpf_func_id {
 	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
 	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */
+	BPF_FUNC_ktime_get_ns,    /* u64 bpf_ktime_get_ns(void) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 450ea93ac4ab..ee7c2c629e75 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -56,6 +56,12 @@ static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 	return probe_kernel_read(dst, unsafe_ptr, size);
 }
 
+static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	/* NMI safe access to clock monotonic */
+	return ktime_get_mono_fast_ns();
+}
+
 static struct bpf_func_proto kprobe_prog_funcs[] = {
 	[BPF_FUNC_probe_read] = {
 		.func = bpf_probe_read,
@@ -65,6 +71,11 @@ static struct bpf_func_proto kprobe_prog_funcs[] = {
 		.arg2_type = ARG_CONST_STACK_SIZE,
 		.arg3_type = ARG_ANYTHING,
 	},
+	[BPF_FUNC_ktime_get_ns] = {
+		.func = bpf_ktime_get_ns,
+		.gpl_only = true,
+		.ret_type = RET_INTEGER,
+	},
 };
 
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v6 tip 4/8] tracing: allow BPF programs to call bpf_trace_printk()
From: Alexei Starovoitov @ 2015-03-11  4:18 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1426047534-8148-1-git-send-email-ast@plumgrid.com>

Debugging of BPF programs needs some form of printk from the program,
so let programs call limited trace_printk() with %d %u %x %p modifiers only.

Similar to kernel modules, during program load verifier checks whether program
is calling bpf_trace_printk() and if so, kernel allocates trace_printk buffers
and emits big 'this is debug only' banner.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 include/uapi/linux/bpf.h |    1 +
 kernel/trace/bpf_trace.c |   68 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 101e509d1001..4095f3d9a716 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -166,6 +166,7 @@ enum bpf_func_id {
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
 	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */
 	BPF_FUNC_ktime_get_ns,    /* u64 bpf_ktime_get_ns(void) */
+	BPF_FUNC_trace_printk,    /* int bpf_trace_printk(const char *fmt, int fmt_size, ...) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ee7c2c629e75..367235d8576b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -62,6 +62,60 @@ static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 	return ktime_get_mono_fast_ns();
 }
 
+/* limited trace_printk()
+ * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
+ */
+static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
+{
+	char *fmt = (char *) (long) r1;
+	int fmt_cnt = 0;
+	bool mod_l[3] = {};
+	int i;
+
+	/* bpf_check() guarantees that fmt points to bpf program stack and
+	 * fmt_size bytes of it were initialized by bpf program
+	 */
+	if (fmt[fmt_size - 1] != 0)
+		return -EINVAL;
+
+	/* check format string for allowed specifiers */
+	for (i = 0; i < fmt_size; i++)
+		if (fmt[i] == '%') {
+			if (fmt_cnt >= 3)
+				return -EINVAL;
+			i++;
+			if (i >= fmt_size)
+				return -EINVAL;
+
+			if (fmt[i] == 'l') {
+				mod_l[fmt_cnt] = true;
+				i++;
+				if (i >= fmt_size)
+					return -EINVAL;
+			} else if (fmt[i] == 'p') {
+				mod_l[fmt_cnt] = true;
+				fmt_cnt++;
+				continue;
+			}
+
+			if (fmt[i] == 'l') {
+				mod_l[fmt_cnt] = true;
+				i++;
+				if (i >= fmt_size)
+					return -EINVAL;
+			}
+
+			if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
+				return -EINVAL;
+			fmt_cnt++;
+		}
+
+	return __trace_printk(1/* fake ip will not be printed */, fmt,
+			      mod_l[0] ? r3 : (u32) r3,
+			      mod_l[1] ? r4 : (u32) r4,
+			      mod_l[2] ? r5 : (u32) r5);
+}
+
 static struct bpf_func_proto kprobe_prog_funcs[] = {
 	[BPF_FUNC_probe_read] = {
 		.func = bpf_probe_read,
@@ -76,6 +130,13 @@ static struct bpf_func_proto kprobe_prog_funcs[] = {
 		.gpl_only = true,
 		.ret_type = RET_INTEGER,
 	},
+	[BPF_FUNC_trace_printk] = {
+		.func = bpf_trace_printk,
+		.gpl_only = true,
+		.ret_type = RET_INTEGER,
+		.arg1_type = ARG_PTR_TO_STACK,
+		.arg2_type = ARG_CONST_STACK_SIZE,
+	},
 };
 
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
@@ -90,6 +151,13 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 	default:
 		if (func_id < 0 || func_id >= ARRAY_SIZE(kprobe_prog_funcs))
 			return NULL;
+
+		if (func_id == BPF_FUNC_trace_printk)
+			/* this program might be calling bpf_trace_printk,
+			 * so allocate per-cpu printk buffers
+			 */
+			trace_printk_init_buffers();
+
 		return &kprobe_prog_funcs[func_id];
 	}
 }
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v6 tip 5/8] samples: bpf: simple non-portable kprobe filter example
From: Alexei Starovoitov @ 2015-03-11  4:18 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1426047534-8148-1-git-send-email-ast@plumgrid.com>

tracex1_kern.c - C program compiled into BPF.
It attaches to kprobe:netif_receive_skb
When skb->dev->name == "lo", it prints sample debug message into trace_pipe
via bpf_trace_printk() helper function.

tracex1_user.c - corresponding user space component that:
- loads bpf program via bpf() syscall
- opens kprobes:netif_receive_skb event via perf_event_open() syscall
- attaches the program to event via ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
- prints from trace_pipe

Note, this bpf program is completely non-portable. It must be recompiled
with current kernel headers. kprobe is not a stable ABI and bpf+kprobe scripts
may stop working any time.

bpf verifier will detect that it's using bpf_trace_printk() and kernel will
print warning banner:
** trace_printk() being used. Allocating extra memory.  **
**                                                      **
** This means that this is a DEBUG kernel and it is     **
** unsafe for production use.                           **

bpf_trace_printk() should be used for debugging of bpf program only.

Usage:
$ sudo tracex1
            ping-19826 [000] d.s2 63103.382648: : skb ffff880466b1ca00 len 84
            ping-19826 [000] d.s2 63103.382684: : skb ffff880466b1d300 len 84

            ping-19826 [000] d.s2 63104.382533: : skb ffff880466b1ca00 len 84
            ping-19826 [000] d.s2 63104.382594: : skb ffff880466b1d300 len 84

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 samples/bpf/Makefile        |    4 ++
 samples/bpf/bpf_helpers.h   |    6 +++
 samples/bpf/bpf_load.c      |  125 ++++++++++++++++++++++++++++++++++++++++---
 samples/bpf/bpf_load.h      |    3 ++
 samples/bpf/libbpf.c        |   14 ++++-
 samples/bpf/libbpf.h        |    5 +-
 samples/bpf/sock_example.c  |    2 +-
 samples/bpf/test_verifier.c |    2 +-
 samples/bpf/tracex1_kern.c  |   50 +++++++++++++++++
 samples/bpf/tracex1_user.c  |   25 +++++++++
 10 files changed, 224 insertions(+), 12 deletions(-)
 create mode 100644 samples/bpf/tracex1_kern.c
 create mode 100644 samples/bpf/tracex1_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index b5b3600dcdf5..51f6f01e5a3a 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -6,23 +6,27 @@ hostprogs-y := test_verifier test_maps
 hostprogs-y += sock_example
 hostprogs-y += sockex1
 hostprogs-y += sockex2
+hostprogs-y += tracex1
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
 sock_example-objs := sock_example.o libbpf.o
 sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
 sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
+tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
 always += sockex1_kern.o
 always += sockex2_kern.o
+always += tracex1_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
 HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
 HOSTLOADLIBES_sockex1 += -lelf
 HOSTLOADLIBES_sockex2 += -lelf
+HOSTLOADLIBES_tracex1 += -lelf
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index ca0333146006..1c872bcf5a80 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -15,6 +15,12 @@ static int (*bpf_map_update_elem)(void *map, void *key, void *value,
 	(void *) BPF_FUNC_map_update_elem;
 static int (*bpf_map_delete_elem)(void *map, void *key) =
 	(void *) BPF_FUNC_map_delete_elem;
+static int (*bpf_probe_read)(void *dst, int size, void *unsafe_ptr) =
+	(void *) BPF_FUNC_probe_read;
+static unsigned long long (*bpf_ktime_get_ns)(void) =
+	(void *) BPF_FUNC_ktime_get_ns;
+static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) =
+	(void *) BPF_FUNC_trace_printk;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 1831d236382b..95c106e4bcdb 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -8,29 +8,70 @@
 #include <unistd.h>
 #include <string.h>
 #include <stdbool.h>
+#include <stdlib.h>
 #include <linux/bpf.h>
 #include <linux/filter.h>
+#include <linux/perf_event.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <poll.h>
 #include "libbpf.h"
 #include "bpf_helpers.h"
 #include "bpf_load.h"
 
+#define DEBUGFS "/sys/kernel/debug/tracing/"
+
 static char license[128];
+static int kern_version;
 static bool processed_sec[128];
 int map_fd[MAX_MAPS];
 int prog_fd[MAX_PROGS];
+int event_fd[MAX_PROGS];
 int prog_cnt;
 
 static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 {
-	int fd;
 	bool is_socket = strncmp(event, "socket", 6) == 0;
-
-	if (!is_socket)
-		/* tracing events tbd */
+	bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
+	bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
+	enum bpf_prog_type prog_type;
+	char buf[256];
+	int fd, efd, err, id;
+	struct perf_event_attr attr = {};
+
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+
+	if (is_socket) {
+		prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+	} else if (is_kprobe || is_kretprobe) {
+		prog_type = BPF_PROG_TYPE_KPROBE;
+	} else {
+		printf("Unknown event '%s'\n", event);
 		return -1;
+	}
+
+	if (is_kprobe || is_kretprobe) {
+		if (is_kprobe)
+			event += 7;
+		else
+			event += 10;
+
+		snprintf(buf, sizeof(buf),
+			 "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
+			 is_kprobe ? 'p' : 'r', event, event);
+		err = system(buf);
+		if (err < 0) {
+			printf("failed to create kprobe '%s' error '%s'\n",
+			       event, strerror(errno));
+			return -1;
+		}
+	}
 
-	fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER,
-			   prog, size, license);
+	fd = bpf_prog_load(prog_type, prog, size, license, kern_version);
 
 	if (fd < 0) {
 		printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf);
@@ -39,6 +80,41 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 
 	prog_fd[prog_cnt++] = fd;
 
+	if (is_socket)
+		return 0;
+
+	strcpy(buf, DEBUGFS);
+	strcat(buf, "events/kprobes/");
+	strcat(buf, event);
+	strcat(buf, "/id");
+
+	efd = open(buf, O_RDONLY, 0);
+	if (efd < 0) {
+		printf("failed to open event %s\n", event);
+		return -1;
+	}
+
+	err = read(efd, buf, sizeof(buf));
+	if (err < 0 || err >= sizeof(buf)) {
+		printf("read from '%s' failed '%s'\n", event, strerror(errno));
+		return -1;
+	}
+
+	close(efd);
+
+	buf[err] = 0;
+	id = atoi(buf);
+	attr.config = id;
+
+	efd = perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
+	if (efd < 0) {
+		printf("event %d fd %d err %s\n", id, efd, strerror(errno));
+		return -1;
+	}
+	event_fd[prog_cnt - 1] = efd;
+	ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
+	ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
+
 	return 0;
 }
 
@@ -135,6 +211,9 @@ int load_bpf_file(char *path)
 	if (gelf_getehdr(elf, &ehdr) != &ehdr)
 		return 1;
 
+	/* clear all kprobes */
+	i = system("echo \"\" > /sys/kernel/debug/tracing/kprobe_events");
+
 	/* scan over all elf sections to get license and map info */
 	for (i = 1; i < ehdr.e_shnum; i++) {
 
@@ -149,6 +228,14 @@ int load_bpf_file(char *path)
 		if (strcmp(shname, "license") == 0) {
 			processed_sec[i] = true;
 			memcpy(license, data->d_buf, data->d_size);
+		} else if (strcmp(shname, "version") == 0) {
+			processed_sec[i] = true;
+			if (data->d_size != sizeof(int)) {
+				printf("invalid size of version section %zd\n",
+				       data->d_size);
+				return 1;
+			}
+			memcpy(&kern_version, data->d_buf, sizeof(int));
 		} else if (strcmp(shname, "maps") == 0) {
 			processed_sec[i] = true;
 			if (load_maps(data->d_buf, data->d_size))
@@ -178,7 +265,8 @@ int load_bpf_file(char *path)
 			if (parse_relo_and_apply(data, symbols, &shdr, insns))
 				continue;
 
-			if (memcmp(shname_prog, "events/", 7) == 0 ||
+			if (memcmp(shname_prog, "kprobe/", 7) == 0 ||
+			    memcmp(shname_prog, "kretprobe/", 10) == 0 ||
 			    memcmp(shname_prog, "socket", 6) == 0)
 				load_and_attach(shname_prog, insns, data_prog->d_size);
 		}
@@ -193,7 +281,8 @@ int load_bpf_file(char *path)
 		if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
 			continue;
 
-		if (memcmp(shname, "events/", 7) == 0 ||
+		if (memcmp(shname, "kprobe/", 7) == 0 ||
+		    memcmp(shname, "kretprobe/", 10) == 0 ||
 		    memcmp(shname, "socket", 6) == 0)
 			load_and_attach(shname, data->d_buf, data->d_size);
 	}
@@ -201,3 +290,23 @@ int load_bpf_file(char *path)
 	close(fd);
 	return 0;
 }
+
+void read_trace_pipe(void)
+{
+	int trace_fd;
+
+	trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
+	if (trace_fd < 0)
+		return;
+
+	while (1) {
+		static char buf[4096];
+		ssize_t sz;
+
+		sz = read(trace_fd, buf, sizeof(buf));
+		if (sz) {
+			buf[sz] = 0;
+			puts(buf);
+		}
+	}
+}
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 27789a34f5e6..cbd7c2b532b9 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -6,6 +6,7 @@
 
 extern int map_fd[MAX_MAPS];
 extern int prog_fd[MAX_PROGS];
+extern int event_fd[MAX_PROGS];
 
 /* parses elf file compiled by llvm .c->.o
  * . parses 'maps' section and creates maps via BPF syscall
@@ -21,4 +22,6 @@ extern int prog_fd[MAX_PROGS];
  */
 int load_bpf_file(char *path);
 
+void read_trace_pipe(void);
+
 #endif
diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c
index 46d50b7ddf79..7e1efa7e2ed7 100644
--- a/samples/bpf/libbpf.c
+++ b/samples/bpf/libbpf.c
@@ -81,7 +81,7 @@ char bpf_log_buf[LOG_BUF_SIZE];
 
 int bpf_prog_load(enum bpf_prog_type prog_type,
 		  const struct bpf_insn *insns, int prog_len,
-		  const char *license)
+		  const char *license, int kern_version)
 {
 	union bpf_attr attr = {
 		.prog_type = prog_type,
@@ -93,6 +93,11 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
 		.log_level = 1,
 	};
 
+	/* assign one field outside of struct init to make sure any
+	 * padding is zero initialized
+	 */
+	attr.kern_version = kern_version;
+
 	bpf_log_buf[0] = 0;
 
 	return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
@@ -121,3 +126,10 @@ int open_raw_sock(const char *name)
 
 	return sock;
 }
+
+int perf_event_open(struct perf_event_attr *attr, int pid, int cpu,
+		    int group_fd, unsigned long flags)
+{
+	return syscall(__NR_perf_event_open, attr, pid, cpu,
+		       group_fd, flags);
+}
diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
index 58c5fe1bdba1..ac7b09672b46 100644
--- a/samples/bpf/libbpf.h
+++ b/samples/bpf/libbpf.h
@@ -13,7 +13,7 @@ int bpf_get_next_key(int fd, void *key, void *next_key);
 
 int bpf_prog_load(enum bpf_prog_type prog_type,
 		  const struct bpf_insn *insns, int insn_len,
-		  const char *license);
+		  const char *license, int kern_version);
 
 #define LOG_BUF_SIZE 65536
 extern char bpf_log_buf[LOG_BUF_SIZE];
@@ -182,4 +182,7 @@ extern char bpf_log_buf[LOG_BUF_SIZE];
 /* create RAW socket and bind to interface 'name' */
 int open_raw_sock(const char *name);
 
+struct perf_event_attr;
+int perf_event_open(struct perf_event_attr *attr, int pid, int cpu,
+		    int group_fd, unsigned long flags);
 #endif
diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c
index c8ad0404416f..a0ce251c5390 100644
--- a/samples/bpf/sock_example.c
+++ b/samples/bpf/sock_example.c
@@ -56,7 +56,7 @@ static int test_sock(void)
 	};
 
 	prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, sizeof(prog),
-				"GPL");
+				"GPL", 0);
 	if (prog_fd < 0) {
 		printf("failed to load prog '%s'\n", strerror(errno));
 		goto cleanup;
diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c
index b96175e90363..740ce97cda5e 100644
--- a/samples/bpf/test_verifier.c
+++ b/samples/bpf/test_verifier.c
@@ -689,7 +689,7 @@ static int test(void)
 
 		prog_fd = bpf_prog_load(BPF_PROG_TYPE_UNSPEC, prog,
 					prog_len * sizeof(struct bpf_insn),
-					"GPL");
+					"GPL", 0);
 
 		if (tests[i].result == ACCEPT) {
 			if (prog_fd < 0) {
diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c
new file mode 100644
index 000000000000..42176fce4847
--- /dev/null
+++ b/samples/bpf/tracex1_kern.c
@@ -0,0 +1,50 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <linux/version.h>
+#include "bpf_helpers.h"
+
+#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;})
+
+/* kprobe is NOT a stable ABI
+ * kernel functions can be removed, renamed or completely change semantics.
+ * Number of argumnets and their posistions can change, etc.
+ * This bpf+kprobe example can stop working any time.
+ */
+SEC("kprobe/__netif_receive_skb_core")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	/* attaches to kprobe netif_receive_skb,
+	 * looks for packets on loobpack device and prints them
+	 */
+	char devname[IFNAMSIZ] = {};
+	struct net_device *dev;
+	struct sk_buff *skb;
+	int len;
+
+	/* non-portable! works for the given kernel only */
+	skb = (struct sk_buff *) ctx->di;
+
+	dev = _(skb->dev);
+
+	len = _(skb->len);
+
+	bpf_probe_read(devname, sizeof(devname), dev->name);
+
+	if (devname[0] == 'l' && devname[1] == 'o') {
+		char fmt[] = "skb %p len %d\n";
+		/* using bpf_trace_printk() for DEBUG ONLY */
+		bpf_trace_printk(fmt, sizeof(fmt), skb, len);
+	}
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c
new file mode 100644
index 000000000000..31a48183beea
--- /dev/null
+++ b/samples/bpf/tracex1_user.c
@@ -0,0 +1,25 @@
+#include <stdio.h>
+#include <linux/bpf.h>
+#include <unistd.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int ac, char **argv)
+{
+	FILE *f;
+	char filename[256];
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	f = popen("taskset 1 ping -c5 localhost", "r");
+	(void) f;
+
+	read_trace_pipe();
+
+	return 0;
+}
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v6 tip 6/8] samples: bpf: counting example for kfree_skb and write syscall
From: Alexei Starovoitov @ 2015-03-11  4:18 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1426047534-8148-1-git-send-email-ast@plumgrid.com>

this example has two probes in one C file that attach to different kprove events
and use two different maps.

1st probe is x64 specific equivalent of dropmon. It attaches to kfree_skb,
retrevies 'ip' address of kfree_skb() caller and counts number of packet drops
at that 'ip' address. User space prints 'location - count' map every second.

2nd probe attaches to kprobe:sys_write and computes a histogram of different
write sizes

Usage:
$ sudo tracex2
location 0xffffffff81695995 count 1
location 0xffffffff816d0da9 count 2

location 0xffffffff81695995 count 2
location 0xffffffff816d0da9 count 2

location 0xffffffff81695995 count 3
location 0xffffffff816d0da9 count 2

557145+0 records in
557145+0 records out
285258240 bytes (285 MB) copied, 1.02379 s, 279 MB/s
           syscall write() stats
     byte_size       : count     distribution
       1 -> 1        : 3        |                                      |
       2 -> 3        : 0        |                                      |
       4 -> 7        : 0        |                                      |
       8 -> 15       : 0        |                                      |
      16 -> 31       : 2        |                                      |
      32 -> 63       : 3        |                                      |
      64 -> 127      : 1        |                                      |
     128 -> 255      : 1        |                                      |
     256 -> 511      : 0        |                                      |
     512 -> 1023     : 1118968  |************************************* |

Ctrl-C at any time. Kernel will auto cleanup maps and programs

$ addr2line -ape ./bld_x64/vmlinux 0xffffffff81695995 0xffffffff816d0da9
0xffffffff81695995: ./bld_x64/../net/ipv4/icmp.c:1038
0xffffffff816d0da9: ./bld_x64/../net/unix/af_unix.c:1231

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 samples/bpf/Makefile       |    4 ++
 samples/bpf/tracex2_kern.c |   86 +++++++++++++++++++++++++++++++++++++++
 samples/bpf/tracex2_user.c |   95 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 185 insertions(+)
 create mode 100644 samples/bpf/tracex2_kern.c
 create mode 100644 samples/bpf/tracex2_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 51f6f01e5a3a..6dd272143733 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -7,6 +7,7 @@ hostprogs-y += sock_example
 hostprogs-y += sockex1
 hostprogs-y += sockex2
 hostprogs-y += tracex1
+hostprogs-y += tracex2
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
@@ -14,12 +15,14 @@ sock_example-objs := sock_example.o libbpf.o
 sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
 sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
 tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
+tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
 always += sockex1_kern.o
 always += sockex2_kern.o
 always += tracex1_kern.o
+always += tracex2_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -27,6 +30,7 @@ HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
 HOSTLOADLIBES_sockex1 += -lelf
 HOSTLOADLIBES_sockex2 += -lelf
 HOSTLOADLIBES_tracex1 += -lelf
+HOSTLOADLIBES_tracex2 += -lelf
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c
new file mode 100644
index 000000000000..334348335795
--- /dev/null
+++ b/samples/bpf/tracex2_kern.c
@@ -0,0 +1,86 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(long),
+	.value_size = sizeof(long),
+	.max_entries = 1024,
+};
+
+/* kprobe is NOT a stable ABI
+ * This bpf+kprobe example can stop working any time.
+ */
+SEC("kprobe/kfree_skb")
+int bpf_prog2(struct pt_regs *ctx)
+{
+	long loc = 0;
+	long init_val = 1;
+	long *value;
+
+	/* x64 specific: read ip of kfree_skb caller.
+	 * non-portable version of __builtin_return_address(0)
+	 */
+	bpf_probe_read(&loc, sizeof(loc), (void *)ctx->sp);
+
+	value = bpf_map_lookup_elem(&my_map, &loc);
+	if (value)
+		*value += 1;
+	else
+		bpf_map_update_elem(&my_map, &loc, &init_val, BPF_ANY);
+	return 0;
+}
+
+static unsigned int log2(unsigned int v)
+{
+	unsigned int r;
+	unsigned int shift;
+
+	r = (v > 0xFFFF) << 4; v >>= r;
+	shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+	shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+	shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+	r |= (v >> 1);
+	return r;
+}
+
+static unsigned int log2l(unsigned long v)
+{
+	unsigned int hi = v >> 32;
+	if (hi)
+		return log2(hi) + 32;
+	else
+		return log2(v);
+}
+
+struct bpf_map_def SEC("maps") my_hist_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(long),
+	.max_entries = 64,
+};
+
+SEC("kprobe/sys_write")
+int bpf_prog3(struct pt_regs *ctx)
+{
+	long write_size = ctx->dx; /* arg3 */
+	long init_val = 1;
+	long *value;
+	u32 index = log2l(write_size);
+
+	value = bpf_map_lookup_elem(&my_hist_map, &index);
+	if (value)
+		__sync_fetch_and_add(value, 1);
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c
new file mode 100644
index 000000000000..91b8d0896fbb
--- /dev/null
+++ b/samples/bpf/tracex2_user.c
@@ -0,0 +1,95 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define MAX_INDEX	64
+#define MAX_STARS	38
+
+static void stars(char *str, long val, long max, int width)
+{
+	int i;
+
+	for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+		str[i] = '*';
+	if (val > max)
+		str[i - 1] = '+';
+	str[i] = '\0';
+}
+
+static void print_hist(int fd)
+{
+	int key;
+	long value;
+	long data[MAX_INDEX] = {};
+	char starstr[MAX_STARS];
+	int i;
+	int max_ind = -1;
+	long max_value = 0;
+
+	for (key = 0; key < MAX_INDEX; key++) {
+		bpf_lookup_elem(fd, &key, &value);
+		data[key] = value;
+		if (value && key > max_ind)
+			max_ind = key;
+		if (value > max_value)
+			max_value = value;
+	}
+
+	printf("           syscall write() stats\n");
+	printf("     byte_size       : count     distribution\n");
+	for (i = 1; i <= max_ind + 1; i++) {
+		stars(starstr, data[i - 1], max_value, MAX_STARS);
+		printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+		       (1l << i) >> 1, (1l << i) - 1, data[i - 1],
+		       MAX_STARS, starstr);
+	}
+}
+static void int_exit(int sig)
+{
+	print_hist(map_fd[1]);
+	exit(0);
+}
+
+int main(int ac, char **argv)
+{
+	char filename[256];
+	long key, next_key, value;
+	FILE *f;
+	int i;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	signal(SIGINT, int_exit);
+
+	/* start 'ping' in the background to have some kfree_skb events */
+	f = popen("ping -c5 localhost", "r");
+	(void) f;
+
+	/* start 'dd' in the background to have plenty of 'write' syscalls */
+	f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r");
+	(void) f;
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	for (i = 0; i < 5; i++) {
+		key = 0;
+		while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) {
+			bpf_lookup_elem(map_fd[0], &next_key, &value);
+			printf("location 0x%lx count %ld\n", next_key, value);
+			key = next_key;
+		}
+		if (key)
+			printf("\n");
+		sleep(1);
+	}
+	print_hist(map_fd[1]);
+
+	return 0;
+}
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v6 tip 7/8] samples: bpf: IO latency analysis (iosnoop/heatmap)
From: Alexei Starovoitov @ 2015-03-11  4:18 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1426047534-8148-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>

BPF C program attaches to blk_mq_start_request/blk_update_request kprobe events
to calculate IO latency.
For every completed block IO event it computes the time delta in nsec
and records in a histogram map: map[log10(delta)*10]++
User space reads this histogram map every 2 seconds and prints it as a 'heatmap'
using gray shades of text terminal. Black spaces have many events and white
spaces have very few events. Left most space is the smallest latency, right most
space is the largest latency in the range.

Usage:
$ sudo ./tracex3
and do 'sudo dd if=/dev/sda of=/dev/null' in other terminal.
Observe IO latencies and how different activity (like 'make kernel') affects it.

Similar experiments can be done for network transmit latencies, syscalls, etc

'-t' flag prints the heatmap using normal ascii characters:

$ sudo ./tracex3 -t
  heatmap of IO latency
  # - many events with this latency
    - few events
|1us      |10us     |100us    |1ms      |10ms     |100ms    |1s       |10s
                         *ooo. *O.#.                                    # 221
                      .  *#     .                                       # 125
                         ..   .o#*..                                    # 55
                    .  . .  .  .#O                                      # 37
                         .#                                             # 175
                               .#*.                                     # 37
                          #                                             # 199
              .              . *#*.                                     # 55
                               *#..*                                    # 42
                          #                                             # 266
                      ...***Oo#*OO**o#* .                               # 629
                          #                                             # 271
                              . .#o* o.*o*                              # 221
                        . . o* *#O..                                    # 50

Signed-off-by: Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
---
 samples/bpf/Makefile       |    4 ++
 samples/bpf/tracex3_kern.c |   89 ++++++++++++++++++++++++++
 samples/bpf/tracex3_user.c |  150 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 243 insertions(+)
 create mode 100644 samples/bpf/tracex3_kern.c
 create mode 100644 samples/bpf/tracex3_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 6dd272143733..dcd850546d52 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -8,6 +8,7 @@ hostprogs-y += sockex1
 hostprogs-y += sockex2
 hostprogs-y += tracex1
 hostprogs-y += tracex2
+hostprogs-y += tracex3
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
@@ -16,6 +17,7 @@ sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
 sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
 tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
 tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
+tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -23,6 +25,7 @@ always += sockex1_kern.o
 always += sockex2_kern.o
 always += tracex1_kern.o
 always += tracex2_kern.o
+always += tracex3_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -31,6 +34,7 @@ HOSTLOADLIBES_sockex1 += -lelf
 HOSTLOADLIBES_sockex2 += -lelf
 HOSTLOADLIBES_tracex1 += -lelf
 HOSTLOADLIBES_tracex2 += -lelf
+HOSTLOADLIBES_tracex3 += -lelf
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c
new file mode 100644
index 000000000000..b9d66766f07a
--- /dev/null
+++ b/samples/bpf/tracex3_kern.c
@@ -0,0 +1,89 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(long),
+	.value_size = sizeof(u64),
+	.max_entries = 4096,
+};
+
+/* kprobe is NOT a stable ABI
+ * This bpf+kprobe example can stop working any time.
+ */
+SEC("kprobe/blk_mq_start_request")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	long rq = ctx->di;
+	u64 val = bpf_ktime_get_ns();
+
+	bpf_map_update_elem(&my_map, &rq, &val, BPF_ANY);
+	return 0;
+}
+
+static unsigned int log2l(unsigned long long n)
+{
+#define S(k) if (n >= (1ull << k)) { i += k; n >>= k; }
+	int i = -(n == 0);
+	S(32); S(16); S(8); S(4); S(2); S(1);
+	return i;
+#undef S
+}
+
+#define SLOTS 100
+
+struct bpf_map_def SEC("maps") lat_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(u64),
+	.max_entries = SLOTS,
+};
+
+SEC("kprobe/blk_update_request")
+int bpf_prog2(struct pt_regs *ctx)
+{
+	long rq = ctx->di;
+	u64 *value, l, base;
+	u32 index;
+
+	value = bpf_map_lookup_elem(&my_map, &rq);
+	if (!value)
+		return 0;
+
+	u64 cur_time = bpf_ktime_get_ns();
+	u64 delta = cur_time - *value;
+
+	bpf_map_delete_elem(&my_map, &rq);
+
+	/* the lines below are computing index = log10(delta)*10
+	 * using integer arithmetic
+	 * index = 29 ~ 1 usec
+	 * index = 59 ~ 1 msec
+	 * index = 89 ~ 1 sec
+	 * index = 99 ~ 10sec or more
+	 * log10(x)*10 = log2(x)*10/log2(10) = log2(x)*3
+	 */
+	l = log2l(delta);
+	base = 1ll << l;
+	index = (l * 64 + (delta - base) * 64 / base) * 3 / 64;
+
+	if (index >= SLOTS)
+		index = SLOTS - 1;
+
+	value = bpf_map_lookup_elem(&lat_map, &index);
+	if (value)
+		__sync_fetch_and_add((long *)value, 1);
+
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c
new file mode 100644
index 000000000000..0aaa933ab938
--- /dev/null
+++ b/samples/bpf/tracex3_user.c
@@ -0,0 +1,150 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+#define SLOTS 100
+
+static void clear_stats(int fd)
+{
+	__u32 key;
+	__u64 value = 0;
+
+	for (key = 0; key < SLOTS; key++)
+		bpf_update_elem(fd, &key, &value, BPF_ANY);
+}
+
+const char *color[] = {
+	"\033[48;5;255m",
+	"\033[48;5;252m",
+	"\033[48;5;250m",
+	"\033[48;5;248m",
+	"\033[48;5;246m",
+	"\033[48;5;244m",
+	"\033[48;5;242m",
+	"\033[48;5;240m",
+	"\033[48;5;238m",
+	"\033[48;5;236m",
+	"\033[48;5;234m",
+	"\033[48;5;232m",
+};
+const int num_colors = ARRAY_SIZE(color);
+
+const char nocolor[] = "\033[00m";
+
+const char *sym[] = {
+	" ",
+	" ",
+	".",
+	".",
+	"*",
+	"*",
+	"o",
+	"o",
+	"O",
+	"O",
+	"#",
+	"#",
+};
+
+bool full_range = false;
+bool text_only = false;
+
+static void print_banner(void)
+{
+	if (full_range)
+		printf("|1ns     |10ns     |100ns    |1us      |10us     |100us"
+		       "    |1ms      |10ms     |100ms    |1s       |10s\n");
+	else
+		printf("|1us      |10us     |100us    |1ms      |10ms     "
+		       "|100ms    |1s       |10s\n");
+}
+
+static void print_hist(int fd)
+{
+	__u32 key;
+	__u64 value;
+	__u64 cnt[SLOTS];
+	__u64 max_cnt = 0;
+	__u64 total_events = 0;
+
+	for (key = 0; key < SLOTS; key++) {
+		value = 0;
+		bpf_lookup_elem(fd, &key, &value);
+		cnt[key] = value;
+		total_events += value;
+		if (value > max_cnt)
+			max_cnt = value;
+	}
+	clear_stats(fd);
+	for (key = full_range ? 0 : 29; key < SLOTS; key++) {
+		int c = num_colors * cnt[key] / (max_cnt + 1);
+
+		if (text_only)
+			printf("%s", sym[c]);
+		else
+			printf("%s %s", color[c], nocolor);
+	}
+	printf(" # %lld\n", total_events);
+}
+
+int main(int ac, char **argv)
+{
+	char filename[256];
+	int i;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	for (i = 1; i < ac; i++) {
+		if (strcmp(argv[i], "-a") == 0) {
+			full_range = true;
+		} else if (strcmp(argv[i], "-t") == 0) {
+			text_only = true;
+		} else if (strcmp(argv[i], "-h") == 0) {
+			printf("Usage:\n"
+			       "  -a display wider latency range\n"
+			       "  -t text only\n");
+			return 1;
+		}
+	}
+
+	printf("  heatmap of IO latency\n");
+	if (text_only)
+		printf("  %s", sym[num_colors - 1]);
+	else
+		printf("  %s %s", color[num_colors - 1], nocolor);
+	printf(" - many events with this latency\n");
+
+	if (text_only)
+		printf("  %s", sym[0]);
+	else
+		printf("  %s %s", color[0], nocolor);
+	printf(" - few events\n");
+
+	for (i = 0; ; i++) {
+		if (i % 20 == 0)
+			print_banner();
+		print_hist(map_fd[1]);
+		sleep(2);
+	}
+
+	return 0;
+}
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v6 tip 8/8] samples: bpf: kmem_alloc/free tracker
From: Alexei Starovoitov @ 2015-03-11  4:18 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1426047534-8148-1-git-send-email-ast@plumgrid.com>

One bpf program attaches to kmem_cache_alloc_node() and remembers all allocated
objects in the map.
Another program attaches to kmem_cache_free() and deletes corresponding
object from the map.

User space walks the map every second and prints any objects which are
older than 1 second.

Usage:
$ sudo tracex4

Then start few long living processes. The tracex4 will print:

obj 0xffff880465928000 is 13sec old was allocated at ip ffffffff8105dc32
obj 0xffff88043181c280 is 13sec old was allocated at ip ffffffff8105dc32
obj 0xffff880465848000 is  8sec old was allocated at ip ffffffff8105dc32
obj 0xffff8804338bc280 is 15sec old was allocated at ip ffffffff8105dc32

$ addr2line -fispe vmlinux ffffffff8105dc32
do_fork at fork.c:1665

As soon as processes exit the memory is reclaimed and tracex4 prints nothing.

Similar experiment can be done with __kmalloc/kfree pair.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 samples/bpf/Makefile       |    4 +++
 samples/bpf/tracex4_kern.c |   54 ++++++++++++++++++++++++++++++++++
 samples/bpf/tracex4_user.c |   69 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 127 insertions(+)
 create mode 100644 samples/bpf/tracex4_kern.c
 create mode 100644 samples/bpf/tracex4_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index dcd850546d52..fe98fb226e6e 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -9,6 +9,7 @@ hostprogs-y += sockex2
 hostprogs-y += tracex1
 hostprogs-y += tracex2
 hostprogs-y += tracex3
+hostprogs-y += tracex4
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
@@ -18,6 +19,7 @@ sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
 tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
 tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
 tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
+tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -26,6 +28,7 @@ always += sockex2_kern.o
 always += tracex1_kern.o
 always += tracex2_kern.o
 always += tracex3_kern.o
+always += tracex4_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -35,6 +38,7 @@ HOSTLOADLIBES_sockex2 += -lelf
 HOSTLOADLIBES_tracex1 += -lelf
 HOSTLOADLIBES_tracex2 += -lelf
 HOSTLOADLIBES_tracex3 += -lelf
+HOSTLOADLIBES_tracex4 += -lelf -lrt
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/tracex4_kern.c b/samples/bpf/tracex4_kern.c
new file mode 100644
index 000000000000..5754e21caf3b
--- /dev/null
+++ b/samples/bpf/tracex4_kern.c
@@ -0,0 +1,54 @@
+/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/ptrace.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct pair {
+	u64 val;
+	u64 ip;
+};
+
+struct bpf_map_def SEC("maps") my_map = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(long),
+	.value_size = sizeof(struct pair),
+	.max_entries = 1000000,
+};
+
+/* kprobe is NOT a stable ABI
+ * This bpf+kprobe example can stop working any time.
+ */
+SEC("kprobe/kmem_cache_free")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	long ptr = ctx->si;
+
+	bpf_map_delete_elem(&my_map, &ptr);
+	return 0;
+}
+
+SEC("kretprobe/kmem_cache_alloc_node")
+int bpf_prog2(struct pt_regs *ctx)
+{
+	long ptr = ctx->ax;
+	long ip = 0;
+
+	/* get ip address of kmem_cache_alloc_node() caller */
+	bpf_probe_read(&ip, sizeof(ip), (void *)(ctx->bp + sizeof(ip)));
+
+	struct pair v = {
+		.val = bpf_ktime_get_ns(),
+		.ip = ip,
+	};
+
+	bpf_map_update_elem(&my_map, &ptr, &v, BPF_ANY);
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c
new file mode 100644
index 000000000000..bc4a3bdea6ed
--- /dev/null
+++ b/samples/bpf/tracex4_user.c
@@ -0,0 +1,69 @@
+/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <time.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+struct pair {
+	long long val;
+	__u64 ip;
+};
+
+static __u64 time_get_ns(void)
+{
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	return ts.tv_sec * 1000000000ull + ts.tv_nsec;
+}
+
+static void print_old_objects(int fd)
+{
+	long long val = time_get_ns();
+	__u64 key, next_key;
+	struct pair v;
+
+	key = write(1, "\e[1;1H\e[2J", 12); /* clear screen */
+
+	key = -1;
+	while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) {
+		bpf_lookup_elem(map_fd[0], &next_key, &v);
+		key = next_key;
+		if (val - v.val < 1000000000ll)
+			/* object was allocated more then 1 sec ago */
+			continue;
+		printf("obj 0x%llx is %2lldsec old was allocated at ip %llx\n",
+		       next_key, (val - v.val) / 1000000000ll, v.ip);
+	}
+}
+
+int main(int ac, char **argv)
+{
+	char filename[256];
+	int i;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	for (i = 0; ; i++) {
+		print_old_objects(map_fd[1]);
+		sleep(1);
+	}
+
+	return 0;
+}
-- 
1.7.9.5

^ permalink raw reply related

* Re: [PATCH 24/45] hdspm.h: include stdint.h in userspace
From: Takashi Iwai @ 2015-03-11  6:09 UTC (permalink / raw)
  To: Mikko Rapeli
  Cc: Arnd Bergmann, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	Jaroslav Kysela, alsa-devel-K7yf7f+aM1XWsZ/bQMPhNw,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20150311002812.GB12550-dqH1CgrzRhOk/eJAJmRu5A@public.gmane.org>

At Wed, 11 Mar 2015 02:28:12 +0200,
Mikko Rapeli wrote:
> 
> On Tue, Feb 17, 2015 at 07:46:02AM +0100, Takashi Iwai wrote:
> > At Tue, 17 Feb 2015 00:05:27 +0100,
> > Mikko Rapeli wrote:
> > > 
> > > Fixes compilation error:
> > > 
> > > sound/hdspm.h:43:2: error: unknown type name ‘uint32_t’
> > > 
> > > Signed-off-by: Mikko Rapeli <mikko.rapeli-X3B1VOXEql0@public.gmane.org>
> > 
> > Applied for 3.21, thanks.
> 
> Sorry, but this should maybe be dropped or reverted from the sound tree.
> 
> Arnd and others pointed out that kernel headers should be using __u32 etc
> types from linux/types.h instead of including stdint.h from libc and using
> uint32_t et al.
> 
> I'll post a new patch version for hdspm.h too in the next revision.

Then please post an incremental patch to for-next branch of sound git
tree (or the current linux-next tree).


Takashi

^ permalink raw reply

* Re: [PATCH 41/45] include/uapi/sound/emu10k1.h: hide gpr_valid, tram_valid and code_valid in userspace
From: Takashi Iwai @ 2015-03-11  6:11 UTC (permalink / raw)
  To: Mikko Rapeli
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, Jaroslav Kysela,
	alsa-devel-K7yf7f+aM1XWsZ/bQMPhNw,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20150311012204.GE12550-dqH1CgrzRhOk/eJAJmRu5A@public.gmane.org>

At Wed, 11 Mar 2015 03:22:04 +0200,
Mikko Rapeli wrote:
> 
> On Tue, Feb 17, 2015 at 07:27:38AM +0100, Takashi Iwai wrote:
> > At Tue, 17 Feb 2015 00:05:44 +0100,
> > Mikko Rapeli wrote:
> > > 
> > > The DECLARE_BITMAP macro is not available in userspace headers.
> > > Fixes userspace compile error:
> > > error: expected specifier-qualifier-list before ‘DECLARE_BITMAP’
> > 
> > It's nonsense.  This results in an incompatible structure, thus ABI
> > would be broken completely (actually this will break the compile of
> > ld10k1).
> 
> None of the exported headers after 'make headers_install' have definition
> of DECLARE_BITMAP macro. It is defined in include/linux/types.h which is
> different from include/uapi/linux/types.h and missing this definition and
> a few other things.
> 
> One option would be add DECLARE_BITMAP macro to include/uapi/linux/types.h
> and add include/linux/bitops.h to uapi.
> 
> Thoughts?

Are there any other headers like that?  If this is the only one, leave
it as is.  The only program that reads this are some alsa-tools ones
and they have already own DECLARE_BITMAP() definition.  Adding the
extra definition here will even break the compilation out of sudden.


Takashi

^ permalink raw reply

* Re: [v9 4/5] ext4: adds FS_IOC_FSSETXATTR/FS_IOC_FSGETXATTR interface support
From: Konstantin Khlebnikov @ 2015-03-11  7:33 UTC (permalink / raw)
  To: Li Xi
  Cc: linux-fsdevel, linux-ext4-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	Linux API, Theodore Ts'o, Andreas Dilger, Jan Kara, Al Viro,
	Christoph Hellwig,
	Дмитрий Монахов
In-Reply-To: <1426043003-31043-5-git-send-email-lixi-LfVdkaOWEx8@public.gmane.org>

On Wed, Mar 11, 2015 at 6:03 AM, Li Xi <pkuelelixi-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> This patch adds FS_IOC_FSSETXATTR/FS_IOC_FSGETXATTR ioctl interface
> support for ext4. The interface is kept consistent with
> XFS_IOC_FSGETXATTR/XFS_IOC_FSGETXATTR.
>
> Signed-off-by: Li Xi <lixi-LfVdkaOWEx8@public.gmane.org>
> ---


> @@ -503,8 +490,8 @@ typedef struct xfs_swapext
>  #define XFS_IOC_ALLOCSP                _IOW ('X', 10, struct xfs_flock64)
>  #define XFS_IOC_FREESP         _IOW ('X', 11, struct xfs_flock64)
>  #define XFS_IOC_DIOINFO                _IOR ('X', 30, struct dioattr)
> -#define XFS_IOC_FSGETXATTR     _IOR ('X', 31, struct fsxattr)
> -#define XFS_IOC_FSSETXATTR     _IOW ('X', 32, struct fsxattr)
> +#define XFS_IOC_FSGETXATTR     FS_IOC_FSGETXATTR
> +#define XFS_IOC_FSSETXATTR     FS_IOC_FSSETXATTR

XFS ioctls use letter 'X'.

>
> @@ -163,6 +193,8 @@ struct inodes_stat_t {
>  #define        FS_IOC_GETVERSION               _IOR('v', 1, long)
>  #define        FS_IOC_SETVERSION               _IOW('v', 2, long)
>  #define FS_IOC_FIEMAP                  _IOWR('f', 11, struct fiemap)
> +#define FS_IOC_FSGETXATTR              _IOR('f', 31, struct fsxattr)
> +#define FS_IOC_FSSETXATTR              _IOW('f', 32, struct fsxattr)

You add 'f'. They're different.

>  #define FS_IOC32_GETFLAGS              _IOR('f', 1, int)
>  #define FS_IOC32_SETFLAGS              _IOW('f', 2, int)
>  #define FS_IOC32_GETVERSION            _IOR('v', 1, int)
> --
> 1.7.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [v9 3/5] ext4: adds project quota support
From: Konstantin Khlebnikov @ 2015-03-11  7:40 UTC (permalink / raw)
  To: Li Xi
  Cc: linux-fsdevel, linux-ext4-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	Linux API, Theodore Ts'o, Andreas Dilger, Jan Kara, Al Viro,
	Christoph Hellwig,
	Дмитрий Монахов
In-Reply-To: <1426043003-31043-4-git-send-email-lixi-LfVdkaOWEx8@public.gmane.org>

On Wed, Mar 11, 2015 at 6:03 AM, Li Xi <pkuelelixi-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> This patch adds mount options for enabling/disabling project quota
> accounting and enforcement. A new specific inode is also used for
> project quota accounting.
>
> Signed-off-by: Li Xi <lixi-LfVdkaOWEx8@public.gmane.org>
> Signed-off-by: Dmitry Monakhov <dmonakhov-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
> Reviewed-by: Jan Kara <jack-AlSwsSmVLrQ@public.gmane.org>
> ---
>  fs/ext4/ext4.h  |    8 +++-
>  fs/ext4/super.c |   99 ++++++++++++++++++++++++++++++++++++++++++++++++-------
>  2 files changed, 93 insertions(+), 14 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 7acb2da..3443456 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -208,6 +208,7 @@ struct ext4_io_submit {
>  #define EXT4_UNDEL_DIR_INO      6      /* Undelete directory inode */
>  #define EXT4_RESIZE_INO                 7      /* Reserved group descriptors inode */
>  #define EXT4_JOURNAL_INO        8      /* Journal inode */
> +#define EXT4_PRJ_QUOTA_INO      9      /* Project quota inode */

This special inode is reserved for:  EXT2_EXCLUDE_INO 9 /* The
"exclude" inode, for snapshots */
I'm not sure if it's ok to use it for project quota.

>
>  /* First non-reserved inode for old ext4 filesystems */
>  #define EXT4_GOOD_OLD_FIRST_INO        11
> @@ -987,6 +988,7 @@ struct ext4_inode_info {
>  #define EXT4_MOUNT_DIOREAD_NOLOCK      0x400000 /* Enable support for dio read nolocking */
>  #define EXT4_MOUNT_JOURNAL_CHECKSUM    0x800000 /* Journal checksums */
>  #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT        0x1000000 /* Journal Async Commit */
> +#define EXT4_MOUNT_PRJQUOTA            0x2000000 /* Project quota support */
>  #define EXT4_MOUNT_DELALLOC            0x8000000 /* Delalloc support */
>  #define EXT4_MOUNT_DATA_ERR_ABORT      0x10000000 /* Abort on file data write */
>  #define EXT4_MOUNT_BLOCK_VALIDITY      0x20000000 /* Block validity checking */
> @@ -1169,7 +1171,8 @@ struct ext4_super_block {
>         __le32  s_overhead_clusters;    /* overhead blocks/clusters in fs */
>         __le32  s_backup_bgs[2];        /* groups with sparse_super2 SBs */
>         __u8    s_encrypt_algos[4];     /* Encryption algorithms in use  */
> -       __le32  s_reserved[105];        /* Padding to the end of the block */
> +       __le32  s_prj_quota_inum;       /* inode for tracking project quota */
> +       __le32  s_reserved[104];        /* Padding to the end of the block */
>         __le32  s_checksum;             /* crc32c(superblock) */
>  };
>
> @@ -1184,7 +1187,7 @@ struct ext4_super_block {
>  #define EXT4_MF_FS_ABORTED     0x0002  /* Fatal error detected */
>
>  /* Number of quota types we support */
> -#define EXT4_MAXQUOTAS 2
> +#define EXT4_MAXQUOTAS 3
>
>  /*
>   * fourth extended-fs super-block data in memory
> @@ -1376,6 +1379,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
>                 ino == EXT4_BOOT_LOADER_INO ||
>                 ino == EXT4_JOURNAL_INO ||
>                 ino == EXT4_RESIZE_INO ||
> +               ino == EXT4_PRJ_QUOTA_INO ||
>                 (ino >= EXT4_FIRST_INO(sb) &&
>                  ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
>  }
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 04c6cc3..e057daa 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1036,8 +1036,8 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
>  }
>
>  #ifdef CONFIG_QUOTA
> -#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
> -#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
> +static char *quotatypes[] = INITQFNAMES;
> +#define QTYPE2NAME(t) (quotatypes[t])
>
>  static int ext4_write_dquot(struct dquot *dquot);
>  static int ext4_acquire_dquot(struct dquot *dquot);
> @@ -1135,7 +1135,8 @@ enum {
>         Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
>         Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
>         Opt_data_err_abort, Opt_data_err_ignore,
> -       Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
> +       Opt_usrjquota, Opt_grpjquota, Opt_prjjquota,
> +       Opt_offusrjquota, Opt_offgrpjquota, Opt_offprjjquota,
>         Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
>         Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
>         Opt_usrquota, Opt_grpquota, Opt_i_version,
> @@ -1190,6 +1191,8 @@ static const match_table_t tokens = {
>         {Opt_usrjquota, "usrjquota=%s"},
>         {Opt_offgrpjquota, "grpjquota="},
>         {Opt_grpjquota, "grpjquota=%s"},
> +       {Opt_prjjquota, "prjjquota"},
> +       {Opt_offprjjquota, "offprjjquota"},
>         {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
>         {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
>         {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
> @@ -1412,11 +1415,14 @@ static const struct mount_opts {
>         {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
>                                                         MOPT_SET | MOPT_Q},
>         {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
> -                      EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q},
> +                      EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA),
> +                                                       MOPT_CLEAR | MOPT_Q},
>         {Opt_usrjquota, 0, MOPT_Q},
>         {Opt_grpjquota, 0, MOPT_Q},
> +       {Opt_prjjquota, 0, MOPT_Q},
>         {Opt_offusrjquota, 0, MOPT_Q},
>         {Opt_offgrpjquota, 0, MOPT_Q},
> +       {Opt_offprjjquota, 0, MOPT_Q},
>         {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
>         {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
>         {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
> @@ -1433,16 +1439,25 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
>         kuid_t uid;
>         kgid_t gid;
>         int arg = 0;
> -
>  #ifdef CONFIG_QUOTA
> +       char *prj_qf_name = "aquota.project";

If you already have inode numer in super-block why do you need this name here?
Modern journalled quota is stored in special inodes and invisible from
user-space.

> +       substring_t prj_qf_string = {
> +               .from = prj_qf_name,
> +               .to = &prj_qf_name[strlen(prj_qf_name)],
> +       };
> +
>         if (token == Opt_usrjquota)
>                 return set_qf_name(sb, USRQUOTA, &args[0]);
>         else if (token == Opt_grpjquota)
>                 return set_qf_name(sb, GRPQUOTA, &args[0]);
> +       else if (token == Opt_prjjquota)
> +               return set_qf_name(sb, PRJQUOTA, &prj_qf_string);
>         else if (token == Opt_offusrjquota)
>                 return clear_qf_name(sb, USRQUOTA);
>         else if (token == Opt_offgrpjquota)
>                 return clear_qf_name(sb, GRPQUOTA);
> +       else if (token == Opt_offprjjquota)
> +               return clear_qf_name(sb, PRJQUOTA);
>  #endif
>         switch (token) {
>         case Opt_noacl:
> @@ -1668,19 +1683,28 @@ static int parse_options(char *options, struct super_block *sb,
>         }
>  #ifdef CONFIG_QUOTA
>         if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
> -           (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) {
> +           (test_opt(sb, USRQUOTA) ||
> +            test_opt(sb, GRPQUOTA) ||
> +            test_opt(sb, PRJQUOTA))) {
>                 ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA "
>                          "feature is enabled");
>                 return 0;
>         }
> -       if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
> +       if (sbi->s_qf_names[USRQUOTA] ||
> +           sbi->s_qf_names[GRPQUOTA] ||
> +           sbi->s_qf_names[PRJQUOTA]) {
>                 if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
>                         clear_opt(sb, USRQUOTA);
>
>                 if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
>                         clear_opt(sb, GRPQUOTA);
>
> -               if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
> +               if (test_opt(sb, PRJQUOTA) && sbi->s_qf_names[PRJQUOTA])
> +                       clear_opt(sb, PRJQUOTA);
> +
> +               if (test_opt(sb, GRPQUOTA) ||
> +                   test_opt(sb, USRQUOTA) ||
> +                   test_opt(sb, PRJQUOTA)) {
>                         ext4_msg(sb, KERN_ERR, "old and new quota "
>                                         "format mixing");
>                         return 0;
> @@ -1740,6 +1764,9 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
>
>         if (sbi->s_qf_names[GRPQUOTA])
>                 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
> +
> +       if (sbi->s_qf_names[PRJQUOTA])
> +               seq_printf(seq, ",prjjquota=%s", sbi->s_qf_names[PRJQUOTA]);
>  #endif
>  }
>
> @@ -3944,7 +3971,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>                 sb->s_qcop = &ext4_qctl_sysfile_operations;
>         else
>                 sb->s_qcop = &ext4_qctl_operations;
> -       sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
> +       sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
>  #endif
>         memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
>
> @@ -5040,6 +5067,46 @@ restore_opts:
>         return err;
>  }
>
> +static int ext4_statfs_project(struct super_block *sb,
> +                              kprojid_t projid, struct kstatfs *buf)
> +{
> +       struct kqid qid;
> +       struct dquot *dquot;
> +       u64 limit;
> +       u64 curblock;
> +
> +       qid = make_kqid_projid(projid);
> +       dquot = dqget(sb, qid);
> +       if (!dquot)
> +               return -ESRCH;
> +       spin_lock(&dq_data_lock);
> +
> +       limit = dquot->dq_dqb.dqb_bsoftlimit ?
> +               dquot->dq_dqb.dqb_bsoftlimit :
> +               dquot->dq_dqb.dqb_bhardlimit;
> +       if (limit && buf->f_blocks * buf->f_bsize > limit) {
> +               curblock = dquot->dq_dqb.dqb_curspace / buf->f_bsize;
> +               buf->f_blocks = limit / buf->f_bsize;
> +               buf->f_bfree = buf->f_bavail =
> +                       (buf->f_blocks > curblock) ?
> +                        (buf->f_blocks - curblock) : 0;
> +       }
> +
> +       limit = dquot->dq_dqb.dqb_isoftlimit ?
> +               dquot->dq_dqb.dqb_isoftlimit :
> +               dquot->dq_dqb.dqb_ihardlimit;
> +       if (limit && buf->f_files > limit) {
> +               buf->f_files = limit;
> +               buf->f_ffree =
> +                       (buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
> +                        (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
> +       }
> +
> +       spin_unlock(&dq_data_lock);
> +       dqput(dquot);
> +       return 0;
> +}
> +
>  static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
>  {
>         struct super_block *sb = dentry->d_sb;
> @@ -5048,6 +5115,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
>         ext4_fsblk_t overhead = 0, resv_blocks;
>         u64 fsid;
>         s64 bfree;
> +       struct inode *inode = dentry->d_inode;
>         resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
>
>         if (!test_opt(sb, MINIX_DF))
> @@ -5072,6 +5140,9 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
>         buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
>         buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
>
> +       if (ext4_test_inode_flag(inode, EXT4_INODE_PROJINHERIT) &&
> +           sb_has_quota_limits_enabled(sb, PRJQUOTA))
> +               ext4_statfs_project(sb, EXT4_I(inode)->i_projid, buf);
>         return 0;
>  }
>
> @@ -5152,7 +5223,9 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot)
>
>         /* Are we journaling quotas? */
>         if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) ||
> -           sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
> +           sbi->s_qf_names[USRQUOTA] ||
> +           sbi->s_qf_names[GRPQUOTA] ||
> +           sbi->s_qf_names[PRJQUOTA]) {
>                 dquot_mark_dquot_dirty(dquot);
>                 return ext4_write_dquot(dquot);
>         } else {
> @@ -5236,7 +5309,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
>         struct inode *qf_inode;
>         unsigned long qf_inums[EXT4_MAXQUOTAS] = {
>                 le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
> -               le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
> +               le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
> +               le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
>         };
>
>         BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA));
> @@ -5264,7 +5338,8 @@ static int ext4_enable_quotas(struct super_block *sb)
>         int type, err = 0;
>         unsigned long qf_inums[EXT4_MAXQUOTAS] = {
>                 le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
> -               le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
> +               le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
> +               le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
>         };
>
>         sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
> --
> 1.7.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 41/45] include/uapi/sound/emu10k1.h: hide gpr_valid, tram_valid and code_valid in userspace
From: Arnd Bergmann @ 2015-03-11  9:46 UTC (permalink / raw)
  To: Takashi Iwai
  Cc: Mikko Rapeli, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	Jaroslav Kysela, alsa-devel-K7yf7f+aM1XWsZ/bQMPhNw,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <s5h4mpsnjw9.wl-tiwai-l3A5Bk7waGM@public.gmane.org>

On Wednesday 11 March 2015 07:11:18 Takashi Iwai wrote:
> At Wed, 11 Mar 2015 03:22:04 +0200,
> Mikko Rapeli wrote:
> > 
> > On Tue, Feb 17, 2015 at 07:27:38AM +0100, Takashi Iwai wrote:
> > > At Tue, 17 Feb 2015 00:05:44 +0100,
> > > Mikko Rapeli wrote:
> > > > 
> > > > The DECLARE_BITMAP macro is not available in userspace headers.
> > > > Fixes userspace compile error:
> > > > error: expected specifier-qualifier-list before ‘DECLARE_BITMAP’
> > > 
> > > It's nonsense.  This results in an incompatible structure, thus ABI
> > > would be broken completely (actually this will break the compile of
> > > ld10k1).
> > 
> > None of the exported headers after 'make headers_install' have definition
> > of DECLARE_BITMAP macro. It is defined in include/linux/types.h which is
> > different from include/uapi/linux/types.h and missing this definition and
> > a few other things.
> > 
> > One option would be add DECLARE_BITMAP macro to include/uapi/linux/types.h
> > and add include/linux/bitops.h to uapi.
> > 
> > Thoughts?
> 
> Are there any other headers like that?  If this is the only one, leave
> it as is.  The only program that reads this are some alsa-tools ones
> and they have already own DECLARE_BITMAP() definition.  Adding the
> extra definition here will even break the compilation out of sudden.

I think it's a worthy goal to have the header files be compilable
standalone, but I don't think we should make the DECLARE_BITMAP()
macro globally visible in user space, in particular because it will
clash with every instance in which user space has a macro of the
same name.

What we could do here is to add a private copy of the macro to emu10k1.h
under a different name, such as __EMU10K1_DECLARE_BITMAP().

	Arnd

^ permalink raw reply

* Re: [PATCH 33/45] include/uapi/asm-generic/signal.h: include stdlib.h in userspace
From: Arnd Bergmann @ 2015-03-11 10:02 UTC (permalink / raw)
  To: Mikko Rapeli
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-arch-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Geoff Levand, Paul Bolle
In-Reply-To: <20150310225828.GZ12550-dqH1CgrzRhOk/eJAJmRu5A@public.gmane.org>

On Wednesday 11 March 2015 00:58:28 Mikko Rapeli wrote:
> linux/kexec.h seems suspicious though:
> 
> #ifndef __KERNEL__
> /*
>  * This structure is used to hold the arguments that are used when
>  * loading  kernel binaries.
>  */
> struct kexec_segment {
>         const void *buf;
>         size_t bufsz;
>         const void *mem;
>         size_t memsz;
> };
> 
> #endif /* __KERNEL__ */
> 

Hmm, maybe that just needs to be removed as well, after we
have removed the function declaration that used to follow it
as part of this commit:

http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=9dc5c05f45ca8101025046cd

Using __kernel_size_t would work here as well, but I suspect that
the entire structure is useless as it is not used by kernel or
user space, and kexec_tools has its own copy of the structure.

	Arnd

^ permalink raw reply

* [PATCH] fix trailing whitespaces in if.h
From: Maxim Uvarov @ 2015-03-11 13:08 UTC (permalink / raw)
  To: linux-api-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, Maxim Uvarov

Signed-off-by: Maxim Uvarov <maxim.uvarov-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>
---
 include/uapi/linux/if.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h
index 9cf2394..ee68193 100644
--- a/include/uapi/linux/if.h
+++ b/include/uapi/linux/if.h
@@ -157,7 +157,7 @@ enum {
 };
 
 /*
- *	Device mapping structure. I'd just gone off and designed a 
+ *	Device mapping structure. I'd just gone off and designed a
  *	beautiful scheme using only loadable modules with arguments
  *	for driver options and along come the PCMCIA people 8)
  *
@@ -169,7 +169,7 @@ enum {
 struct ifmap {
 	unsigned long mem_start;
 	unsigned long mem_end;
-	unsigned short base_addr; 
+	unsigned short base_addr;
 	unsigned char irq;
 	unsigned char dma;
 	unsigned char port;
@@ -206,7 +206,7 @@ struct ifreq {
 	{
 		char	ifrn_name[IFNAMSIZ];		/* if name, e.g. "en0" */
 	} ifr_ifrn;
-	
+
 	union {
 		struct	sockaddr ifru_addr;
 		struct	sockaddr ifru_dstaddr;
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH v2 07/18] drivers: reset: Add STM32 reset driver
From: Philipp Zabel @ 2015-03-11 13:08 UTC (permalink / raw)
  To: Maxime Coquelin
  Cc: Arnd Bergmann, Uwe Kleine-König, Andreas Färber,
	Geert Uytterhoeven, Rob Herring, Jonathan Corbet, Pawel Moll,
	Mark Rutland, Ian Campbell, Kumar Gala, Russell King,
	Daniel Lezcano, Thomas Gleixner, Linus Walleij,
	Greg Kroah-Hartman, Jiri Slaby, Andrew Morton, David S. Miller,
	Mauro Carvalho Chehab, Joe Perches, Antti Palosaari, Tejun Heo
In-Reply-To: <CALszF6DENRS2L8mHD_iPsPwbAZndbHoB10oHaJ8-E4ajhog=8w@mail.gmail.com>

Am Dienstag, den 10.03.2015, 22:20 +0100 schrieb Maxime Coquelin:
> 2015-03-10 21:21 GMT+01:00 Arnd Bergmann <arnd@arndb.de>:
> > On Tuesday 10 March 2015 16:44:24 Maxime Coquelin wrote:
> >> 2015-03-10 16:02 GMT+01:00 Arnd Bergmann <arnd@arndb.de>:
> >> > On Friday 20 February 2015 19:01:06 Maxime Coquelin wrote:
> >> >> +/* AHB1 */
> >> >> +#define GPIOA_RESET    0
> >> >> +#define GPIOB_RESET    1
> >> >> +#define GPIOC_RESET    2
> >> >> +#define GPIOD_RESET    3
> >> >> +#define GPIOE_RESET    4
> >> >> +#define GPIOF_RESET    5
> >> >> +#define GPIOG_RESET    6
> >> >> +#define GPIOH_RESET    7
> >> >> +#define GPIOI_RESET    8
> >> >> +#define GPIOJ_RESET    9
> >> >> +#define GPIOK_RESET    10
> >> >>
> >> >
> >> > As these are just the hardware numbers, it's better to not make them
> >> > part of the binding at all. Instead, just document in the binding that
> >> > one is supposed to pass the hardware number as the argument.
> >>
> >> The reset controller is part of the RCC (Reset & Clock Controller) IP.
> >> In this version, I only provided the reset registers to the reset
> >> controller driver, but as per Andreas Färber remark, I should avec a
> >> single DT node for both the resets and clocks.
> >>
> >> In the next version I am preparing, the defines doesn't look as
> >> trivial as in this version, GPIOA_RESET being 128 for instance.
> >>
> >> Is it fine for you if I keep the defines part of the binding?
> >>
> >>
> >
> > It's always better to avoid these files entirely, as they are
> > a frequent source of merge dependencies, and they make it less
> > obvious what's going on than having binary values in the dtb
> > that make sense.
> 
> I agree it is always painful to have to have to manage these merge dependencies.
> What I will do, if Philipp agrees, is to list all the values in the
> binding documentation.
> 
> Doing that, the user of a reset won't have to do the calculation, and
> no more merge dependencies.

I'd prefer to have #defines for the reset bits if they are named in the
documentation and use the names in the dts. But if you want to reference
reset bits by number in the device tree instead, I won't insist.

Consider using two cells in the phandle for register and bit offset
instead of a single number that arbitrarily starts at 128.

regards
Philipp


^ permalink raw reply

* Re: [PATCH v4 4/9] selftests: Add install target
From: Shuah Khan @ 2015-03-11 13:18 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linux-kernel, davej, mmarek, linux-api
In-Reply-To: <1426046765-19289-4-git-send-email-mpe@ellerman.id.au>

On 03/10/2015 10:06 PM, Michael Ellerman wrote:
> This adds make install support to selftests. The basic usage is:
> 
> $ cd tools/testing/selftests
> $ make install
> 
> That installs into tools/testing/selftests/install, which can then be
> copied where ever necessary.
> 
> The install destination is also configurable using eg:
> 
> $ INSTALL_PATH=/mnt/selftests make install
> 
> The implementation uses two targets in the child makefiles. The first
> "install" is expected to install all files into $(INSTALL_PATH).
> 
> The second, "emit_tests", is expected to emit the test instructions (ie.
> bash script) on stdout. Separating this from install means the child
> makefiles need no knowledge of the location of the test script.
> 
> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
> ---
> 
> v3: Rebase onto 4.0-rc2.
>     Rename all.sh to run_kselftest.sh.
>     Add --no-print-directory to emit_tests invocation.
> v4: Rebase onto 4.0-rc3, add TEST_FILES to efivars and vm tests, remove
>     newlines from echoes.

I don't see my comments addressed. If you want me to take
this work, please address the following comments:

- Name install directory kselftest. It should work with the
  the use-case.

  make INSTALL_PATH=/tmp make install
  The install directory should be /tmp/kselftest

- Flatten the directory with all tests under /tmp/kselftest

I am wasting lot of time because you don't fully address my
comments and send patches that dont' work correctly. Please
make sure your patches don't generate work for me.

thanks,
-- Shuah

-- 
Shuah Khan
Sr. Linux Kernel Developer
Open Source Innovation Group
Samsung Research America (Silicon Valley)
shuahkh@osg.samsung.com | (970) 217-8978

^ permalink raw reply

* Re: [PATCH 2/9] selftests: Add install target
From: Shuah Khan @ 2015-03-11 13:19 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Dave Jones, mmarek-AlSwsSmVLrQ
In-Reply-To: <1426044019.23148.2.camel-Gsx/Oe8HsFggBc27wqDAHg@public.gmane.org>

On 03/10/2015 09:20 PM, Michael Ellerman wrote:
> On Tue, 2015-03-10 at 09:11 -0600, Shuah Khan wrote:
>> On 03/09/2015 04:29 PM, Shuah Khan wrote:
>>> On 03/09/2015 08:20 AM, Shuah Khan wrote:
>>>> On 03/05/2015 11:53 AM, Dave Jones wrote:
>>>>> On Tue, Mar 03, 2015 at 03:51:35PM +1100, Michael Ellerman wrote:
>>>>>  > This adds make install support to selftests. The basic usage is:
>>>>>  > 
>>>>>  > $ cd tools/testing/selftests
>>>>>  > $ make install
>>>>>  > 
>>>>>  > That installs into tools/testing/selftests/install, which can then be
>>>>>  > copied where ever necessary.
>>>>>  > 
>>>>>  > The install destination is also configurable using eg:
>>>>>  > 
>>>>>  > $ INSTALL_PATH=/mnt/selftests make install
>>>>>
>>>>>  ...
>>>>>
>>>>>  > +	@# Ask all targets to emit their test scripts
>>>>>  > +	echo "#!/bin/bash\n\n" > $(ALL_SCRIPT)
>>>>>
>>>>> $ ./all.sh 
>>>>> -bash: ./all.sh: /bin/bash\n\n: bad interpreter: No such file or directory
>>>>>
>>>>> Removing the \n\n fixes it.
>>>>>
>>>>>  > +		echo "cd \$$ROOT\n" >> $(ALL_SCRIPT); \
>>>>>
>>>>> ditto
>>>>>
>>>>> 	Dave
>>>>
>>>> Michael,
>>>>
>>>> Could you please fix these problems and send the patch.
>>>>
>>>
>>> Michael,
>>>
>>> Did you happen to run run_kselftest.sh from the install
>>> directory to make sure all the dependent executables
>>> are installed? You are missing a few required dependencies.
>>> efivars test for example.
>>>
>>> Please run kselftest_install.sh I sent out for review and
>>> compare the following:
>>>
>>> - contents of install directory created with your patch vs.
>>>   my kselftest_install.sh tool
>>> - Compare your run_kselftest.sh run with the one that gets
>>>   generated with my kselftest_install.sh tool
>>>
>>> General rule is all tests that get run when run_tests target
>>> is run should run from the install directory using the
>>> run_kselftest.sh generated during install.
>>>
>>
>> Couple more things. Please change the install directory name
>> to kselftest
>>
>> tools/testing/selftests/kselftest
>> instead of
>> tools/testing/selftests/install
> 
> I prefer install, that's what it is after all. I don't know why you're so
> obsessed with the "kselftest" name.

It is the overall user-interface. I want to be able to call
install wrapper script and have the right directory generated.
If user passes in /tmp for example as a main directory, install
is so generic and doesn't make sense. I see that you sent the
patch v4 still with install and please change it to kselftest
> 
>> Also please flatten the directory structure under the install
>> directory. I don't see any value in creating directory for each
>> test for install. Also it is makes it cumbersome for users
>> to navigate and work with after the install. This would mean cpu
>> and memory hot-plug scripts need unique names.
> 
> That's not a good idea. To start with different tests might the same name, as
> already happens with the memory & cpu hot-plug tests. They may also have data
> files that would clobber each other. We'd need to make sure all files used in
> all tests have different names, that would be a total mess.

No I want the just one container directory for the tests. Again I am
looking at the bigger picture user-interface angle. Please flatten it,
and resend it.

I responded to your patch v4. Please re-do the patch to address my
comments if you want your patch to be included.

thanks,
-- Shuah


-- 
Shuah Khan
Sr. Linux Kernel Developer
Open Source Innovation Group
Samsung Research America (Silicon Valley)
shuahkh-JPH+aEBZ4P+UEJcrhfAQsw@public.gmane.org | (970) 217-8978

^ permalink raw reply

* [PATCH RFCv2 00/21] Drivers: hv: utils: re-implement the kernel/userspace communication layer
From: Vitaly Kuznetsov @ 2015-03-11 13:29 UTC (permalink / raw)
  To: K. Y. Srinivasan, devel-tBiZLqfeLfOHmIFyCCdPziST3g8Odh+X
  Cc: Haiyang Zhang, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Dexuan Cui,
	Radim Krcmar, Greg Kroah-Hartman,
	linux-api-u79uwXL29TY76Z2rM5mHXA

Changes in RFCv2:
- Preserve backwards compatibility with netlink-speaking daemons. [K. Y. Srinivasan]
- Introduce transport abstraction layer. [K. Y. Srinivasan]
- Get rid of ioctls [Radim Krcmar]
- Make the series reviewable by splitting it into smaller patches.

Anatomy of the series:
Patches 01 - 07 are cleanup with minor functional change.
Patch 08 defines the state machine.
Patches 09-11 convert all 3 drivers to using the state machine.
Patch 12 fixes a bug in fcopy. This change is going away in Patch 15,
 I just want to highlight the fix.
Patch 13 introduces a transport abstraction.
Patch 14-16 convert all drivers to using the transport abstraction.
Patches 17-18 switch KVP and VSS daemon to using char devices.
Patches 19-20 convert FCOPY and VSS to hull handshake (the same we have in
 KVP). These two can be postponed till we really need to distinguish between
 different kernels in the daemon code.
Patch 21 unifies log messages on daemons connect across all drivers and moves
 these messages to debug level.

I smoke-tested this series with both old (netlink) and new (char devices)
daemons and tested the daemon upgrade procedure.

Original description:
This series converts kvp/vss daemons to use misc char devices instead of
netlink for userspace/kernel communication and then updates fcopy to be
consistent with kvp/vss.

Userspace/kernel communication via netlink has a number of issues:
- It is hard for userspace to figure out if the kernel part was loaded or not
  and this fact can change as there is a way to enable/disable the service from
  host side. Racy daemon startup is also a problem.
- When the userspace daemon restarts/dies kernel part doesn't receive a
  notification.
- Netlink communication is not stable under heavy load.
- ...

Vitaly Kuznetsov (21):
  Drivers: hv: util: move kvp/vss function declarations to
    hyperv_vmbus.h
  Drivers: hv: kvp: reset kvp_context
  Drivers: hv: kvp: move poll_channel() to hyperv_vmbus.h
  Drivers: hv: fcopy: process deferred messages when we complete the
    transaction
  Drivers: hv: vss: process deferred messages when we complete the
    transaction
  Drivers: hv: kvp: rename kvp_work -> kvp_timeout_work
  Drivers: hv: fcopy: rename fcopy_work -> fcopy_timeout_work
  Drivers: hv: util: introduce state machine for util drivers
  Drivers: hv: kvp: switch to using the hvutil_device_state state
    machine
  Drivers: hv: vss: switch to using the hvutil_device_state state
    machine
  Drivers: hv: fcopy: switch to using the hvutil_device_state state
    machine
  Drivers: hv: fcopy: set .owner reference for file operations
  Drivers: hv: util: introduce hv_utils_transport abstraction
  Drivers: hv: vss: convert to hv_utils_transport
  Drivers: hv: fcopy: convert to hv_utils_transport
  Drivers: hv: kvp: convert to hv_utils_transport
  Tools: hv: kvp: use misc char device to communicate with kernel
  Tools: hv: vss: use misc char device to communicate with kernel
  Drivers: hv: vss: full handshake support
  Drivers: hv: fcopy: full handshake support
  Drivers: hv: utils: unify driver registration reporting

 drivers/hv/Makefile             |   2 +-
 drivers/hv/hv_fcopy.c           | 287 ++++++++++++++--------------------------
 drivers/hv/hv_kvp.c             | 192 +++++++++++++--------------
 drivers/hv/hv_snapshot.c        | 168 +++++++++++++++--------
 drivers/hv/hv_utils_transport.c | 276 ++++++++++++++++++++++++++++++++++++++
 drivers/hv/hv_utils_transport.h |  51 +++++++
 drivers/hv/hyperv_vmbus.h       |  29 ++++
 include/linux/hyperv.h          |   8 --
 include/uapi/linux/hyperv.h     |   8 +-
 tools/hv/hv_fcopy_daemon.c      |  15 +++
 tools/hv/hv_kvp_daemon.c        | 166 +++++------------------
 tools/hv/hv_vss_daemon.c        | 149 ++++++---------------
 12 files changed, 752 insertions(+), 599 deletions(-)
 create mode 100644 drivers/hv/hv_utils_transport.c
 create mode 100644 drivers/hv/hv_utils_transport.h

-- 
1.9.3

^ permalink raw reply

* [PATCH RFCv2 01/21] Drivers: hv: util: move kvp/vss function declarations to hyperv_vmbus.h
From: Vitaly Kuznetsov @ 2015-03-11 13:29 UTC (permalink / raw)
  To: K. Y. Srinivasan, devel
  Cc: Radim Krcmar, Greg Kroah-Hartman, Haiyang Zhang, linux-kernel,
	linux-api
In-Reply-To: <1426080574-9011-1-git-send-email-vkuznets@redhat.com>

These declarations are internal to hv_util module and hv_fcopy_* declarations
already reside there.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
---
 drivers/hv/hv_kvp.c       | 1 +
 drivers/hv/hv_snapshot.c  | 2 ++
 drivers/hv/hyperv_vmbus.h | 8 ++++++++
 include/linux/hyperv.h    | 8 --------
 4 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c
index beb8105..a414c83 100644
--- a/drivers/hv/hv_kvp.c
+++ b/drivers/hv/hv_kvp.c
@@ -28,6 +28,7 @@
 #include <linux/workqueue.h>
 #include <linux/hyperv.h>
 
+#include "hyperv_vmbus.h"
 
 /*
  * Pre win8 version numbers used in ws2008 and ws 2008 r2 (win7)
diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c
index 9d5e0d1..c1a3604 100644
--- a/drivers/hv/hv_snapshot.c
+++ b/drivers/hv/hv_snapshot.c
@@ -24,6 +24,8 @@
 #include <linux/workqueue.h>
 #include <linux/hyperv.h>
 
+#include "hyperv_vmbus.h"
+
 #define VSS_MAJOR  5
 #define VSS_MINOR  0
 #define VSS_VERSION    (VSS_MAJOR << 16 | VSS_MINOR)
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 44b1c94..0d85dd1 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -699,6 +699,14 @@ int vmbus_set_event(struct vmbus_channel *channel);
 
 void vmbus_on_event(unsigned long data);
 
+int hv_kvp_init(struct hv_util_service *);
+void hv_kvp_deinit(void);
+void hv_kvp_onchannelcallback(void *);
+
+int hv_vss_init(struct hv_util_service *);
+void hv_vss_deinit(void);
+void hv_vss_onchannelcallback(void *);
+
 int hv_fcopy_init(struct hv_util_service *);
 void hv_fcopy_deinit(void);
 void hv_fcopy_onchannelcallback(void *);
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 5a2ba67..78cfd65 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1206,14 +1206,6 @@ extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *,
 					struct icmsg_negotiate *, u8 *, int,
 					int);
 
-int hv_kvp_init(struct hv_util_service *);
-void hv_kvp_deinit(void);
-void hv_kvp_onchannelcallback(void *);
-
-int hv_vss_init(struct hv_util_service *);
-void hv_vss_deinit(void);
-void hv_vss_onchannelcallback(void *);
-
 extern struct resource hyperv_mmio;
 
 /*
-- 
1.9.3

^ permalink raw reply related

* [PATCH RFCv2 02/21] Drivers: hv: kvp: reset kvp_context
From: Vitaly Kuznetsov @ 2015-03-11 13:29 UTC (permalink / raw)
  To: K. Y. Srinivasan, devel
  Cc: Haiyang Zhang, linux-kernel, Dexuan Cui, Radim Krcmar,
	Greg Kroah-Hartman, linux-api
In-Reply-To: <1426080574-9011-1-git-send-email-vkuznets@redhat.com>

We set kvp_context when we want to postpone receiving a packet from vmbus due
to the previous transaction being unfinished. We, however, never reset this
state, all consequent kvp_respond_to_host() calls will result in poll_channel()
calling hv_kvp_onchannelcallback(). This doesn't cause real issues as:
1) Host is supposed to serialize transactions as well
2) If no message is pending vmbus_recvpacket() will return 0 recvlen.
This is just a cleanup.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
---
 drivers/hv/hv_kvp.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c
index a414c83..caa467d 100644
--- a/drivers/hv/hv_kvp.c
+++ b/drivers/hv/hv_kvp.c
@@ -621,6 +621,7 @@ void hv_kvp_onchannelcallback(void *context)
 		kvp_transaction.kvp_context = context;
 		return;
 	}
+	kvp_transaction.kvp_context = NULL;

 	vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 4, &recvlen,
 			 &requestid);
-- 
1.9.3

^ permalink raw reply related

* [PATCH RFCv2 03/21] Drivers: hv: kvp: move poll_channel() to hyperv_vmbus.h
From: Vitaly Kuznetsov @ 2015-03-11 13:29 UTC (permalink / raw)
  To: K. Y. Srinivasan, devel
  Cc: Radim Krcmar, Greg Kroah-Hartman, Haiyang Zhang, linux-kernel,
	linux-api
In-Reply-To: <1426080574-9011-1-git-send-email-vkuznets@redhat.com>

..., make it inline and rename it to hv_poll_channel() so it can be reused
in other hv_util modules.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
---
 drivers/hv/hv_kvp.c       | 17 +++--------------
 drivers/hv/hyperv_vmbus.h | 12 ++++++++++++
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c
index caa467d..939c3e7 100644
--- a/drivers/hv/hv_kvp.c
+++ b/drivers/hv/hv_kvp.c
@@ -128,17 +128,6 @@ kvp_work_func(struct work_struct *dummy)
 	kvp_respond_to_host(NULL, HV_E_FAIL);
 }
 
-static void poll_channel(struct vmbus_channel *channel)
-{
-	if (channel->target_cpu != smp_processor_id())
-		smp_call_function_single(channel->target_cpu,
-					 hv_kvp_onchannelcallback,
-					 channel, true);
-	else
-		hv_kvp_onchannelcallback(channel);
-}
-
-
 static int kvp_handle_handshake(struct hv_kvp_msg *msg)
 {
 	int ret = 1;
@@ -166,8 +155,8 @@ static int kvp_handle_handshake(struct hv_kvp_msg *msg)
 		pr_info("KVP: user-mode registering done.\n");
 		kvp_register(dm_reg_value);
 		kvp_transaction.active = false;
-		if (kvp_transaction.kvp_context)
-			poll_channel(kvp_transaction.kvp_context);
+		hv_poll_channel(kvp_transaction.kvp_context,
+				hv_kvp_onchannelcallback);
 	}
 	return ret;
 }
@@ -587,7 +576,7 @@ response_done:
 
 	vmbus_sendpacket(channel, recv_buffer, buf_len, req_id,
 				VM_PKT_DATA_INBAND, 0);
-	poll_channel(channel);
+	hv_poll_channel(channel, hv_kvp_onchannelcallback);
 }
 
 /*
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 0d85dd1..177dbec 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -711,5 +711,17 @@ int hv_fcopy_init(struct hv_util_service *);
 void hv_fcopy_deinit(void);
 void hv_fcopy_onchannelcallback(void *);
 
+static inline void hv_poll_channel(struct vmbus_channel *channel,
+				   void (*cb)(void *))
+{
+	if (!channel)
+		return;
+
+	if (channel->target_cpu != smp_processor_id())
+		smp_call_function_single(channel->target_cpu,
+					 cb, channel, true);
+	else
+		cb(channel);
+}
 
 #endif /* _HYPERV_VMBUS_H */
-- 
1.9.3

^ permalink raw reply related

* [PATCH RFCv2 04/21] Drivers: hv: fcopy: process deferred messages when we complete the transaction
From: Vitaly Kuznetsov @ 2015-03-11 13:29 UTC (permalink / raw)
  To: K. Y. Srinivasan, devel
  Cc: Radim Krcmar, Greg Kroah-Hartman, Haiyang Zhang, linux-kernel,
	linux-api
In-Reply-To: <1426080574-9011-1-git-send-email-vkuznets@redhat.com>

In theory, the host is not supposed to issue any requests before be reply to
the previous one. In KVP we, however, support the following scenarios:
1) A message was received before userspace daemon registered;
2) A message was received while the previous one is still being processed.
In FCOPY we support only the former. Add support for the later, use
hv_poll_channel() to do the job.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
---
 drivers/hv/hv_fcopy.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c
index cd453e4..8bdf752 100644
--- a/drivers/hv/hv_fcopy.c
+++ b/drivers/hv/hv_fcopy.c
@@ -98,6 +98,8 @@ static void fcopy_work_func(struct work_struct *dummy)
 	if (down_trylock(&fcopy_transaction.read_sema))
 		;
 
+	hv_poll_channel(fcopy_transaction.fcopy_context,
+			hv_fcopy_onchannelcallback);
 }
 
 static int fcopy_handle_handshake(u32 version)
@@ -117,8 +119,8 @@ static int fcopy_handle_handshake(u32 version)
 	pr_info("FCP: user-mode registering done. Daemon version: %d\n",
 		version);
 	fcopy_transaction.active = false;
-	if (fcopy_transaction.fcopy_context)
-		hv_fcopy_onchannelcallback(fcopy_transaction.fcopy_context);
+	hv_poll_channel(fcopy_transaction.fcopy_context,
+			hv_fcopy_onchannelcallback);
 	in_hand_shake = false;
 	return 0;
 }
@@ -226,6 +228,7 @@ void hv_fcopy_onchannelcallback(void *context)
 		fcopy_transaction.fcopy_context = context;
 		return;
 	}
+	fcopy_transaction.fcopy_context = NULL;
 
 	vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 2, &recvlen,
 			 &requestid);
@@ -333,8 +336,11 @@ static ssize_t fcopy_write(struct file *file, const char __user *buf,
 	 * Complete the transaction by forwarding the result
 	 * to the host. But first, cancel the timeout.
 	 */
-	if (cancel_delayed_work_sync(&fcopy_work))
+	if (cancel_delayed_work_sync(&fcopy_work)) {
 		fcopy_respond_to_host(response);
+		hv_poll_channel(fcopy_transaction.fcopy_context,
+				hv_fcopy_onchannelcallback);
+	}
 
 	return sizeof(int);
 }
-- 
1.9.3

^ permalink raw reply related

* [PATCH RFCv2 05/21] Drivers: hv: vss: process deferred messages when we complete the transaction
From: Vitaly Kuznetsov @ 2015-03-11 13:29 UTC (permalink / raw)
  To: K. Y. Srinivasan, devel
  Cc: Radim Krcmar, Greg Kroah-Hartman, Haiyang Zhang, linux-kernel,
	linux-api
In-Reply-To: <1426080574-9011-1-git-send-email-vkuznets@redhat.com>

In theory, the host is not supposed to issue any requests before be reply to
the previous one. In KVP we, however, support the following scenarios:
1) A message was received before userspace daemon registered;
2) A message was received while the previous one is still being processed.
In VSS we support only the former. Add support for the later, use
hv_poll_channel() to do the job.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
---
 drivers/hv/hv_snapshot.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c
index c1a3604..4bb9b1c 100644
--- a/drivers/hv/hv_snapshot.c
+++ b/drivers/hv/hv_snapshot.c
@@ -47,6 +47,7 @@ static struct {
 	struct vmbus_channel *recv_channel; /* chn we got the request */
 	u64 recv_req_id; /* request ID. */
 	struct hv_vss_msg  *msg; /* current message */
+	void *vss_context; /* for the channel callback */
 } vss_transaction;
 
 
@@ -73,6 +74,9 @@ static void vss_timeout_func(struct work_struct *dummy)
 	 */
 	pr_warn("VSS: timeout waiting for daemon to reply\n");
 	vss_respond_to_host(HV_E_FAIL);
+
+	hv_poll_channel(vss_transaction.vss_context,
+			hv_vss_onchannelcallback);
 }
 
 static void
@@ -85,13 +89,12 @@ vss_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
 	if (vss_msg->vss_hdr.operation == VSS_OP_REGISTER) {
 		pr_info("VSS daemon registered\n");
 		vss_transaction.active = false;
-		if (vss_transaction.recv_channel != NULL)
-			hv_vss_onchannelcallback(vss_transaction.recv_channel);
-		return;
-
 	}
 	if (cancel_delayed_work_sync(&vss_timeout_work))
 		vss_respond_to_host(vss_msg->error);
+
+	hv_poll_channel(vss_transaction.vss_context,
+			hv_vss_onchannelcallback);
 }
 
 
@@ -198,9 +201,10 @@ void hv_vss_onchannelcallback(void *context)
 		 * We will defer processing this callback once
 		 * the current transaction is complete.
 		 */
-		vss_transaction.recv_channel = channel;
+		vss_transaction.vss_context = context;
 		return;
 	}
+	vss_transaction.vss_context = NULL;
 
 	vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 2, &recvlen,
 			 &requestid);
-- 
1.9.3

^ permalink raw reply related

* [PATCH RFCv2 06/21] Drivers: hv: kvp: rename kvp_work -> kvp_timeout_work
From: Vitaly Kuznetsov @ 2015-03-11 13:29 UTC (permalink / raw)
  To: K. Y. Srinivasan, devel-tBiZLqfeLfOHmIFyCCdPziST3g8Odh+X
  Cc: Haiyang Zhang, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Dexuan Cui,
	Radim Krcmar, Greg Kroah-Hartman,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1426080574-9011-1-git-send-email-vkuznets-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

'kvp_work' (and kvp_work_func) is a misnomer as it sounds like we expect
this useful work to happen and in reality it is just an emergency escape when
timeout happens.

Signed-off-by: Vitaly Kuznetsov <vkuznets-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
---
 drivers/hv/hv_kvp.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c
index 939c3e7..14c62e2 100644
--- a/drivers/hv/hv_kvp.c
+++ b/drivers/hv/hv_kvp.c
@@ -79,10 +79,10 @@ static void kvp_send_key(struct work_struct *dummy);
 
 
 static void kvp_respond_to_host(struct hv_kvp_msg *msg, int error);
-static void kvp_work_func(struct work_struct *dummy);
+static void kvp_timeout_func(struct work_struct *dummy);
 static void kvp_register(int);
 
-static DECLARE_DELAYED_WORK(kvp_work, kvp_work_func);
+static DECLARE_DELAYED_WORK(kvp_timeout_work, kvp_timeout_func);
 static DECLARE_WORK(kvp_sendkey_work, kvp_send_key);
 
 static struct cb_id kvp_id = { CN_KVP_IDX, CN_KVP_VAL };
@@ -118,8 +118,8 @@ kvp_register(int reg_value)
 		kfree(msg);
 	}
 }
-static void
-kvp_work_func(struct work_struct *dummy)
+
+static void kvp_timeout_func(struct work_struct *dummy)
 {
 	/*
 	 * If the timer fires, the user-mode component has not responded;
@@ -215,7 +215,7 @@ kvp_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
 	 * Complete the transaction by forwarding the key value
 	 * to the host. But first, cancel the timeout.
 	 */
-	if (cancel_delayed_work_sync(&kvp_work))
+	if (cancel_delayed_work_sync(&kvp_timeout_work))
 		kvp_respond_to_host(message, error);
 }
 
@@ -440,7 +440,7 @@ kvp_send_key(struct work_struct *dummy)
 	rc = cn_netlink_send(msg, 0, 0, GFP_ATOMIC);
 	if (rc) {
 		pr_debug("KVP: failed to communicate to the daemon: %d\n", rc);
-		if (cancel_delayed_work_sync(&kvp_work))
+		if (cancel_delayed_work_sync(&kvp_timeout_work))
 			kvp_respond_to_host(message, HV_E_FAIL);
 	}
 
@@ -668,7 +668,7 @@ void hv_kvp_onchannelcallback(void *context)
 			 * user-mode not responding.
 			 */
 			schedule_work(&kvp_sendkey_work);
-			schedule_delayed_work(&kvp_work, 5*HZ);
+			schedule_delayed_work(&kvp_timeout_work, 5*HZ);
 
 			return;
 
@@ -708,6 +708,6 @@ hv_kvp_init(struct hv_util_service *srv)
 void hv_kvp_deinit(void)
 {
 	cn_del_callback(&kvp_id);
-	cancel_delayed_work_sync(&kvp_work);
+	cancel_delayed_work_sync(&kvp_timeout_work);
 	cancel_work_sync(&kvp_sendkey_work);
 }
-- 
1.9.3

^ permalink raw reply related

* [PATCH RFCv2 07/21] Drivers: hv: fcopy: rename fcopy_work -> fcopy_timeout_work
From: Vitaly Kuznetsov @ 2015-03-11 13:29 UTC (permalink / raw)
  To: K. Y. Srinivasan, devel
  Cc: Radim Krcmar, Greg Kroah-Hartman, Haiyang Zhang, linux-kernel,
	linux-api
In-Reply-To: <1426080574-9011-1-git-send-email-vkuznets@redhat.com>

'fcopy_work' (and fcopy_work_func) is a misnomer as it sounds like we expect
this useful work to happen and in reality it is just an emergency escape when
timeout happens.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
---
 drivers/hv/hv_fcopy.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c
index 8bdf752..c14f0f4 100644
--- a/drivers/hv/hv_fcopy.c
+++ b/drivers/hv/hv_fcopy.c
@@ -75,11 +75,11 @@ static bool opened; /* currently device opened */
 static bool in_hand_shake = true;
 static void fcopy_send_data(void);
 static void fcopy_respond_to_host(int error);
-static void fcopy_work_func(struct work_struct *dummy);
-static DECLARE_DELAYED_WORK(fcopy_work, fcopy_work_func);
+static void fcopy_timeout_func(struct work_struct *dummy);
+static DECLARE_DELAYED_WORK(fcopy_timeout_work, fcopy_timeout_func);
 static u8 *recv_buffer;
 
-static void fcopy_work_func(struct work_struct *dummy)
+static void fcopy_timeout_func(struct work_struct *dummy)
 {
 	/*
 	 * If the timer fires, the user-mode component has not responded;
@@ -261,7 +261,7 @@ void hv_fcopy_onchannelcallback(void *context)
 		/*
 		 * Send the information to the user-level daemon.
 		 */
-		schedule_delayed_work(&fcopy_work, 5*HZ);
+		schedule_delayed_work(&fcopy_timeout_work, 5*HZ);
 		fcopy_send_data();
 		return;
 	}
@@ -336,7 +336,7 @@ static ssize_t fcopy_write(struct file *file, const char __user *buf,
 	 * Complete the transaction by forwarding the result
 	 * to the host. But first, cancel the timeout.
 	 */
-	if (cancel_delayed_work_sync(&fcopy_work)) {
+	if (cancel_delayed_work_sync(&fcopy_timeout_work))
 		fcopy_respond_to_host(response);
 		hv_poll_channel(fcopy_transaction.fcopy_context,
 				hv_fcopy_onchannelcallback);
@@ -378,7 +378,7 @@ static int fcopy_release(struct inode *inode, struct file *f)
 	in_hand_shake = true;
 	opened = false;
 
-	if (cancel_delayed_work_sync(&fcopy_work)) {
+	if (cancel_delayed_work_sync(&fcopy_timeout_work)) {
 		/* We haven't up()-ed the semaphore(very rare)? */
 		if (down_trylock(&fcopy_transaction.read_sema))
 			;
@@ -442,6 +442,6 @@ int hv_fcopy_init(struct hv_util_service *srv)
 
 void hv_fcopy_deinit(void)
 {
-	cancel_delayed_work_sync(&fcopy_work);
+	cancel_delayed_work_sync(&fcopy_timeout_work);
 	fcopy_dev_deinit();
 }
-- 
1.9.3

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox