Linux userland API discussions
 help / color / mirror / Atom feed
* [PATCH v3 linux-trace 1/8] tracing: attach eBPF programs to tracepoints and syscalls
From: Alexei Starovoitov @ 2015-02-10  3:45 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1423539961-21792-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>

User interface:
struct perf_event_attr attr = {.type = PERF_TYPE_TRACEPOINT, .config = event_id, ...};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);

prog_fd is a file descriptor associated with eBPF program previously loaded.
event_id is an ID of static tracepoint event or syscall.
(kprobe support is in next patch)

close(event_fd) - automatically detaches eBPF program from it

eBPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- fetch_ptr/u64/u32/u16/u8 values from unsafe address via probe_kernel_read(),
  so that eBPF program can walk any kernel data structures
- probe_memcmp - combination of probe_kernel_read() and memcmp()

Signed-off-by: Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
---
 include/linux/bpf.h             |    6 +-
 include/linux/ftrace_event.h    |   11 +++
 include/trace/bpf_trace.h       |   25 +++++++
 include/trace/ftrace.h          |   31 +++++++++
 include/uapi/linux/bpf.h        |    7 ++
 include/uapi/linux/perf_event.h |    1 +
 kernel/events/core.c            |   55 +++++++++++++++
 kernel/trace/Makefile           |    1 +
 kernel/trace/bpf_trace.c        |  145 +++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_syscalls.c   |   35 ++++++++++
 10 files changed, 316 insertions(+), 1 deletion(-)
 create mode 100644 include/trace/bpf_trace.h
 create mode 100644 kernel/trace/bpf_trace.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bbfceb756452..a0f6f636ced0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -130,10 +130,14 @@ struct bpf_prog_aux {
 
 #ifdef CONFIG_BPF_SYSCALL
 void bpf_prog_put(struct bpf_prog *prog);
+struct bpf_prog *bpf_prog_get(u32 ufd);
 #else
 static inline void bpf_prog_put(struct bpf_prog *prog) {}
+static inline struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+	return ERR_PTR(-ENOENT);
+}
 #endif
-struct bpf_prog *bpf_prog_get(u32 ufd);
 /* verify correctness of eBPF program */
 int bpf_check(struct bpf_prog *fp, union bpf_attr *attr);
 
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 0bebb5c348b8..479d0a4a42b3 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -13,6 +13,7 @@ struct trace_array;
 struct trace_buffer;
 struct tracer;
 struct dentry;
+struct bpf_prog;
 
 struct trace_print_flags {
 	unsigned long		mask;
@@ -299,6 +300,7 @@ struct ftrace_event_call {
 #ifdef CONFIG_PERF_EVENTS
 	int				perf_refcount;
 	struct hlist_head __percpu	*perf_events;
+	struct bpf_prog			*prog;
 
 	int	(*perf_perm)(struct ftrace_event_call *,
 			     struct perf_event *);
@@ -544,6 +546,15 @@ event_trigger_unlock_commit_regs(struct ftrace_event_file *file,
 		event_triggers_post_call(file, tt);
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
+#else
+static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+	return 1;
+}
+#endif
+
 enum {
 	FILTER_OTHER = 0,
 	FILTER_STATIC_STRING,
diff --git a/include/trace/bpf_trace.h b/include/trace/bpf_trace.h
new file mode 100644
index 000000000000..4e64f61f484d
--- /dev/null
+++ b/include/trace/bpf_trace.h
@@ -0,0 +1,25 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef _LINUX_KERNEL_BPF_TRACE_H
+#define _LINUX_KERNEL_BPF_TRACE_H
+
+/* For tracepoint filters argN fields match one to one to arguments
+ * passed to tracepoint events
+ *
+ * For syscall entry filters argN fields match syscall arguments
+ * For syscall exit filters arg1 is a return value
+ */
+struct bpf_context {
+	u64 arg1;
+	u64 arg2;
+	u64 arg3;
+	u64 arg4;
+	u64 arg5;
+	u64 arg6;
+};
+
+#endif /* _LINUX_KERNEL_BPF_TRACE_H */
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 139b5067345b..4c275ce2dcf0 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -17,6 +17,7 @@
  */
 
 #include <linux/ftrace_event.h>
+#include <trace/bpf_trace.h>
 
 /*
  * DECLARE_EVENT_CLASS can be used to add a generic function
@@ -755,12 +756,32 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call
 #undef __perf_task
 #define __perf_task(t)	(__task = (t))
 
+/* zero extend integer, pointer or aggregate type to u64 without warnings */
+#define __CAST_TO_U64(EXPR) ({ \
+	u64 ret = 0; \
+	typeof(EXPR) expr = EXPR; \
+	switch (sizeof(expr)) { \
+	case 8: ret = *(u64 *) &expr; break; \
+	case 4: ret = *(u32 *) &expr; break; \
+	case 2: ret = *(u16 *) &expr; break; \
+	case 1: ret = *(u8 *) &expr; break; \
+	} \
+	ret; })
+
+#define __BPF_CAST1(a,...) __CAST_TO_U64(a)
+#define __BPF_CAST2(a,...) __CAST_TO_U64(a), __BPF_CAST1(__VA_ARGS__)
+#define __BPF_CAST3(a,...) __CAST_TO_U64(a), __BPF_CAST2(__VA_ARGS__)
+#define __BPF_CAST4(a,...) __CAST_TO_U64(a), __BPF_CAST3(__VA_ARGS__)
+#define __BPF_CAST5(a,...) __CAST_TO_U64(a), __BPF_CAST4(__VA_ARGS__)
+#define __BPF_CAST6(a,...) __CAST_TO_U64(a), __BPF_CAST5(__VA_ARGS__)
+
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 static notrace void							\
 perf_trace_##call(void *__data, proto)					\
 {									\
 	struct ftrace_event_call *event_call = __data;			\
+	struct bpf_prog *prog = event_call->prog;			\
 	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
 	struct ftrace_raw_##call *entry;				\
 	struct pt_regs __regs;						\
@@ -771,6 +792,16 @@ perf_trace_##call(void *__data, proto)					\
 	int __data_size;						\
 	int rctx;							\
 									\
+	if (prog) {							\
+		__maybe_unused const u64 z = 0;				\
+		struct bpf_context __ctx = ((struct bpf_context) {	\
+				__BPF_CAST6(args, z, z, z, z, z)	\
+			});						\
+									\
+		if (!trace_call_bpf(prog, &__ctx))			\
+			return;						\
+	}								\
+									\
 	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
 									\
 	head = this_cpu_ptr(event_call->perf_events);			\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 45da7ec7d274..d73d7d0abe6e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -118,6 +118,7 @@ enum bpf_map_type {
 enum bpf_prog_type {
 	BPF_PROG_TYPE_UNSPEC,
 	BPF_PROG_TYPE_SOCKET_FILTER,
+	BPF_PROG_TYPE_TRACEPOINT,
 };
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
@@ -162,6 +163,12 @@ enum bpf_func_id {
 	BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
 	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
+	BPF_FUNC_fetch_ptr,       /* void *bpf_fetch_ptr(void *unsafe_ptr) */
+	BPF_FUNC_fetch_u64,       /* u64 bpf_fetch_u64(void *unsafe_ptr) */
+	BPF_FUNC_fetch_u32,       /* u32 bpf_fetch_u32(void *unsafe_ptr) */
+	BPF_FUNC_fetch_u16,       /* u16 bpf_fetch_u16(void *unsafe_ptr) */
+	BPF_FUNC_fetch_u8,        /* u8 bpf_fetch_u8(void *unsafe_ptr) */
+	BPF_FUNC_probe_memcmp,    /* int bpf_probe_memcmp(unsafe_ptr, safe_ptr, size) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 9b79abbd1ab8..d7ba67234761 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -360,6 +360,7 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
 #define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
 #define PERF_EVENT_IOC_ID		_IOR('$', 7, __u64 *)
+#define PERF_EVENT_IOC_SET_BPF		_IOW('$', 8, __u32)
 
 enum perf_event_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 882f835a0d85..674a8ca17190 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -42,6 +42,8 @@
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/compat.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
 
 #include "internal.h"
 
@@ -3283,6 +3285,7 @@ errout:
 }
 
 static void perf_event_free_filter(struct perf_event *event);
+static void perf_event_free_bpf_prog(struct perf_event *event);
 
 static void free_event_rcu(struct rcu_head *head)
 {
@@ -3292,6 +3295,7 @@ static void free_event_rcu(struct rcu_head *head)
 	if (event->ns)
 		put_pid_ns(event->ns);
 	perf_event_free_filter(event);
+	perf_event_free_bpf_prog(event);
 	kfree(event);
 }
 
@@ -3795,6 +3799,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
 static int perf_event_set_output(struct perf_event *event,
 				 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
 
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
@@ -3849,6 +3854,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case PERF_EVENT_IOC_SET_FILTER:
 		return perf_event_set_filter(event, (void __user *)arg);
 
+	case PERF_EVENT_IOC_SET_BPF:
+		return perf_event_set_bpf_prog(event, arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -6266,6 +6274,45 @@ static void perf_event_free_filter(struct perf_event *event)
 	ftrace_profile_free_filter(event);
 }
 
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+	struct bpf_prog *prog;
+
+	if (event->attr.type != PERF_TYPE_TRACEPOINT)
+		return -EINVAL;
+
+	if (event->tp_event->prog)
+		return -EEXIST;
+
+	prog = bpf_prog_get(prog_fd);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	if (prog->aux->prog_type != BPF_PROG_TYPE_TRACEPOINT) {
+		/* valid fd, but invalid bpf program type */
+		bpf_prog_put(prog);
+		return -EINVAL;
+	}
+
+	event->tp_event->prog = prog;
+
+	return 0;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+	struct bpf_prog *prog;
+
+	if (!event->tp_event)
+		return;
+
+	prog = event->tp_event->prog;
+	if (prog) {
+		event->tp_event->prog = NULL;
+		bpf_prog_put(prog);
+	}
+}
+
 #else
 
 static inline void perf_tp_register(void)
@@ -6281,6 +6328,14 @@ static void perf_event_free_filter(struct perf_event *event)
 {
 }
 
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+	return -ENOENT;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+}
 #endif /* CONFIG_EVENT_TRACING */
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 979ccde26720..54ae225e5fc6 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_trace.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_PM),y)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
new file mode 100644
index 000000000000..ec065e0a364e
--- /dev/null
+++ b/kernel/trace/bpf_trace.c
@@ -0,0 +1,145 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/uaccess.h>
+#include <trace/bpf_trace.h>
+#include "trace.h"
+
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+	unsigned int ret;
+
+	if (in_nmi()) /* not supported yet */
+		return 1;
+
+	rcu_read_lock();
+	ret = BPF_PROG_RUN(prog, ctx);
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(trace_call_bpf);
+
+static u64 bpf_fetch_ptr(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	void *unsafe_ptr = (void *) (long) r1;
+	void *ptr = NULL;
+
+	probe_kernel_read(&ptr, unsafe_ptr, sizeof(ptr));
+	return (u64) (unsigned long) ptr;
+}
+
+#define FETCH(SIZE) \
+static u64 bpf_fetch_##SIZE(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)	\
+{									\
+	void *unsafe_ptr = (void *) (long) r1;				\
+	SIZE val = 0;							\
+									\
+	probe_kernel_read(&val, unsafe_ptr, sizeof(val));		\
+	return (u64) (SIZE) val;					\
+}
+FETCH(u64)
+FETCH(u32)
+FETCH(u16)
+FETCH(u8)
+#undef FETCH
+
+static u64 bpf_probe_memcmp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	void *unsafe_ptr = (void *) (long) r1;
+	void *safe_ptr = (void *) (long) r2;
+	u32 size = (u32) r3;
+	char buf[64];
+	int err;
+
+	if (size < 64) {
+		err = probe_kernel_read(buf, unsafe_ptr, size);
+		if (err)
+			return err;
+		return memcmp(buf, safe_ptr, size);
+	}
+	return -1;
+}
+
+static struct bpf_func_proto tp_prog_funcs[] = {
+#define FETCH(SIZE)				\
+	[BPF_FUNC_fetch_##SIZE] = {		\
+		.func = bpf_fetch_##SIZE,	\
+		.gpl_only = true,		\
+		.ret_type = RET_INTEGER,	\
+	},
+	FETCH(ptr)
+	FETCH(u64)
+	FETCH(u32)
+	FETCH(u16)
+	FETCH(u8)
+#undef FETCH
+	[BPF_FUNC_probe_memcmp] = {
+		.func = bpf_probe_memcmp,
+		.gpl_only = false,
+		.ret_type = RET_INTEGER,
+		.arg1_type = ARG_ANYTHING,
+		.arg2_type = ARG_PTR_TO_STACK,
+		.arg3_type = ARG_CONST_STACK_SIZE,
+	},
+};
+
+static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	default:
+		if (func_id < 0 || func_id >= ARRAY_SIZE(tp_prog_funcs))
+			return NULL;
+		return &tp_prog_funcs[func_id];
+	}
+}
+
+/* check access to argN fields of 'struct bpf_context' from program */
+static bool tp_prog_is_valid_access(int off, int size,
+				    enum bpf_access_type type)
+{
+	/* check bounds */
+	if (off < 0 || off >= sizeof(struct bpf_context))
+		return false;
+
+	/* only read is allowed */
+	if (type != BPF_READ)
+		return false;
+
+	/* disallow misaligned access */
+	if (off % size != 0)
+		return false;
+
+	return true;
+}
+
+static struct bpf_verifier_ops tp_prog_ops = {
+	.get_func_proto = tp_prog_func_proto,
+	.is_valid_access = tp_prog_is_valid_access,
+};
+
+static struct bpf_prog_type_list tl = {
+	.ops = &tp_prog_ops,
+	.type = BPF_PROG_TYPE_TRACEPOINT,
+};
+
+static int __init register_tp_prog_ops(void)
+{
+	bpf_register_prog_type(&tl);
+	return 0;
+}
+late_initcall(register_tp_prog_ops);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c6ee36fcbf90..3487c41f4c0e 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -7,6 +7,7 @@
 #include <linux/ftrace.h>
 #include <linux/perf_event.h>
 #include <asm/syscall.h>
+#include <trace/bpf_trace.h>
 
 #include "trace_output.h"
 #include "trace.h"
@@ -545,11 +546,26 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
 static int sys_perf_refcount_enter;
 static int sys_perf_refcount_exit;
 
+static void populate_bpf_ctx(struct bpf_context *ctx, struct pt_regs *regs)
+{
+	struct task_struct *task = current;
+	unsigned long args[6];
+
+	syscall_get_arguments(task, regs, 0, 6, args);
+	ctx->arg1 = args[0];
+	ctx->arg2 = args[1];
+	ctx->arg3 = args[2];
+	ctx->arg4 = args[3];
+	ctx->arg5 = args[4];
+	ctx->arg6 = args[5];
+}
+
 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_enter *rec;
 	struct hlist_head *head;
+	struct bpf_prog *prog;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -564,6 +580,15 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	if (!sys_data)
 		return;
 
+	prog = sys_data->enter_event->prog;
+	if (prog) {
+		struct bpf_context ctx;
+
+		populate_bpf_ctx(&ctx, regs);
+		if (!trace_call_bpf(prog, &ctx))
+			return;
+	}
+
 	head = this_cpu_ptr(sys_data->enter_event->perf_events);
 	if (hlist_empty(head))
 		return;
@@ -624,6 +649,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_exit *rec;
 	struct hlist_head *head;
+	struct bpf_prog *prog;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -638,6 +664,15 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	if (!sys_data)
 		return;
 
+	prog = sys_data->exit_event->prog;
+	if (prog) {
+		struct bpf_context ctx = {};
+
+		ctx.arg1 = syscall_get_return_value(current, regs);
+		if (!trace_call_bpf(prog, &ctx))
+			return;
+	}
+
 	head = this_cpu_ptr(sys_data->exit_event->perf_events);
 	if (hlist_empty(head))
 		return;
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v3 linux-trace 0/8] tracing: attach eBPF programs to tracepoints/syscalls/kprobe
From: Alexei Starovoitov @ 2015-02-10  3:45 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

Hi Steven,

This patch set is for linux-trace/for-next
It adds ability to attach eBPF programs to tracepoints, syscalls and kprobes.
Obviously too late for 3.20, but please review. I'll rebase and repost when
merge window closes.

Main difference in V3 is different attaching mechanism:
- load program via bpf() syscall and receive prog_fd
- event_fd = perf_event_open()
- ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd) to attach program to event
- close(event_fd) will destroy event and detach the program
kernel diff became smaller and in general this approach is cleaner
(thanks to Masami and Namhyung for suggesting it)

The programs are run before ring buffer is allocated to have minimal
impact on a system, which can be demonstrated by
'dd if=/dev/zero of=/dev/null count=20000000' test:
4.80074 s, 2.1 GB/s - no tracing (raw base line)
5.62705 s, 1.8 GB/s - attached bpf program does 'map[log2(count)]++' without JIT
5.05963 s, 2.0 GB/s - attached bpf program does 'map[log2(count)]++' with JIT
4.91715 s, 2.1 GB/s - attached bpf program does 'return 0'

perf record -e skb:sys_write dd if=/dev/zero of=/dev/null count=20000000
8.75686 s, 1.2 GB/s
Warning: Processed 20355236 events and lost 44 chunks!

perf record -e skb:sys_write --filter cnt==1234 dd if=/dev/zero of=/dev/null count=20000000
5.69732 s, 1.8 GB/s

6.13730 s, 1.7 GB/s - echo 1 > /sys/../events/skb/sys_write/enable
6.50091 s, 1.6 GB/s - echo 'cnt == 1234' > /sys/../events/skb/sys_write/filter

(skb:sys_write is a temporary tracepoint in write() syscall)

So the overhead of realistic bpf program is 5.05963/4.80074 = ~5%
which is faster than perf_event filtering: 5.69732/4.80074 = ~18%
or ftrace filtering: 6.50091/4.80074 = ~35%

V2->V3:
- changed program attach interface from tracefs into perf_event ioctl
- rewrote user space helpers to use perf_events
- rewrote tracex1 example to use mmap-ed ring_buffer instead of trace_pipe
- as suggested by Arnaldo renamed bpf_memcmp to bpf_probe_memcmp to better
  indicate function logic
- added ifdefs to make bpf check a nop when CONFIG_BPF_SYSCALL is not set

V1->V2:
- dropped bpf_dump_stack() and bpf_printk() helpers
- disabled running programs in_nmi
- other minor cleanups

Program attach point and input arguments:
- programs attached to kprobes receive 'struct pt_regs *' as an input.
  See tracex4_kern.c that demonstrates how users can write a C program like:
  SEC("events/kprobes/sys_write")
  int bpf_prog4(struct pt_regs *regs)
  {
     long write_size = regs->dx; 
     // here user need to know the proto of sys_write() from kernel
     // sources and x64 calling convention to know that register $rdx
     // contains 3rd argument to sys_write() which is 'size_t count'

  it's obviously architecture dependent, but allows building sophisticated
  user tools on top, that can see from debug info of vmlinux which variables
  are in which registers or stack locations and fetch it from there.
  'perf probe' can potentialy use this hook to generate programs in user space
  and insert them instead of letting kernel parse string during kprobe creation.

- programs attached to tracepoints and syscalls receive 'struct bpf_context *':
  u64 arg1, arg2, ..., arg6;
  for syscalls they match syscall arguments.
  for tracepoints these args match arguments passed to tracepoint.
  For example:
  trace_sched_migrate_task(p, new_cpu); from sched/core.c
  arg1 <- p        which is 'struct task_struct *'
  arg2 <- new_cpu  which is 'unsigned int'
  arg3..arg6 = 0
  the program can use bpf_fetch_u8/16/32/64/ptr() helpers to walk 'task_struct'
  or any other kernel data structures.
  These helpers are using probe_kernel_read() similar to 'perf probe' which is
  not 100% safe in both cases, but good enough.
  To access task_struct's pid inside 'sched_migrate_task' tracepoint
  the program can do:
  struct task_struct *task = (struct task_struct *)ctx->arg1;
  u32 pid = bpf_fetch_u32(&task->pid);
  Since struct layout is kernel configuration specific such programs are not
  portable and require access to kernel headers to be compiled,
  but in this case we don't need debug info.
  llvm with bpf backend will statically compute task->pid offset as a constant
  based on kernel headers only.
  The example of this arbitrary pointer walking is tracex1_kern.c
  which does skb->dev->name == "lo" filtering.

In all cases the programs are called before ring buffer is allocated to
minimize the overhead, since we want to filter huge number of events, but
perf_trace_buf_prepare/submit and argument copy for every event is too costly.

Note, tracepoint/syscall and kprobe programs are two different types:
BPF_PROG_TYPE_TRACEPOINT and BPF_PROG_TYPE_KPROBE,
since they expect different input.
Both use the same set of helper functions:
- map access (lookup/update/delete)
- fetch (probe_kernel_read wrappers)
- probe_memcmp (probe_kernel_read + memcmp)

Portability:
- kprobe programs are architecture dependent and need user scripting
  language like ktap/stap/dtrace/perf that will dynamically generate
  them based on debug info in vmlinux
- tracepoint programs are architecture independent, but if arbitrary pointer
  walking (with fetch() helpers) is used, they need data struct layout to match.
  Debug info is not necessary
- for networking use case we need to access 'struct sk_buff' fields in portable
  way (user space needs to fetch packet length without knowing layout of sk_buff),
  so for some frequently used data structures there will be a way to access them
  effeciently without bpf_fetch* helpers. Once it's ready tracepoint programs
  that access common data structs will be kernel independent.

Program return value:
- programs return 0 to discard an event
- and return non-zero to proceed with event (get ring buffer, copy
  arguments there and pass to user space via mmap-ed area)

Examples:
- dropmon.c - simple kfree_skb() accounting in eBPF assembler, similar
  to dropmon tool
- tracex1_kern.c - does net/netif_receive_skb event filtering
  for dev->skb->name == "lo" condition
  trace1_user.c - receives PERF_SAMPLE_RAW events into mmap-ed buffer and
  prints them
- tracex2_kern.c - same kfree_skb() accounting like dropmon, but now in C
  plus computes histogram of all write sizes from sys_write syscall
  and prints the histogram in userspace
- tracex3_kern.c - most sophisticated example that computes IO latency
  between block/block_rq_issue and block/block_rq_complete events
  and prints 'heatmap' using gray shades of text terminal.
  Useful to analyze disk performance.
- tracex4_kern.c - computes histogram of write sizes from sys_write syscall
  using kprobe mechanism instead of syscall. Since kprobe is optimized into
  ftrace the overhead of instrumentation is smaller than in example 2.

The user space tools like ktap/dtrace/systemptap/perf that has access
to debug info would probably want to use kprobe attachment point, since kprobe
can be inserted anywhere and all registers are avaiable in the program.
tracepoint attachments are useful without debug info, so standalone tools
like iosnoop will use them.

The main difference vs existing perf_probe/ftrace infra is in kernel aggregation
and conditional walking of arbitrary data structures.

Thanks!

Alexei Starovoitov (8):
  tracing: attach eBPF programs to tracepoints and syscalls
  tracing: allow eBPF programs to call ktime_get_ns()
  samples: bpf: simple tracing example in eBPF assembler
  samples: bpf: simple tracing example in C
  samples: bpf: counting example for kfree_skb tracepoint and write
    syscall
  samples: bpf: IO latency analysis (iosnoop/heatmap)
  tracing: attach eBPF programs to kprobe/kretprobe
  samples: bpf: simple kprobe example

 include/linux/bpf.h             |    6 +-
 include/linux/ftrace_event.h    |   14 +++
 include/trace/bpf_trace.h       |   25 +++++
 include/trace/ftrace.h          |   31 +++++++
 include/uapi/linux/bpf.h        |    9 ++
 include/uapi/linux/perf_event.h |    1 +
 kernel/events/core.c            |   58 ++++++++++++
 kernel/trace/Makefile           |    1 +
 kernel/trace/bpf_trace.c        |  194 +++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_kprobe.c     |   10 +-
 kernel/trace/trace_syscalls.c   |   35 +++++++
 samples/bpf/Makefile            |   18 ++++
 samples/bpf/bpf_helpers.h       |   14 +++
 samples/bpf/bpf_load.c          |  136 +++++++++++++++++++++++++--
 samples/bpf/bpf_load.h          |   12 +++
 samples/bpf/dropmon.c           |  143 +++++++++++++++++++++++++++++
 samples/bpf/libbpf.c            |    7 ++
 samples/bpf/libbpf.h            |    4 +
 samples/bpf/tracex1_kern.c      |   28 ++++++
 samples/bpf/tracex1_user.c      |   50 ++++++++++
 samples/bpf/tracex2_kern.c      |   71 ++++++++++++++
 samples/bpf/tracex2_user.c      |   95 +++++++++++++++++++
 samples/bpf/tracex3_kern.c      |   98 ++++++++++++++++++++
 samples/bpf/tracex3_user.c      |  152 ++++++++++++++++++++++++++++++
 samples/bpf/tracex4_kern.c      |   36 ++++++++
 samples/bpf/tracex4_user.c      |   83 +++++++++++++++++
 26 files changed, 1321 insertions(+), 10 deletions(-)
 create mode 100644 include/trace/bpf_trace.h
 create mode 100644 kernel/trace/bpf_trace.c
 create mode 100644 samples/bpf/dropmon.c
 create mode 100644 samples/bpf/tracex1_kern.c
 create mode 100644 samples/bpf/tracex1_user.c
 create mode 100644 samples/bpf/tracex2_kern.c
 create mode 100644 samples/bpf/tracex2_user.c
 create mode 100644 samples/bpf/tracex3_kern.c
 create mode 100644 samples/bpf/tracex3_user.c
 create mode 100644 samples/bpf/tracex4_kern.c
 create mode 100644 samples/bpf/tracex4_user.c

-- 
1.7.9.5

^ permalink raw reply

* Re: [GIT PULL] Kselftest updates for 3.20-rc1
From: Michael Ellerman @ 2015-02-10  3:16 UTC (permalink / raw)
  To: Shuah Khan
  Cc: Linus Torvalds, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <54D958A0.1020404-JPH+aEBZ4P+UEJcrhfAQsw@public.gmane.org>

On Mon, 2015-02-09 at 18:02 -0700, Shuah Khan wrote:
> On 02/09/2015 05:43 PM, Michael Ellerman wrote:
> > On Mon, 2015-02-09 at 17:36 -0700, Shuah Khan wrote:
> >> On 02/09/2015 05:30 PM, Michael Ellerman wrote:
> >>> On Mon, 2015-02-09 at 11:36 -0700, Shuah Khan wrote:
> >>>> Hi Linus,
> >>>>
> >>>> Please pull the following Kselftest updates for 3.20-rc1
> >>>>
> >>>> thanks,
> >>>> -- Shuah
> >>>>
> >>>> The following changes since commit 97bf6af1f928216fd6c5a66e8a57bfa95a659672:
> >>>>
> >>>>   Linux 3.19-rc1 (2014-12-20 17:08:50 -0800)
> >>>>
> >>>> are available in the git repository at:
> >>>>
> >>>>   git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest
> >>>> tags/linux-kselftest-3.20-rc1
> >>>>
> >>>> for you to fetch changes up to 6ddf898c23d62c974e148efd9e509731324a167a:
> >>>>
> >>>>   selftests/exec: Check if the syscall exists and bail if not
> >>>> (2015-02-04 10:17:35 -0700)
> >>>>
> >>>> ----------------------------------------------------------------
> >>>> Kselftest updates for 3.20-rc1
> >>>>
> >>>> This update adds:
> >>>> - Kselftest install target feature
> >>>> - Fix for selftests/exec test
> >>>>
> >>>> ----------------------------------------------------------------
> >>>> Michael Ellerman (1):
> >>>>       selftests/exec: Check if the syscall exists and bail if not
> >>>>
> >>>> Shuah Khan (20):
> >>>>       selftests/breakpoints: add install target to enable test install
> >>>>       selftests/cpu-hotplug: add install target to enable test install
> >>>>       selftests/efivarfs: add install target to enable test install
> >>>>       selftests/firmware: add install target to enable test install
> >>>>       selftests/ftrace: add install target to enable test install
> >>>>       selftests/ipc: add install target to enable test install
> >>>>       selftests/kcmp: add install target to enable test install
> >>>>       selftests/memfd: add install target to enable test install
> >>>>       selftests/memory-hotplug: add install target to enable test install
> >>>>       selftests/mount: add install target to enable test install
> >>>>       selftests/mqueue: add install target to enable test install
> >>>>       selftests/net: add install target to enable test install
> >>>>       selftests/ptrace: add install target to enable test install
> >>>>       selftests/size: add install target to enable test install
> >>>>       selftests/sysctl: add install target to enable test install
> >>>>       selftests/timers: add install target to enable test install
> >>>>       selftests/user: add install target to enable test install
> >>>>       selftests/vm: add install target to enable test install
> >>>>       selftests: add install target to enable test install
> >>>>       kbuild: add a new kselftest_install make target to install selftests
> >>>
> >>>
> >>> I don't understand why you insist on merging this series with the logic copied
> >>> 18 times.
> >>>
> >>> I'm happy to tweak my series that uses an include file, but I don't see the
> >>> point of merging this series first when almost every line will be removed when
> >>> my series goes in.
> >>
> >> Please work on the suggestions I made and rework the patches
> >> and resend. As I mentioned earlier, I want to enable this work
> >> and them make improvements.
> > 
> > Yes I would like install to work to. I'd also like it to work for the powerpc
> > tests you ignored. But I don't want it to involve copying the same logic into
> > every Makefile in the tree.
> 
> Michael,
> 
> powerpc tests aren't ignored. They are in the list to do as
> the next step.

They are ignored by this series, unlike my series.

> > My series was sent over a month ago, with plenty of time for you to merge it
> > instead of this cut-and-paste solution.
> 
> I asked you to re-work the patches based on my suggestions
> and resend. I didn't see any patches from you that addressed
> the comments. I can't merge the patches you sent without
> addressing the comments.

Your comments were "please rebase on my series", and as I explained that is
pointless because my series replaces your series.

cheers

^ permalink raw reply

* Re: [GIT PULL] Kselftest updates for 3.20-rc1
From: Shuah Khan @ 2015-02-10  1:02 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Linus Torvalds, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1423529039.19657.6.camel-Gsx/Oe8HsFggBc27wqDAHg@public.gmane.org>

On 02/09/2015 05:43 PM, Michael Ellerman wrote:
> On Mon, 2015-02-09 at 17:36 -0700, Shuah Khan wrote:
>> On 02/09/2015 05:30 PM, Michael Ellerman wrote:
>>> On Mon, 2015-02-09 at 11:36 -0700, Shuah Khan wrote:
>>>> Hi Linus,
>>>>
>>>> Please pull the following Kselftest updates for 3.20-rc1
>>>>
>>>> thanks,
>>>> -- Shuah
>>>>
>>>> The following changes since commit 97bf6af1f928216fd6c5a66e8a57bfa95a659672:
>>>>
>>>>   Linux 3.19-rc1 (2014-12-20 17:08:50 -0800)
>>>>
>>>> are available in the git repository at:
>>>>
>>>>   git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest
>>>> tags/linux-kselftest-3.20-rc1
>>>>
>>>> for you to fetch changes up to 6ddf898c23d62c974e148efd9e509731324a167a:
>>>>
>>>>   selftests/exec: Check if the syscall exists and bail if not
>>>> (2015-02-04 10:17:35 -0700)
>>>>
>>>> ----------------------------------------------------------------
>>>> Kselftest updates for 3.20-rc1
>>>>
>>>> This update adds:
>>>> - Kselftest install target feature
>>>> - Fix for selftests/exec test
>>>>
>>>> ----------------------------------------------------------------
>>>> Michael Ellerman (1):
>>>>       selftests/exec: Check if the syscall exists and bail if not
>>>>
>>>> Shuah Khan (20):
>>>>       selftests/breakpoints: add install target to enable test install
>>>>       selftests/cpu-hotplug: add install target to enable test install
>>>>       selftests/efivarfs: add install target to enable test install
>>>>       selftests/firmware: add install target to enable test install
>>>>       selftests/ftrace: add install target to enable test install
>>>>       selftests/ipc: add install target to enable test install
>>>>       selftests/kcmp: add install target to enable test install
>>>>       selftests/memfd: add install target to enable test install
>>>>       selftests/memory-hotplug: add install target to enable test install
>>>>       selftests/mount: add install target to enable test install
>>>>       selftests/mqueue: add install target to enable test install
>>>>       selftests/net: add install target to enable test install
>>>>       selftests/ptrace: add install target to enable test install
>>>>       selftests/size: add install target to enable test install
>>>>       selftests/sysctl: add install target to enable test install
>>>>       selftests/timers: add install target to enable test install
>>>>       selftests/user: add install target to enable test install
>>>>       selftests/vm: add install target to enable test install
>>>>       selftests: add install target to enable test install
>>>>       kbuild: add a new kselftest_install make target to install selftests
>>>
>>>
>>> I don't understand why you insist on merging this series with the logic copied
>>> 18 times.
>>>
>>> I'm happy to tweak my series that uses an include file, but I don't see the
>>> point of merging this series first when almost every line will be removed when
>>> my series goes in.
>>
>> Please work on the suggestions I made and rework the patches
>> and resend. As I mentioned earlier, I want to enable this work
>> and them make improvements.
> 
> Yes I would like install to work to. I'd also like it to work for the powerpc
> tests you ignored. But I don't want it to involve copying the same logic into
> every Makefile in the tree.

Michael,

powerpc tests aren't ignored. They are in the list to do as
the next step.

> 
> My series was sent over a month ago, with plenty of time for you to merge it
> instead of this cut-and-paste solution.

I asked you to re-work the patches based on my suggestions
and resend. I didn't see any patches from you that addressed
the comments. I can't merge the patches you sent without
addressing the comments.

I want to get this feature implemented in this series as the
first step and then make improvements that are isolated to the
selftests makefile hierarchy. This series I am requesting to be
pulled in has been in progress for a while now and the series is
v4.

Please plan upon re-working resending the patches for the next
release.

thanks,
-- Shuah

-- 
Shuah Khan
Sr. Linux Kernel Developer
Open Source Innovation Group
Samsung Research America (Silicon Valley)
shuahkh-JPH+aEBZ4P+UEJcrhfAQsw@public.gmane.org | (970) 217-8978

^ permalink raw reply

* Re: [GIT PULL] Kselftest updates for 3.20-rc1
From: Michael Ellerman @ 2015-02-10  0:43 UTC (permalink / raw)
  To: Shuah Khan
  Cc: Linus Torvalds, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <54D95271.7070708-JPH+aEBZ4P+UEJcrhfAQsw@public.gmane.org>

On Mon, 2015-02-09 at 17:36 -0700, Shuah Khan wrote:
> On 02/09/2015 05:30 PM, Michael Ellerman wrote:
> > On Mon, 2015-02-09 at 11:36 -0700, Shuah Khan wrote:
> >> Hi Linus,
> >>
> >> Please pull the following Kselftest updates for 3.20-rc1
> >>
> >> thanks,
> >> -- Shuah
> >>
> >> The following changes since commit 97bf6af1f928216fd6c5a66e8a57bfa95a659672:
> >>
> >>   Linux 3.19-rc1 (2014-12-20 17:08:50 -0800)
> >>
> >> are available in the git repository at:
> >>
> >>   git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest
> >> tags/linux-kselftest-3.20-rc1
> >>
> >> for you to fetch changes up to 6ddf898c23d62c974e148efd9e509731324a167a:
> >>
> >>   selftests/exec: Check if the syscall exists and bail if not
> >> (2015-02-04 10:17:35 -0700)
> >>
> >> ----------------------------------------------------------------
> >> Kselftest updates for 3.20-rc1
> >>
> >> This update adds:
> >> - Kselftest install target feature
> >> - Fix for selftests/exec test
> >>
> >> ----------------------------------------------------------------
> >> Michael Ellerman (1):
> >>       selftests/exec: Check if the syscall exists and bail if not
> >>
> >> Shuah Khan (20):
> >>       selftests/breakpoints: add install target to enable test install
> >>       selftests/cpu-hotplug: add install target to enable test install
> >>       selftests/efivarfs: add install target to enable test install
> >>       selftests/firmware: add install target to enable test install
> >>       selftests/ftrace: add install target to enable test install
> >>       selftests/ipc: add install target to enable test install
> >>       selftests/kcmp: add install target to enable test install
> >>       selftests/memfd: add install target to enable test install
> >>       selftests/memory-hotplug: add install target to enable test install
> >>       selftests/mount: add install target to enable test install
> >>       selftests/mqueue: add install target to enable test install
> >>       selftests/net: add install target to enable test install
> >>       selftests/ptrace: add install target to enable test install
> >>       selftests/size: add install target to enable test install
> >>       selftests/sysctl: add install target to enable test install
> >>       selftests/timers: add install target to enable test install
> >>       selftests/user: add install target to enable test install
> >>       selftests/vm: add install target to enable test install
> >>       selftests: add install target to enable test install
> >>       kbuild: add a new kselftest_install make target to install selftests
> > 
> > 
> > I don't understand why you insist on merging this series with the logic copied
> > 18 times.
> > 
> > I'm happy to tweak my series that uses an include file, but I don't see the
> > point of merging this series first when almost every line will be removed when
> > my series goes in.
> 
> Please work on the suggestions I made and rework the patches
> and resend. As I mentioned earlier, I want to enable this work
> and them make improvements.

Yes I would like install to work to. I'd also like it to work for the powerpc
tests you ignored. But I don't want it to involve copying the same logic into
every Makefile in the tree.

My series was sent over a month ago, with plenty of time for you to merge it
instead of this cut-and-paste solution.

cheers

^ permalink raw reply

* Re: [GIT PULL] Kselftest updates for 3.20-rc1
From: Shuah Khan @ 2015-02-10  0:36 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Linus Torvalds, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1423528239.19657.3.camel-Gsx/Oe8HsFggBc27wqDAHg@public.gmane.org>

On 02/09/2015 05:30 PM, Michael Ellerman wrote:
> On Mon, 2015-02-09 at 11:36 -0700, Shuah Khan wrote:
>> Hi Linus,
>>
>> Please pull the following Kselftest updates for 3.20-rc1
>>
>> thanks,
>> -- Shuah
>>
>> The following changes since commit 97bf6af1f928216fd6c5a66e8a57bfa95a659672:
>>
>>   Linux 3.19-rc1 (2014-12-20 17:08:50 -0800)
>>
>> are available in the git repository at:
>>
>>   git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest
>> tags/linux-kselftest-3.20-rc1
>>
>> for you to fetch changes up to 6ddf898c23d62c974e148efd9e509731324a167a:
>>
>>   selftests/exec: Check if the syscall exists and bail if not
>> (2015-02-04 10:17:35 -0700)
>>
>> ----------------------------------------------------------------
>> Kselftest updates for 3.20-rc1
>>
>> This update adds:
>> - Kselftest install target feature
>> - Fix for selftests/exec test
>>
>> ----------------------------------------------------------------
>> Michael Ellerman (1):
>>       selftests/exec: Check if the syscall exists and bail if not
>>
>> Shuah Khan (20):
>>       selftests/breakpoints: add install target to enable test install
>>       selftests/cpu-hotplug: add install target to enable test install
>>       selftests/efivarfs: add install target to enable test install
>>       selftests/firmware: add install target to enable test install
>>       selftests/ftrace: add install target to enable test install
>>       selftests/ipc: add install target to enable test install
>>       selftests/kcmp: add install target to enable test install
>>       selftests/memfd: add install target to enable test install
>>       selftests/memory-hotplug: add install target to enable test install
>>       selftests/mount: add install target to enable test install
>>       selftests/mqueue: add install target to enable test install
>>       selftests/net: add install target to enable test install
>>       selftests/ptrace: add install target to enable test install
>>       selftests/size: add install target to enable test install
>>       selftests/sysctl: add install target to enable test install
>>       selftests/timers: add install target to enable test install
>>       selftests/user: add install target to enable test install
>>       selftests/vm: add install target to enable test install
>>       selftests: add install target to enable test install
>>       kbuild: add a new kselftest_install make target to install selftests
> 
> 
> I don't understand why you insist on merging this series with the logic copied
> 18 times.
> 
> I'm happy to tweak my series that uses an include file, but I don't see the
> point of merging this series first when almost every line will be removed when
> my series goes in.

Please work on the suggestions I made and rework the patches
and resend. As I mentioned earlier, I want to enable this work
and them make improvements.

thanks,
-- Shuah


-- 
Shuah Khan
Sr. Linux Kernel Developer
Open Source Innovation Group
Samsung Research America (Silicon Valley)
shuahkh-JPH+aEBZ4P+UEJcrhfAQsw@public.gmane.org | (970) 217-8978

^ permalink raw reply

* Re: [GIT PULL] Kselftest updates for 3.20-rc1
From: Michael Ellerman @ 2015-02-10  0:30 UTC (permalink / raw)
  To: Shuah Khan
  Cc: Linus Torvalds, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <54D8FE1C.5040303-JPH+aEBZ4P+UEJcrhfAQsw@public.gmane.org>

On Mon, 2015-02-09 at 11:36 -0700, Shuah Khan wrote:
> Hi Linus,
> 
> Please pull the following Kselftest updates for 3.20-rc1
> 
> thanks,
> -- Shuah
> 
> The following changes since commit 97bf6af1f928216fd6c5a66e8a57bfa95a659672:
> 
>   Linux 3.19-rc1 (2014-12-20 17:08:50 -0800)
> 
> are available in the git repository at:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest
> tags/linux-kselftest-3.20-rc1
> 
> for you to fetch changes up to 6ddf898c23d62c974e148efd9e509731324a167a:
> 
>   selftests/exec: Check if the syscall exists and bail if not
> (2015-02-04 10:17:35 -0700)
> 
> ----------------------------------------------------------------
> Kselftest updates for 3.20-rc1
> 
> This update adds:
> - Kselftest install target feature
> - Fix for selftests/exec test
> 
> ----------------------------------------------------------------
> Michael Ellerman (1):
>       selftests/exec: Check if the syscall exists and bail if not
> 
> Shuah Khan (20):
>       selftests/breakpoints: add install target to enable test install
>       selftests/cpu-hotplug: add install target to enable test install
>       selftests/efivarfs: add install target to enable test install
>       selftests/firmware: add install target to enable test install
>       selftests/ftrace: add install target to enable test install
>       selftests/ipc: add install target to enable test install
>       selftests/kcmp: add install target to enable test install
>       selftests/memfd: add install target to enable test install
>       selftests/memory-hotplug: add install target to enable test install
>       selftests/mount: add install target to enable test install
>       selftests/mqueue: add install target to enable test install
>       selftests/net: add install target to enable test install
>       selftests/ptrace: add install target to enable test install
>       selftests/size: add install target to enable test install
>       selftests/sysctl: add install target to enable test install
>       selftests/timers: add install target to enable test install
>       selftests/user: add install target to enable test install
>       selftests/vm: add install target to enable test install
>       selftests: add install target to enable test install
>       kbuild: add a new kselftest_install make target to install selftests


I don't understand why you insist on merging this series with the logic copied
18 times.

I'm happy to tweak my series that uses an include file, but I don't see the
point of merging this series first when almost every line will be removed when
my series goes in.

  https://lkml.org/lkml/2015/1/9/45

cheers

^ permalink raw reply

* Re: [PATCH 2/2] epoll: introduce EPOLLEXCLUSIVE and EPOLLROUNDROBIN
From: Andy Lutomirski @ 2015-02-09 22:45 UTC (permalink / raw)
  To: Jason Baron, Linux API
  Cc: Peter Zijlstra, Ingo Molnar, Al Viro, Andrew Morton, Eric Wong,
	Davide Libenzi, Michael Kerrisk-manpages,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	Linux FS Devel
In-Reply-To: <54D92780.4000303-JqFfY2XvxFXQT0dZR+AlfA@public.gmane.org>

On Mon, Feb 9, 2015 at 1:32 PM, Jason Baron <jbaron-JqFfY2XvxFXQT0dZR+AlfA@public.gmane.org> wrote:
> On 02/09/2015 03:18 PM, Andy Lutomirski wrote:
>> On 02/09/2015 12:06 PM, Jason Baron wrote:
>>> Epoll file descriptors that are added to a shared wakeup source are always
>>> added in a non-exclusive manner. That means that when we have multiple epoll
>>> fds attached to a shared wakeup source they are all woken up. This can
>>> lead to excessive cpu usage and uneven load distribution.
>>>
>>> This patch introduces two new 'events' flags that are intended to be used
>>> with EPOLL_CTL_ADD operations. EPOLLEXCLUSIVE, adds the epoll fd to the event
>>> source in an exclusive manner such that the minimum number of threads are
>>> woken. EPOLLROUNDROBIN, which depends on EPOLLEXCLUSIVE also being set, can
>>> also be added to the 'events' flag, such that we round robin around the set
>>> of waiting threads.
>>>
>>> An implementation note is that in the epoll wakeup routine,
>>> 'ep_poll_callback()', if EPOLLROUNDROBIN is set, we return 1, for a successful
>>> wakeup, only when there are current waiters. The idea is to use this additional
>>> heuristic in order minimize wakeup latencies.
>>
>> I don't understand what this is intended to do.
>>
>> If an event has EPOLLONESHOT, then this only one thread should be woken regardless, right?  If not, isn't that just a bug that should be fixed?
>>
>
> hmm...so with EPOLLONESHOT you basically get notified once about an event. If i have multiple epoll fds (say 1 per-thread) attached to a single source in EPOLLONESHOT, then all threads will potentially get woken up once per event. Then, I would have to re-arm all of them. So I don't think this addresses this particular usecase...what I am trying to avoid is this mass wakeup or thundering herd for a shared event source.

Now I understand.  Why are you using multiple epollfds?

--Andy

>
>> If an event has EPOLLET, then the considerations are similar to EPOLLONESHOT, right?
>>
>
> EPOLLET is still going to cause this thundering herd.
>
>> If an event is a normal level-triggered non-one-shot event, then I don't understand how a round-robin wakeup makes any sense.  It's level-triggered, after all.
>
> Yeah, so the current behavior is to wake up all of the threads. I'm trying to add a new mode where it load balances among the threads interested in the event. Perhaps, the test program I attached to 0/2 will show the issue better?
>
> Also, this originally came up in the context of a single listening socket which was attached to multiple epoll fds each in a separate thread. With the attached patch, I can measure a large decrease in cpu usage and better balancing behavior among the accepting threads.
>
> Thanks,
>
> -Jason



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply

* Re: [PATCH 2/2] epoll: introduce EPOLLEXCLUSIVE and EPOLLROUNDROBIN
From: Michael Kerrisk @ 2015-02-09 20:27 UTC (permalink / raw)
  To: Jason Baron
  Cc: Peter Zijlstra, Ingo Molnar, Al Viro, Andrew Morton, normalperson,
	Davide Libenzi, Linux Kernel, Linux-Fsdevel, Linux API
In-Reply-To: <68a0ad4a99551ea3bfff89da461bb490d63b0ca8.1423509605.git.jbaron-JqFfY2XvxFXQT0dZR+AlfA@public.gmane.org>

[CC += linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org]


On Mon, Feb 9, 2015 at 9:06 PM, Jason Baron <jbaron-JqFfY2XvxFXQT0dZR+AlfA@public.gmane.org> wrote:
> Epoll file descriptors that are added to a shared wakeup source are always
> added in a non-exclusive manner. That means that when we have multiple epoll
> fds attached to a shared wakeup source they are all woken up. This can
> lead to excessive cpu usage and uneven load distribution.
>
> This patch introduces two new 'events' flags that are intended to be used
> with EPOLL_CTL_ADD operations. EPOLLEXCLUSIVE, adds the epoll fd to the event
> source in an exclusive manner such that the minimum number of threads are
> woken. EPOLLROUNDROBIN, which depends on EPOLLEXCLUSIVE also being set, can
> also be added to the 'events' flag, such that we round robin around the set
> of waiting threads.
>
> An implementation note is that in the epoll wakeup routine,
> 'ep_poll_callback()', if EPOLLROUNDROBIN is set, we return 1, for a successful
> wakeup, only when there are current waiters. The idea is to use this additional
> heuristic in order minimize wakeup latencies.
>
> Signed-off-by: Jason Baron <jbaron-JqFfY2XvxFXQT0dZR+AlfA@public.gmane.org>
> ---
>  fs/eventpoll.c                 | 25 ++++++++++++++++++++-----
>  include/uapi/linux/eventpoll.h |  6 ++++++
>  2 files changed, 26 insertions(+), 5 deletions(-)
>
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index d77f944..382c832 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -92,7 +92,8 @@
>   */
>
>  /* Epoll private bits inside the event mask */
> -#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET)
> +#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | \
> +                        EPOLLEXCLUSIVE | EPOLLROUNDROBIN)
>
>  /* Maximum number of nesting allowed inside epoll sets */
>  #define EP_MAX_NESTS 4
> @@ -1002,6 +1003,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
>         unsigned long flags;
>         struct epitem *epi = ep_item_from_wait(wait);
>         struct eventpoll *ep = epi->ep;
> +       int ewake = 0;
>
>         if ((unsigned long)key & POLLFREE) {
>                 ep_pwq_from_wait(wait)->whead = NULL;
> @@ -1066,8 +1068,10 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
>          * Wake up ( if active ) both the eventpoll wait list and the ->poll()
>          * wait list.
>          */
> -       if (waitqueue_active(&ep->wq))
> +       if (waitqueue_active(&ep->wq)) {
> +               ewake = 1;
>                 wake_up_locked(&ep->wq);
> +       }
>         if (waitqueue_active(&ep->poll_wait))
>                 pwake++;
>
> @@ -1078,6 +1082,8 @@ out_unlock:
>         if (pwake)
>                 ep_poll_safewake(&ep->poll_wait);
>
> +       if (epi->event.events & EPOLLROUNDROBIN)
> +               return ewake;
>         return 1;
>  }
>
> @@ -1095,7 +1101,12 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
>                 init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
>                 pwq->whead = whead;
>                 pwq->base = epi;
> -               add_wait_queue(whead, &pwq->wait);
> +               if (epi->event.events & EPOLLROUNDROBIN)
> +                       add_wait_queue_rr(whead, &pwq->wait);
> +               else if (epi->event.events & EPOLLEXCLUSIVE)
> +                       add_wait_queue_exclusive(whead, &pwq->wait);
> +               else
> +                       add_wait_queue(whead, &pwq->wait);
>                 list_add_tail(&pwq->llink, &epi->pwqlist);
>                 epi->nwait++;
>         } else {
> @@ -1820,8 +1831,7 @@ SYSCALL_DEFINE1(epoll_create, int, size)
>  SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
>                 struct epoll_event __user *, event)
>  {
> -       int error;
> -       int full_check = 0;
> +       int error, full_check = 0, wait_flags = 0;
>         struct fd f, tf;
>         struct eventpoll *ep;
>         struct epitem *epi;
> @@ -1861,6 +1871,11 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
>         if (f.file == tf.file || !is_file_epoll(f.file))
>                 goto error_tgt_fput;
>
> +       wait_flags = epds.events & (EPOLLEXCLUSIVE | EPOLLROUNDROBIN);
> +       if (wait_flags && ((op == EPOLL_CTL_MOD) || ((op == EPOLL_CTL_ADD) &&
> +           ((wait_flags == EPOLLROUNDROBIN) || (is_file_epoll(tf.file))))))
> +               goto error_tgt_fput;
> +
>         /*
>          * At this point it is safe to assume that the "private_data" contains
>          * our own data structure.
> diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
> index bc81fb2..10260a1 100644
> --- a/include/uapi/linux/eventpoll.h
> +++ b/include/uapi/linux/eventpoll.h
> @@ -26,6 +26,12 @@
>  #define EPOLL_CTL_DEL 2
>  #define EPOLL_CTL_MOD 3
>
> +/* Balance wakeups for a shared event source */
> +#define EPOLLROUNDROBIN (1 << 27)
> +
> +/* Add exclusively */
> +#define EPOLLEXCLUSIVE (1 << 28)
> +
>  /*
>   * Request the handling of system wakeup events so as to prevent system suspends
>   * from happening while those events are being processed.
> --
> 1.8.2.rc2
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Michael Kerrisk Linux man-pages maintainer;
http://www.kernel.org/doc/man-pages/
Author of "The Linux Programming Interface", http://blog.man7.org/

^ permalink raw reply

* Re: [PATCH 1/2] sched/wait: add round robin wakeup mode
From: Michael Kerrisk @ 2015-02-09 20:26 UTC (permalink / raw)
  To: Jason Baron
  Cc: Peter Zijlstra, Ingo Molnar, Al Viro, Andrew Morton, normalperson,
	Davide Libenzi, Linux Kernel, Linux-Fsdevel, Linux API
In-Reply-To: <f382714bd4ba5df3589e2e2e8bac114cdd4d7bb3.1423509605.git.jbaron@akamai.com>

[CC += linux-api@vger.kernel.org]


On Mon, Feb 9, 2015 at 9:05 PM, Jason Baron <jbaron@akamai.com> wrote:
> The motivation for this flag is to allow the distribution of wakeups from
> a shared source in a balanced manner. Currently, we can add threads exclusively
> but that often results in the same thread woken up again and again. In the case
> where we are trying to balance work across threads this is not desirable.
>
> The WQ_FLAG_ROUND_ROBIN is restricted to being exclusive as well, otherwise we
> do not know who is being woken up.
>
> Signed-off-by: Jason Baron <jbaron@akamai.com>
> ---
>  include/linux/wait.h | 11 +++++++++++
>  kernel/sched/wait.c  |  5 ++++-
>  2 files changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/wait.h b/include/linux/wait.h
> index 2232ed1..bbdef98 100644
> --- a/include/linux/wait.h
> +++ b/include/linux/wait.h
> @@ -16,6 +16,7 @@ int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *ke
>  /* __wait_queue::flags */
>  #define WQ_FLAG_EXCLUSIVE      0x01
>  #define WQ_FLAG_WOKEN          0x02
> +#define WQ_FLAG_ROUND_ROBIN    0x04
>
>  struct __wait_queue {
>         unsigned int            flags;
> @@ -109,6 +110,16 @@ static inline int waitqueue_active(wait_queue_head_t *q)
>
>  extern void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
>  extern void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait);
> +
> +/*
> + * rr relies on exclusive, otherwise we don't know which entry was woken
> + */
> +static inline void add_wait_queue_rr(wait_queue_head_t *q, wait_queue_t *wait)
> +{
> +       wait->flags |= WQ_FLAG_ROUND_ROBIN;
> +       add_wait_queue_exclusive(q, wait);
> +}
> +
>  extern void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
>
>  static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
> diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
> index 852143a..17d1039 100644
> --- a/kernel/sched/wait.c
> +++ b/kernel/sched/wait.c
> @@ -71,8 +71,11 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
>                 unsigned flags = curr->flags;
>
>                 if (curr->func(curr, mode, wake_flags, key) &&
> -                               (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
> +                              (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) {
> +                       if (flags & WQ_FLAG_ROUND_ROBIN)
> +                               list_move_tail(&curr->task_list, &q->task_list);
>                         break;
> +               }
>         }
>  }
>
> --
> 1.8.2.rc2
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Michael Kerrisk Linux man-pages maintainer;
http://www.kernel.org/doc/man-pages/
Author of "The Linux Programming Interface", http://blog.man7.org/

^ permalink raw reply

* Re: [PATCH 0/2] Add epoll round robin wakeup mode
From: Michael Kerrisk @ 2015-02-09 20:25 UTC (permalink / raw)
  To: Jason Baron
  Cc: Peter Zijlstra, Ingo Molnar, Al Viro, Andrew Morton, normalperson,
	Davide Libenzi, Linux Kernel, Linux-Fsdevel, Linux API
In-Reply-To: <cover.1423509605.git.jbaron@akamai.com>

[CC += linux-api@vger.kernel.org]

Jason,

Since this is a kernel-user-space API change, please CC linux-api@.
The kernel source file Documentation/SubmitChecklist notes that all
Linux kernel patches that change userspace interfaces should be CCed
to linux-api@vger.kernel.org, so that the various parties who are
interested in API changes are informed. For further information, see
https://www.kernel.org/doc/man-pages/linux-api-ml.html


Thanks,

Michael


On Mon, Feb 9, 2015 at 9:05 PM, Jason Baron <jbaron@akamai.com> wrote:
> Hi,
>
> When we are sharing a wakeup source among multiple epoll fds, we end up with
> thundering herd wakeups, since there is currently no way to add to the
> wakeup source exclusively. This series introduces 2 new epoll flags,
> EPOLLEXCLUSIVE for adding to a wakeup source exclusively. And EPOLLROUNDROBIN
> which is to be used in conjunction to EPOLLEXCLUSIVE to evenly
> distribute the wakeups. I'm showing perf results from the simple pipe() usecase
> below. But this patch was originally motivated by a desire to improve
> wakeup balance and cpu usage for a shared listen socket().
>
> Perf stat, 3.19.0-rc7+, 4 core, Intel(R) Xeon(R) CPU E3-1265L v3 @ 2.50GHz:
>
> pipe test wake all:
>
>  Performance counter stats for './wake':
>
>       10837.480396      task-clock (msec)         #    1.879 CPUs utilized
>            2047108      context-switches          #    0.189 M/sec
>             214491      cpu-migrations            #    0.020 M/sec
>                247      page-faults               #    0.023 K/sec
>        23655687888      cycles                    #    2.183 GHz
>    <not supported>      stalled-cycles-frontend
>    <not supported>      stalled-cycles-backend
>        11242141621      instructions              #    0.48  insns per cycle
>         2313479486      branches                  #  213.470 M/sec
>           13679036      branch-misses             #    0.59% of all branches
>
>        5.768295821 seconds time elapsed
>
> pipe test wake balanced:
>
>  Performance counter stats for './wake -o':
>
>         291.250312      task-clock (msec)         #    0.094 CPUs utilized
>              40308      context-switches          #    0.138 M/sec
>               1448      cpu-migrations            #    0.005 M/sec
>                248      page-faults               #    0.852 K/sec
>          646407197      cycles                    #    2.219 GHz
>    <not supported>      stalled-cycles-frontend
>    <not supported>      stalled-cycles-backend
>          364256883      instructions              #    0.56  insns per cycle
>           65775397      branches                  #  225.838 M/sec
>             535637      branch-misses             #    0.81% of all branches
>
>        3.086694452 seconds time elapsed
>
> Rough epoll manpage text:
>
> EPOLLEXCLUSIVE
>         Provides exclusive wakeups when attaching multiple epoll fds to a
>         shared wakeup source. Must be specified on an EPOLL_CTL_ADD operation.
>
> EPOLLROUNDROBIN
>         Provides balancing for exclusive wakeups when attaching multiple epoll
>         fds to a shared wakeup soruce. Must be specificed with EPOLLEXCLUSIVE
>         during an EPOLL_CTL_ADD operation.
>
>
> Thanks,
>
> -Jason
>
> #include <unistd.h>
> #include <sys/epoll.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <pthread.h>
>
> #define NUM_THREADS 100
> #define NUM_EVENTS 20000
> #define EPOLLEXCLUSIVE (1 << 28)
> #define EPOLLBALANCED (1 << 27)
>
> int optimize, exclusive;
> int p[2];
> pthread_t threads[NUM_THREADS];
> int event_count[NUM_THREADS];
>
> struct epoll_event evt = {
>         .events = EPOLLIN
> };
>
> void die(const char *msg) {
>     perror(msg);
>     exit(-1);
> }
>
> void *run_func(void *ptr)
> {
>         int i = 0;
>         int j = 0;
>         int ret;
>         int epfd;
>         char buf[4];
>         int id = *(int *)ptr;
>         int *contents;
>
>         if ((epfd = epoll_create(1)) < 0)
>                 die("create");
>
>         if (optimize)
>                 evt.events |= ((EPOLLBALANCED | EPOLLEXCLUSIVE));
>         else if (exclusive)
>                 evt.events |= EPOLLEXCLUSIVE;
>         ret = epoll_ctl(epfd, EPOLL_CTL_ADD, p[0], &evt);
>         if (ret)
>                 perror("epoll_ctl add error!\n");
>
>         while (1) {
>                 ret = epoll_wait(epfd, &evt, 10000, -1);
>                 ret = read(p[0], buf, sizeof(int));
>                 if (ret == 4)
>                         event_count[id]++;
>         }
> }
>
> int main(int argc, char *argv[])
> {
>         int ret, i, j;
>         int id[NUM_THREADS];
>         int total = 0;
>         int nohit = 0;
>         int extra_wakeups = 0;
>
>         if (argc == 2) {
>                 if (strcmp(argv[1], "-o") == 0)
>                         optimize = 1;
>                 if (strcmp(argv[1], "-e") == 0)
>                         exclusive = 1;
>         }
>
>         if (pipe(p) < 0)
>                 die("pipe");
>
>         for (i = 0; i < NUM_THREADS; i++) {
>                 id[i] = i;
>                 pthread_create(&threads[i], NULL, run_func, &id[i]);
>         }
>
>         for (j = 0; j < NUM_EVENTS; j++) {
>                 write(p[1], p, sizeof(int));
>                 usleep(100);
>         }
>
>         for (i = 0; i < NUM_THREADS; i++) {
>                 pthread_cancel(threads[i]);
>                 printf("joined: %d\n", i);
>                 printf("event count: %d\n", event_count[i]);
>                 total += event_count[i];
>                 if (!event_count[i])
>                         nohit++;
>         }
>
>         printf("total events is: %d\n", total);
>         printf("nohit is: %d\n", nohit);
> }
>
>
> Jason Baron (2):
>   sched/wait: add round robin wakeup mode
>   epoll: introduce EPOLLEXCLUSIVE and EPOLLROUNDROBIN
>
>  fs/eventpoll.c                 | 25 ++++++++++++++++++++-----
>  include/linux/wait.h           | 11 +++++++++++
>  include/uapi/linux/eventpoll.h |  6 ++++++
>  kernel/sched/wait.c            |  5 ++++-
>  4 files changed, 41 insertions(+), 6 deletions(-)
>
> --
> 1.8.2.rc2
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Michael Kerrisk Linux man-pages maintainer;
http://www.kernel.org/doc/man-pages/
Author of "The Linux Programming Interface", http://blog.man7.org/

^ permalink raw reply

* [GIT PULL] Kselftest updates for 3.20-rc1
From: Shuah Khan @ 2015-02-09 18:36 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Shuah Khan, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA

Hi Linus,

Please pull the following Kselftest updates for 3.20-rc1

thanks,
-- Shuah

The following changes since commit 97bf6af1f928216fd6c5a66e8a57bfa95a659672:

  Linux 3.19-rc1 (2014-12-20 17:08:50 -0800)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest
tags/linux-kselftest-3.20-rc1

for you to fetch changes up to 6ddf898c23d62c974e148efd9e509731324a167a:

  selftests/exec: Check if the syscall exists and bail if not
(2015-02-04 10:17:35 -0700)

----------------------------------------------------------------
Kselftest updates for 3.20-rc1

This update adds:
- Kselftest install target feature
- Fix for selftests/exec test

----------------------------------------------------------------
Michael Ellerman (1):
      selftests/exec: Check if the syscall exists and bail if not

Shuah Khan (20):
      selftests/breakpoints: add install target to enable test install
      selftests/cpu-hotplug: add install target to enable test install
      selftests/efivarfs: add install target to enable test install
      selftests/firmware: add install target to enable test install
      selftests/ftrace: add install target to enable test install
      selftests/ipc: add install target to enable test install
      selftests/kcmp: add install target to enable test install
      selftests/memfd: add install target to enable test install
      selftests/memory-hotplug: add install target to enable test install
      selftests/mount: add install target to enable test install
      selftests/mqueue: add install target to enable test install
      selftests/net: add install target to enable test install
      selftests/ptrace: add install target to enable test install
      selftests/size: add install target to enable test install
      selftests/sysctl: add install target to enable test install
      selftests/timers: add install target to enable test install
      selftests/user: add install target to enable test install
      selftests/vm: add install target to enable test install
      selftests: add install target to enable test install
      kbuild: add a new kselftest_install make target to install selftests

 Makefile                                           | 14 +++++-
 tools/testing/selftests/Makefile                   | 54
+++++++++++++++++++++-
 tools/testing/selftests/breakpoints/Makefile       | 19 +++++++-
 tools/testing/selftests/cpu-hotplug/Makefile       | 14 +++++-
 .../{on-off-test.sh => cpu-on-off-test.sh}         |  0
 tools/testing/selftests/efivarfs/Makefile          | 16 ++++++-
 tools/testing/selftests/exec/execveat.c            | 10 +++-
 tools/testing/selftests/firmware/Makefile          | 43 ++++++++++-------
 tools/testing/selftests/ftrace/Makefile            | 13 +++++-
 tools/testing/selftests/ipc/Makefile               | 19 +++++++-
 tools/testing/selftests/kcmp/Makefile              | 13 +++++-
 tools/testing/selftests/memfd/Makefile             | 17 +++++--
 tools/testing/selftests/memory-hotplug/Makefile    | 14 +++++-
 .../{on-off-test.sh => mem-on-off-test.sh}         |  0
 tools/testing/selftests/mount/Makefile             | 12 ++++-
 tools/testing/selftests/mqueue/Makefile            | 18 ++++++--
 tools/testing/selftests/net/Makefile               | 20 ++++++--
 tools/testing/selftests/ptrace/Makefile            | 16 +++++--
 tools/testing/selftests/size/Makefile              | 12 ++++-
 tools/testing/selftests/sysctl/Makefile            | 17 ++++++-
 tools/testing/selftests/timers/Makefile            | 12 ++++-
 tools/testing/selftests/user/Makefile              | 12 ++++-
 tools/testing/selftests/vm/Makefile                | 11 ++++-
 23 files changed, 326 insertions(+), 50 deletions(-)
 rename tools/testing/selftests/cpu-hotplug/{on-off-test.sh =>
cpu-on-off-test.sh} (100%)
 rename tools/testing/selftests/memory-hotplug/{on-off-test.sh =>
mem-on-off-test.sh} (100%)

-- 
Shuah Khan
Sr. Linux Kernel Developer
Open Source Innovation Group
Samsung Research America (Silicon Valley)
shuahkh-JPH+aEBZ4P+UEJcrhfAQsw@public.gmane.org | (970) 217-8978

^ permalink raw reply

* Re: [RFC PATCH] iio: Export userspace IIO headers
From: Lars-Peter Clausen @ 2015-02-09 17:05 UTC (permalink / raw)
  To: Daniel Baluta, jic23-DgEjT+Ai2ygdnm+yROfE0A, knaack.h-Mmb7MZpHnFY,
	pmeerw-jW+XmwGofnusTnJN9+BGXg
  Cc: irina.tirdea-ral2JQCrhuEAvxtiuMwx3w,
	roberta.dobrescu-Re5JQEeQqe8AvxtiuMwx3w,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-iio-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1423500586-26480-1-git-send-email-daniel.baluta-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

On 02/09/2015 05:49 PM, Daniel Baluta wrote:
> After UAPI header file split [1] all user-kernel interfaces were
> placed under include/uapi/.
>
> This patch moves IIO user specific API from:
> 	* include/linux/iio/events.h => include/uapi/linux/iio/events.h
> 	* include/linux/iio/types.h => include/uapi/linux/iio/types.h
>
> Now there is no need for nasty tricks to compile userspace programs
> (e.g iio_event_monitor). Just installing the kernel headers with
> make headers_install command does the job.
>
> [1] http://lwn.net/Articles/507794/

Thanks for taking care of this, this is something that should have done a 
while ago.

[...]
> index 580ed5b..146cda1 100644
> --- a/include/linux/iio/types.h
> +++ b/include/linux/iio/types.h
> @@ -10,97 +10,5 @@
>   #ifndef _IIO_TYPES_H_
>   #define _IIO_TYPES_H_
>
> -enum iio_chan_type {
> -	IIO_VOLTAGE,
> -	IIO_CURRENT,
> -	IIO_POWER,
> -	IIO_ACCEL,
> -	IIO_ANGL_VEL,
> -	IIO_MAGN,
> -	IIO_LIGHT,
> -	IIO_INTENSITY,
> -	IIO_PROXIMITY,
> -	IIO_TEMP,
> -	IIO_INCLI,
> -	IIO_ROT,
> -	IIO_ANGL,
> -	IIO_TIMESTAMP,
> -	IIO_CAPACITANCE,
> -	IIO_ALTVOLTAGE,
> -	IIO_CCT,
> -	IIO_PRESSURE,
> -	IIO_HUMIDITYRELATIVE,
> -	IIO_ACTIVITY,
> -	IIO_STEPS,
> -	IIO_ENERGY,
> -	IIO_DISTANCE,
> -	IIO_VELOCITY,
> -};
> -
> -enum iio_modifier {
> -	IIO_NO_MOD,
> -	IIO_MOD_X,
> -	IIO_MOD_Y,
> -	IIO_MOD_Z,
> -	IIO_MOD_X_AND_Y,
> -	IIO_MOD_X_AND_Z,
> -	IIO_MOD_Y_AND_Z,
> -	IIO_MOD_X_AND_Y_AND_Z,
> -	IIO_MOD_X_OR_Y,
> -	IIO_MOD_X_OR_Z,
> -	IIO_MOD_Y_OR_Z,
> -	IIO_MOD_X_OR_Y_OR_Z,
> -	IIO_MOD_LIGHT_BOTH,
> -	IIO_MOD_LIGHT_IR,
> -	IIO_MOD_ROOT_SUM_SQUARED_X_Y,
> -	IIO_MOD_SUM_SQUARED_X_Y_Z,
> -	IIO_MOD_LIGHT_CLEAR,
> -	IIO_MOD_LIGHT_RED,
> -	IIO_MOD_LIGHT_GREEN,
> -	IIO_MOD_LIGHT_BLUE,
> -	IIO_MOD_QUATERNION,
> -	IIO_MOD_TEMP_AMBIENT,
> -	IIO_MOD_TEMP_OBJECT,
> -	IIO_MOD_NORTH_MAGN,
> -	IIO_MOD_NORTH_TRUE,
> -	IIO_MOD_NORTH_MAGN_TILT_COMP,
> -	IIO_MOD_NORTH_TRUE_TILT_COMP,
> -	IIO_MOD_RUNNING,
> -	IIO_MOD_JOGGING,
> -	IIO_MOD_WALKING,
> -	IIO_MOD_STILL,
> -	IIO_MOD_ROOT_SUM_SQUARED_X_Y_Z,
> -};
> -
> -enum iio_event_type {
> -	IIO_EV_TYPE_THRESH,
> -	IIO_EV_TYPE_MAG,
> -	IIO_EV_TYPE_ROC,
> -	IIO_EV_TYPE_THRESH_ADAPTIVE,
> -	IIO_EV_TYPE_MAG_ADAPTIVE,
> -	IIO_EV_TYPE_CHANGE,
> -};

I think everything in this file below is not part of the ABI and should not 
be exported to userspace.

> -
> -enum iio_event_info {
> -	IIO_EV_INFO_ENABLE,
> -	IIO_EV_INFO_VALUE,
> -	IIO_EV_INFO_HYSTERESIS,
> -	IIO_EV_INFO_PERIOD,
> -};
> -
> -enum iio_event_direction {
> -	IIO_EV_DIR_EITHER,
> -	IIO_EV_DIR_RISING,
> -	IIO_EV_DIR_FALLING,
> -	IIO_EV_DIR_NONE,
> -};
> -
> -#define IIO_VAL_INT 1
> -#define IIO_VAL_INT_PLUS_MICRO 2
> -#define IIO_VAL_INT_PLUS_NANO 3
> -#define IIO_VAL_INT_PLUS_MICRO_DB 4
> -#define IIO_VAL_INT_MULTIPLE 5
> -#define IIO_VAL_FRACTIONAL 10
> -#define IIO_VAL_FRACTIONAL_LOG2 11
> -
[...]

^ permalink raw reply

* [RFC PATCH] iio: Export userspace IIO headers
From: Daniel Baluta @ 2015-02-09 16:49 UTC (permalink / raw)
  To: jic23-DgEjT+Ai2ygdnm+yROfE0A, knaack.h-Mmb7MZpHnFY,
	lars-Qo5EllUWu/uELgA04lAiVw, pmeerw-jW+XmwGofnusTnJN9+BGXg
  Cc: irina.tirdea-ral2JQCrhuEAvxtiuMwx3w,
	roberta.dobrescu-Re5JQEeQqe8AvxtiuMwx3w,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-iio-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA

After UAPI header file split [1] all user-kernel interfaces were
placed under include/uapi/.

This patch moves IIO user specific API from:
	* include/linux/iio/events.h => include/uapi/linux/iio/events.h
	* include/linux/iio/types.h => include/uapi/linux/iio/types.h

Now there is no need for nasty tricks to compile userspace programs
(e.g iio_event_monitor). Just installing the kernel headers with
make headers_install command does the job.

[1] http://lwn.net/Articles/507794/

Signed-off-by: Daniel Baluta <daniel.baluta-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
I am not sure if this is the right thing to do. I am still 
trying to understand the inners workings of make headers_install.

 include/linux/iio/events.h      |  30 +-----------
 include/linux/iio/types.h       |  94 +----------------------------------
 include/uapi/linux/Kbuild       |   1 +
 include/uapi/linux/iio/Kbuild   |   3 ++
 include/uapi/linux/iio/events.h |  43 ++++++++++++++++
 include/uapi/linux/iio/types.h  | 106 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 155 insertions(+), 122 deletions(-)
 create mode 100644 include/uapi/linux/iio/Kbuild
 create mode 100644 include/uapi/linux/iio/events.h
 create mode 100644 include/uapi/linux/iio/types.h

diff --git a/include/linux/iio/events.h b/include/linux/iio/events.h
index 03fa332..8ad87d1 100644
--- a/include/linux/iio/events.h
+++ b/include/linux/iio/events.h
@@ -9,22 +9,8 @@
 #ifndef _IIO_EVENTS_H_
 #define _IIO_EVENTS_H_
 
-#include <linux/ioctl.h>
-#include <linux/types.h>
 #include <linux/iio/types.h>
-
-/**
- * struct iio_event_data - The actual event being pushed to userspace
- * @id:		event identifier
- * @timestamp:	best estimate of time of event occurrence (often from
- *		the interrupt handler)
- */
-struct iio_event_data {
-	__u64	id;
-	__s64	timestamp;
-};
-
-#define IIO_GET_EVENT_FD_IOCTL _IOR('i', 0x90, int)
+#include <uapi/linux/iio/events.h>
 
 /**
  * IIO_EVENT_CODE() - create event identifier
@@ -70,18 +56,4 @@ struct iio_event_data {
 #define IIO_UNMOD_EVENT_CODE(chan_type, number, type, direction)	\
 	IIO_EVENT_CODE(chan_type, 0, 0, direction, type, number, 0, 0)
 
-#define IIO_EVENT_CODE_EXTRACT_TYPE(mask) ((mask >> 56) & 0xFF)
-
-#define IIO_EVENT_CODE_EXTRACT_DIR(mask) ((mask >> 48) & 0x7F)
-
-#define IIO_EVENT_CODE_EXTRACT_CHAN_TYPE(mask) ((mask >> 32) & 0xFF)
-
-/* Event code number extraction depends on which type of event we have.
- * Perhaps review this function in the future*/
-#define IIO_EVENT_CODE_EXTRACT_CHAN(mask) ((__s16)(mask & 0xFFFF))
-#define IIO_EVENT_CODE_EXTRACT_CHAN2(mask) ((__s16)(((mask) >> 16) & 0xFFFF))
-
-#define IIO_EVENT_CODE_EXTRACT_MODIFIER(mask) ((mask >> 40) & 0xFF)
-#define IIO_EVENT_CODE_EXTRACT_DIFF(mask) (((mask) >> 55) & 0x1)
-
 #endif
diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index 580ed5b..146cda1 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -10,97 +10,5 @@
 #ifndef _IIO_TYPES_H_
 #define _IIO_TYPES_H_
 
-enum iio_chan_type {
-	IIO_VOLTAGE,
-	IIO_CURRENT,
-	IIO_POWER,
-	IIO_ACCEL,
-	IIO_ANGL_VEL,
-	IIO_MAGN,
-	IIO_LIGHT,
-	IIO_INTENSITY,
-	IIO_PROXIMITY,
-	IIO_TEMP,
-	IIO_INCLI,
-	IIO_ROT,
-	IIO_ANGL,
-	IIO_TIMESTAMP,
-	IIO_CAPACITANCE,
-	IIO_ALTVOLTAGE,
-	IIO_CCT,
-	IIO_PRESSURE,
-	IIO_HUMIDITYRELATIVE,
-	IIO_ACTIVITY,
-	IIO_STEPS,
-	IIO_ENERGY,
-	IIO_DISTANCE,
-	IIO_VELOCITY,
-};
-
-enum iio_modifier {
-	IIO_NO_MOD,
-	IIO_MOD_X,
-	IIO_MOD_Y,
-	IIO_MOD_Z,
-	IIO_MOD_X_AND_Y,
-	IIO_MOD_X_AND_Z,
-	IIO_MOD_Y_AND_Z,
-	IIO_MOD_X_AND_Y_AND_Z,
-	IIO_MOD_X_OR_Y,
-	IIO_MOD_X_OR_Z,
-	IIO_MOD_Y_OR_Z,
-	IIO_MOD_X_OR_Y_OR_Z,
-	IIO_MOD_LIGHT_BOTH,
-	IIO_MOD_LIGHT_IR,
-	IIO_MOD_ROOT_SUM_SQUARED_X_Y,
-	IIO_MOD_SUM_SQUARED_X_Y_Z,
-	IIO_MOD_LIGHT_CLEAR,
-	IIO_MOD_LIGHT_RED,
-	IIO_MOD_LIGHT_GREEN,
-	IIO_MOD_LIGHT_BLUE,
-	IIO_MOD_QUATERNION,
-	IIO_MOD_TEMP_AMBIENT,
-	IIO_MOD_TEMP_OBJECT,
-	IIO_MOD_NORTH_MAGN,
-	IIO_MOD_NORTH_TRUE,
-	IIO_MOD_NORTH_MAGN_TILT_COMP,
-	IIO_MOD_NORTH_TRUE_TILT_COMP,
-	IIO_MOD_RUNNING,
-	IIO_MOD_JOGGING,
-	IIO_MOD_WALKING,
-	IIO_MOD_STILL,
-	IIO_MOD_ROOT_SUM_SQUARED_X_Y_Z,
-};
-
-enum iio_event_type {
-	IIO_EV_TYPE_THRESH,
-	IIO_EV_TYPE_MAG,
-	IIO_EV_TYPE_ROC,
-	IIO_EV_TYPE_THRESH_ADAPTIVE,
-	IIO_EV_TYPE_MAG_ADAPTIVE,
-	IIO_EV_TYPE_CHANGE,
-};
-
-enum iio_event_info {
-	IIO_EV_INFO_ENABLE,
-	IIO_EV_INFO_VALUE,
-	IIO_EV_INFO_HYSTERESIS,
-	IIO_EV_INFO_PERIOD,
-};
-
-enum iio_event_direction {
-	IIO_EV_DIR_EITHER,
-	IIO_EV_DIR_RISING,
-	IIO_EV_DIR_FALLING,
-	IIO_EV_DIR_NONE,
-};
-
-#define IIO_VAL_INT 1
-#define IIO_VAL_INT_PLUS_MICRO 2
-#define IIO_VAL_INT_PLUS_NANO 3
-#define IIO_VAL_INT_PLUS_MICRO_DB 4
-#define IIO_VAL_INT_MULTIPLE 5
-#define IIO_VAL_FRACTIONAL 10
-#define IIO_VAL_FRACTIONAL_LOG2 11
-
+#include <uapi/linux/iio/types.h>
 #endif /* _IIO_TYPES_H_ */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 00b10002..5bfc5bd 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -6,6 +6,7 @@ header-y += caif/
 header-y += dvb/
 header-y += hdlc/
 header-y += hsi/
+header-y += iio/
 header-y += isdn/
 header-y += mmc/
 header-y += nfsd/
diff --git a/include/uapi/linux/iio/Kbuild b/include/uapi/linux/iio/Kbuild
new file mode 100644
index 0000000..86f76d8
--- /dev/null
+++ b/include/uapi/linux/iio/Kbuild
@@ -0,0 +1,3 @@
+# UAPI Header export list
+header-y += events.h
+header-y += types.h
diff --git a/include/uapi/linux/iio/events.h b/include/uapi/linux/iio/events.h
new file mode 100644
index 0000000..4b06477
--- /dev/null
+++ b/include/uapi/linux/iio/events.h
@@ -0,0 +1,43 @@
+/* The industrial I/O - event passing to userspace
+ *
+ * Copyright (c) 2008-2011 Jonathan Cameron
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+#ifndef _UAPI_IIO_EVENTS_H_
+#define _UAPI_IIO_EVENTS_H_
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/**
+ * struct iio_event_data - The actual event being pushed to userspace
+ * @id:		event identifier
+ * @timestamp:	best estimate of time of event occurrence (often from
+ *		the interrupt handler)
+ */
+struct iio_event_data {
+	__u64	id;
+	__s64	timestamp;
+};
+
+#define IIO_GET_EVENT_FD_IOCTL _IOR('i', 0x90, int)
+
+#define IIO_EVENT_CODE_EXTRACT_TYPE(mask) ((mask >> 56) & 0xFF)
+
+#define IIO_EVENT_CODE_EXTRACT_DIR(mask) ((mask >> 48) & 0x7F)
+
+#define IIO_EVENT_CODE_EXTRACT_CHAN_TYPE(mask) ((mask >> 32) & 0xFF)
+
+/* Event code number extraction depends on which type of event we have.
+ * Perhaps review this function in the future*/
+#define IIO_EVENT_CODE_EXTRACT_CHAN(mask) ((__s16)(mask & 0xFFFF))
+#define IIO_EVENT_CODE_EXTRACT_CHAN2(mask) ((__s16)(((mask) >> 16) & 0xFFFF))
+
+#define IIO_EVENT_CODE_EXTRACT_MODIFIER(mask) ((mask >> 40) & 0xFF)
+#define IIO_EVENT_CODE_EXTRACT_DIFF(mask) (((mask) >> 55) & 0x1)
+
+#endif /* _UAPI_IIO_EVENTS_H_ */
+
diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h
new file mode 100644
index 0000000..1aa8dcf
--- /dev/null
+++ b/include/uapi/linux/iio/types.h
@@ -0,0 +1,106 @@
+/* industrial I/O data types needed both in and out of kernel
+ *
+ * Copyright (c) 2008 Jonathan Cameron
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef _UAPI_IIO_TYPES_H_
+#define _UAPI_IIO_TYPES_H_
+
+enum iio_chan_type {
+	IIO_VOLTAGE,
+	IIO_CURRENT,
+	IIO_POWER,
+	IIO_ACCEL,
+	IIO_ANGL_VEL,
+	IIO_MAGN,
+	IIO_LIGHT,
+	IIO_INTENSITY,
+	IIO_PROXIMITY,
+	IIO_TEMP,
+	IIO_INCLI,
+	IIO_ROT,
+	IIO_ANGL,
+	IIO_TIMESTAMP,
+	IIO_CAPACITANCE,
+	IIO_ALTVOLTAGE,
+	IIO_CCT,
+	IIO_PRESSURE,
+	IIO_HUMIDITYRELATIVE,
+	IIO_ACTIVITY,
+	IIO_STEPS,
+	IIO_ENERGY,
+	IIO_DISTANCE,
+	IIO_VELOCITY,
+};
+
+enum iio_modifier {
+	IIO_NO_MOD,
+	IIO_MOD_X,
+	IIO_MOD_Y,
+	IIO_MOD_Z,
+	IIO_MOD_X_AND_Y,
+	IIO_MOD_X_AND_Z,
+	IIO_MOD_Y_AND_Z,
+	IIO_MOD_X_AND_Y_AND_Z,
+	IIO_MOD_X_OR_Y,
+	IIO_MOD_X_OR_Z,
+	IIO_MOD_Y_OR_Z,
+	IIO_MOD_X_OR_Y_OR_Z,
+	IIO_MOD_LIGHT_BOTH,
+	IIO_MOD_LIGHT_IR,
+	IIO_MOD_ROOT_SUM_SQUARED_X_Y,
+	IIO_MOD_SUM_SQUARED_X_Y_Z,
+	IIO_MOD_LIGHT_CLEAR,
+	IIO_MOD_LIGHT_RED,
+	IIO_MOD_LIGHT_GREEN,
+	IIO_MOD_LIGHT_BLUE,
+	IIO_MOD_QUATERNION,
+	IIO_MOD_TEMP_AMBIENT,
+	IIO_MOD_TEMP_OBJECT,
+	IIO_MOD_NORTH_MAGN,
+	IIO_MOD_NORTH_TRUE,
+	IIO_MOD_NORTH_MAGN_TILT_COMP,
+	IIO_MOD_NORTH_TRUE_TILT_COMP,
+	IIO_MOD_RUNNING,
+	IIO_MOD_JOGGING,
+	IIO_MOD_WALKING,
+	IIO_MOD_STILL,
+	IIO_MOD_ROOT_SUM_SQUARED_X_Y_Z,
+};
+
+enum iio_event_type {
+	IIO_EV_TYPE_THRESH,
+	IIO_EV_TYPE_MAG,
+	IIO_EV_TYPE_ROC,
+	IIO_EV_TYPE_THRESH_ADAPTIVE,
+	IIO_EV_TYPE_MAG_ADAPTIVE,
+	IIO_EV_TYPE_CHANGE,
+};
+
+enum iio_event_info {
+	IIO_EV_INFO_ENABLE,
+	IIO_EV_INFO_VALUE,
+	IIO_EV_INFO_HYSTERESIS,
+	IIO_EV_INFO_PERIOD,
+};
+
+enum iio_event_direction {
+	IIO_EV_DIR_EITHER,
+	IIO_EV_DIR_RISING,
+	IIO_EV_DIR_FALLING,
+	IIO_EV_DIR_NONE,
+};
+
+#define IIO_VAL_INT 1
+#define IIO_VAL_INT_PLUS_MICRO 2
+#define IIO_VAL_INT_PLUS_NANO 3
+#define IIO_VAL_INT_PLUS_MICRO_DB 4
+#define IIO_VAL_INT_MULTIPLE 5
+#define IIO_VAL_FRACTIONAL 10
+#define IIO_VAL_FRACTIONAL_LOG2 11
+
+#endif /* _UAPI_IIO_TYPES_H_ */
-- 
1.9.1

^ permalink raw reply related

* Re: [tpmdd-devel] [PATCH] tpm, tpm_tis: fix TPM 2.0 probing
From: peterhuewe @ 2015-02-09  9:20 UTC (permalink / raw)
  To: Jarkko Sakkinen
  Cc: christophe.ricard, linux-api, Ashley Lai, linux-kernel, josh,
	tpmdd-devel, jason.gunthorpe, trousers-tech
In-Reply-To: <20150209083947.GC29987@intel.com>


[-- Attachment #1.1: Type: text/plain, Size: 72 bytes --]

Ok, good. 
I'll apply it later today.
Peter
-- 
Sent from my mobile.

[-- Attachment #1.2: Type: text/html, Size: 92 bytes --]

[-- Attachment #2: Type: text/plain, Size: 441 bytes --]

------------------------------------------------------------------------------
Dive into the World of Parallel Programming. The Go Parallel Website,
sponsored by Intel and developed in partnership with Slashdot Media, is your
hub for all things parallel software development, from weekly thought
leadership blogs to news, videos, case studies, tutorials and more. Take a
look and join the conversation now. http://goparallel.sourceforge.net/

[-- Attachment #3: Type: text/plain, Size: 170 bytes --]

_______________________________________________
TrouSerS-tech mailing list
TrouSerS-tech@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/trousers-tech

^ permalink raw reply

* Re: MADV_DONTNEED semantics? Was: [RFC PATCH] mm: madvise: Ignore repeated MADV_DONTNEED hints
From: Michael Kerrisk (man-pages) @ 2015-02-09  9:13 UTC (permalink / raw)
  To: Minchan Kim
  Cc: mtk.manpages, Vlastimil Babka, Kirill A. Shutemov, Dave Hansen,
	Mel Gorman, linux-mm@kvack.org, Andrew Morton, lkml, Linux API,
	linux-man, Hugh Dickins
In-Reply-To: <20150209064600.GA32300@blaptop>

Hello Minchan

On 02/09/2015 07:46 AM, Minchan Kim wrote:
> Hello, Michael
> 
> On Fri, Feb 06, 2015 at 04:41:12PM +0100, Michael Kerrisk (man-pages) wrote:
>> On 02/05/2015 02:07 AM, Minchan Kim wrote:
>>> Hello,
>>>
>>> On Wed, Feb 04, 2015 at 08:24:27PM +0100, Michael Kerrisk (man-pages) wrote:
>>>> On 4 February 2015 at 18:02, Vlastimil Babka <vbabka@suse.cz> wrote:
>>>>> On 02/04/2015 03:00 PM, Michael Kerrisk (man-pages) wrote:

[...]

>>> And we should make error section, too.
>>> "locked" covers mlock(2) and you said you will add hugetlb. Then,
>>> VM_PFNMAP? In that case, it fails. How can we say about VM_PFNMAP?
>>> special mapping for some drivers?
>>
>> I'm open for offers on what to add.
> 
> I suggests from quote "LWN" http://lwn.net/Articles/162860/
> "*special mapping* which is not made up of "normal" pages.
> It is usually created by device drivers which map special memory areas
> into user space"

Thanks. I've added mention of VM_PFNMAP in the discussion of both 
MADV_DONTNEED and MADV_REMOVE, and noted that both of those
operations will give an error when applied to VM_PFNMAP pages.

Cheers,

Michael


-- 
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH] tpm, tpm_tis: fix TPM 2.0 probing
From: Jarkko Sakkinen @ 2015-02-09  8:39 UTC (permalink / raw)
  To: Peter Hüwe
  Cc: Ashley Lai, Marcel Selhorst, tpmdd-devel, linux-kernel, josh,
	christophe.ricard, jason.gunthorpe, stefanb, linux-api,
	trousers-tech
In-Reply-To: <201502090008.47986.PeterHuewe@gmx.de>

On Mon, Feb 09, 2015 at 12:08:46AM +0100, Peter Hüwe wrote:
> Am Mittwoch, 4. Februar 2015, 15:21:09 schrieb Jarkko Sakkinen:
> > If during transmission system error was returned, the logic was to
> > incorrectly deduce that chip is a TPM 1.x chip. This patch fixes this
> > issue. Also, this patch changes probing so that message tag is used as the
> > measure for TPM 2.x, which should be much more stable.
> Is it aware that some TPMs may respond with 0x00C1 as TAG for TPM1.2 commands?

I guess none of the TPM 1.2 command answer with the tag 0x8002?

> > A separate function
> > called tpm2_probe() is encapsulated because it can be used with any
> > chipset.
> 
> > 
> > Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> > ---
> >  drivers/char/tpm/tpm.h      |  3 ++-
> >  drivers/char/tpm/tpm2-cmd.c | 40 +++++++++++++++++++++++++++++++++-------
> >  drivers/char/tpm/tpm_tis.c  | 11 ++++-------
> >  3 files changed, 39 insertions(+), 15 deletions(-)
> > 
> > diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h
> > index 7b0727c..a4b0f5e 100644
> > --- a/drivers/char/tpm/tpm.h
> > +++ b/drivers/char/tpm/tpm.h
> > @@ -435,4 +435,5 @@ extern int tpm2_startup(struct tpm_chip *chip, u16
> > startup_type); extern int tpm2_shutdown(struct tpm_chip *chip, u16
> > shutdown_type); extern unsigned long tpm2_calc_ordinal_duration(struct
> > tpm_chip *, u32); extern int tpm2_do_selftest(struct tpm_chip *chip);
> > -extern int tpm2_gen_interrupt(struct tpm_chip *chip, bool quiet);
> > +extern int tpm2_gen_interrupt(struct tpm_chip *chip);
> > +extern int tpm2_probe(struct tpm_chip *chip);
> > diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c
> > index 1abe650..49cd354 100644
> > --- a/drivers/char/tpm/tpm2-cmd.c
> > +++ b/drivers/char/tpm/tpm2-cmd.c
> > @@ -598,20 +598,46 @@ EXPORT_SYMBOL_GPL(tpm2_do_selftest);
> >  /**
> >   * tpm2_gen_interrupt() - generate an interrupt
> >   * @chip: TPM chip to use
> > - * @quiet: surpress the error message
> >   *
> >   * 0 is returned when the operation is successful. If a negative number is
> >   * returned it remarks a POSIX error code. If a positive number is
> > returned * it remarks a TPM error.
> >   */
> > -int tpm2_gen_interrupt(struct tpm_chip *chip, bool quiet)
> > +int tpm2_gen_interrupt(struct tpm_chip *chip)
> >  {
> > -	const char *desc = NULL;
> >  	u32 dummy;
> > 
> > -	if (!quiet)
> > -		desc = "attempting to generate an interrupt";
> > -
> > -	return tpm2_get_tpm_pt(chip, TPM2_CAP_TPM_PROPERTIES, &dummy, desc);
> > +	return tpm2_get_tpm_pt(chip, 0x100, &dummy,
> > +			       "attempting to generate an interrupt");
> Why the change from TPM2_CAP_TPM_PROPERTIES = 6 to 0x100 and what does 0x100 
> stand for?

In TPM 2.0 there are two levels: capabilities and properties. Using
capability ID of "TPM properties" property set was a sloppy mistake
although it didn't matter because interrupt is still generate.

The properties in the "TPM properties" property set start with the
index 0x100.

> >  }
> >  EXPORT_SYMBOL_GPL(tpm2_gen_interrupt);
> > +
> > +/**
> > + * tpm2_probe() - probe TPM 2.0
> > + * @chip: TPM chip to use
> > + *
> > + * Send idempotent TPM 2.0 command and see whether TPM 2.0 chip replied
> > based on + * the reply tag.
> > + */
> > +int tpm2_probe(struct tpm_chip *chip)
> > +{
> > +	struct tpm2_cmd cmd;
> > +	int rc;
> > +
> > +	cmd.header.in = tpm2_get_tpm_pt_header;
> > +	cmd.params.get_tpm_pt_in.cap_id = cpu_to_be32(TPM2_CAP_TPM_PROPERTIES);
> > +	cmd.params.get_tpm_pt_in.property_id = cpu_to_be32(0x100);
> > +	cmd.params.get_tpm_pt_in.property_cnt = cpu_to_be32(1);
> > +
> > +	rc = tpm_transmit(chip, (const char *) &cmd, sizeof(cmd));
> > +	if (rc <  0)
> > +		return rc;
> > +	else if (rc < TPM_HEADER_SIZE)
> > +		return -EFAULT;
> > +
> > +	if (be16_to_cpu(cmd.header.out.tag) == TPM2_ST_NO_SESSIONS)
> > +		chip->flags |= TPM_CHIP_FLAG_TPM2;
> > +
> > +	return 0;
> > +}
> > +EXPORT_SYMBOL_GPL(tpm2_probe);
> > diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c
> > index 6725bef..ee6e0bd 100644
> > --- a/drivers/char/tpm/tpm_tis.c
> > +++ b/drivers/char/tpm/tpm_tis.c
> > @@ -639,12 +639,9 @@ static int tpm_tis_init(struct device *dev,
> > acpi_handle acpi_dev_handle, goto out_err;
> >  	}
> > 
> > -	/* Every TPM 2.x command has a higher ordinal than TPM 1.x commands.
> > -	 * Therefore, we can use an idempotent TPM 2.x command to probe TPM 2.x.
> > -	 */
> > -	rc = tpm2_gen_interrupt(chip, true);
> > -	if (rc == 0 || rc == TPM2_RC_INITIALIZE)
> > -		chip->flags |= TPM_CHIP_FLAG_TPM2;
> > +	rc = tpm2_probe(chip);
> > +	if (rc)
> > +		goto out_err;
> > 
> >  	vendor = ioread32(chip->vendor.iobase + TPM_DID_VID(0));
> >  	chip->vendor.manufacturer_id = vendor;
> > @@ -747,7 +744,7 @@ static int tpm_tis_init(struct device *dev, acpi_handle
> > acpi_dev_handle,
> > 
> >  			/* Generate Interrupts */
> >  			if (chip->flags & TPM_CHIP_FLAG_TPM2)
> > -				tpm2_gen_interrupt(chip, false);
> > +				tpm2_gen_interrupt(chip);
> >  			else
> >  				tpm_gen_interrupt(chip);

/Jarkko

^ permalink raw reply

* [PATCH 3.18 29/39] arm64: Fix up /proc/cpuinfo
From: Greg Kroah-Hartman @ 2015-02-09  8:34 UTC (permalink / raw)
  To: linux-kernel
  Cc: Greg Kroah-Hartman, stable, Greg Hackmann, Ian Campbell,
	Serban Constantinescu, Will Deacon, cross-distro, linux-api,
	linux-arm-kernel, Catalin Marinas, Mark Rutland
In-Reply-To: <20150209083328.753647350@linuxfoundation.org>

3.18-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Mark Rutland <mark.rutland@arm.com>

commit 44b82b7700d05a52cd983799d3ecde1a976b3bed upstream.

Commit d7a49086f263164a (arm64: cpuinfo: print info for all CPUs)
attempted to clean up /proc/cpuinfo, but due to concerns regarding
further changes was reverted in commit 5e39977edf6500fd (Revert "arm64:
cpuinfo: print info for all CPUs").

There are two major issues with the arm64 /proc/cpuinfo format
currently:

* The "Features" line describes (only) the 64-bit hwcaps, which is
  problematic for some 32-bit applications which attempt to parse it. As
  the same names are used for analogous ISA features (e.g. aes) despite
  these generally being architecturally unrelated, it is not possible to
  simply append the 64-bit and 32-bit hwcaps in a manner that might not
  be misleading to some applications.

  Various potential solutions have appeared in vendor kernels. Typically
  the format of the Features line varies depending on whether the task
  is 32-bit.

* Information is only printed regarding a single CPU. This does not
  match the ARM format, and does not provide sufficient information in
  big.LITTLE systems where CPUs are heterogeneous. The CPU information
  printed is queried from the current CPU's registers, which is racy
  w.r.t. cross-cpu migration.

This patch attempts to solve these issues. The following changes are
made:

* When a task with a LINUX32 personality attempts to read /proc/cpuinfo,
  the "Features" line contains the decoded 32-bit hwcaps, as with the
  arm port. Otherwise, the decoded 64-bit hwcaps are shown. This aligns
  with the behaviour of COMPAT_UTS_MACHINE and COMPAT_ELF_PLATFORM. In
  the absense of compat support, the Features line is empty.

  The set of hwcaps injected into a task's auxval are unaffected.

* Properties are printed per-cpu, as with the ARM port. The per-cpu
  information is queried from pre-recorded cpu information (as used by
  the sanity checks).

* As with the previous attempt at fixing up /proc/cpuinfo, the hardware
  field is removed. The only users so far are 32-bit applications tied
  to particular boards, so no portable applications should be affected,
  and this should prevent future tying to particular boards.

The following differences remain:

* No model_name is printed, as this cannot be queried from the hardware
  and cannot be provided in a stable fashion. Use of the CPU
  {implementor,variant,part,revision} fields is sufficient to identify a
  CPU and is portable across arm and arm64.

* The following system-wide properties are not provided, as they are not
  possible to provide generally. Programs relying on these are already
  tied to particular (32-bit only) boards:
  - Hardware
  - Revision
  - Serial

No software has yet been identified for which these remaining
differences are problematic.

Cc: Greg Hackmann <ghackmann@google.com>
Cc: Ian Campbell <ijc@hellion.org.uk>
Cc: Serban Constantinescu <serban.constantinescu@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: cross-distro@lists.linaro.org
Cc: linux-api@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

---
 arch/arm64/kernel/setup.c |   94 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 71 insertions(+), 23 deletions(-)

--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -43,6 +43,7 @@
 #include <linux/of_fdt.h>
 #include <linux/of_platform.h>
 #include <linux/efi.h>
+#include <linux/personality.h>
 
 #include <asm/fixmap.h>
 #include <asm/cpu.h>
@@ -79,7 +80,6 @@ unsigned int compat_elf_hwcap2 __read_mo
 #endif
 
 static const char *cpu_name;
-static const char *machine_name;
 phys_addr_t __fdt_pointer __initdata;
 
 /*
@@ -311,8 +311,6 @@ static void __init setup_machine_fdt(phy
 		while (true)
 			cpu_relax();
 	}
-
-	machine_name = of_flat_dt_get_machine_name();
 }
 
 /*
@@ -449,14 +447,50 @@ static const char *hwcap_str[] = {
 	NULL
 };
 
+#ifdef CONFIG_COMPAT
+static const char *compat_hwcap_str[] = {
+	"swp",
+	"half",
+	"thumb",
+	"26bit",
+	"fastmult",
+	"fpa",
+	"vfp",
+	"edsp",
+	"java",
+	"iwmmxt",
+	"crunch",
+	"thumbee",
+	"neon",
+	"vfpv3",
+	"vfpv3d16",
+	"tls",
+	"vfpv4",
+	"idiva",
+	"idivt",
+	"vfpd32",
+	"lpae",
+	"evtstrm"
+};
+
+static const char *compat_hwcap2_str[] = {
+	"aes",
+	"pmull",
+	"sha1",
+	"sha2",
+	"crc32",
+	NULL
+};
+#endif /* CONFIG_COMPAT */
+
 static int c_show(struct seq_file *m, void *v)
 {
-	int i;
-
-	seq_printf(m, "Processor\t: %s rev %d (%s)\n",
-		   cpu_name, read_cpuid_id() & 15, ELF_PLATFORM);
+	int i, j;
 
 	for_each_online_cpu(i) {
+		struct cpuinfo_arm64 *cpuinfo = &per_cpu(cpu_data, i);
+		u32 midr = cpuinfo->reg_midr;
+
 		/*
 		 * glibc reads /proc/cpuinfo to determine the number of
 		 * online processors, looking for lines beginning with
@@ -465,24 +499,38 @@ static int c_show(struct seq_file *m, vo
 #ifdef CONFIG_SMP
 		seq_printf(m, "processor\t: %d\n", i);
 #endif
-	}
-
-	/* dump out the processor features */
-	seq_puts(m, "Features\t: ");
-
-	for (i = 0; hwcap_str[i]; i++)
-		if (elf_hwcap & (1 << i))
-			seq_printf(m, "%s ", hwcap_str[i]);
-
-	seq_printf(m, "\nCPU implementer\t: 0x%02x\n", read_cpuid_id() >> 24);
-	seq_printf(m, "CPU architecture: AArch64\n");
-	seq_printf(m, "CPU variant\t: 0x%x\n", (read_cpuid_id() >> 20) & 15);
-	seq_printf(m, "CPU part\t: 0x%03x\n", (read_cpuid_id() >> 4) & 0xfff);
-	seq_printf(m, "CPU revision\t: %d\n", read_cpuid_id() & 15);
 
-	seq_puts(m, "\n");
+		/*
+		 * Dump out the common processor features in a single line.
+		 * Userspace should read the hwcaps with getauxval(AT_HWCAP)
+		 * rather than attempting to parse this, but there's a body of
+		 * software which does already (at least for 32-bit).
+		 */
+		seq_puts(m, "Features\t:");
+		if (personality(current->personality) == PER_LINUX32) {
+#ifdef CONFIG_COMPAT
+			for (j = 0; compat_hwcap_str[j]; j++)
+				if (compat_elf_hwcap & (1 << j))
+					seq_printf(m, " %s", compat_hwcap_str[j]);
+
+			for (j = 0; compat_hwcap2_str[j]; j++)
+				if (compat_elf_hwcap2 & (1 << j))
+					seq_printf(m, " %s", compat_hwcap2_str[j]);
+#endif /* CONFIG_COMPAT */
+		} else {
+			for (j = 0; hwcap_str[j]; j++)
+				if (elf_hwcap & (1 << j))
+					seq_printf(m, " %s", hwcap_str[j]);
+		}
+		seq_puts(m, "\n");
 
-	seq_printf(m, "Hardware\t: %s\n", machine_name);
+		seq_printf(m, "CPU implementer\t: 0x%02x\n",
+			   MIDR_IMPLEMENTOR(midr));
+		seq_printf(m, "CPU architecture: 8\n");
+		seq_printf(m, "CPU variant\t: 0x%x\n", MIDR_VARIANT(midr));
+		seq_printf(m, "CPU part\t: 0x%03x\n", MIDR_PARTNUM(midr));
+		seq_printf(m, "CPU revision\t: %d\n\n", MIDR_REVISION(midr));
+	}
 
 	return 0;
 }

^ permalink raw reply

* [PATCH 3.14 13/20] arm64: Fix up /proc/cpuinfo
From: Greg Kroah-Hartman @ 2015-02-09  8:34 UTC (permalink / raw)
  To: linux-kernel
  Cc: Greg Kroah-Hartman, stable, Greg Hackmann, Ian Campbell,
	Serban Constantinescu, Will Deacon, cross-distro, linux-api,
	linux-arm-kernel, Catalin Marinas, Mark Rutland
In-Reply-To: <20150209083042.033412726@linuxfoundation.org>

3.14-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Mark Rutland <mark.rutland@arm.com>

commit 44b82b7700d05a52cd983799d3ecde1a976b3bed upstream.

Commit d7a49086f263164a (arm64: cpuinfo: print info for all CPUs)
attempted to clean up /proc/cpuinfo, but due to concerns regarding
further changes was reverted in commit 5e39977edf6500fd (Revert "arm64:
cpuinfo: print info for all CPUs").

There are two major issues with the arm64 /proc/cpuinfo format
currently:

* The "Features" line describes (only) the 64-bit hwcaps, which is
  problematic for some 32-bit applications which attempt to parse it. As
  the same names are used for analogous ISA features (e.g. aes) despite
  these generally being architecturally unrelated, it is not possible to
  simply append the 64-bit and 32-bit hwcaps in a manner that might not
  be misleading to some applications.

  Various potential solutions have appeared in vendor kernels. Typically
  the format of the Features line varies depending on whether the task
  is 32-bit.

* Information is only printed regarding a single CPU. This does not
  match the ARM format, and does not provide sufficient information in
  big.LITTLE systems where CPUs are heterogeneous. The CPU information
  printed is queried from the current CPU's registers, which is racy
  w.r.t. cross-cpu migration.

This patch attempts to solve these issues. The following changes are
made:

* When a task with a LINUX32 personality attempts to read /proc/cpuinfo,
  the "Features" line contains the decoded 32-bit hwcaps, as with the
  arm port. Otherwise, the decoded 64-bit hwcaps are shown. This aligns
  with the behaviour of COMPAT_UTS_MACHINE and COMPAT_ELF_PLATFORM. In
  the absense of compat support, the Features line is empty.

  The set of hwcaps injected into a task's auxval are unaffected.

* Properties are printed per-cpu, as with the ARM port. The per-cpu
  information is queried from pre-recorded cpu information (as used by
  the sanity checks).

* As with the previous attempt at fixing up /proc/cpuinfo, the hardware
  field is removed. The only users so far are 32-bit applications tied
  to particular boards, so no portable applications should be affected,
  and this should prevent future tying to particular boards.

The following differences remain:

* No model_name is printed, as this cannot be queried from the hardware
  and cannot be provided in a stable fashion. Use of the CPU
  {implementor,variant,part,revision} fields is sufficient to identify a
  CPU and is portable across arm and arm64.

* The following system-wide properties are not provided, as they are not
  possible to provide generally. Programs relying on these are already
  tied to particular (32-bit only) boards:
  - Hardware
  - Revision
  - Serial

No software has yet been identified for which these remaining
differences are problematic.

Cc: Greg Hackmann <ghackmann@google.com>
Cc: Ian Campbell <ijc@hellion.org.uk>
Cc: Serban Constantinescu <serban.constantinescu@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: cross-distro@lists.linaro.org
Cc: linux-api@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

---
 arch/arm64/include/asm/cputype.h |    2 
 arch/arm64/kernel/setup.c        |   99 ++++++++++++++++++++++++++++-----------
 arch/arm64/kernel/smp.c          |    5 +
 3 files changed, 80 insertions(+), 26 deletions(-)

--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -77,6 +77,8 @@ static inline u32 __attribute_const__ re
 	return read_cpuid(CTR_EL0);
 }
 
+void cpuinfo_store_cpu(void);
+
 #endif /* __ASSEMBLY__ */
 
 #endif
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -41,6 +41,7 @@
 #include <linux/memblock.h>
 #include <linux/of_fdt.h>
 #include <linux/of_platform.h>
+#include <linux/personality.h>
 
 #include <asm/cputype.h>
 #include <asm/elf.h>
@@ -73,7 +74,6 @@ unsigned int compat_elf_hwcap __read_mos
 #endif
 
 static const char *cpu_name;
-static const char *machine_name;
 phys_addr_t __fdt_pointer __initdata;
 
 /*
@@ -193,6 +193,19 @@ static void __init smp_build_mpidr_hash(
 }
 #endif
 
+struct cpuinfo_arm64 {
+	struct cpu	cpu;
+	u32		reg_midr;
+};
+
+static DEFINE_PER_CPU(struct cpuinfo_arm64, cpu_data);
+
+void cpuinfo_store_cpu(void)
+{
+	struct cpuinfo_arm64 *info = this_cpu_ptr(&cpu_data);
+	info->reg_midr = read_cpuid_id();
+}
+
 static void __init setup_processor(void)
 {
 	struct cpu_info *cpu_info;
@@ -213,6 +226,8 @@ static void __init setup_processor(void)
 	sprintf(init_utsname()->machine, ELF_PLATFORM);
 	elf_hwcap = 0;
 
+	cpuinfo_store_cpu();
+
 	/*
 	 * ID_AA64ISAR0_EL1 contains 4-bit wide signed feature blocks.
 	 * The blocks we test below represent incremental functionality
@@ -257,8 +272,6 @@ static void __init setup_machine_fdt(phy
 		while (true)
 			cpu_relax();
 	}
-
-	machine_name = of_flat_dt_get_machine_name();
 }
 
 /*
@@ -363,14 +376,12 @@ static int __init arm64_device_init(void
 }
 arch_initcall(arm64_device_init);
 
-static DEFINE_PER_CPU(struct cpu, cpu_data);
-
 static int __init topology_init(void)
 {
 	int i;
 
 	for_each_possible_cpu(i) {
-		struct cpu *cpu = &per_cpu(cpu_data, i);
+		struct cpu *cpu = &per_cpu(cpu_data.cpu, i);
 		cpu->hotpluggable = 1;
 		register_cpu(cpu, i);
 	}
@@ -391,14 +402,41 @@ static const char *hwcap_str[] = {
 	NULL
 };
 
+#ifdef CONFIG_COMPAT
+static const char *compat_hwcap_str[] = {
+	"swp",
+	"half",
+	"thumb",
+	"26bit",
+	"fastmult",
+	"fpa",
+	"vfp",
+	"edsp",
+	"java",
+	"iwmmxt",
+	"crunch",
+	"thumbee",
+	"neon",
+	"vfpv3",
+	"vfpv3d16",
+	"tls",
+	"vfpv4",
+	"idiva",
+	"idivt",
+	"vfpd32",
+	"lpae",
+	"evtstrm"
+};
+#endif /* CONFIG_COMPAT */
+
 static int c_show(struct seq_file *m, void *v)
 {
-	int i;
-
-	seq_printf(m, "Processor\t: %s rev %d (%s)\n",
-		   cpu_name, read_cpuid_id() & 15, ELF_PLATFORM);
+	int i, j;
 
 	for_each_online_cpu(i) {
+		struct cpuinfo_arm64 *cpuinfo = &per_cpu(cpu_data, i);
+		u32 midr = cpuinfo->reg_midr;
+
 		/*
 		 * glibc reads /proc/cpuinfo to determine the number of
 		 * online processors, looking for lines beginning with
@@ -407,24 +445,33 @@ static int c_show(struct seq_file *m, vo
 #ifdef CONFIG_SMP
 		seq_printf(m, "processor\t: %d\n", i);
 #endif
-	}
-
-	/* dump out the processor features */
-	seq_puts(m, "Features\t: ");
-
-	for (i = 0; hwcap_str[i]; i++)
-		if (elf_hwcap & (1 << i))
-			seq_printf(m, "%s ", hwcap_str[i]);
-
-	seq_printf(m, "\nCPU implementer\t: 0x%02x\n", read_cpuid_id() >> 24);
-	seq_printf(m, "CPU architecture: AArch64\n");
-	seq_printf(m, "CPU variant\t: 0x%x\n", (read_cpuid_id() >> 20) & 15);
-	seq_printf(m, "CPU part\t: 0x%03x\n", (read_cpuid_id() >> 4) & 0xfff);
-	seq_printf(m, "CPU revision\t: %d\n", read_cpuid_id() & 15);
 
-	seq_puts(m, "\n");
+		/*
+		 * Dump out the common processor features in a single line.
+		 * Userspace should read the hwcaps with getauxval(AT_HWCAP)
+		 * rather than attempting to parse this, but there's a body of
+		 * software which does already (at least for 32-bit).
+		 */
+		seq_puts(m, "Features\t:");
+		if (personality(current->personality) == PER_LINUX32) {
+#ifdef CONFIG_COMPAT
+			for (j = 0; compat_hwcap_str[j]; j++)
+				if (compat_elf_hwcap & (1 << j))
+					seq_printf(m, " %s", compat_hwcap_str[j]);
+#endif /* CONFIG_COMPAT */
+		} else {
+			for (j = 0; hwcap_str[j]; j++)
+				if (elf_hwcap & (1 << j))
+					seq_printf(m, " %s", hwcap_str[j]);
+		}
+		seq_puts(m, "\n");
 
-	seq_printf(m, "Hardware\t: %s\n", machine_name);
+		seq_printf(m, "CPU implementer\t: 0x%02x\n", (midr >> 24));
+		seq_printf(m, "CPU architecture: 8\n");
+		seq_printf(m, "CPU variant\t: 0x%x\n", ((midr >> 20) & 0xf));
+		seq_printf(m, "CPU part\t: 0x%03x\n", ((midr >> 4) & 0xfff));
+		seq_printf(m, "CPU revision\t: %d\n\n", (midr & 0xf));
+	}
 
 	return 0;
 }
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -148,6 +148,11 @@ asmlinkage void secondary_start_kernel(v
 		cpu_ops[cpu]->cpu_postboot();
 
 	/*
+	 * Log the CPU info before it is marked online and might get read.
+	 */
+	cpuinfo_store_cpu();
+
+	/*
 	 * Enable GIC and timers.
 	 */
 	notify_cpu_starting(cpu);

^ permalink raw reply

* [PATCH 3.10 09/17] arm64: Fix up /proc/cpuinfo
From: Greg Kroah-Hartman @ 2015-02-09  8:33 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mark Rutland, cross-distro, Catalin Marinas, Greg Kroah-Hartman,
	Serban Constantinescu, Will Deacon, Greg Hackmann, stable,
	Ian Campbell, linux-api, linux-arm-kernel
In-Reply-To: <20150209083039.240170510@linuxfoundation.org>

3.10-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Mark Rutland <mark.rutland@arm.com>

commit 44b82b7700d05a52cd983799d3ecde1a976b3bed upstream.

Commit d7a49086f263164a (arm64: cpuinfo: print info for all CPUs)
attempted to clean up /proc/cpuinfo, but due to concerns regarding
further changes was reverted in commit 5e39977edf6500fd (Revert "arm64:
cpuinfo: print info for all CPUs").

There are two major issues with the arm64 /proc/cpuinfo format
currently:

* The "Features" line describes (only) the 64-bit hwcaps, which is
  problematic for some 32-bit applications which attempt to parse it. As
  the same names are used for analogous ISA features (e.g. aes) despite
  these generally being architecturally unrelated, it is not possible to
  simply append the 64-bit and 32-bit hwcaps in a manner that might not
  be misleading to some applications.

  Various potential solutions have appeared in vendor kernels. Typically
  the format of the Features line varies depending on whether the task
  is 32-bit.

* Information is only printed regarding a single CPU. This does not
  match the ARM format, and does not provide sufficient information in
  big.LITTLE systems where CPUs are heterogeneous. The CPU information
  printed is queried from the current CPU's registers, which is racy
  w.r.t. cross-cpu migration.

This patch attempts to solve these issues. The following changes are
made:

* When a task with a LINUX32 personality attempts to read /proc/cpuinfo,
  the "Features" line contains the decoded 32-bit hwcaps, as with the
  arm port. Otherwise, the decoded 64-bit hwcaps are shown. This aligns
  with the behaviour of COMPAT_UTS_MACHINE and COMPAT_ELF_PLATFORM. In
  the absense of compat support, the Features line is empty.

  The set of hwcaps injected into a task's auxval are unaffected.

* Properties are printed per-cpu, as with the ARM port. The per-cpu
  information is queried from pre-recorded cpu information (as used by
  the sanity checks).

* As with the previous attempt at fixing up /proc/cpuinfo, the hardware
  field is removed. The only users so far are 32-bit applications tied
  to particular boards, so no portable applications should be affected,
  and this should prevent future tying to particular boards.

The following differences remain:

* No model_name is printed, as this cannot be queried from the hardware
  and cannot be provided in a stable fashion. Use of the CPU
  {implementor,variant,part,revision} fields is sufficient to identify a
  CPU and is portable across arm and arm64.

* The following system-wide properties are not provided, as they are not
  possible to provide generally. Programs relying on these are already
  tied to particular (32-bit only) boards:
  - Hardware
  - Revision
  - Serial

No software has yet been identified for which these remaining
differences are problematic.

Cc: Greg Hackmann <ghackmann@google.com>
Cc: Ian Campbell <ijc@hellion.org.uk>
Cc: Serban Constantinescu <serban.constantinescu@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: cross-distro@lists.linaro.org
Cc: linux-api@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
[Mark: backport to v3.10.x]
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

---
 arch/arm64/include/asm/cputype.h |    2 
 arch/arm64/kernel/setup.c        |  100 +++++++++++++++++++++++++++++----------
 arch/arm64/kernel/smp.c          |    5 +
 3 files changed, 82 insertions(+), 25 deletions(-)

--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -74,6 +74,8 @@ static inline u32 __attribute_const__ re
 	return read_cpuid(ID_CTR_EL0);
 }
 
+void cpuinfo_store_cpu(void);
+
 #endif /* __ASSEMBLY__ */
 
 #endif
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -41,6 +41,7 @@
 #include <linux/memblock.h>
 #include <linux/of_fdt.h>
 #include <linux/of_platform.h>
+#include <linux/personality.h>
 
 #include <asm/cputype.h>
 #include <asm/elf.h>
@@ -97,6 +98,19 @@ void __init early_print(const char *str,
 	printk("%s", buf);
 }
 
+struct cpuinfo_arm64 {
+	struct cpu	cpu;
+	u32		reg_midr;
+};
+
+static DEFINE_PER_CPU(struct cpuinfo_arm64, cpu_data);
+
+void cpuinfo_store_cpu(void)
+{
+	struct cpuinfo_arm64 *info = this_cpu_ptr(&cpu_data);
+	info->reg_midr = read_cpuid_id();
+}
+
 static void __init setup_processor(void)
 {
 	struct cpu_info *cpu_info;
@@ -127,6 +141,8 @@ static void __init setup_machine_fdt(phy
 	struct boot_param_header *devtree;
 	unsigned long dt_root;
 
+	cpuinfo_store_cpu();
+
 	/* Check we have a non-NULL DT pointer */
 	if (!dt_phys) {
 		early_print("\n"
@@ -290,14 +306,12 @@ static int __init arm64_device_init(void
 }
 arch_initcall(arm64_device_init);
 
-static DEFINE_PER_CPU(struct cpu, cpu_data);
-
 static int __init topology_init(void)
 {
 	int i;
 
 	for_each_possible_cpu(i) {
-		struct cpu *cpu = &per_cpu(cpu_data, i);
+		struct cpu *cpu = &per_cpu(cpu_data.cpu, i);
 		cpu->hotpluggable = 1;
 		register_cpu(cpu, i);
 	}
@@ -312,14 +326,41 @@ static const char *hwcap_str[] = {
 	NULL
 };
 
+#ifdef CONFIG_COMPAT
+static const char *compat_hwcap_str[] = {
+	"swp",
+	"half",
+	"thumb",
+	"26bit",
+	"fastmult",
+	"fpa",
+	"vfp",
+	"edsp",
+	"java",
+	"iwmmxt",
+	"crunch",
+	"thumbee",
+	"neon",
+	"vfpv3",
+	"vfpv3d16",
+	"tls",
+	"vfpv4",
+	"idiva",
+	"idivt",
+	"vfpd32",
+	"lpae",
+	"evtstrm"
+};
+#endif /* CONFIG_COMPAT */
+
 static int c_show(struct seq_file *m, void *v)
 {
-	int i;
-
-	seq_printf(m, "Processor\t: %s rev %d (%s)\n",
-		   cpu_name, read_cpuid_id() & 15, ELF_PLATFORM);
+	int i, j;
 
 	for_each_online_cpu(i) {
+		struct cpuinfo_arm64 *cpuinfo = &per_cpu(cpu_data, i);
+		u32 midr = cpuinfo->reg_midr;
+
 		/*
 		 * glibc reads /proc/cpuinfo to determine the number of
 		 * online processors, looking for lines beginning with
@@ -328,27 +369,36 @@ static int c_show(struct seq_file *m, vo
 #ifdef CONFIG_SMP
 		seq_printf(m, "processor\t: %d\n", i);
 #endif
-		seq_printf(m, "BogoMIPS\t: %lu.%02lu\n\n",
+		seq_printf(m, "BogoMIPS\t: %lu.%02lu\n",
 			   loops_per_jiffy / (500000UL/HZ),
 			   loops_per_jiffy / (5000UL/HZ) % 100);
-	}
-
-	/* dump out the processor features */
-	seq_puts(m, "Features\t: ");
 
-	for (i = 0; hwcap_str[i]; i++)
-		if (elf_hwcap & (1 << i))
-			seq_printf(m, "%s ", hwcap_str[i]);
-
-	seq_printf(m, "\nCPU implementer\t: 0x%02x\n", read_cpuid_id() >> 24);
-	seq_printf(m, "CPU architecture: AArch64\n");
-	seq_printf(m, "CPU variant\t: 0x%x\n", (read_cpuid_id() >> 20) & 15);
-	seq_printf(m, "CPU part\t: 0x%03x\n", (read_cpuid_id() >> 4) & 0xfff);
-	seq_printf(m, "CPU revision\t: %d\n", read_cpuid_id() & 15);
-
-	seq_puts(m, "\n");
-
-	seq_printf(m, "Hardware\t: %s\n", machine_name);
+		/*
+		 * Dump out the common processor features in a single line.
+		 * Userspace should read the hwcaps with getauxval(AT_HWCAP)
+		 * rather than attempting to parse this, but there's a body of
+		 * software which does already (at least for 32-bit).
+		 */
+		seq_puts(m, "Features\t:");
+		if (personality(current->personality) == PER_LINUX32) {
+#ifdef CONFIG_COMPAT
+			for (j = 0; compat_hwcap_str[j]; j++)
+				if (COMPAT_ELF_HWCAP & (1 << j))
+					seq_printf(m, " %s", compat_hwcap_str[j]);
+#endif /* CONFIG_COMPAT */
+		} else {
+			for (j = 0; hwcap_str[j]; j++)
+				if (elf_hwcap & (1 << j))
+					seq_printf(m, " %s", hwcap_str[j]);
+		}
+		seq_puts(m, "\n");
+
+		seq_printf(m, "CPU implementer\t: 0x%02x\n", (midr >> 24));
+		seq_printf(m, "CPU architecture: 8\n");
+		seq_printf(m, "CPU variant\t: 0x%x\n", ((midr >> 20) & 0xf));
+		seq_printf(m, "CPU part\t: 0x%03x\n", ((midr >> 4) & 0xfff));
+		seq_printf(m, "CPU revision\t: %d\n\n", (midr & 0xf));
+	}
 
 	return 0;
 }
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -200,6 +200,11 @@ asmlinkage void __cpuinit secondary_star
 	raw_spin_unlock(&boot_lock);
 
 	/*
+	 * Log the CPU info before it is marked online and might get read.
+	 */
+	cpuinfo_store_cpu();
+
+	/*
 	 * OK, now it's safe to let the boot CPU continue.  Wait for
 	 * the CPU migration code to notice that the CPU is online
 	 * before we continue.

^ permalink raw reply

* Re: [PATCH v17 1/7] mm: support madvise(MADV_FREE)
From: Minchan Kim @ 2015-02-09  7:15 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Michael Kerrisk (man-pages), Michal Hocko, Andrew Morton,
	linux-kernel, linux-mm, linux-api, Hugh Dickins, Johannes Weiner,
	Rik van Riel, KOSAKI Motohiro, Mel Gorman, Jason Evans,
	zhangyanfei, Kirill A. Shutemov, Kirill A. Shutemov
In-Reply-To: <20150206182918.GA2290@kernel.org>

On Fri, Feb 06, 2015 at 10:29:18AM -0800, Shaohua Li wrote:
> On Fri, Feb 06, 2015 at 02:51:03PM +0900, Minchan Kim wrote:
> > Hi Shaohua,
> > 
> > On Thu, Feb 05, 2015 at 04:33:11PM -0800, Shaohua Li wrote:
> > > 
> > > Hi Minchan,
> > > 
> > > Sorry to jump in this thread so later, and if some issues are discussed before.
> > > I'm interesting in this patch, so tried it here. I use a simple test with
> > 
> > No problem at all. Interest is always win over ignorance.
> > 
> > > jemalloc. Obviously this can improve performance when there is no memory
> > > pressure. Did you try setup with memory pressure?
> > 
> > Sure but it was not a huge memory system like yours.
> 
> Yes, I'd like to check the symptom in memory pressure, so choose such test.
> 
> > > In my test, jemalloc will map 61G vma, and use about 32G memory without
> > > MADV_FREE. If MADV_FREE is enabled, jemalloc will use whole 61G memory because
> > > madvise doesn't reclaim the unused memory. If I disable swap (tweak your patch
> > 
> > Yes, IIUC, jemalloc replaces MADV_DONTNEED with MADV_FREE completely.
> 
> right.
> > > slightly to make it work without swap), I got oom. If swap is enabled, my
> > 
> > You mean you modified anon aging logic so it works although there is no swap?
> > If so, I have no idea why OOM happens. I guess it should free all of freeable
> > pages during the aging so although system stall happens more, I don't expect
> > OOM. Anyway, with MADV_FREE with no swap, we should consider more things
> > about anonymous aging.
> 
> In the patch, MADV_FREE will be disabled and fallback to DONTNEED if no swap is
> enabled. Our production environment doesn't enable swap, so I tried to delete
> the 'no swap' check and make MADV_FREE always enabled regardless if swap is
> enabled. I didn't change anything else. With such change, I saw oom
> immediately. So definitely we have aging issue, the pages aren't reclaimed
> fast.

In current VM implementation, it doesn't age anonymous LRU list if we have no
swap. That's the reason to drop freeing pages instantly.
I think it could be enhanced later.
http://lists.infradead.org/pipermail/linux-arm-kernel/2014-December/311591.html

> 
> > > system is totally stalled because of swap activity. Without the MADV_FREE,
> > > everything is ok. Considering we definitely don't want to waste too much
> > > memory, a system with memory pressure is normal, so sounds MADV_FREE will
> > > introduce big trouble here.
> > > 
> > > Did you think about move the MADV_FREE pages to the head of inactive LRU, so
> > > they can be reclaimed easily?
> > 
> > I think it's desirable if the page lived in active LRU.
> > The reason I didn't that was caused by volatile ranges system call which
> > was motivaion for MADV_FREE in my mind.
> > In last LSF/MM, there was concern about data's hotness.
> > Some of users want to keep that as it is in LRU position, others want to
> > handle that as cold(tail of inactive list)/warm(head of inactive list)/
> > hot(head of active list), for example.
> > The vrange syscall was just about volatiltiy, not depends on page hotness
> > so the decision on my head was not to change LRU order and let's make new
> > hotness advise if we need it later.
> > 
> > However, MADV_FREE's main customer is allocators and afaik, they want
> > to replace MADV_DONTNEED with MADV_FREE so I think it is really cold,
> > but we couldn't make sure so head of inactive is good compromise.
> > Another concern about tail of inactive list is that there could be
> > plenty of pages in there, which was asynchromos write-backed in
> > previous reclaim path, not-yet reclaimed because of not being able
> > to free the in softirq context of writeback. It means we ends up
> > freeing more potential pages to become workingset in advance
> > than pages VM already decided to evict.
> 
> Yes, they are definitely cold pages. I thought We should make sure the
> MADV_FREE pages are reclaimed first before other pages, at least in the anon
> LRU list, though there might be difficult to determine if we should reclaim
> writeback pages first or MADV_FREE pages first.

Frankly speaking, the issue with writeback page is just hurdle of
implementation, not design so if we could fix it, we might move
cold pages into tail of the inactive LRU. I tried it but don't have
time slot to continue these days. Hope to get a time to look soon.
https://lkml.org/lkml/2014/7/1/628
Even, it wouldn't be critical problem although we couldn't fix
the problem of writeback pages because they are already all
cold pages so it might be not important to keep order in LRU so
we could save working set and effort of VM to reclaim them
at the cost of moving all of hinting pages into tail of the LRU
whenever the syscall is called.

However, significant problem from my mind is we couldn't make
sure they are really cold pages. It would be true for allocators
but it's cache-friendly pages so it might be better to discard
tail pages of inactive LRU, which are really cold.
In addition, we couldn't expect all of usecase for MADV_FREE
so some of users might want to treat them as warm, not cold.

With moving them into inactive list's head, if we still see
a lot stall, I think it's a sign to add other logic, for example,
we could drop MADV_FREEed pages instantly if the zone is below
low min watermark when the syscall is called. Because everybody
doesn't like direct reclaim.

> 
> Thanks,
> Shaohua

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: MADV_DONTNEED semantics? Was: [RFC PATCH] mm: madvise: Ignore repeated MADV_DONTNEED hints
From: Minchan Kim @ 2015-02-09  6:50 UTC (permalink / raw)
  To: Michael Kerrisk (man-pages)
  Cc: Michal Hocko, Vlastimil Babka, Kirill A. Shutemov, Dave Hansen,
	Mel Gorman, linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org,
	Andrew Morton, lkml, Linux API, linux-man, Hugh Dickins
In-Reply-To: <54D4E47E.4020509-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

On Fri, Feb 06, 2015 at 04:57:50PM +0100, Michael Kerrisk (man-pages) wrote:
> Hi Michael
> 
> On 02/05/2015 04:41 PM, Michal Hocko wrote:
> > On Wed 04-02-15 20:24:27, Michael Kerrisk wrote:
> > [...]
> >> So, how about this text:
> >>
> >>               After a successful MADV_DONTNEED operation, the seman‐
> >>               tics  of  memory  access  in  the specified region are
> >>               changed: subsequent accesses of  pages  in  the  range
> >>               will  succeed,  but will result in either reloading of
> >>               the memory contents from the  underlying  mapped  file
> > 
> > "
> > result in either providing the up-to-date contents of the underlying
> > mapped file
> > "
> 
> Thanks! I did something like that. See below.
> 
> > Would be more precise IMO because reload might be interpreted as a major
> > fault which is not necessarily the case (see below).
> > 
> >>               (for  shared file mappings, shared anonymous mappings,
> >>               and shmem-based techniques such  as  System  V  shared
> >>               memory  segments)  or  zero-fill-on-demand  pages  for
> >>               anonymous private mappings.
> > 
> > Yes, this wording is better because many users are not aware of
> > MAP_ANON|MAP_SHARED being file backed in fact and mmap man page doesn't
> > mention that.
> 
> (Michal, would you have a text to propose to add to the mmap(2) page?
> Maybe it would be useful to add something there.)
> 
> > 
> > I am just wondering whether it makes sense to mention that MADV_DONTNEED
> > for shared mappings might be surprising and not freeing the backing
> > pages thus not really freeing memory until there is a memory
> > pressure. But maybe this is too implementation specific for a man
> > page. What about the following wording on top of yours?
> > "
> > Please note that the MADV_DONTNEED hint on shared mappings might not
> > lead to immediate freeing of pages in the range. The kernel is free to
> > delay this until an appropriate moment. RSS of the calling process will
> > be reduced however.
> > "
> 
> Thanks! I added this, but dropped in the word "immediately" in the last 
> sentence, since I assume that was implied. So now we have:
> 
>               After  a  successful MADV_DONTNEED operation, the seman‐
>               tics of  memory  access  in  the  specified  region  are
>               changed:  subsequent accesses of pages in the range will
>               succeed, but will result in either repopulating the mem‐
>               ory  contents from the up-to-date contents of the under‐
>               lying mapped file  (for  shared  file  mappings,  shared
>               anonymous  mappings,  and shmem-based techniques such as
>               System V shared memory segments) or  zero-fill-on-demand
>               pages for anonymous private mappings.
> 
>               Note  that,  when applied to shared mappings, MADV_DONT‐
>               NEED might not lead to immediate freeing of the pages in
>               the  range.   The  kernel  is  free to delay freeing the
>               pages until an appropriate  moment.   The  resident  set
>               size  (RSS)  of  the calling process will be immediately
>               reduced however.

Looks good. So, I can parse it that anonymous private mappings will lead
to immediate freeing of the pages in the range so it's clearly different
with MADV_FREE.

> 
> The current draft of the page can be found in a branch,
> http://git.kernel.org/cgit/docs/man-pages/man-pages.git/log/?h=draft_madvise
> 
> Thanks,
> 
> Michael
> 
> 
> 
> -- 
> Michael Kerrisk
> Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
> Linux/UNIX System Programming Training: http://man7.org/training/

-- 
Kind regards,
Minchan Kim
--
To unsubscribe from this list: send the line "unsubscribe linux-man" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: MADV_DONTNEED semantics? Was: [RFC PATCH] mm: madvise: Ignore repeated MADV_DONTNEED hints
From: Minchan Kim @ 2015-02-09  6:46 UTC (permalink / raw)
  To: Michael Kerrisk (man-pages)
  Cc: Vlastimil Babka, Kirill A. Shutemov, Dave Hansen, Mel Gorman,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org, Andrew Morton,
	lkml, Linux API, linux-man, Hugh Dickins
In-Reply-To: <54D4E098.8050004-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

Hello, Michael

On Fri, Feb 06, 2015 at 04:41:12PM +0100, Michael Kerrisk (man-pages) wrote:
> On 02/05/2015 02:07 AM, Minchan Kim wrote:
> > Hello,
> > 
> > On Wed, Feb 04, 2015 at 08:24:27PM +0100, Michael Kerrisk (man-pages) wrote:
> >> On 4 February 2015 at 18:02, Vlastimil Babka <vbabka-AlSwsSmVLrQ@public.gmane.org> wrote:
> >>> On 02/04/2015 03:00 PM, Michael Kerrisk (man-pages) wrote:
> >>>>
> >>>> Hello Vlastimil,
> >>>>
> >>>> On 4 February 2015 at 14:46, Vlastimil Babka <vbabka-AlSwsSmVLrQ@public.gmane.org> wrote:
> >>>>>>>
> >>>>>>> - that covers mlocking ok, not sure if the rest fits the "shared pages"
> >>>>>>> case
> >>>>>>> though. I dont see any check for other kinds of shared pages in the
> >>>>>>> code.
> >>>>>>
> >>>>>>
> >>>>>> Agreed. "shared" here seems confused. I've removed it. And I've
> >>>>>> added mention of "Huge TLB pages" for this error.
> >>>>>
> >>>>>
> >>>>> Thanks.
> >>>>
> >>>>
> >>>> I also added those cases for MADV_REMOVE, BTW.
> >>>
> >>>
> >>> Right. There's also the following for MADV_REMOVE that needs updating:
> >>>
> >>> "Currently, only shmfs/tmpfs supports this; other filesystems return with
> >>> the error ENOSYS."
> >>>
> >>> - it's not just shmem/tmpfs anymore. It should be best to refer to
> >>> fallocate(2) option FALLOC_FL_PUNCH_HOLE which seems to be (more) up to
> >>> date.
> >>>
> >>> - AFAICS it doesn't return ENOSYS but EOPNOTSUPP. Also neither error code is
> >>> listed in the ERRORS section.
> >>
> >> Yup, I recently added that as well, based on a patch from Jan Chaloupka.
> >>
> >>>>>>>>> - The word "will result" did sound as a guarantee at least to me. So
> >>>>>>>>> here it
> >>>>>>>>> could be changed to "may result (unless the advice is ignored)"?
> >>>>>>>>
> >>>>>>>> It's too late to fix documentation. Applications already depends on
> >>>>>>>> the
> >>>>>>>> beheviour.
> >>>>>>>
> >>>>>>> Right, so as long as they check for EINVAL, it should be safe. It
> >>>>>>> appears
> >>>>>>> that
> >>>>>>> jemalloc does.
> >>>>>>
> >>>>>> So, first a brief question: in the cases where the call does not error
> >>>>>> out,
> >>>>>> are we agreed that in the current implementation, MADV_DONTNEED will
> >>>>>> always result in zero-filled pages when the region is faulted back in
> >>>>>> (when we consider pages that are not backed by a file)?
> >>>>>
> >>>>> I'd agree at this point.
> >>>>
> >>>> Thanks for the confirmation.
> >>>>
> >>>>> Also we should probably mention anonymously shared pages (shmem). I think
> >>>>> they behave the same as file here.
> >>>>
> >>>> You mean tmpfs here, right? (I don't keep all of the synonyms straight.)
> >>>
> >>> shmem is tmpfs (that by itself would fit under "files" just fine), but also
> >>> sys V segments created by shmget(2) and also mappings created by mmap with
> >>> MAP_SHARED | MAP_ANONYMOUS. I'm not sure if there's a single manpage to
> >>> refer to the full list.
> >>
> >> So, how about this text:
> >>
> >>               After a successful MADV_DONTNEED operation, the seman‐
> >>               tics  of  memory  access  in  the specified region are
> >>               changed: subsequent accesses of  pages  in  the  range
> >>               will  succeed,  but will result in either reloading of
> >>               the memory contents from the  underlying  mapped  file
> >>               (for  shared file mappings, shared anonymous mappings,
> >>               and shmem-based techniques such  as  System  V  shared
> >>               memory  segments)  or  zero-fill-on-demand  pages  for
> >>               anonymous private mappings.
> > 
> > Hmm, I'd like to clarify.
> > 
> > Whether it was intention or not, some of userspace developers thought
> > about that syscall drop pages instantly if was no-error return so that
> > they will see more free pages(ie, rss for the process will be decreased)
> > with keeping the VMA. Can we rely on it?
> 
> I do not know. Michael?

It's important to identify difference between MADV_DONTNEED and MADV_FREE
so it would be better to clear out in this chance.

> 
> > And we should make error section, too.
> > "locked" covers mlock(2) and you said you will add hugetlb. Then,
> > VM_PFNMAP? In that case, it fails. How can we say about VM_PFNMAP?
> > special mapping for some drivers?
> 
> I'm open for offers on what to add.

I suggests from quote "LWN" http://lwn.net/Articles/162860/
"*special mapping* which is not made up of "normal" pages.
It is usually created by device drivers which map special memory areas
into user space"

>  
> > One more thing, "The kernel is free to ignore the advice".
> > It conflicts "This call does not influence the semantics of the
> > application (except in the case of MADV_DONTNEED)" so
> > is it okay we can believe "The kernel is free to ingmore the advise
> > except MADV_DONTNEED"?
> 
> I decided to just drop the sentence
> 
>      The kernel is free to ignore the advice.
> 
> It creates misunderstandings, and does not really add information.

Sounds good.

> 
> Cheers,
> 
> Michael
> 
> -- 
> Michael Kerrisk
> Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
> Linux/UNIX System Programming Training: http://man7.org/training/

-- 
Kind regards,
Minchan Kim

^ permalink raw reply

* Re: [PATCH] tpm, tpm_tis: fix TPM 2.0 probing
From: Peter Hüwe @ 2015-02-08 23:08 UTC (permalink / raw)
  To: Jarkko Sakkinen
  Cc: Ashley Lai, Marcel Selhorst,
	tpmdd-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, josh-iaAMLnmF4UmaiuxdJuQwMA,
	christophe.ricard-Re5JQEeQqe8AvxtiuMwx3w,
	jason.gunthorpe-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/,
	stefanb-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	trousers-tech-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
In-Reply-To: <1423059669-31734-1-git-send-email-jarkko.sakkinen-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>

Am Mittwoch, 4. Februar 2015, 15:21:09 schrieb Jarkko Sakkinen:
> If during transmission system error was returned, the logic was to
> incorrectly deduce that chip is a TPM 1.x chip. This patch fixes this
> issue. Also, this patch changes probing so that message tag is used as the
> measure for TPM 2.x, which should be much more stable.
Is it aware that some TPMs may respond with 0x00C1 as TAG for TPM1.2 commands?


> A separate function
> called tpm2_probe() is encapsulated because it can be used with any
> chipset.

> 
> Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
> ---
>  drivers/char/tpm/tpm.h      |  3 ++-
>  drivers/char/tpm/tpm2-cmd.c | 40 +++++++++++++++++++++++++++++++++-------
>  drivers/char/tpm/tpm_tis.c  | 11 ++++-------
>  3 files changed, 39 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h
> index 7b0727c..a4b0f5e 100644
> --- a/drivers/char/tpm/tpm.h
> +++ b/drivers/char/tpm/tpm.h
> @@ -435,4 +435,5 @@ extern int tpm2_startup(struct tpm_chip *chip, u16
> startup_type); extern int tpm2_shutdown(struct tpm_chip *chip, u16
> shutdown_type); extern unsigned long tpm2_calc_ordinal_duration(struct
> tpm_chip *, u32); extern int tpm2_do_selftest(struct tpm_chip *chip);
> -extern int tpm2_gen_interrupt(struct tpm_chip *chip, bool quiet);
> +extern int tpm2_gen_interrupt(struct tpm_chip *chip);
> +extern int tpm2_probe(struct tpm_chip *chip);
> diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c
> index 1abe650..49cd354 100644
> --- a/drivers/char/tpm/tpm2-cmd.c
> +++ b/drivers/char/tpm/tpm2-cmd.c
> @@ -598,20 +598,46 @@ EXPORT_SYMBOL_GPL(tpm2_do_selftest);
>  /**
>   * tpm2_gen_interrupt() - generate an interrupt
>   * @chip: TPM chip to use
> - * @quiet: surpress the error message
>   *
>   * 0 is returned when the operation is successful. If a negative number is
>   * returned it remarks a POSIX error code. If a positive number is
> returned * it remarks a TPM error.
>   */
> -int tpm2_gen_interrupt(struct tpm_chip *chip, bool quiet)
> +int tpm2_gen_interrupt(struct tpm_chip *chip)
>  {
> -	const char *desc = NULL;
>  	u32 dummy;
> 
> -	if (!quiet)
> -		desc = "attempting to generate an interrupt";
> -
> -	return tpm2_get_tpm_pt(chip, TPM2_CAP_TPM_PROPERTIES, &dummy, desc);
> +	return tpm2_get_tpm_pt(chip, 0x100, &dummy,
> +			       "attempting to generate an interrupt");
Why the change from TPM2_CAP_TPM_PROPERTIES = 6 to 0x100 and what does 0x100 
stand for?


>  }
>  EXPORT_SYMBOL_GPL(tpm2_gen_interrupt);
> +
> +/**
> + * tpm2_probe() - probe TPM 2.0
> + * @chip: TPM chip to use
> + *
> + * Send idempotent TPM 2.0 command and see whether TPM 2.0 chip replied
> based on + * the reply tag.
> + */
> +int tpm2_probe(struct tpm_chip *chip)
> +{
> +	struct tpm2_cmd cmd;
> +	int rc;
> +
> +	cmd.header.in = tpm2_get_tpm_pt_header;
> +	cmd.params.get_tpm_pt_in.cap_id = cpu_to_be32(TPM2_CAP_TPM_PROPERTIES);
> +	cmd.params.get_tpm_pt_in.property_id = cpu_to_be32(0x100);
> +	cmd.params.get_tpm_pt_in.property_cnt = cpu_to_be32(1);
> +
> +	rc = tpm_transmit(chip, (const char *) &cmd, sizeof(cmd));
> +	if (rc <  0)
> +		return rc;
> +	else if (rc < TPM_HEADER_SIZE)
> +		return -EFAULT;
> +
> +	if (be16_to_cpu(cmd.header.out.tag) == TPM2_ST_NO_SESSIONS)
> +		chip->flags |= TPM_CHIP_FLAG_TPM2;
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(tpm2_probe);
> diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c
> index 6725bef..ee6e0bd 100644
> --- a/drivers/char/tpm/tpm_tis.c
> +++ b/drivers/char/tpm/tpm_tis.c
> @@ -639,12 +639,9 @@ static int tpm_tis_init(struct device *dev,
> acpi_handle acpi_dev_handle, goto out_err;
>  	}
> 
> -	/* Every TPM 2.x command has a higher ordinal than TPM 1.x commands.
> -	 * Therefore, we can use an idempotent TPM 2.x command to probe TPM 2.x.
> -	 */
> -	rc = tpm2_gen_interrupt(chip, true);
> -	if (rc == 0 || rc == TPM2_RC_INITIALIZE)
> -		chip->flags |= TPM_CHIP_FLAG_TPM2;
> +	rc = tpm2_probe(chip);
> +	if (rc)
> +		goto out_err;
> 
>  	vendor = ioread32(chip->vendor.iobase + TPM_DID_VID(0));
>  	chip->vendor.manufacturer_id = vendor;
> @@ -747,7 +744,7 @@ static int tpm_tis_init(struct device *dev, acpi_handle
> acpi_dev_handle,
> 
>  			/* Generate Interrupts */
>  			if (chip->flags & TPM_CHIP_FLAG_TPM2)
> -				tpm2_gen_interrupt(chip, false);
> +				tpm2_gen_interrupt(chip);
>  			else
>  				tpm_gen_interrupt(chip);

^ permalink raw reply

* Re: [RFC] implementing tape statistics single file vs multi-file in sysfs
From: James Bottomley @ 2015-02-08 17:35 UTC (permalink / raw)
  To: Greg KH
  Cc: Laurence Oberman, Bryn M. Reeves, Seymour, Shane M,
	linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	Kai.Makisara-9Aww8k/80nUxHbG02/KK1g@public.gmane.org,
	Laurence Oberman (loberman-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org)
In-Reply-To: <20150208024506.GC15396-U8xfFu+wG4EAvxtiuMwx3w@public.gmane.org>

On Sun, 2015-02-08 at 10:45 +0800, Greg KH wrote:
> On Sat, Feb 07, 2015 at 09:27:05PM -0500, Laurence Oberman wrote:
> > Hello
> > Its not going to be tens of thousands of devices. That count was an
> > aggregate based on 1000's of servers.
> > In reality its unlikely to ever be more than 100 tapes drives per
> > individual Linux kernel instance.
> > Therefore sysfs will be the valid way to do this and make the data
> > available to user space.
> 
> Even if it is only 2 tape drives, again, what's wrong with using the
> existing i/o statistic interfaces that all block devices have?

Tape is a character device.  It only uses block via SCSI (SCSI uses
block to give an issue queue for every device).  One of the problems
with this model is that the block kobj, where all the statistics hang,
is actually never exposed for these devices because they don't have a
block name.  Even granted that we could alter block to give names to the
nameless queues and expose them in /sys/block, we'd still have the
problem, the queue statistics are the property of the pluggable I/O
scheduler, so there's a disconnect between the SCSI upper layer drivers
and the block scheduler (since the latter is embedded by design).
Pulling that apart would get us into a fairly nasty layering violation
(drivers aren't supposed to care about the scheulders).

>   Don't go
> making special one-off interfaces for one type of device if at all
> possible.

I don't really see any way around this.  The statistics the block
schedulers collect are relevant to I/O load balancing; that's not at all
the same class of statistics as the users of tape are interested in.
This problem is equivalent to the fibrechannel one where we collect the
fc_host_statistics in the scsi_transport_fc.c class as an attribute
group (block doesn't want to see or know any of the information because
it's all relevant to the transport, not the block abstraction).

James

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox