* [PATCH v2 linux-trace 3/8] samples: bpf: simple tracing example in eBPF assembler
From: Alexei Starovoitov @ 2015-01-28 4:06 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
In-Reply-To: <1422417973-10195-1-git-send-email-ast@plumgrid.com>
simple packet drop monitor:
- in-kernel eBPF program attaches to kfree_skb() event and records number
of packet drops at given location
- userspace iterates over the map every second and prints stats
Usage:
$ sudo dropmon
location 0xffffffff81695995 count 1
location 0xffffffff816d0da9 count 2
location 0xffffffff81695995 count 2
location 0xffffffff816d0da9 count 2
location 0xffffffff81695995 count 3
location 0xffffffff816d0da9 count 2
$ addr2line -ape ./bld_x64/vmlinux 0xffffffff81695995 0xffffffff816d0da9
0xffffffff81695995: ./bld_x64/../net/ipv4/icmp.c:1038
0xffffffff816d0da9: ./bld_x64/../net/unix/af_unix.c:1231
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
samples/bpf/Makefile | 2 +
samples/bpf/dropmon.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 131 insertions(+)
create mode 100644 samples/bpf/dropmon.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index b5b3600dcdf5..789691374562 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -6,7 +6,9 @@ hostprogs-y := test_verifier test_maps
hostprogs-y += sock_example
hostprogs-y += sockex1
hostprogs-y += sockex2
+hostprogs-y += dropmon
+dropmon-objs := dropmon.o libbpf.o
test_verifier-objs := test_verifier.o libbpf.o
test_maps-objs := test_maps.o libbpf.o
sock_example-objs := sock_example.o libbpf.o
diff --git a/samples/bpf/dropmon.c b/samples/bpf/dropmon.c
new file mode 100644
index 000000000000..9a2cd3344d69
--- /dev/null
+++ b/samples/bpf/dropmon.c
@@ -0,0 +1,129 @@
+/* simple packet drop monitor:
+ * - in-kernel eBPF program attaches to kfree_skb() event and records number
+ * of packet drops at given location
+ * - userspace iterates over the map every second and prints stats
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include <linux/unistd.h>
+#include <string.h>
+#include <linux/filter.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include "libbpf.h"
+
+#define TRACEPOINT "/sys/kernel/debug/tracing/events/skb/kfree_skb/"
+
+static int write_to_file(const char *file, const char *str, bool keep_open)
+{
+ int fd, err;
+
+ fd = open(file, O_WRONLY);
+ err = write(fd, str, strlen(str));
+ (void) err;
+
+ if (keep_open) {
+ return fd;
+ } else {
+ close(fd);
+ return -1;
+ }
+}
+
+static int dropmon(void)
+{
+ long long key, next_key, value = 0;
+ int prog_fd, map_fd, i;
+ char fmt[32];
+
+ map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), 1024);
+ if (map_fd < 0) {
+ printf("failed to create map '%s'\n", strerror(errno));
+ goto cleanup;
+ }
+
+ /* the following eBPF program is equivalent to C:
+ * int filter(struct bpf_context *ctx)
+ * {
+ * long loc = ctx->arg2;
+ * long init_val = 1;
+ * long *value;
+ *
+ * value = bpf_map_lookup_elem(MAP_ID, &loc);
+ * if (value) {
+ * __sync_fetch_and_add(value, 1);
+ * } else {
+ * bpf_map_update_elem(MAP_ID, &loc, &init_val, BPF_ANY);
+ * }
+ * return 0;
+ * }
+ */
+ struct bpf_insn prog[] = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8), /* r2 = *(u64 *)(r1 + 8) */
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), /* *(u64 *)(fp - 8) = r2 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), /* r2 = fp - 8 */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
+ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+ BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
+ BPF_EXIT_INSN(),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, 1), /* *(u64 *)(fp - 16) = 1 */
+ BPF_MOV64_IMM(BPF_REG_4, BPF_ANY),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -16), /* r3 = fp - 16 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), /* r2 = fp - 8 */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_update_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
+ BPF_EXIT_INSN(),
+ };
+
+ prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACING_FILTER, prog,
+ sizeof(prog), "GPL");
+ if (prog_fd < 0) {
+ printf("failed to load prog '%s'\n%s", strerror(errno), bpf_log_buf);
+ return -1;
+ }
+
+ sprintf(fmt, "bpf_%d", prog_fd);
+
+ write_to_file(TRACEPOINT "filter", fmt, true);
+
+ for (i = 0; i < 10; i++) {
+ key = 0;
+ while (bpf_get_next_key(map_fd, &key, &next_key) == 0) {
+ bpf_lookup_elem(map_fd, &next_key, &value);
+ printf("location 0x%llx count %lld\n", next_key, value);
+ key = next_key;
+ }
+ if (key)
+ printf("\n");
+ sleep(1);
+ }
+
+cleanup:
+ /* maps, programs, tracepoint filters will auto cleanup on process exit */
+
+ return 0;
+}
+
+int main(void)
+{
+ FILE *f;
+
+ /* start ping in the background to get some kfree_skb events */
+ f = popen("ping -c5 localhost", "r");
+ (void) f;
+
+ dropmon();
+ return 0;
+}
--
1.7.9.5
^ permalink raw reply related
* [PATCH v2 linux-trace 4/8] samples: bpf: simple tracing example in C
From: Alexei Starovoitov @ 2015-01-28 4:06 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
In-Reply-To: <1422417973-10195-1-git-send-email-ast@plumgrid.com>
tracex1_kern.c - C program which will be compiled into eBPF
to filter netif_receive_skb events on skb->dev->name == "lo"
The programs returns 1 to continue storing an event into trace buffer
and returns 0 - to discard an event.
tracex1_user.c - corresponding user space component that
forever reads /sys/.../trace_pipe
Usage:
$ sudo tracex1
should see:
writing bpf-4 -> /sys/kernel/debug/tracing/events/net/netif_receive_skb/filter
ping-364 [000] ..s2 8.089771: netif_receive_skb: dev=lo skbaddr=ffff88000dfcc100 len=84
ping-364 [000] ..s2 8.089889: netif_receive_skb: dev=lo skbaddr=ffff88000dfcc900 len=84
Ctrl-C at any time, kernel will auto cleanup
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
samples/bpf/Makefile | 4 +++
samples/bpf/bpf_helpers.h | 14 +++++++++++
samples/bpf/bpf_load.c | 59 +++++++++++++++++++++++++++++++++++++++-----
samples/bpf/bpf_load.h | 3 +++
samples/bpf/tracex1_kern.c | 28 +++++++++++++++++++++
samples/bpf/tracex1_user.c | 24 ++++++++++++++++++
6 files changed, 126 insertions(+), 6 deletions(-)
create mode 100644 samples/bpf/tracex1_kern.c
create mode 100644 samples/bpf/tracex1_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 789691374562..da28e1b6d3a6 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -7,6 +7,7 @@ hostprogs-y += sock_example
hostprogs-y += sockex1
hostprogs-y += sockex2
hostprogs-y += dropmon
+hostprogs-y += tracex1
dropmon-objs := dropmon.o libbpf.o
test_verifier-objs := test_verifier.o libbpf.o
@@ -14,17 +15,20 @@ test_maps-objs := test_maps.o libbpf.o
sock_example-objs := sock_example.o libbpf.o
sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
+tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
always += sockex1_kern.o
always += sockex2_kern.o
+always += tracex1_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
HOSTLOADLIBES_sockex1 += -lelf
HOSTLOADLIBES_sockex2 += -lelf
+HOSTLOADLIBES_tracex1 += -lelf
# point this to your LLVM backend with bpf support
LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index ca0333146006..9c385c2eacf8 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -15,6 +15,20 @@ static int (*bpf_map_update_elem)(void *map, void *key, void *value,
(void *) BPF_FUNC_map_update_elem;
static int (*bpf_map_delete_elem)(void *map, void *key) =
(void *) BPF_FUNC_map_delete_elem;
+static void *(*bpf_fetch_ptr)(void *unsafe_ptr) =
+ (void *) BPF_FUNC_fetch_ptr;
+static unsigned long long (*bpf_fetch_u64)(void *unsafe_ptr) =
+ (void *) BPF_FUNC_fetch_u64;
+static unsigned int (*bpf_fetch_u32)(void *unsafe_ptr) =
+ (void *) BPF_FUNC_fetch_u32;
+static unsigned short (*bpf_fetch_u16)(void *unsafe_ptr) =
+ (void *) BPF_FUNC_fetch_u16;
+static unsigned char (*bpf_fetch_u8)(void *unsafe_ptr) =
+ (void *) BPF_FUNC_fetch_u8;
+static int (*bpf_memcmp)(void *unsafe_ptr, void *safe_ptr, int size) =
+ (void *) BPF_FUNC_memcmp;
+static unsigned long long (*bpf_ktime_get_ns)(void) =
+ (void *) BPF_FUNC_ktime_get_ns;
/* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 1831d236382b..788ac51c1024 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -14,6 +14,8 @@
#include "bpf_helpers.h"
#include "bpf_load.h"
+#define DEBUGFS "/sys/kernel/debug/tracing/"
+
static char license[128];
static bool processed_sec[128];
int map_fd[MAX_MAPS];
@@ -22,15 +24,18 @@ int prog_cnt;
static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
{
- int fd;
bool is_socket = strncmp(event, "socket", 6) == 0;
+ enum bpf_prog_type prog_type;
+ char path[256] = DEBUGFS;
+ char fmt[32];
+ int fd, event_fd, err;
- if (!is_socket)
- /* tracing events tbd */
- return -1;
+ if (is_socket)
+ prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+ else
+ prog_type = BPF_PROG_TYPE_TRACING_FILTER;
- fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER,
- prog, size, license);
+ fd = bpf_prog_load(prog_type, prog, size, license);
if (fd < 0) {
printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf);
@@ -39,6 +44,28 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
prog_fd[prog_cnt++] = fd;
+ if (is_socket)
+ return 0;
+
+ snprintf(fmt, sizeof(fmt), "bpf-%d", fd);
+
+ strcat(path, event);
+ strcat(path, "/filter");
+
+ printf("writing %s -> %s\n", fmt, path);
+
+ event_fd = open(path, O_WRONLY, 0);
+ if (event_fd < 0) {
+ printf("failed to open event %s\n", event);
+ return -1;
+ }
+
+ err = write(event_fd, fmt, strlen(fmt));
+ if (err < 0) {
+ printf("write to '%s' failed '%s'\n", event, strerror(errno));
+ return -1;
+ }
+
return 0;
}
@@ -201,3 +228,23 @@ int load_bpf_file(char *path)
close(fd);
return 0;
}
+
+void read_trace_pipe(void)
+{
+ int trace_fd;
+
+ trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
+ if (trace_fd < 0)
+ return;
+
+ while (1) {
+ static char buf[4096];
+ ssize_t sz;
+
+ sz = read(trace_fd, buf, sizeof(buf));
+ if (sz) {
+ buf[sz] = 0;
+ puts(buf);
+ }
+ }
+}
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 27789a34f5e6..d154fc2b0535 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -21,4 +21,7 @@ extern int prog_fd[MAX_PROGS];
*/
int load_bpf_file(char *path);
+/* forever reads /sys/.../trace_pipe */
+void read_trace_pipe(void);
+
#endif
diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c
new file mode 100644
index 000000000000..7849ceb4bce6
--- /dev/null
+++ b/samples/bpf/tracex1_kern.c
@@ -0,0 +1,28 @@
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include "bpf_helpers.h"
+
+SEC("events/net/netif_receive_skb")
+int bpf_prog1(struct bpf_context *ctx)
+{
+ /*
+ * attaches to /sys/kernel/debug/tracing/events/net/netif_receive_skb
+ * prints events for loobpack device only
+ */
+ char devname[] = "lo";
+ struct net_device *dev;
+ struct sk_buff *skb = 0;
+
+ skb = (struct sk_buff *) ctx->arg1;
+ dev = bpf_fetch_ptr(&skb->dev);
+ if (bpf_memcmp(dev->name, devname, 2) == 0)
+ /* print event using default tracepoint format */
+ return 1;
+
+ /* drop event */
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c
new file mode 100644
index 000000000000..e85c1b483f57
--- /dev/null
+++ b/samples/bpf/tracex1_user.c
@@ -0,0 +1,24 @@
+#include <stdio.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int ac, char **argv)
+{
+ FILE *f;
+ char filename[256];
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ f = popen("ping -c5 localhost", "r");
+ (void) f;
+
+ read_trace_pipe();
+
+ return 0;
+}
--
1.7.9.5
^ permalink raw reply related
* [PATCH v2 linux-trace 5/8] samples: bpf: counting example for kfree_skb tracepoint and write syscall
From: Alexei Starovoitov @ 2015-01-28 4:06 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
In-Reply-To: <1422417973-10195-1-git-send-email-ast@plumgrid.com>
this example has two probes in one C file that attach to different tracepoints
and use two different maps.
1st probe is the similar to dropmon.c. It attaches to kfree_skb tracepoint and
count number of packet drops at different locations
2nd probe attaches to syscalls/sys_enter_write and computes a histogram of different
write sizes
Usage:
$ sudo tracex2
writing bpf-8 -> /sys/kernel/debug/tracing/events/skb/kfree_skb/filter
writing bpf-10 -> /sys/kernel/debug/tracing/events/syscalls/sys_enter_write/filter
location 0xffffffff816959a5 count 1
location 0xffffffff816959a5 count 2
557145+0 records in
557145+0 records out
285258240 bytes (285 MB) copied, 1.02379 s, 279 MB/s
syscall write() stats
byte_size : count distribution
1 -> 1 : 3 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 0 | |
16 -> 31 : 2 | |
32 -> 63 : 3 | |
64 -> 127 : 1 | |
128 -> 255 : 1 | |
256 -> 511 : 0 | |
512 -> 1023 : 1118968 |************************************* |
Ctrl-C at any time. Kernel will auto cleanup maps and programs
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
samples/bpf/Makefile | 4 ++
samples/bpf/tracex2_kern.c | 71 +++++++++++++++++++++++++++++++++
samples/bpf/tracex2_user.c | 95 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 170 insertions(+)
create mode 100644 samples/bpf/tracex2_kern.c
create mode 100644 samples/bpf/tracex2_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index da28e1b6d3a6..416af24b01fd 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -8,6 +8,7 @@ hostprogs-y += sockex1
hostprogs-y += sockex2
hostprogs-y += dropmon
hostprogs-y += tracex1
+hostprogs-y += tracex2
dropmon-objs := dropmon.o libbpf.o
test_verifier-objs := test_verifier.o libbpf.o
@@ -16,12 +17,14 @@ sock_example-objs := sock_example.o libbpf.o
sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
+tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
always += sockex1_kern.o
always += sockex2_kern.o
always += tracex1_kern.o
+always += tracex2_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
@@ -29,6 +32,7 @@ HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
HOSTLOADLIBES_sockex1 += -lelf
HOSTLOADLIBES_sockex2 += -lelf
HOSTLOADLIBES_tracex1 += -lelf
+HOSTLOADLIBES_tracex2 += -lelf
# point this to your LLVM backend with bpf support
LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c
new file mode 100644
index 000000000000..a789c456c1b4
--- /dev/null
+++ b/samples/bpf/tracex2_kern.c
@@ -0,0 +1,71 @@
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(long),
+ .value_size = sizeof(long),
+ .max_entries = 1024,
+};
+
+SEC("events/skb/kfree_skb")
+int bpf_prog2(struct bpf_context *ctx)
+{
+ long loc = ctx->arg2;
+ long init_val = 1;
+ long *value;
+
+ value = bpf_map_lookup_elem(&my_map, &loc);
+ if (value)
+ *value += 1;
+ else
+ bpf_map_update_elem(&my_map, &loc, &init_val, BPF_ANY);
+ return 0;
+}
+
+static unsigned int log2(unsigned int v)
+{
+ unsigned int r;
+ unsigned int shift;
+
+ r = (v > 0xFFFF) << 4; v >>= r;
+ shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+ shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+ shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+ r |= (v >> 1);
+ return r;
+}
+
+static unsigned int log2l(unsigned long v)
+{
+ unsigned int hi = v >> 32;
+ if (hi)
+ return log2(hi) + 32;
+ else
+ return log2(v);
+}
+
+struct bpf_map_def SEC("maps") my_hist_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = 64,
+};
+
+SEC("events/syscalls/sys_enter_write")
+int bpf_prog3(struct bpf_context *ctx)
+{
+ long write_size = ctx->arg3;
+ long init_val = 1;
+ long *value;
+ u32 index = log2l(write_size);
+
+ value = bpf_map_lookup_elem(&my_hist_map, &index);
+ if (value)
+ __sync_fetch_and_add(value, 1);
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c
new file mode 100644
index 000000000000..016a76e97cd7
--- /dev/null
+++ b/samples/bpf/tracex2_user.c
@@ -0,0 +1,95 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define MAX_INDEX 64
+#define MAX_STARS 38
+
+static void stars(char *str, long val, long max, int width)
+{
+ int i;
+
+ for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+ str[i] = '*';
+ if (val > max)
+ str[i - 1] = '+';
+ str[i] = '\0';
+}
+
+static void print_hist(int fd)
+{
+ int key;
+ long value;
+ long data[MAX_INDEX] = {};
+ char starstr[MAX_STARS];
+ int i;
+ int max_ind = -1;
+ long max_value = 0;
+
+ for (key = 0; key < MAX_INDEX; key++) {
+ bpf_lookup_elem(fd, &key, &value);
+ data[key] = value;
+ if (value && key > max_ind)
+ max_ind = key;
+ if (value > max_value)
+ max_value = value;
+ }
+
+ printf(" syscall write() stats\n");
+ printf(" byte_size : count distribution\n");
+ for (i = 1; i <= max_ind + 1; i++) {
+ stars(starstr, data[i - 1], max_value, MAX_STARS);
+ printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+ (1l << i) >> 1, (1l << i) - 1, data[i - 1],
+ MAX_STARS, starstr);
+ }
+}
+static void int_exit(int sig)
+{
+ print_hist(map_fd[1]);
+ exit(0);
+}
+
+int main(int ac, char **argv)
+{
+ char filename[256];
+ long key, next_key, value;
+ FILE *f;
+ int i;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ signal(SIGINT, int_exit);
+
+ /* start 'ping' in the background to have some kfree_skb events */
+ f = popen("ping -c5 localhost", "r");
+ (void) f;
+
+ /* start 'dd' in the background to have plenty of 'write' syscalls */
+ f = popen("dd if=/dev/zero of=/dev/null", "r");
+ (void) f;
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ for (i = 0; i < 5; i++) {
+ key = 0;
+ while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) {
+ bpf_lookup_elem(map_fd[0], &next_key, &value);
+ printf("location 0x%lx count %ld\n", next_key, value);
+ key = next_key;
+ }
+ if (key)
+ printf("\n");
+ sleep(1);
+ }
+ print_hist(map_fd[1]);
+
+ return 0;
+}
--
1.7.9.5
^ permalink raw reply related
* [PATCH v2 linux-trace 6/8] samples: bpf: IO latency analysis (iosnoop/heatmap)
From: Alexei Starovoitov @ 2015-01-28 4:06 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
In-Reply-To: <1422417973-10195-1-git-send-email-ast@plumgrid.com>
eBPF C program attaches to block_rq_issue/block_rq_complete events to calculate
IO latency. Then it waits for the first 100 events to compute average latency
and uses range [0 .. ave_lat * 2] to record histogram of events in this latency
range.
User space reads this histogram map every 2 seconds and prints it as a 'heatmap'
using gray shades of text terminal. Black spaces have many events and white
spaces have very few events. Left most space is the smallest latency, right most
space is the largest latency in the range.
If kernel sees too many events that fall out of histogram range, user space
adjusts the range up, so heatmap for next 2 seconds will be more accurate.
Usage:
$ sudo ./tracex3
and do 'sudo dd if=/dev/sda of=/dev/null' in other terminal.
Observe IO latencies and how different activity (like 'make kernel') affects it.
Similar experiments can be done for network transmit latencies, syscalls, etc
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
samples/bpf/Makefile | 4 ++
samples/bpf/tracex3_kern.c | 92 +++++++++++++++++++++++++++
samples/bpf/tracex3_user.c | 150 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 246 insertions(+)
create mode 100644 samples/bpf/tracex3_kern.c
create mode 100644 samples/bpf/tracex3_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 416af24b01fd..da0efd8032ab 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -9,6 +9,7 @@ hostprogs-y += sockex2
hostprogs-y += dropmon
hostprogs-y += tracex1
hostprogs-y += tracex2
+hostprogs-y += tracex3
dropmon-objs := dropmon.o libbpf.o
test_verifier-objs := test_verifier.o libbpf.o
@@ -18,6 +19,7 @@ sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
+tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
@@ -25,6 +27,7 @@ always += sockex1_kern.o
always += sockex2_kern.o
always += tracex1_kern.o
always += tracex2_kern.o
+always += tracex3_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
@@ -33,6 +36,7 @@ HOSTLOADLIBES_sockex1 += -lelf
HOSTLOADLIBES_sockex2 += -lelf
HOSTLOADLIBES_tracex1 += -lelf
HOSTLOADLIBES_tracex2 += -lelf
+HOSTLOADLIBES_tracex3 += -lelf
# point this to your LLVM backend with bpf support
LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c
new file mode 100644
index 000000000000..c31f29aa6fc1
--- /dev/null
+++ b/samples/bpf/tracex3_kern.c
@@ -0,0 +1,92 @@
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(long),
+ .value_size = sizeof(u64),
+ .max_entries = 4096,
+};
+
+SEC("events/block/block_rq_issue")
+int bpf_prog1(struct bpf_context *ctx)
+{
+ long rq = ctx->arg2;
+ u64 val = bpf_ktime_get_ns();
+
+ bpf_map_update_elem(&my_map, &rq, &val, BPF_ANY);
+ return 0;
+}
+
+struct globals {
+ u64 lat_ave;
+ u64 lat_sum;
+ u64 missed;
+ u64 max_lat;
+ int num_samples;
+};
+
+struct bpf_map_def SEC("maps") global_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(struct globals),
+ .max_entries = 1,
+};
+
+#define MAX_SLOT 32
+
+struct bpf_map_def SEC("maps") lat_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(u64),
+ .max_entries = MAX_SLOT,
+};
+
+SEC("events/block/block_rq_complete")
+int bpf_prog2(struct bpf_context *ctx)
+{
+ long rq = ctx->arg2;
+ void *value;
+
+ value = bpf_map_lookup_elem(&my_map, &rq);
+ if (!value)
+ return 0;
+
+ u64 cur_time = bpf_ktime_get_ns();
+ u64 delta = (cur_time - *(u64 *)value) / 1000;
+
+ bpf_map_delete_elem(&my_map, &rq);
+
+ int ind = 0;
+ struct globals *g = bpf_map_lookup_elem(&global_map, &ind);
+
+ if (!g)
+ return 0;
+ if (g->lat_ave == 0) {
+ g->num_samples++;
+ g->lat_sum += delta;
+ if (g->num_samples >= 100)
+ g->lat_ave = g->lat_sum / g->num_samples;
+ } else {
+ u64 max_lat = g->lat_ave * 2;
+
+ if (delta > max_lat) {
+ g->missed++;
+ if (delta > g->max_lat)
+ g->max_lat = delta;
+ return 0;
+ }
+
+ ind = delta * MAX_SLOT / max_lat;
+ value = bpf_map_lookup_elem(&lat_map, &ind);
+ if (!value)
+ return 0;
+ (*(u64 *)value)++;
+ }
+
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c
new file mode 100644
index 000000000000..b7549adbd981
--- /dev/null
+++ b/samples/bpf/tracex3_user.c
@@ -0,0 +1,150 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+struct globals {
+ __u64 lat_ave;
+ __u64 lat_sum;
+ __u64 missed;
+ __u64 max_lat;
+ int num_samples;
+};
+
+static void clear_stats(int fd)
+{
+ int key;
+ __u64 value = 0;
+
+ for (key = 0; key < 32; key++)
+ bpf_update_elem(fd, &key, &value, BPF_ANY);
+}
+
+const char *color[] = {
+ "\033[48;5;255m",
+ "\033[48;5;252m",
+ "\033[48;5;250m",
+ "\033[48;5;248m",
+ "\033[48;5;246m",
+ "\033[48;5;244m",
+ "\033[48;5;242m",
+ "\033[48;5;240m",
+ "\033[48;5;238m",
+ "\033[48;5;236m",
+ "\033[48;5;234m",
+ "\033[48;5;232m",
+};
+const int num_colors = ARRAY_SIZE(color);
+
+const char nocolor[] = "\033[00m";
+
+static void print_banner(__u64 max_lat)
+{
+ printf("0 usec ... %lld usec\n", max_lat);
+}
+
+static void print_hist(int fd)
+{
+ int key;
+ __u64 value;
+ __u64 cnt[32];
+ __u64 max_cnt = 0;
+ __u64 total_events = 0;
+ int max_bucket = 0;
+
+ for (key = 0; key < 32; key++) {
+ value = 0;
+ bpf_lookup_elem(fd, &key, &value);
+ if (value > 0)
+ max_bucket = key;
+ cnt[key] = value;
+ total_events += value;
+ if (value > max_cnt)
+ max_cnt = value;
+ }
+ clear_stats(fd);
+ for (key = 0; key < 32; key++) {
+ int c = num_colors * cnt[key] / (max_cnt + 1);
+
+ printf("%s %s", color[c], nocolor);
+ }
+ printf(" captured=%lld", total_events);
+
+ key = 0;
+ struct globals g = {};
+
+ bpf_lookup_elem(map_fd[1], &key, &g);
+
+ printf(" missed=%lld max_lat=%lld usec\n",
+ g.missed, g.max_lat);
+
+ if (g.missed > 10 && g.missed > total_events / 10) {
+ printf("adjusting range UP...\n");
+ g.lat_ave = g.max_lat / 2;
+ print_banner(g.lat_ave * 2);
+ } else if (max_bucket < 4 && total_events > 100) {
+ printf("adjusting range DOWN...\n");
+ g.lat_ave = g.lat_ave / 4;
+ print_banner(g.lat_ave * 2);
+ }
+ /* clear some globals */
+ g.missed = 0;
+ g.max_lat = 0;
+ bpf_update_elem(map_fd[1], &key, &g, BPF_ANY);
+}
+
+static void int_exit(int sig)
+{
+ print_hist(map_fd[2]);
+ exit(0);
+}
+
+int main(int ac, char **argv)
+{
+ char filename[256];
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ clear_stats(map_fd[2]);
+
+ signal(SIGINT, int_exit);
+
+ if (fork() == 0) {
+ read_trace_pipe();
+ } else {
+ struct globals g;
+
+ printf("waiting for events to determine average latency...\n");
+ for (;;) {
+ int key = 0;
+
+ bpf_lookup_elem(map_fd[1], &key, &g);
+ if (g.lat_ave)
+ break;
+ sleep(1);
+ }
+
+ printf(" IO latency in usec\n"
+ " %s %s - many events with this latency\n"
+ " %s %s - few events\n",
+ color[num_colors - 1], nocolor,
+ color[0], nocolor);
+ print_banner(g.lat_ave * 2);
+ for (;;) {
+ print_hist(map_fd[2]);
+ sleep(2);
+ }
+ }
+
+ return 0;
+}
--
1.7.9.5
^ permalink raw reply related
* [PATCH v2 linux-trace 7/8] tracing: attach eBPF programs to kprobe/kretprobe
From: Alexei Starovoitov @ 2015-01-28 4:06 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
In-Reply-To: <1422417973-10195-1-git-send-email-ast@plumgrid.com>
introduce new type of eBPF programs BPF_PROG_TYPE_KPROBE_FILTER.
Such programs are allowed to call the same helper functions
as tracing filters, but bpf_context is different:
For tracing filters bpf_context is 6 arguments of tracepoints or syscalls
For kprobe filters bpf_context == pt_regs
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
include/linux/ftrace_event.h | 2 ++
include/uapi/linux/bpf.h | 1 +
kernel/trace/bpf_trace.c | 39 ++++++++++++++++++++++++++++++++++++
kernel/trace/trace_events_filter.c | 10 ++++++---
kernel/trace/trace_kprobe.c | 11 +++++++++-
5 files changed, 59 insertions(+), 4 deletions(-)
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 79de230b7df3..b057ca0c5539 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -249,6 +249,7 @@ enum {
TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
TRACE_EVENT_FL_TRACEPOINT_BIT,
TRACE_EVENT_FL_BPF_BIT,
+ TRACE_EVENT_FL_KPROBE_BIT,
};
/*
@@ -272,6 +273,7 @@ enum {
TRACE_EVENT_FL_USE_CALL_FILTER = (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
TRACE_EVENT_FL_TRACEPOINT = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
TRACE_EVENT_FL_BPF = (1 << TRACE_EVENT_FL_BPF_BIT),
+ TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT),
};
struct ftrace_event_call {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 227a4e404726..974932b8b5c6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -119,6 +119,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_UNSPEC,
BPF_PROG_TYPE_SOCKET_FILTER,
BPF_PROG_TYPE_TRACING_FILTER,
+ BPF_PROG_TYPE_KPROBE_FILTER,
};
/* flags for BPF_MAP_UPDATE_ELEM command */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 1c07f55702d6..0fc50d3ecde1 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -137,3 +137,42 @@ static int __init register_tracing_filter_ops(void)
return 0;
}
late_initcall(register_tracing_filter_ops);
+
+/* check access to fields of 'struct pt_regs' from BPF program */
+static bool kprobe_filter_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+ /* check bounds */
+ if (off < 0 || off >= sizeof(struct pt_regs))
+ return false;
+
+ /* only read is allowed */
+ if (type != BPF_READ)
+ return false;
+
+ /* disallow misaligned access */
+ if (off % size != 0)
+ return false;
+
+ return true;
+}
+/* kprobe filter programs are allowed to call the same helper functions
+ * as tracing filters, but bpf_context is different:
+ * For tracing filters bpf_context is 6 arguments of tracepoints or syscalls
+ * For kprobe filters bpf_context == pt_regs
+ */
+static struct bpf_verifier_ops kprobe_filter_ops = {
+ .get_func_proto = tracing_filter_func_proto,
+ .is_valid_access = kprobe_filter_is_valid_access,
+};
+
+static struct bpf_prog_type_list kprobe_tl = {
+ .ops = &kprobe_filter_ops,
+ .type = BPF_PROG_TYPE_KPROBE_FILTER,
+};
+
+static int __init register_kprobe_filter_ops(void)
+{
+ bpf_register_prog_type(&kprobe_tl);
+ return 0;
+}
+late_initcall(register_kprobe_filter_ops);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e0303b3cc9fb..e4a0268f2810 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1894,7 +1894,8 @@ static int create_filter_start(char *filter_str, bool set_str,
return err;
}
-static int create_filter_bpf(char *filter_str, struct event_filter **filterp)
+static int create_filter_bpf(struct ftrace_event_call *call, char *filter_str,
+ struct event_filter **filterp)
{
struct event_filter *filter;
struct bpf_prog *prog;
@@ -1923,7 +1924,10 @@ static int create_filter_bpf(char *filter_str, struct event_filter **filterp)
filter->prog = prog;
- if (prog->aux->prog_type != BPF_PROG_TYPE_TRACING_FILTER) {
+ if (((call->flags & TRACE_EVENT_FL_KPROBE) &&
+ prog->aux->prog_type != BPF_PROG_TYPE_KPROBE_FILTER) ||
+ (!(call->flags & TRACE_EVENT_FL_KPROBE) &&
+ prog->aux->prog_type != BPF_PROG_TYPE_TRACING_FILTER)) {
/* valid fd, but invalid bpf program type */
err = -EINVAL;
goto free_filter;
@@ -2054,7 +2058,7 @@ int apply_event_filter(struct ftrace_event_file *file, char *filter_string)
*/
if (memcmp(filter_string, "bpf", 3) == 0 && filter_string[3] != 0 &&
filter_string[4] != 0) {
- err = create_filter_bpf(filter_string, &filter);
+ err = create_filter_bpf(call, filter_string, &filter);
if (!err)
file->flags |= TRACE_EVENT_FL_BPF;
} else {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5edb518be345..ec62dd8cb35f 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -19,6 +19,7 @@
#include <linux/module.h>
#include <linux/uaccess.h>
+#include <trace/bpf_trace.h>
#include "trace_probe.h"
@@ -930,6 +931,10 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
if (ftrace_trigger_soft_disabled(ftrace_file))
return;
+ if (ftrace_file->flags & TRACE_EVENT_FL_BPF)
+ if (!trace_filter_call_bpf(ftrace_file->filter, regs))
+ return;
+
local_save_flags(irq_flags);
pc = preempt_count();
@@ -978,6 +983,10 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
if (ftrace_trigger_soft_disabled(ftrace_file))
return;
+ if (ftrace_file->flags & TRACE_EVENT_FL_BPF)
+ if (!trace_filter_call_bpf(ftrace_file->filter, regs))
+ return;
+
local_save_flags(irq_flags);
pc = preempt_count();
@@ -1286,7 +1295,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
kfree(call->print_fmt);
return -ENODEV;
}
- call->flags = 0;
+ call->flags = TRACE_EVENT_FL_KPROBE;
call->class->reg = kprobe_register;
call->data = tk;
ret = trace_add_event_call(call);
--
1.7.9.5
^ permalink raw reply related
* [PATCH v2 linux-trace 8/8] samples: bpf: simple kprobe example
From: Alexei Starovoitov @ 2015-01-28 4:06 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api-u79uwXL29TY76Z2rM5mHXA,
netdev-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1422417973-10195-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
the logic of the example is similar to tracex2, but syscall 'write' statistics
is capturead from kprobe placed at sys_write function instead of through
syscall instrumentation.
Also tracex4_kern.c has a different way of doing log2 in C.
Note, unlike tracepoint and syscall programs, kprobe programs receive
'struct pt_regs' as an input. It's responsibility of the program author
or higher level dynamic tracing tool to match registers to function arguments.
Since pt_regs are architecture dependent, programs are also arch dependent,
unlike tracepoint/syscalls programs which are universal.
Usage:
$ sudo tracex4
writing bpf-6 -> /sys/kernel/debug/tracing/events/kprobes/sys_write/filter
2216443+0 records in
2216442+0 records out
1134818304 bytes (1.1 GB) copied, 2.00746 s, 565 MB/s
kprobe sys_write() stats
byte_size : count distribution
1 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 0 | |
16 -> 31 : 0 | |
32 -> 63 : 0 | |
64 -> 127 : 1 | |
128 -> 255 : 0 | |
256 -> 511 : 0 | |
512 -> 1023 : 2214734 |************************************* |
Signed-off-by: Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
---
samples/bpf/Makefile | 4 +++
samples/bpf/bpf_load.c | 3 ++
samples/bpf/tracex4_kern.c | 36 +++++++++++++++++++
samples/bpf/tracex4_user.c | 83 ++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 126 insertions(+)
create mode 100644 samples/bpf/tracex4_kern.c
create mode 100644 samples/bpf/tracex4_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index da0efd8032ab..22c7a38f3f95 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -10,6 +10,7 @@ hostprogs-y += dropmon
hostprogs-y += tracex1
hostprogs-y += tracex2
hostprogs-y += tracex3
+hostprogs-y += tracex4
dropmon-objs := dropmon.o libbpf.o
test_verifier-objs := test_verifier.o libbpf.o
@@ -20,6 +21,7 @@ sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
+tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
@@ -28,6 +30,7 @@ always += sockex2_kern.o
always += tracex1_kern.o
always += tracex2_kern.o
always += tracex3_kern.o
+always += tracex4_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
@@ -37,6 +40,7 @@ HOSTLOADLIBES_sockex2 += -lelf
HOSTLOADLIBES_tracex1 += -lelf
HOSTLOADLIBES_tracex2 += -lelf
HOSTLOADLIBES_tracex3 += -lelf
+HOSTLOADLIBES_tracex4 += -lelf
# point this to your LLVM backend with bpf support
LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 788ac51c1024..d8c5176f0564 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -25,6 +25,7 @@ int prog_cnt;
static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
{
bool is_socket = strncmp(event, "socket", 6) == 0;
+ bool is_kprobe = strncmp(event, "events/kprobes/", 15) == 0;
enum bpf_prog_type prog_type;
char path[256] = DEBUGFS;
char fmt[32];
@@ -32,6 +33,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
if (is_socket)
prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+ else if (is_kprobe)
+ prog_type = BPF_PROG_TYPE_KPROBE_FILTER;
else
prog_type = BPF_PROG_TYPE_TRACING_FILTER;
diff --git a/samples/bpf/tracex4_kern.c b/samples/bpf/tracex4_kern.c
new file mode 100644
index 000000000000..9646f9e43417
--- /dev/null
+++ b/samples/bpf/tracex4_kern.c
@@ -0,0 +1,36 @@
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include "bpf_helpers.h"
+
+static unsigned int log2l(unsigned long long n)
+{
+#define S(k) if (n >= (1ull << k)) { i += k; n >>= k; }
+ int i = -(n == 0);
+ S(32); S(16); S(8); S(4); S(2); S(1);
+ return i;
+#undef S
+}
+
+struct bpf_map_def SEC("maps") my_hist_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = 64,
+};
+
+SEC("events/kprobes/sys_write")
+int bpf_prog4(struct pt_regs *regs)
+{
+ long write_size = regs->dx; /* $rdx contains 3rd argument to a function */
+ long init_val = 1;
+ void *value;
+ u32 index = log2l(write_size);
+
+ value = bpf_map_lookup_elem(&my_hist_map, &index);
+ if (value)
+ __sync_fetch_and_add((long *)value, 1);
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c
new file mode 100644
index 000000000000..47dde2791f9e
--- /dev/null
+++ b/samples/bpf/tracex4_user.c
@@ -0,0 +1,83 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define MAX_INDEX 64
+#define MAX_STARS 38
+
+static void stars(char *str, long val, long max, int width)
+{
+ int i;
+
+ for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+ str[i] = '*';
+ if (val > max)
+ str[i - 1] = '+';
+ str[i] = '\0';
+}
+
+static void print_hist(int fd)
+{
+ int key;
+ long value;
+ long data[MAX_INDEX] = {};
+ char starstr[MAX_STARS];
+ int i;
+ int max_ind = -1;
+ long max_value = 0;
+
+ for (key = 0; key < MAX_INDEX; key++) {
+ bpf_lookup_elem(fd, &key, &value);
+ data[key] = value;
+ if (value && key > max_ind)
+ max_ind = key;
+ if (value > max_value)
+ max_value = value;
+ }
+
+ printf("\n kprobe sys_write() stats\n");
+ printf(" byte_size : count distribution\n");
+ for (i = 1; i <= max_ind + 1; i++) {
+ stars(starstr, data[i - 1], max_value, MAX_STARS);
+ printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+ (1l << i) >> 1, (1l << i) - 1, data[i - 1],
+ MAX_STARS, starstr);
+ }
+}
+static void int_exit(int sig)
+{
+ print_hist(map_fd[0]);
+ exit(0);
+}
+
+int main(int ac, char **argv)
+{
+ char filename[256];
+ FILE *f;
+ int i;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ signal(SIGINT, int_exit);
+
+ i = system("echo 'p:sys_write sys_write' > /sys/kernel/debug/tracing/kprobe_events");
+ (void) i;
+
+ /* start 'dd' in the background to have plenty of 'write' syscalls */
+ f = popen("dd if=/dev/zero of=/dev/null", "r");
+ (void) f;
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ sleep(2);
+ kill(0, SIGINT); /* send Ctrl-C to self and to 'dd' */
+
+ return 0;
+}
--
1.7.9.5
^ permalink raw reply related
* Re: [RFC][PATCH v2] procfs: Always expose /proc/<pid>/map_files/ and make it readable
From: Calvin Owens @ 2015-01-28 4:38 UTC (permalink / raw)
To: Andrew Morton
Cc: Cyrill Gorcunov, Kirill A. Shutemov, Alexey Dobriyan,
Oleg Nesterov, Eric W. Biederman, Al Viro, Kirill A. Shutemov,
Peter Feiner, Grant Likely, Siddhesh Poyarekar, linux-kernel,
kernel-team, Pavel Emelyanov, linux-api, Kees Cook
In-Reply-To: <20150126154346.c63c512e5821e9e0ea31f759@linux-foundation.org>
On Monday 01/26 at 15:43 -0800, Andrew Morton wrote:
> On Tue, 27 Jan 2015 00:00:54 +0300 Cyrill Gorcunov <gorcunov@gmail.com> wrote:
>
> > On Mon, Jan 26, 2015 at 02:47:31PM +0200, Kirill A. Shutemov wrote:
> > > On Fri, Jan 23, 2015 at 07:15:44PM -0800, Calvin Owens wrote:
> > > > Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and
> > > > is only exposed if CONFIG_CHECKPOINT_RESTORE is set. This interface
> > > > is very useful for enumerating the files mapped into a process when
> > > > the more verbose information in /proc/<pid>/maps is not needed.
>
> This is the main (actually only) justification for the patch, and it it
> far too thin. What does "not needed" mean. Why can't people just use
> /proc/pid/maps?
The biggest difference is that if you do something like this:
fd = open("/stuff", O_BLAH);
map = mmap(NULL, 4096, PROT_BLAH, MAP_SHARED, fd, 0);
close(fd);
unlink("/stuff");
...then map_files/ gives you a way to get a file descriptor for
"/stuff", which you couldn't do with /proc/pid/maps.
It's also something of a win if you just want to see what is mapped at a
specific address, since you can just readlink() the symlink for the
address range you care about and it will go grab the appropriate VMA and
give you the answer. /proc/pid/maps requires walking the VMA tree, which
is quite expensive for processes with many thousands of threads, even
without the O(N^2) issue.
(You have to know what address range you want though, since readdir() on
map_files/ obviously has to walk the VMA tree just like /proc/N/maps.)
> > > > This patch moves the folder out from behind CHECKPOINT_RESTORE, and
> > > > removes the CAP_SYS_ADMIN restrictions. Following the links requires
> > > > the ability to ptrace the process in question, so this doesn't allow
> > > > an attacker to do anything they couldn't already do before.
> > > >
> > > > Signed-off-by: Calvin Owens <calvinowens@fb.com>
> > >
> > > Cc +linux-api@
> >
> > Looks good to me, thanks! Though I would really appreciate if someone
> > from security camp take a look as well.
>
> hm, who's that. Kees comes to mind.
>
> And reviewers' task would be a heck of a lot easier if they knew what
> /proc/pid/map_files actually does. This:
>
> akpm3:/usr/src/25> grep -r map_files Documentation
> akpm3:/usr/src/25>
>
> does not help.
>
> The 640708a2cff7f81 changelog says:
>
> : This one behaves similarly to the /proc/<pid>/fd/ one - it contains
> : symlinks one for each mapping with file, the name of a symlink is
> : "vma->vm_start-vma->vm_end", the target is the file. Opening a symlink
> : results in a file that point exactly to the same inode as them vma's one.
> :
> : For example the ls -l of some arbitrary /proc/<pid>/map_files/
> :
> : | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
> : | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
> : | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
> : | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
> : | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so
>
> afacit this info is also available in /proc/pid/maps, so things
> shouldn't get worse if the /proc/pid/map_files permissions are at least
> as restrictive as the /proc/pid/maps permissions. Is that the case?
> (Please add to changelog).
Yes, the only difference is that you can follow the link as per above.
I'll resend with a new message explaining that and the deletion thing.
> There's one other problem here: we're assuming that the map_files
> implementation doesn't have bugs. If it does have bugs then relaxing
> permissions like this will create new vulnerabilities. And the
> map_files implementation is surprisingly complex. Is it bug-free?
While I was messing with it I used it a good bit and didn't see any
issues, although I didn't actively try to fuzz it or anything. I'd be
happy to write something to test hammering it in weird ways if you like.
I'm also happy to write testcases for namespaces.
So far as security issues, as others have pointed out you can't follow
the links unless you can ptrace the process in question, which seems
like a pretty solid guarantee. As Cyrill pointed out in the discussion
about the documentation, that's the same protection as /proc/N/fd/*, and
those links function in the same way.
Thanks,
Calvin
^ permalink raw reply
* Re: [PATCH v9 2/2] tty/serial: Add Spreadtrum sc9836-uart driver support
From: Baruch Siach @ 2015-01-28 6:12 UTC (permalink / raw)
To: Chunyan Zhang
Cc: gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r, mark.rutland-5wv7dgnIgG8,
gnomes-qBU/x9rampVanCEyBjwyrvXRex20P6io,
heiko-4mtYJXux2i+zQB+pC5nmwQ, andrew-g2DYL2Zd6BY,
jslaby-AlSwsSmVLrQ, lanqing.liu-lxIno14LUO0EEoCn2XhGlw,
arnd-r2nGTMty4D4, zhang.lyra-Re5JQEeQqe8AvxtiuMwx3w,
zhizhou.zhang-lxIno14LUO0EEoCn2XhGlw,
geng.ren-lxIno14LUO0EEoCn2XhGlw,
antonynpavlov-Re5JQEeQqe8AvxtiuMwx3w,
linux-serial-u79uwXL29TY76Z2rM5mHXA,
grant.likely-QSEj5FYQhm4dnm+yROfE0A,
orsonzhai-Re5JQEeQqe8AvxtiuMwx3w, florian.vaussard-p8DiymsW2f8,
devicetree-u79uwXL29TY76Z2rM5mHXA, jason-NLaQJdtUoK4Be96aLqz0jA,
pawel.moll-5wv7dgnIgG8, ijc+devicetree-KcIKpvwj1kUDXYZnReoRVg,
hytszk-Re5JQEeQqe8AvxtiuMwx3w, robh+dt-DgEjT+Ai2ygdnm+yROfE0A,
wei.qiao-lxIno14LUO0EEoCn2XhGlw,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
peter-WaGBZJeGNqdsbIuE7sb01tBPR1lH4CV8,
linux-api-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, galak-sgV2jX0FEOL9JmXXK+q4OQ,
shawn.guo-QSEj5FYQhm4dnm+yROfE0A
In-Reply-To: <1422413261-17184-3-git-send-email-chunyan.zhang-lxIno14LUO0EEoCn2XhGlw@public.gmane.org>
Hi Chunyan Zhang,
On Wed, Jan 28, 2015 at 10:47:41AM +0800, Chunyan Zhang wrote:
[...]
> +static inline void sprd_rx(struct uart_port *port)
> +{
> + struct tty_port *tty = &port->state->port;
> + unsigned int ch, flag, lsr, max_count = SPRD_TIMEOUT;
> +
> + while ((serial_in(port, SPRD_STS1) & 0x00ff) && max_count--) {
> + lsr = serial_in(port, SPRD_LSR);
> + ch = serial_in(port, SPRD_RXD);
> + flag = TTY_NORMAL;
> + port->icount.rx++;
> +
> + if (lsr & (SPRD_LSR_BI | SPRD_LSR_PE |
> + SPRD_LSR_FE | SPRD_LSR_OE))
> + if (handle_lsr_errors(port, &lsr, &flag))
> + continue;
> + if (uart_handle_sysrq_char(port, ch))
> + continue;
My comment[1] on a previous version of this patch still stands.
uart_handle_sysrq_char is a NOP when SUPPORT_SYSRQ is not defined.
> +
> + uart_insert_char(port, lsr, SPRD_LSR_OE, ch, flag);
> + }
> +
> + tty_flip_buffer_push(tty);
> +}
[...]
> +static void sprd_console_write(struct console *co, const char *s,
> + unsigned int count)
> +{
> + struct uart_port *port = &sprd_port[co->index]->port;
> + int locked = 1;
> + unsigned long flags;
> +
> + if (port->sysrq)
> + locked = 0;
The sysrq field of struct uart_port is only defined when
CONFIG_SERIAL_CORE_CONSOLE or SUPPORT_SYSRQ are defined. You should #ifdef
this part accordingly to avoid build breakage.
> + else if (oops_in_progress)
> + locked = spin_trylock_irqsave(&port->lock, flags);
> + else
> + spin_lock_irqsave(&port->lock, flags);
> +
> + uart_console_write(port, s, count, sprd_console_putchar);
> +
> + /* wait for transmitter to become empty */
> + wait_for_xmitr(port);
> +
> + if (locked)
> + spin_unlock_irqrestore(&port->lock, flags);
> +}
[1]
http://article.gmane.org/gmane.linux.drivers.devicetree/106483
baruch
--
http://baruch.siach.name/blog/ ~. .~ Tk Open Systems
=}------------------------------------------------ooO--U--Ooo------------{=
- baruch-NswTu9S1W3P6gbPvEgmw2w@public.gmane.org - tel: +972.2.679.5364, http://www.tkos.co.il -
^ permalink raw reply
* Re: [PATCH v9 2/2] tty/serial: Add Spreadtrum sc9836-uart driver support
From: Lyra Zhang @ 2015-01-28 8:46 UTC (permalink / raw)
To: Baruch Siach
Cc: Mark Rutland, andrew@lunn.ch, Heiko Stübner,
gnomes@lxorguk.ukuu.org.uk, jslaby@suse.cz,
lanqing.liu@spreadtrum.com, Pawel Moll, Chunyan Zhang,
zhizhou.zhang, geng.ren@spreadtrum.com, antonynpavlov@gmail.com,
linux-serial@vger.kernel.org, Grant Likely, Orson Zhai,
florian.vaussard@epfl.ch, devicetree@vger.kernel.org,
jason@lakedaemon.net, Arnd Bergmann
In-Reply-To: <20150128061250.GQ2555@sapphire.tkos.co.il>
Hi, Baruch
I'm sorry that I didn't receive your previous email.
and my explanations are below.
On Wed, Jan 28, 2015 at 2:12 PM, Baruch Siach <baruch@tkos.co.il> wrote:
> Hi Chunyan Zhang,
>
> On Wed, Jan 28, 2015 at 10:47:41AM +0800, Chunyan Zhang wrote:
> [...]
>> +static inline void sprd_rx(struct uart_port *port)
>> +{
>> + struct tty_port *tty = &port->state->port;
>> + unsigned int ch, flag, lsr, max_count = SPRD_TIMEOUT;
>> +
>> + while ((serial_in(port, SPRD_STS1) & 0x00ff) && max_count--) {
>> + lsr = serial_in(port, SPRD_LSR);
>> + ch = serial_in(port, SPRD_RXD);
>> + flag = TTY_NORMAL;
>> + port->icount.rx++;
>> +
>> + if (lsr & (SPRD_LSR_BI | SPRD_LSR_PE |
>> + SPRD_LSR_FE | SPRD_LSR_OE))
>> + if (handle_lsr_errors(port, &lsr, &flag))
>> + continue;
>> + if (uart_handle_sysrq_char(port, ch))
>> + continue;
>
> My comment[1] on a previous version of this patch still stands.
> uart_handle_sysrq_char is a NOP when SUPPORT_SYSRQ is not defined.
>
yes, there are two definitions in serial_core.h, and it returns 0 when
SUPPORT_SYSRQ is not defined.
so, I think my code behavior does not hurt, right?
>> +
>> + uart_insert_char(port, lsr, SPRD_LSR_OE, ch, flag);
>> + }
>> +
>> + tty_flip_buffer_push(tty);
>> +}
>
> [...]
>
>> +static void sprd_console_write(struct console *co, const char *s,
>> + unsigned int count)
>> +{
>> + struct uart_port *port = &sprd_port[co->index]->port;
>> + int locked = 1;
>> + unsigned long flags;
>> +
>> + if (port->sysrq)
>> + locked = 0;
>
> The sysrq field of struct uart_port is only defined when
> CONFIG_SERIAL_CORE_CONSOLE or SUPPORT_SYSRQ are defined. You should #ifdef
> this part accordingly to avoid build breakage.
>
we have this part to been included under #ifdef
CONFIG_SERIAL_SPRD_CONSOLE which is selected with SERIAL_CORE_CONSOLE.
Thank you very much!
Chunyan
>> + else if (oops_in_progress)
>> + locked = spin_trylock_irqsave(&port->lock, flags);
>> + else
>> + spin_lock_irqsave(&port->lock, flags);
>> +
>> + uart_console_write(port, s, count, sprd_console_putchar);
>> +
>> + /* wait for transmitter to become empty */
>> + wait_for_xmitr(port);
>> +
>> + if (locked)
>> + spin_unlock_irqrestore(&port->lock, flags);
>> +}
>
> [1]
> http://article.gmane.org/gmane.linux.drivers.devicetree/106483
>
> baruch
>
> --
> http://baruch.siach.name/blog/ ~. .~ Tk Open Systems
> =}------------------------------------------------ooO--U--Ooo------------{=
> - baruch@tkos.co.il - tel: +972.2.679.5364, http://www.tkos.co.il -
^ permalink raw reply
* Re: [PATCH v9 2/2] tty/serial: Add Spreadtrum sc9836-uart driver support
From: Baruch Siach @ 2015-01-28 9:54 UTC (permalink / raw)
To: Lyra Zhang
Cc: Mark Rutland, andrew@lunn.ch, Heiko Stübner,
gnomes@lxorguk.ukuu.org.uk, jslaby@suse.cz,
lanqing.liu@spreadtrum.com, Pawel Moll, Chunyan Zhang,
zhizhou.zhang, geng.ren@spreadtrum.com, antonynpavlov@gmail.com,
linux-serial@vger.kernel.org, Grant Likely, Orson Zhai,
florian.vaussard@epfl.ch, devicetree@vger.kernel.org,
jason@lakedaemon.net, Arnd Bergmann
In-Reply-To: <CAAfSe-v4FGpHL3N3utiDwcoc7qus1XZZszuXBqB6LeE1frAfeA@mail.gmail.com>
Hi Lyra Zhang,
On Wed, Jan 28, 2015 at 04:46:33PM +0800, Lyra Zhang wrote:
> On Wed, Jan 28, 2015 at 2:12 PM, Baruch Siach <baruch@tkos.co.il> wrote:
> > On Wed, Jan 28, 2015 at 10:47:41AM +0800, Chunyan Zhang wrote:
> >> + if (uart_handle_sysrq_char(port, ch))
> >> + continue;
> >
> > My comment[1] on a previous version of this patch still stands.
> > uart_handle_sysrq_char is a NOP when SUPPORT_SYSRQ is not defined.
>
> yes, there are two definitions in serial_core.h, and it returns 0 when
> SUPPORT_SYSRQ is not defined.
> so, I think my code behavior does not hurt, right?
Right. I just wanted to point out that you can easily define SUPPORT_SYSRQ as
appropriate like many other serial drivers do to get sysrq support.
baruch
--
http://baruch.siach.name/blog/ ~. .~ Tk Open Systems
=}------------------------------------------------ooO--U--Ooo------------{=
- baruch@tkos.co.il - tel: +972.2.679.5364, http://www.tkos.co.il -
^ permalink raw reply
* Re: [PATCH 01/13] kdbus: add documentation
From: Michael Kerrisk (man-pages) @ 2015-01-28 10:46 UTC (permalink / raw)
To: Daniel Mack, David Herrmann
Cc: mtk.manpages, Tom Gundersen, Greg Kroah-Hartman, Arnd Bergmann,
Eric W. Biederman, One Thousand Gnomes, Jiri Kosina,
Andy Lutomirski, Linux API, LKML, Djalal Harouni,
Johannes Stezenbach, Theodore T'so, christoph Hellwig
In-Reply-To: <54C7D573.7080809@zonque.org>
Hello Daniel,
On 01/27/2015 07:14 PM, Daniel Mack wrote:
> Hi Michael,
>
> On 01/27/2015 06:53 PM, Michael Kerrisk (man-pages) wrote:
>> On 01/27/2015 04:23 PM, David Herrmann wrote:
>
>>> I only expect a handful of users to call the ioctls directly. The
>>> libraries that implement the payload-marshaling, in particular. It's a
>>> similar situation with netlink.
>>
>> Thanks, David, for the clarification. I think it would have been helpful
>> to have that more clearly stated up front, especially as some comments
>> in this thread, such as the above, could be interpreted to mean quite
>> the opposite. Can I suggest that some text on this point be added to
>> kdbus.txt?
>
> We're currently working on an a set of comprehensive man pages to
> document all the commands in the API, along with every struct, enum etc.
> We do that so that developers are able to actually understand every
> detail of the API, even though most people - as David explained - will
> not use that interface directly in the first place but let one of the
> high-level libraries help them integrate D-Bus functionality into their
> applications.
(I suggest that some text about this text go into the kdbus(7) page.)
> If you want, have a look at the upstream repository for a preliminary
> version of the new docs.
That's at https://code.google.com/p/d-bus/ , right? This looks like a
good direction to go in. Thanks for tackling that.
I hope to take a longer look sometime soon, but a few general conventions
for man-pages that you might want to consider following:
* When listing errors, I think you should change your
language/formatting somewhat. Examples here from kdbus.endpoint.7:
(1) The man page says
RETURN VALUE
On success, all mentioned ioctl commands return 0.
Better to write this from a user-space point of view:
RETURN VALUE
On success, all mentioned ioctl commands return 0; on
error, -1 is returned, and errno is set to indicate
the error.
(2) I would change the wording in the ERRORS sections from
"may return the following errors"
to
"may fail with the following errors"
(3) When listing the errors, drop the minus signs; that's not
what user-space sees. They see a positive value in errno.
(4) The usual formatting convention for constants, including
error constants in man pages is boldface, rather than
underline/emphasis.
(5) Insofar as it's possible, it would be good to make all
pages format nicely within 80 columns. Some of the literal
text and ASCII art could, I think, be narrowed.
Thanks,
Michael
--
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/
^ permalink raw reply
* Re: [PATCH v9 2/2] tty/serial: Add Spreadtrum sc9836-uart driver support
From: Lyra Zhang @ 2015-01-28 10:46 UTC (permalink / raw)
To: Baruch Siach
Cc: Mark Rutland, andrew@lunn.ch, Heiko Stübner,
gnomes@lxorguk.ukuu.org.uk, jslaby@suse.cz,
lanqing.liu@spreadtrum.com, Pawel Moll, Chunyan Zhang,
zhizhou.zhang, geng.ren@spreadtrum.com, antonynpavlov@gmail.com,
linux-serial@vger.kernel.org, Grant Likely, Orson Zhai,
florian.vaussard@epfl.ch, devicetree@vger.kernel.org,
jason@lakedaemon.net, Arnd Bergmann
In-Reply-To: <20150128095401.GC14175@tarshish>
On Wed, Jan 28, 2015 at 5:54 PM, Baruch Siach <baruch@tkos.co.il> wrote:
> Hi Lyra Zhang,
>
> On Wed, Jan 28, 2015 at 04:46:33PM +0800, Lyra Zhang wrote:
>> On Wed, Jan 28, 2015 at 2:12 PM, Baruch Siach <baruch@tkos.co.il> wrote:
>> > On Wed, Jan 28, 2015 at 10:47:41AM +0800, Chunyan Zhang wrote:
>> >> + if (uart_handle_sysrq_char(port, ch))
>> >> + continue;
>> >
>> > My comment[1] on a previous version of this patch still stands.
>> > uart_handle_sysrq_char is a NOP when SUPPORT_SYSRQ is not defined.
>>
>> yes, there are two definitions in serial_core.h, and it returns 0 when
>> SUPPORT_SYSRQ is not defined.
>> so, I think my code behavior does not hurt, right?
>
> Right. I just wanted to point out that you can easily define SUPPORT_SYSRQ as
> appropriate like many other serial drivers do to get sysrq support.
>
Ok, I got it. I'll add SUPPORT_SYSRQ definition, and send v10 to Greg.
thanks,
Chunyan
> baruch
>
> --
> http://baruch.siach.name/blog/ ~. .~ Tk Open Systems
> =}------------------------------------------------ooO--U--Ooo------------{=
> - baruch@tkos.co.il - tel: +972.2.679.5364, http://www.tkos.co.il -
^ permalink raw reply
* [PATCH v10 0/2] Add Spreadtrum SoC bindings and serial driver support
From: Chunyan Zhang @ 2015-01-28 11:08 UTC (permalink / raw)
To: gregkh
Cc: mark.rutland, gnomes, heiko, andrew, jslaby, lanqing.liu, arnd,
zhang.lyra, zhizhou.zhang, geng.ren, antonynpavlov, linux-serial,
grant.likely, orsonzhai, florian.vaussard, devicetree, jason,
pawel.moll, ijc+devicetree, hytszk, robh+dt, wei.qiao,
linux-arm-kernel, peter, linux-api, linux-kernel, galak,
shawn.guo
In-Reply-To: <sc9836-serial-v10>
Changes from v9:
- Added SUPPORT_SYSRQ definition for sprd serial
Changes from v8:
- Removed a few unuseful code lines.
Chunyan Zhang (2):
Documentation: DT: Add bindings for Spreadtrum SoC Platform
tty/serial: Add Spreadtrum sc9836-uart driver support
Documentation/devicetree/bindings/arm/sprd.txt | 11 +
.../devicetree/bindings/serial/sprd-uart.txt | 7 +
.../devicetree/bindings/vendor-prefixes.txt | 1 +
drivers/tty/serial/Kconfig | 18 +
drivers/tty/serial/Makefile | 1 +
drivers/tty/serial/sprd_serial.c | 793 ++++++++++++++++++++
include/uapi/linux/serial_core.h | 3 +
7 files changed, 834 insertions(+)
create mode 100644 Documentation/devicetree/bindings/arm/sprd.txt
create mode 100644 Documentation/devicetree/bindings/serial/sprd-uart.txt
create mode 100644 drivers/tty/serial/sprd_serial.c
--
1.7.9.5
^ permalink raw reply
* [PATCH v10 1/2] Documentation: DT: Add bindings for Spreadtrum SoC Platform
From: Chunyan Zhang @ 2015-01-28 11:08 UTC (permalink / raw)
To: gregkh
Cc: mark.rutland, gnomes, heiko, andrew, jslaby, lanqing.liu, arnd,
zhang.lyra, zhizhou.zhang, geng.ren, antonynpavlov, linux-serial,
grant.likely, orsonzhai, florian.vaussard, devicetree, jason,
pawel.moll, ijc+devicetree, hytszk, robh+dt, wei.qiao,
linux-arm-kernel, peter, linux-api, linux-kernel, galak,
shawn.guo
In-Reply-To: <1422443324-25082-1-git-send-email-chunyan.zhang@spreadtrum.com>
Adds Spreadtrum's prefix "sprd" to vendor-prefixes file.
Adds the devicetree binding documentations for Spreadtrum's sc9836-uart
and SC9836 SoC based on the Sharkl64 Platform which is a 64-bit SoC
Platform of Spreadtrum.
Signed-off-by: Chunyan Zhang <chunyan.zhang@spreadtrum.com>
---
Documentation/devicetree/bindings/arm/sprd.txt | 11 +++++++++++
.../devicetree/bindings/serial/sprd-uart.txt | 7 +++++++
.../devicetree/bindings/vendor-prefixes.txt | 1 +
3 files changed, 19 insertions(+)
create mode 100644 Documentation/devicetree/bindings/arm/sprd.txt
create mode 100644 Documentation/devicetree/bindings/serial/sprd-uart.txt
diff --git a/Documentation/devicetree/bindings/arm/sprd.txt b/Documentation/devicetree/bindings/arm/sprd.txt
new file mode 100644
index 0000000..31a629d
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/sprd.txt
@@ -0,0 +1,11 @@
+Spreadtrum SoC Platforms Device Tree Bindings
+----------------------------------------------------
+
+Sharkl64 is a Spreadtrum's SoC Platform which is based
+on ARM 64-bit processor.
+
+SC9836 openphone board with SC9836 SoC based on the
+Sharkl64 Platform shall have the following properties.
+
+Required root node properties:
+ - compatible = "sprd,sc9836-openphone", "sprd,sc9836";
diff --git a/Documentation/devicetree/bindings/serial/sprd-uart.txt b/Documentation/devicetree/bindings/serial/sprd-uart.txt
new file mode 100644
index 0000000..2aff0f2
--- /dev/null
+++ b/Documentation/devicetree/bindings/serial/sprd-uart.txt
@@ -0,0 +1,7 @@
+* Spreadtrum serial UART
+
+Required properties:
+- compatible: must be "sprd,sc9836-uart"
+- reg: offset and length of the register set for the device
+- interrupts: exactly one interrupt specifier
+- clocks: phandles to input clocks.
diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt
index b1df0ad..0a8384f 100644
--- a/Documentation/devicetree/bindings/vendor-prefixes.txt
+++ b/Documentation/devicetree/bindings/vendor-prefixes.txt
@@ -153,6 +153,7 @@ snps Synopsys, Inc.
solidrun SolidRun
sony Sony Corporation
spansion Spansion Inc.
+sprd Spreadtrum Communications Inc.
st STMicroelectronics
ste ST-Ericsson
stericsson ST-Ericsson
--
1.7.9.5
^ permalink raw reply related
* [PATCH v10 2/2] tty/serial: Add Spreadtrum sc9836-uart driver support
From: Chunyan Zhang @ 2015-01-28 11:08 UTC (permalink / raw)
To: gregkh
Cc: mark.rutland, gnomes, heiko, andrew, jslaby, lanqing.liu, arnd,
zhang.lyra, zhizhou.zhang, geng.ren, antonynpavlov, linux-serial,
grant.likely, orsonzhai, florian.vaussard, devicetree, jason,
pawel.moll, ijc+devicetree, hytszk, robh+dt, wei.qiao,
linux-arm-kernel, peter, linux-api, linux-kernel, galak,
shawn.guo
In-Reply-To: <1422443324-25082-1-git-send-email-chunyan.zhang@spreadtrum.com>
Add a full sc9836-uart driver for SC9836 SoC which is based on the
spreadtrum sharkl64 platform.
This driver also support earlycon.
Originally-by: Lanqing Liu <lanqing.liu@spreadtrum.com>
Signed-off-by: Orson Zhai <orson.zhai@spreadtrum.com>
Signed-off-by: Chunyan Zhang <chunyan.zhang@spreadtrum.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
---
drivers/tty/serial/Kconfig | 18 +
drivers/tty/serial/Makefile | 1 +
drivers/tty/serial/sprd_serial.c | 793 ++++++++++++++++++++++++++++++++++++++
include/uapi/linux/serial_core.h | 3 +
4 files changed, 815 insertions(+)
create mode 100644 drivers/tty/serial/sprd_serial.c
diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig
index c79b43c..13211f7 100644
--- a/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig
@@ -1577,6 +1577,24 @@ config SERIAL_MEN_Z135
This driver can also be build as a module. If so, the module will be called
men_z135_uart.ko
+config SERIAL_SPRD
+ tristate "Support for Spreadtrum serial"
+ depends on ARCH_SPRD
+ select SERIAL_CORE
+ help
+ This enables the driver for the Spreadtrum's serial.
+
+config SERIAL_SPRD_CONSOLE
+ bool "Spreadtrum UART console support"
+ depends on SERIAL_SPRD=y
+ select SERIAL_CORE_CONSOLE
+ select SERIAL_EARLYCON
+ help
+ Support for early debug console using Spreadtrum's serial. This enables
+ the console before standard serial driver is probed. This is enabled
+ with "earlycon" on the kernel command line. The console is
+ enabled when early_param is processed.
+
endmenu
config SERIAL_MCTRL_GPIO
diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile
index 9a548ac..4801aca 100644
--- a/drivers/tty/serial/Makefile
+++ b/drivers/tty/serial/Makefile
@@ -93,6 +93,7 @@ obj-$(CONFIG_SERIAL_ARC) += arc_uart.o
obj-$(CONFIG_SERIAL_RP2) += rp2.o
obj-$(CONFIG_SERIAL_FSL_LPUART) += fsl_lpuart.o
obj-$(CONFIG_SERIAL_MEN_Z135) += men_z135_uart.o
+obj-$(CONFIG_SERIAL_SPRD) += sprd_serial.o
# GPIOLIB helpers for modem control lines
obj-$(CONFIG_SERIAL_MCTRL_GPIO) += serial_mctrl_gpio.o
diff --git a/drivers/tty/serial/sprd_serial.c b/drivers/tty/serial/sprd_serial.c
new file mode 100644
index 0000000..594b633
--- /dev/null
+++ b/drivers/tty/serial/sprd_serial.c
@@ -0,0 +1,793 @@
+/*
+ * Copyright (C) 2012-2015 Spreadtrum Communications Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#if defined(CONFIG_SERIAL_SPRD_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ)
+#define SUPPORT_SYSRQ
+#endif
+
+#include <linux/clk.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/serial_core.h>
+#include <linux/serial.h>
+#include <linux/slab.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+
+/* device name */
+#define UART_NR_MAX 8
+#define SPRD_TTY_NAME "ttyS"
+#define SPRD_FIFO_SIZE 128
+#define SPRD_DEF_RATE 26000000
+#define SPRD_BAUD_IO_LIMIT 3000000
+#define SPRD_TIMEOUT 256
+
+/* the offset of serial registers and BITs for them */
+/* data registers */
+#define SPRD_TXD 0x0000
+#define SPRD_RXD 0x0004
+
+/* line status register and its BITs */
+#define SPRD_LSR 0x0008
+#define SPRD_LSR_OE BIT(4)
+#define SPRD_LSR_FE BIT(3)
+#define SPRD_LSR_PE BIT(2)
+#define SPRD_LSR_BI BIT(7)
+#define SPRD_LSR_TX_OVER BIT(15)
+
+/* data number in TX and RX fifo */
+#define SPRD_STS1 0x000C
+
+/* interrupt enable register and its BITs */
+#define SPRD_IEN 0x0010
+#define SPRD_IEN_RX_FULL BIT(0)
+#define SPRD_IEN_TX_EMPTY BIT(1)
+#define SPRD_IEN_BREAK_DETECT BIT(7)
+#define SPRD_IEN_TIMEOUT BIT(13)
+
+/* interrupt clear register */
+#define SPRD_ICLR 0x0014
+
+/* line control register */
+#define SPRD_LCR 0x0018
+#define SPRD_LCR_STOP_1BIT 0x10
+#define SPRD_LCR_STOP_2BIT 0x30
+#define SPRD_LCR_DATA_LEN (BIT(2) | BIT(3))
+#define SPRD_LCR_DATA_LEN5 0x0
+#define SPRD_LCR_DATA_LEN6 0x4
+#define SPRD_LCR_DATA_LEN7 0x8
+#define SPRD_LCR_DATA_LEN8 0xc
+#define SPRD_LCR_PARITY (BIT(0) | BIT(1))
+#define SPRD_LCR_PARITY_EN 0x2
+#define SPRD_LCR_EVEN_PAR 0x0
+#define SPRD_LCR_ODD_PAR 0x1
+
+/* control register 1 */
+#define SPRD_CTL1 0x001C
+#define RX_HW_FLOW_CTL_THLD BIT(6)
+#define RX_HW_FLOW_CTL_EN BIT(7)
+#define TX_HW_FLOW_CTL_EN BIT(8)
+#define RX_TOUT_THLD_DEF 0x3E00
+#define RX_HFC_THLD_DEF 0x40
+
+/* fifo threshold register */
+#define SPRD_CTL2 0x0020
+#define THLD_TX_EMPTY 0x40
+#define THLD_RX_FULL 0x40
+
+/* config baud rate register */
+#define SPRD_CLKD0 0x0024
+#define SPRD_CLKD1 0x0028
+
+/* interrupt mask status register */
+#define SPRD_IMSR 0x002C
+#define SPRD_IMSR_RX_FIFO_FULL BIT(0)
+#define SPRD_IMSR_TX_FIFO_EMPTY BIT(1)
+#define SPRD_IMSR_BREAK_DETECT BIT(7)
+#define SPRD_IMSR_TIMEOUT BIT(13)
+
+struct reg_backup {
+ u32 ien;
+ u32 ctrl0;
+ u32 ctrl1;
+ u32 ctrl2;
+ u32 clkd0;
+ u32 clkd1;
+ u32 dspwait;
+};
+
+struct sprd_uart_port {
+ struct uart_port port;
+ struct reg_backup reg_bak;
+ char name[16];
+};
+
+static struct sprd_uart_port *sprd_port[UART_NR_MAX];
+static int sprd_ports_num;
+
+static inline unsigned int serial_in(struct uart_port *port, int offset)
+{
+ return readl_relaxed(port->membase + offset);
+}
+
+static inline void serial_out(struct uart_port *port, int offset, int value)
+{
+ writel_relaxed(value, port->membase + offset);
+}
+
+static unsigned int sprd_tx_empty(struct uart_port *port)
+{
+ if (serial_in(port, SPRD_STS1) & 0xff00)
+ return 0;
+ else
+ return TIOCSER_TEMT;
+}
+
+static unsigned int sprd_get_mctrl(struct uart_port *port)
+{
+ return TIOCM_DSR | TIOCM_CTS;
+}
+
+static void sprd_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+ /* nothing to do */
+}
+
+static void sprd_stop_tx(struct uart_port *port)
+{
+ unsigned int ien, iclr;
+
+ iclr = serial_in(port, SPRD_ICLR);
+ ien = serial_in(port, SPRD_IEN);
+
+ iclr |= SPRD_IEN_TX_EMPTY;
+ ien &= ~SPRD_IEN_TX_EMPTY;
+
+ serial_out(port, SPRD_ICLR, iclr);
+ serial_out(port, SPRD_IEN, ien);
+}
+
+static void sprd_start_tx(struct uart_port *port)
+{
+ unsigned int ien;
+
+ ien = serial_in(port, SPRD_IEN);
+ if (!(ien & SPRD_IEN_TX_EMPTY)) {
+ ien |= SPRD_IEN_TX_EMPTY;
+ serial_out(port, SPRD_IEN, ien);
+ }
+}
+
+static void sprd_stop_rx(struct uart_port *port)
+{
+ unsigned int ien, iclr;
+
+ iclr = serial_in(port, SPRD_ICLR);
+ ien = serial_in(port, SPRD_IEN);
+
+ ien &= ~(SPRD_IEN_RX_FULL | SPRD_IEN_BREAK_DETECT);
+ iclr |= SPRD_IEN_RX_FULL | SPRD_IEN_BREAK_DETECT;
+
+ serial_out(port, SPRD_IEN, ien);
+ serial_out(port, SPRD_ICLR, iclr);
+}
+
+/* The Sprd serial does not support this function. */
+static void sprd_break_ctl(struct uart_port *port, int break_state)
+{
+ /* nothing to do */
+}
+
+static int handle_lsr_errors(struct uart_port *port,
+ unsigned int *flag,
+ unsigned int *lsr)
+{
+ int ret = 0;
+
+ /* statistics */
+ if (*lsr & SPRD_LSR_BI) {
+ *lsr &= ~(SPRD_LSR_FE | SPRD_LSR_PE);
+ port->icount.brk++;
+ ret = uart_handle_break(port);
+ if (ret)
+ return ret;
+ } else if (*lsr & SPRD_LSR_PE)
+ port->icount.parity++;
+ else if (*lsr & SPRD_LSR_FE)
+ port->icount.frame++;
+ if (*lsr & SPRD_LSR_OE)
+ port->icount.overrun++;
+
+ /* mask off conditions which should be ignored */
+ *lsr &= port->read_status_mask;
+ if (*lsr & SPRD_LSR_BI)
+ *flag = TTY_BREAK;
+ else if (*lsr & SPRD_LSR_PE)
+ *flag = TTY_PARITY;
+ else if (*lsr & SPRD_LSR_FE)
+ *flag = TTY_FRAME;
+
+ return ret;
+}
+
+static inline void sprd_rx(struct uart_port *port)
+{
+ struct tty_port *tty = &port->state->port;
+ unsigned int ch, flag, lsr, max_count = SPRD_TIMEOUT;
+
+ while ((serial_in(port, SPRD_STS1) & 0x00ff) && max_count--) {
+ lsr = serial_in(port, SPRD_LSR);
+ ch = serial_in(port, SPRD_RXD);
+ flag = TTY_NORMAL;
+ port->icount.rx++;
+
+ if (lsr & (SPRD_LSR_BI | SPRD_LSR_PE |
+ SPRD_LSR_FE | SPRD_LSR_OE))
+ if (handle_lsr_errors(port, &lsr, &flag))
+ continue;
+ if (uart_handle_sysrq_char(port, ch))
+ continue;
+
+ uart_insert_char(port, lsr, SPRD_LSR_OE, ch, flag);
+ }
+
+ tty_flip_buffer_push(tty);
+}
+
+static inline void sprd_tx(struct uart_port *port)
+{
+ struct circ_buf *xmit = &port->state->xmit;
+ int count;
+
+ if (port->x_char) {
+ serial_out(port, SPRD_TXD, port->x_char);
+ port->icount.tx++;
+ port->x_char = 0;
+ return;
+ }
+
+ if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
+ sprd_stop_tx(port);
+ return;
+ }
+
+ count = THLD_TX_EMPTY;
+ do {
+ serial_out(port, SPRD_TXD, xmit->buf[xmit->tail]);
+ xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
+ port->icount.tx++;
+ if (uart_circ_empty(xmit))
+ break;
+ } while (--count > 0);
+
+ if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+ uart_write_wakeup(port);
+
+ if (uart_circ_empty(xmit))
+ sprd_stop_tx(port);
+}
+
+/* this handles the interrupt from one port */
+static irqreturn_t sprd_handle_irq(int irq, void *dev_id)
+{
+ struct uart_port *port = dev_id;
+ unsigned int ims;
+
+ spin_lock(&port->lock);
+
+ ims = serial_in(port, SPRD_IMSR);
+
+ if (!ims)
+ return IRQ_NONE;
+
+ serial_out(port, SPRD_ICLR, ~0);
+
+ if (ims & (SPRD_IMSR_RX_FIFO_FULL |
+ SPRD_IMSR_BREAK_DETECT | SPRD_IMSR_TIMEOUT))
+ sprd_rx(port);
+
+ if (ims & SPRD_IMSR_TX_FIFO_EMPTY)
+ sprd_tx(port);
+
+ spin_unlock(&port->lock);
+
+ return IRQ_HANDLED;
+}
+
+static int sprd_startup(struct uart_port *port)
+{
+ int ret = 0;
+ unsigned int ien, fc;
+ unsigned int timeout;
+ struct sprd_uart_port *sp;
+ unsigned long flags;
+
+ serial_out(port, SPRD_CTL2, ((THLD_TX_EMPTY << 8) | THLD_RX_FULL));
+
+ /* clear rx fifo */
+ timeout = SPRD_TIMEOUT;
+ while (timeout-- && serial_in(port, SPRD_STS1) & 0x00ff)
+ serial_in(port, SPRD_RXD);
+
+ /* clear tx fifo */
+ timeout = SPRD_TIMEOUT;
+ while (timeout-- && serial_in(port, SPRD_STS1) & 0xff00)
+ cpu_relax();
+
+ /* clear interrupt */
+ serial_out(port, SPRD_IEN, 0);
+ serial_out(port, SPRD_ICLR, ~0);
+
+ /* allocate irq */
+ sp = container_of(port, struct sprd_uart_port, port);
+ snprintf(sp->name, sizeof(sp->name), "sprd_serial%d", port->line);
+ ret = devm_request_irq(port->dev, port->irq, sprd_handle_irq,
+ IRQF_SHARED, sp->name, port);
+ if (ret) {
+ dev_err(port->dev, "fail to request serial irq %d, ret=%d\n",
+ port->irq, ret);
+ return ret;
+ }
+ fc = serial_in(port, SPRD_CTL1);
+ fc |= RX_TOUT_THLD_DEF | RX_HFC_THLD_DEF;
+ serial_out(port, SPRD_CTL1, fc);
+
+ /* enable interrupt */
+ spin_lock_irqsave(&port->lock, flags);
+ ien = serial_in(port, SPRD_IEN);
+ ien |= SPRD_IEN_RX_FULL | SPRD_IEN_BREAK_DETECT | SPRD_IEN_TIMEOUT;
+ serial_out(port, SPRD_IEN, ien);
+ spin_unlock_irqrestore(&port->lock, flags);
+
+ return 0;
+}
+
+static void sprd_shutdown(struct uart_port *port)
+{
+ serial_out(port, SPRD_IEN, 0);
+ serial_out(port, SPRD_ICLR, ~0);
+ devm_free_irq(port->dev, port->irq, port);
+}
+
+static void sprd_set_termios(struct uart_port *port,
+ struct ktermios *termios,
+ struct ktermios *old)
+{
+ unsigned int baud, quot;
+ unsigned int lcr = 0, fc;
+ unsigned long flags;
+
+ /* ask the core to calculate the divisor for us */
+ baud = uart_get_baud_rate(port, termios, old, 0, SPRD_BAUD_IO_LIMIT);
+
+ quot = (unsigned int)((port->uartclk + baud / 2) / baud);
+
+ /* set data length */
+ switch (termios->c_cflag & CSIZE) {
+ case CS5:
+ lcr |= SPRD_LCR_DATA_LEN5;
+ break;
+ case CS6:
+ lcr |= SPRD_LCR_DATA_LEN6;
+ break;
+ case CS7:
+ lcr |= SPRD_LCR_DATA_LEN7;
+ break;
+ case CS8:
+ default:
+ lcr |= SPRD_LCR_DATA_LEN8;
+ break;
+ }
+
+ /* calculate stop bits */
+ lcr &= ~(SPRD_LCR_STOP_1BIT | SPRD_LCR_STOP_2BIT);
+ if (termios->c_cflag & CSTOPB)
+ lcr |= SPRD_LCR_STOP_2BIT;
+ else
+ lcr |= SPRD_LCR_STOP_1BIT;
+
+ /* calculate parity */
+ lcr &= ~SPRD_LCR_PARITY;
+ termios->c_cflag &= ~CMSPAR; /* no support mark/space */
+ if (termios->c_cflag & PARENB) {
+ lcr |= SPRD_LCR_PARITY_EN;
+ if (termios->c_cflag & PARODD)
+ lcr |= SPRD_LCR_ODD_PAR;
+ else
+ lcr |= SPRD_LCR_EVEN_PAR;
+ }
+
+ spin_lock_irqsave(&port->lock, flags);
+
+ /* update the per-port timeout */
+ uart_update_timeout(port, termios->c_cflag, baud);
+
+ port->read_status_mask = SPRD_LSR_OE;
+ if (termios->c_iflag & INPCK)
+ port->read_status_mask |= SPRD_LSR_FE | SPRD_LSR_PE;
+ if (termios->c_iflag & (IGNBRK | BRKINT | PARMRK))
+ port->read_status_mask |= SPRD_LSR_BI;
+
+ /* characters to ignore */
+ port->ignore_status_mask = 0;
+ if (termios->c_iflag & IGNPAR)
+ port->ignore_status_mask |= SPRD_LSR_PE | SPRD_LSR_FE;
+ if (termios->c_iflag & IGNBRK) {
+ port->ignore_status_mask |= SPRD_LSR_BI;
+ /*
+ * If we're ignoring parity and break indicators,
+ * ignore overruns too (for real raw support).
+ */
+ if (termios->c_iflag & IGNPAR)
+ port->ignore_status_mask |= SPRD_LSR_OE;
+ }
+
+ /* flow control */
+ fc = serial_in(port, SPRD_CTL1);
+ fc &= ~(RX_HW_FLOW_CTL_THLD | RX_HW_FLOW_CTL_EN | TX_HW_FLOW_CTL_EN);
+ if (termios->c_cflag & CRTSCTS) {
+ fc |= RX_HW_FLOW_CTL_THLD;
+ fc |= RX_HW_FLOW_CTL_EN;
+ fc |= TX_HW_FLOW_CTL_EN;
+ }
+
+ /* clock divider bit0~bit15 */
+ serial_out(port, SPRD_CLKD0, quot & 0xffff);
+
+ /* clock divider bit16~bit20 */
+ serial_out(port, SPRD_CLKD1, (quot & 0x1f0000) >> 16);
+ serial_out(port, SPRD_LCR, lcr);
+ fc |= RX_TOUT_THLD_DEF | RX_HFC_THLD_DEF;
+ serial_out(port, SPRD_CTL1, fc);
+
+ spin_unlock_irqrestore(&port->lock, flags);
+
+ /* Don't rewrite B0 */
+ if (tty_termios_baud_rate(termios))
+ tty_termios_encode_baud_rate(termios, baud, baud);
+}
+
+static const char *sprd_type(struct uart_port *port)
+{
+ return "SPX";
+}
+
+static void sprd_release_port(struct uart_port *port)
+{
+ /* nothing to do */
+}
+
+static int sprd_request_port(struct uart_port *port)
+{
+ return 0;
+}
+
+static void sprd_config_port(struct uart_port *port, int flags)
+{
+ if (flags & UART_CONFIG_TYPE)
+ port->type = PORT_SPRD;
+}
+
+static int sprd_verify_port(struct uart_port *port,
+ struct serial_struct *ser)
+{
+ if (ser->type != PORT_SPRD)
+ return -EINVAL;
+ if (port->irq != ser->irq)
+ return -EINVAL;
+ return 0;
+}
+
+static struct uart_ops serial_sprd_ops = {
+ .tx_empty = sprd_tx_empty,
+ .get_mctrl = sprd_get_mctrl,
+ .set_mctrl = sprd_set_mctrl,
+ .stop_tx = sprd_stop_tx,
+ .start_tx = sprd_start_tx,
+ .stop_rx = sprd_stop_rx,
+ .break_ctl = sprd_break_ctl,
+ .startup = sprd_startup,
+ .shutdown = sprd_shutdown,
+ .set_termios = sprd_set_termios,
+ .type = sprd_type,
+ .release_port = sprd_release_port,
+ .request_port = sprd_request_port,
+ .config_port = sprd_config_port,
+ .verify_port = sprd_verify_port,
+};
+
+#ifdef CONFIG_SERIAL_SPRD_CONSOLE
+static inline void wait_for_xmitr(struct uart_port *port)
+{
+ unsigned int status, tmout = 10000;
+
+ /* wait up to 10ms for the character(s) to be sent */
+ do {
+ status = serial_in(port, SPRD_STS1);
+ if (--tmout == 0)
+ break;
+ udelay(1);
+ } while (status & 0xff00);
+}
+
+static void sprd_console_putchar(struct uart_port *port, int ch)
+{
+ wait_for_xmitr(port);
+ serial_out(port, SPRD_TXD, ch);
+}
+
+static void sprd_console_write(struct console *co, const char *s,
+ unsigned int count)
+{
+ struct uart_port *port = &sprd_port[co->index]->port;
+ int locked = 1;
+ unsigned long flags;
+
+ if (port->sysrq)
+ locked = 0;
+ else if (oops_in_progress)
+ locked = spin_trylock_irqsave(&port->lock, flags);
+ else
+ spin_lock_irqsave(&port->lock, flags);
+
+ uart_console_write(port, s, count, sprd_console_putchar);
+
+ /* wait for transmitter to become empty */
+ wait_for_xmitr(port);
+
+ if (locked)
+ spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static int __init sprd_console_setup(struct console *co, char *options)
+{
+ struct uart_port *port;
+ int baud = 115200;
+ int bits = 8;
+ int parity = 'n';
+ int flow = 'n';
+
+ if (co->index >= UART_NR_MAX || co->index < 0)
+ co->index = 0;
+
+ port = &sprd_port[co->index]->port;
+ if (port == NULL) {
+ pr_info("serial port %d not yet initialized\n", co->index);
+ return -ENODEV;
+ }
+ if (options)
+ uart_parse_options(options, &baud, &parity, &bits, &flow);
+
+ return uart_set_options(port, co, baud, parity, bits, flow);
+}
+
+static struct uart_driver sprd_uart_driver;
+static struct console sprd_console = {
+ .name = SPRD_TTY_NAME,
+ .write = sprd_console_write,
+ .device = uart_console_device,
+ .setup = sprd_console_setup,
+ .flags = CON_PRINTBUFFER,
+ .index = -1,
+ .data = &sprd_uart_driver,
+};
+
+#define SPRD_CONSOLE (&sprd_console)
+
+/* Support for earlycon */
+static void sprd_putc(struct uart_port *port, int c)
+{
+ unsigned int timeout = SPRD_TIMEOUT;
+
+ while (timeout-- &&
+ !(readl(port->membase + SPRD_LSR) & SPRD_LSR_TX_OVER))
+ cpu_relax();
+
+ writeb(c, port->membase + SPRD_TXD);
+}
+
+static void sprd_early_write(struct console *con, const char *s,
+ unsigned n)
+{
+ struct earlycon_device *dev = con->data;
+
+ uart_console_write(&dev->port, s, n, sprd_putc);
+}
+
+static int __init sprd_early_console_setup(
+ struct earlycon_device *device,
+ const char *opt)
+{
+ if (!device->port.membase)
+ return -ENODEV;
+
+ device->con->write = sprd_early_write;
+ return 0;
+}
+
+EARLYCON_DECLARE(sprd_serial, sprd_early_console_setup);
+OF_EARLYCON_DECLARE(sprd_serial, "sprd,sc9836-uart",
+ sprd_early_console_setup);
+
+#else /* !CONFIG_SERIAL_SPRD_CONSOLE */
+#define SPRD_CONSOLE NULL
+#endif
+
+static struct uart_driver sprd_uart_driver = {
+ .owner = THIS_MODULE,
+ .driver_name = "sprd_serial",
+ .dev_name = SPRD_TTY_NAME,
+ .major = 0,
+ .minor = 0,
+ .nr = UART_NR_MAX,
+ .cons = SPRD_CONSOLE,
+};
+
+static int sprd_probe_dt_alias(int index, struct device *dev)
+{
+ struct device_node *np;
+ int ret = index;
+
+ if (!IS_ENABLED(CONFIG_OF))
+ return ret;
+
+ np = dev->of_node;
+ if (!np)
+ return ret;
+
+ ret = of_alias_get_id(np, "serial");
+ if (IS_ERR_VALUE(ret))
+ ret = index;
+ else if (ret >= ARRAY_SIZE(sprd_port) || sprd_port[ret] != NULL) {
+ dev_warn(dev, "requested serial port %d not available.\n", ret);
+ ret = index;
+ }
+
+ return ret;
+}
+
+static int sprd_remove(struct platform_device *dev)
+{
+ struct sprd_uart_port *sup = platform_get_drvdata(dev);
+
+ if (sup) {
+ uart_remove_one_port(&sprd_uart_driver, &sup->port);
+ sprd_port[sup->port.line] = NULL;
+ sprd_ports_num--;
+ }
+
+ if (!sprd_ports_num)
+ uart_unregister_driver(&sprd_uart_driver);
+
+ return 0;
+}
+
+static int sprd_probe(struct platform_device *pdev)
+{
+ struct resource *res;
+ struct uart_port *up;
+ struct clk *clk;
+ int irq;
+ int index;
+ int ret;
+
+ for (index = 0; index < ARRAY_SIZE(sprd_port); index++)
+ if (sprd_port[index] == NULL)
+ break;
+
+ if (index == ARRAY_SIZE(sprd_port))
+ return -EBUSY;
+
+ index = sprd_probe_dt_alias(index, &pdev->dev);
+
+ sprd_port[index] = devm_kzalloc(&pdev->dev,
+ sizeof(*sprd_port[index]), GFP_KERNEL);
+ if (!sprd_port[index])
+ return -ENOMEM;
+
+ up = &sprd_port[index]->port;
+ up->dev = &pdev->dev;
+ up->line = index;
+ up->type = PORT_SPRD;
+ up->iotype = SERIAL_IO_PORT;
+ up->uartclk = SPRD_DEF_RATE;
+ up->fifosize = SPRD_FIFO_SIZE;
+ up->ops = &serial_sprd_ops;
+ up->flags = UPF_BOOT_AUTOCONF;
+
+ clk = devm_clk_get(&pdev->dev, NULL);
+ if (!IS_ERR(clk))
+ up->uartclk = clk_get_rate(clk);
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (!res) {
+ dev_err(&pdev->dev, "not provide mem resource\n");
+ return -ENODEV;
+ }
+ up->mapbase = res->start;
+ up->membase = devm_ioremap_resource(&pdev->dev, res);
+ if (IS_ERR(up->membase))
+ return PTR_ERR(up->membase);
+
+ irq = platform_get_irq(pdev, 0);
+ if (irq < 0) {
+ dev_err(&pdev->dev, "not provide irq resource\n");
+ return -ENODEV;
+ }
+ up->irq = irq;
+
+ if (!sprd_ports_num) {
+ ret = uart_register_driver(&sprd_uart_driver);
+ if (ret < 0) {
+ pr_err("Failed to register SPRD-UART driver\n");
+ return ret;
+ }
+ }
+ sprd_ports_num++;
+
+ ret = uart_add_one_port(&sprd_uart_driver, up);
+ if (ret) {
+ sprd_port[index] = NULL;
+ sprd_remove(pdev);
+ }
+
+ platform_set_drvdata(pdev, up);
+
+ return ret;
+}
+
+static int sprd_suspend(struct device *dev)
+{
+ struct sprd_uart_port *sup = dev_get_drvdata(dev);
+
+ uart_suspend_port(&sprd_uart_driver, &sup->port);
+
+ return 0;
+}
+
+static int sprd_resume(struct device *dev)
+{
+ struct sprd_uart_port *sup = dev_get_drvdata(dev);
+
+ uart_resume_port(&sprd_uart_driver, &sup->port);
+
+ return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(sprd_pm_ops, sprd_suspend, sprd_resume);
+
+static const struct of_device_id serial_ids[] = {
+ {.compatible = "sprd,sc9836-uart",},
+ {}
+};
+
+static struct platform_driver sprd_platform_driver = {
+ .probe = sprd_probe,
+ .remove = sprd_remove,
+ .driver = {
+ .name = "sprd_serial",
+ .of_match_table = of_match_ptr(serial_ids),
+ .pm = &sprd_pm_ops,
+ },
+};
+
+module_platform_driver(sprd_platform_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Spreadtrum SoC serial driver series");
diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index c172180..7e6eb39 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -248,4 +248,7 @@
/* MESON */
#define PORT_MESON 109
+/* SPRD SERIAL */
+#define PORT_SPRD 110
+
#endif /* _UAPILINUX_SERIAL_CORE_H */
--
1.7.9.5
^ permalink raw reply related
* Re: [PATCH 3/4] IB/uverbs: ex_query_device: check request's comp_mask
From: Yann Droneaud @ 2015-01-28 13:19 UTC (permalink / raw)
To: Haggai Eran
Cc: Sagi Grimberg, Shachar Raindel, Eli Cohen, Roland Dreier,
linux-rdma-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA, Tzahi Oved, Or Gerlitz,
Matan Barak
In-Reply-To: <54C73549.5000003-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Hi,
Le mardi 27 janvier 2015 à 08:50 +0200, Haggai Eran a écrit :
> On 26/01/2015 13:17, Yann Droneaud wrote:
> > ...
> > Le dimanche 25 janvier 2015 à 17:23 +0200, Haggai Eran a écrit :
> >> On 22/01/2015 15:28, Yann Droneaud wrote:
> >>> This patch ensures the extended QUERY_DEVICE uverbs request's
> >>> comp_mask has only known values. If userspace returns unknown
> >>> features, -EINVAL will be returned, allowing to probe/discover
> >>> which features are currently supported by the kernel.
> >>
> >> This probing process will be much more cumbersome than it needs to be
> >> because userspace will have to call QUERY_DEVICE repetitively with
> >> different comp_mask fields until it finds out the exact set of supported
> >> bits.
> >>
> >
> > O(log2(N))
>
> I don't think user space developers will be happy having to do trial and
> error to determine what features the kernel driver supports. It might be
> even more then O(log2(N)). If my understanding of comp_mask bits usage is
> correct it would O(N). But it's not the time complexity I'm worried about,
> it's the fact that it requires user-space developers to go through hoops in
> order to get information that can be much more easily exported.
>
But there's currently *NO* such mean that could give a hint to userspace
developer whether one bit of request's comp_mask is currently effective
in extended QUERY_DEVICE uverbs. While we have such mean in CREATE_FLOW
and DESTROY_FLOW: unsupported bits trigger -EINVAL.
In QUERY_DEVICE, userspace developer is allowed to set request's
comp_mask to -1: it won't hurt it on current kernel, so why not using
this value or any other random value ? The program will run: it's "Works
For Me".
But the same program (either binary or source code) might fail on
newer kernel where some bits in comp_mask gain a meaning not supported
by the program.
> > Or you had to add a different interface, dedicated to retrieve the exact
> > supported feature mask.
> >
> >>> Moreover, it also ensure the requested features set in comp_mask
> >>> are sequentially set, not skipping intermediate features, eg. the
> >>> "highest" requested feature also request all the "lower" ones.
> >>> This way each new feature will have to be stacked on top of the
> >>> existing ones: this is especially important for the request and
> >>> response data structures where fields are added after the
> >>> current ones when expanded to support a new feature.
> >>
> >> I think it is perfectly acceptable that not all drivers will implement
> >> all available features, and so you shouldn't enforce this requirement.
> >
> > With regard to QUERY_DEVICE: the data structure layout depends on the
> > comp_mask value. So either you propose a way to express multipart data
> > structure (see CMSG or NETLINK), or we have to ensure the data structure
> > is ever-growing, with each new chunck stacked over the existing ones:
> > that's the purpose of :
> >
> > if (cmd.comp_mask & (cmd.comp_mask + 1))
> > return -EINVAL;
> >
> >> Also, it makes the comp_mask nothing more than a wasteful version number
> >> between 0 and 63.
> >
> > That's what I've already explained earlier in "Re: [PATCH v3 06/17]
> > IB/core: Add support for extended query device caps":
> >
> > http://mid.gmane.org/1421844612.13543.40.camel-RlY5vtjFyJ3QT0dZR+AlfA@public.gmane.org
>
> Yes, you wrote there:
> > Regarding comp_mask (not for this particular verb):
> >
> > It's not clear whether request's comp_mask describe the request or the
> > response, as such I'm puzzled.
> >
> > How would the kernel and the userspace be able to parse the request and
> > the response if they ignore unknown bits ?
> >
> > How would they be able to skip the unrecognized chunk of the request and
> > response buffer ?
> >
> > How one bit in a comp_mask is related to a chunk in the request or
> > response ?
> >
> > It's likely the kernel or userspace would have to skip the remaining
> > comp_mask's bits after encountering an unknown bit as the size of the
> > corresponding chunk in request or response would be unknown, making
> > impossible to locate the corresponding chunk for the next bit set in
> > comp_mask. Having said that, comp_mask is just a complicated way of
> > expressing a version, which is turn describe a size (ever growing).
>
> It is my understanding that each comp_mask bit marks a set of fields in
> the command or in the response struct as valid, so the struct format
> remains the same and the kernel and userspace don't need to make
> difficult calculation as to where each field is, but you can still pass
> a high bit set in comp_mask with one of the lower bits cleared.
>
How can the struct format remain the same, if some fields are added
to implement newer feature ?
> I couldn't find this explicit detail in the mailing list, but I did found
> a presentation that was presented in OFA International Developer
> Workshop 2013 [1], that gave an example of of an verb where each
> comp_mask bit marked a different field as valid.
>
Thanks for the link to the presentation.
As I read it the presentation:
- in request, comp_mask give hint to the kernel of additional fields in
the request.
- in response, comp_mask give hint to userspace regarding the presence
of additional fields in the response.
But commit 860f10a799c8 ("IB/core: Add flags for on demand paging
support") on top of commit 5a77abf9a97a ("IB/core: Add support for
extended query device caps") is not doing it the expected way
as one bit set in request's comp_mask has direct effect on
the response's fields.
To be conformant with the "Extending Verbs API" presentation,
no check should be done on request's comp_mask, and on-demand paging
information should be returned to userspace in all case, provided
there's enough room in the response buffer.
Anyway, allowing userspace to set any "hint" in the request's comp_mask
is opening a pandora box: being allowed to set any value, userspace
program will use any random value, as it will work with current kernel.
But the same program used on newer kernel is going to trigger some
behavior unknown to the application or return an error the application
is not ready to handle ... breaking any kind of forward compatibility
promise.
It's likely the kernel will have to use the size of the request to
guess the hints in comp_mask are corrects to handle such. In such case,
relying only on the size of the request might be enough to detect
extended request, without the need for comp_mask.
Regarding the answer, if the response buffer is smaller than the size
of the extended response, will the kernel keep setting the response's
comp_mask with all the bits it supports or will it unset the bits for
the fields it wasn't able to fit in the response buffer ?
It's likely the kernel will have to use the size of the response buffer
to set the response's comp_mask.
Instead commit 860f10a799c8 ("IB/core: Add flags for on demand paging
support") on top of commit 5a77abf9a97a ("IB/core: Add support for
extended query device caps") make it possible for the kernel to return
truncated response with full comp_mask. Such is going to break silently
when one will introduce a data structure with an ABI mismatch between
userspace and kernel (for example x86 vs amd64 ... we have some recent
exemples).
> >
> >>
> >> In the specific case of QUERY_DEVICE you might argue that there isn't
> >> any need for input comp_mask, only for output, and then you may enforce
> >> the input comp_mask will always be zero.
> >
> > The extended QUERY_DEVICE uverbs as currently merged is using comp_mask
> > from input to choose to report on-demand-paging related value. So it
> > seems it's needed.
> >
> > http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/drivers/infiniband/core/uverbs_cmd.c?id=v3.19-rc6#n3297
> >
> >> However, you will in any case need to be able to extended the size of the response in the future.
> >>
> >
> > That's already the case for on demand paging.
> >
> >>>
> >>> Link: http://mid.gmane.org/cover.1421931555.git.ydroneaud@opteya.com
> >>> Cc: Sagi Grimberg <sagig-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> >>> Cc: Shachar Raindel <raindel-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> >>> Cc: Eli Cohen <eli-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> >>> Cc: Haggai Eran <haggaie-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> >>> Signed-off-by: Yann Droneaud <ydroneaud-RlY5vtjFyJ3QT0dZR+AlfA@public.gmane.org>
> >>> ---
> >>> drivers/infiniband/core/uverbs_cmd.c | 6 ++++++
> >>> 1 file changed, 6 insertions(+)
> >>>
> >>> diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
> >>> index 8668b328b7e6..80a1c90f1dbf 100644
> >>> --- a/drivers/infiniband/core/uverbs_cmd.c
> >>> +++ b/drivers/infiniband/core/uverbs_cmd.c
> >>> @@ -3313,6 +3313,12 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
> >>> if (err)
> >>> return err;
> >>>
> >>> + if (cmd.comp_mask & (cmd.comp_mask + 1))
> >>> + return -EINVAL;
> >>> +
> >>> + if (cmd.comp_mask & ~(__u32)IB_USER_VERBS_EX_QUERY_DEVICE_ODP)
> >>> + return -EINVAL;
> >>> +
> >
> > If we keep the checks on output buffer size from patch 1, returning
> > -ENOSPC in case of size mismatch, and if we are sure that no bit in
> > input comp_mask will ever trigger a call to a kernel function that can
> > make the uverb fail, the latter check on known value could be dropped,
> > allowing the userspace to set the highest mask (-1): userspace
> > will use -ENOSPC to probe the expected size of the response buffer
> > to match the highest supported comp_mask. But it's going to hurt
> > userspace application not ready to receive -ENOSPC on newer kernel
> > with extended QUERY_DEVICE ABI ... Oops.
> >
> > So in this end, the safest way to ensure userspace is doing the correct
> > thing so that we have backward and forward compatibility is to check
> > for known values in comp_mask, check for response buffer size and ensure
> > that data structure chunk are stacked.
> >
> > The tighter are the checks now, the easier the interface could be
> > extended latter.
>
> I understand this position, and I generally agree, but I think that
> specifically for a verb like QUERY_DEVICE that only reads information from
> the kernel driver to user-space, there is no harm in the kernel just
> providing all the information it can fit in the response buffer provided
> by user-space.
>
Returning as much as possible information to userspace is OK,
but it's doing it the wrong way.
I'm not specifically discussing QUERY_DEVICE, as I'm concerned with
every extended uverbs to be added to the kernel.
> Let me explain: newer fields are added to the kernel response struct at the
> end, together with a new comp_mask bit.
>
> Older user-space with newer kernels will simply ask only for the buffer
> size they care about. The fact that the struct is truncated doesn't affect
> these programs because the truncated struct is a valid struct that was
> presented by the kernel in an older version.
>
You cannot ensure the userspace being correct when specifying a request.
It's a wrong assumption (perhaps not so wrong in the case of
InfiniBand/RDMA, as most userspace program are using libibverbs,
librdmacm and provider's libraries).
That's why we must not be liberal with the request and check any bit of
it for being something valid, so that erroneous values are catch now,
ensuring userspace is not trying to request things we don't know yet
and it's not aware of it too.
> They may or may not receive a comp_mask bit they don't recognize, but that
> only tells them that the kernel supports new features they don't know about.
>
> Newer user-space with older kernel will give a larger buffer then the
> kernel can fill. The kernel only fills in the beginning of the user-space
> buffer, and provides user-space with the comp_mask bits that mark which
> fields are valid. So user-space can tell that the end of the buffer isn't
> valid.
>
But older userspace programs with newer kernel might break, as request's
comp_mask was allowed to contains invalid values in older kernel.
> In my implementation I also left the ending uninitialized, but the
> kernel can zero it if you think it is important.
>
Why not, but it's a minor point.
> So I hope you agree this scheme is extendible and allows keeping backward
> and forward compatibility. If you can think of another scheme that will be
> more strict with the buffer sizes, but doesn't require user-space to do
> extra work, I'll be happy to hear about it.
>
I'm sorry about that but I cannot agree with the current scheme.
> [1] Extending Verbs API, Tzahi Oved
> https://www.openfabrics.org/images/docs/2013_Dev_Workshop/Tues_0423/2013_Workshop_Tues_0830_Tzahi_Oved-verbs_extensions_ofa_2013-tzahio.pdf
Regards.
--
Yann Droneaud
OPTEYA
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [PATCH 3/4] IB/uverbs: ex_query_device: check request's comp_mask
From: Haggai Eran @ 2015-01-28 15:40 UTC (permalink / raw)
To: Yann Droneaud
Cc: Sagi Grimberg, Shachar Raindel, Eli Cohen, Roland Dreier,
linux-rdma-u79uwXL29TY76Z2rM5mHXA,
linux-api-u79uwXL29TY76Z2rM5mHXA, Tzahi Oved, Or Gerlitz,
Matan Barak
In-Reply-To: <1422451151.3133.130.camel-RlY5vtjFyJ3QT0dZR+AlfA@public.gmane.org>
On 28/01/2015 15:19, Yann Droneaud wrote:
> Hi,
>
> Le mardi 27 janvier 2015 à 08:50 +0200, Haggai Eran a écrit :
>> On 26/01/2015 13:17, Yann Droneaud wrote:
>>> ...
>>> Le dimanche 25 janvier 2015 à 17:23 +0200, Haggai Eran a écrit :
>>>> On 22/01/2015 15:28, Yann Droneaud wrote:
>>>>> This patch ensures the extended QUERY_DEVICE uverbs request's
>>>>> comp_mask has only known values. If userspace returns unknown
>>>>> features, -EINVAL will be returned, allowing to probe/discover
>>>>> which features are currently supported by the kernel.
>>>>
>>>> This probing process will be much more cumbersome than it needs to be
>>>> because userspace will have to call QUERY_DEVICE repetitively with
>>>> different comp_mask fields until it finds out the exact set of supported
>>>> bits.
>>>>
>>>
>>> O(log2(N))
>>
>> I don't think user space developers will be happy having to do trial and
>> error to determine what features the kernel driver supports. It might be
>> even more then O(log2(N)). If my understanding of comp_mask bits usage is
>> correct it would O(N). But it's not the time complexity I'm worried about,
>> it's the fact that it requires user-space developers to go through hoops in
>> order to get information that can be much more easily exported.
>>
>
> But there's currently *NO* such mean that could give a hint to userspace
> developer whether one bit of request's comp_mask is currently effective
> in extended QUERY_DEVICE uverbs. While we have such mean in CREATE_FLOW
> and DESTROY_FLOW: unsupported bits trigger -EINVAL.
>
> In QUERY_DEVICE, userspace developer is allowed to set request's
> comp_mask to -1: it won't hurt it on current kernel, so why not using
> this value or any other random value ? The program will run: it's "Works
> For Me".
>
> But the same program (either binary or source code) might fail on
> newer kernel where some bits in comp_mask gain a meaning not supported
> by the program.
Why don't we make the command comp_mask in QUERY_DEVICE zero in both
versions. The behavior of the command with comp_mask = 0 will be to
return the maximum amount of data that fits in the response buffer. The
kernel will return -EINVAL if the input command comp_mask is not zero in
the current version.
If in the future we want to alter the behavior or add more input fields
to QUERY_DEVICE, we would use new bits.
>>> Or you had to add a different interface, dedicated to retrieve the exact
>>> supported feature mask.
>>>
>>>>> Moreover, it also ensure the requested features set in comp_mask
>>>>> are sequentially set, not skipping intermediate features, eg. the
>>>>> "highest" requested feature also request all the "lower" ones.
>>>>> This way each new feature will have to be stacked on top of the
>>>>> existing ones: this is especially important for the request and
>>>>> response data structures where fields are added after the
>>>>> current ones when expanded to support a new feature.
>>>>
>>>> I think it is perfectly acceptable that not all drivers will implement
>>>> all available features, and so you shouldn't enforce this requirement.
>>>
>>> With regard to QUERY_DEVICE: the data structure layout depends on the
>>> comp_mask value. So either you propose a way to express multipart data
>>> structure (see CMSG or NETLINK), or we have to ensure the data structure
>>> is ever-growing, with each new chunck stacked over the existing ones:
>>> that's the purpose of :
>>>
>>> if (cmd.comp_mask & (cmd.comp_mask + 1))
>>> return -EINVAL;
>>>
>>>> Also, it makes the comp_mask nothing more than a wasteful version number
>>>> between 0 and 63.
>>>
>>> That's what I've already explained earlier in "Re: [PATCH v3 06/17]
>>> IB/core: Add support for extended query device caps":
>>>
>>> http://mid.gmane.org/1421844612.13543.40.camel-RlY5vtjFyJ3QT0dZR+AlfA@public.gmane.org
>>
>> Yes, you wrote there:
>>> Regarding comp_mask (not for this particular verb):
>>>
>>> It's not clear whether request's comp_mask describe the request or the
>>> response, as such I'm puzzled.
>>>
>>> How would the kernel and the userspace be able to parse the request and
>>> the response if they ignore unknown bits ?
>>>
>>> How would they be able to skip the unrecognized chunk of the request and
>>> response buffer ?
>>>
>>> How one bit in a comp_mask is related to a chunk in the request or
>>> response ?
>>>
>>> It's likely the kernel or userspace would have to skip the remaining
>>> comp_mask's bits after encountering an unknown bit as the size of the
>>> corresponding chunk in request or response would be unknown, making
>>> impossible to locate the corresponding chunk for the next bit set in
>>> comp_mask. Having said that, comp_mask is just a complicated way of
>>> expressing a version, which is turn describe a size (ever growing).
>>
>> It is my understanding that each comp_mask bit marks a set of fields in
>> the command or in the response struct as valid, so the struct format
>> remains the same and the kernel and userspace don't need to make
>> difficult calculation as to where each field is, but you can still pass
>> a high bit set in comp_mask with one of the lower bits cleared.
>>
>
> How can the struct format remain the same, if some fields are added
> to implement newer feature ?
I meant that the binary format for an older version is the prefix of the
binary format of the newer version.
>> I couldn't find this explicit detail in the mailing list, but I did found
>> a presentation that was presented in OFA International Developer
>> Workshop 2013 [1], that gave an example of of an verb where each
>> comp_mask bit marked a different field as valid.
>>
>
> Thanks for the link to the presentation.
>
> As I read it the presentation:
>
> - in request, comp_mask give hint to the kernel of additional fields in
> the request.
>
> - in response, comp_mask give hint to userspace regarding the presence
> of additional fields in the response.
I'm not sure that in request you can regard the comp_mask as a hint. I
agree that you need to enforce it in general and reject unknown bits
there (although I thought QUERY_DEVICE would be an exception).
> But commit 860f10a799c8 ("IB/core: Add flags for on demand paging
> support") on top of commit 5a77abf9a97a ("IB/core: Add support for
> extended query device caps") is not doing it the expected way
> as one bit set in request's comp_mask has direct effect on
> the response's fields.
>
> To be conformant with the "Extending Verbs API" presentation,
> no check should be done on request's comp_mask, and on-demand paging
> information should be returned to userspace in all case, provided
> there's enough room in the response buffer.
>
> Anyway, allowing userspace to set any "hint" in the request's comp_mask
> is opening a pandora box: being allowed to set any value, userspace
> program will use any random value, as it will work with current kernel.
>
> But the same program used on newer kernel is going to trigger some
> behavior unknown to the application or return an error the application
> is not ready to handle ... breaking any kind of forward compatibility
> promise.
I don't think that the application should handle unknown response bits
as an error. As you wrote, I see these as hints about more data in the
response that the application is free to ignore.
> It's likely the kernel will have to use the size of the request to
> guess the hints in comp_mask are corrects to handle such. In such case,
> relying only on the size of the request might be enough to detect
> extended request, without the need for comp_mask.
>
> Regarding the answer, if the response buffer is smaller than the size
> of the extended response, will the kernel keep setting the response's
> comp_mask with all the bits it supports or will it unset the bits for
> the fields it wasn't able to fit in the response buffer ?
>
> It's likely the kernel will have to use the size of the response buffer
> to set the response's comp_mask.
>
> Instead commit 860f10a799c8 ("IB/core: Add flags for on demand paging
> support") on top of commit 5a77abf9a97a ("IB/core: Add support for
> extended query device caps") make it possible for the kernel to return
> truncated response with full comp_mask. Such is going to break silently
> when one will introduce a data structure with an ABI mismatch between
> userspace and kernel (for example x86 vs amd64 ... we have some recent
> exemples).
As I said, I think unknown bits in the comp_mask are hints about unknown
fields in the response that can be ignored by the application. However,
I can agree to having the kernel checking the response buffer size as
you wrote, and only setting the valid comp_mask bits.
>
>>>
>>>>
>>>> In the specific case of QUERY_DEVICE you might argue that there isn't
>>>> any need for input comp_mask, only for output, and then you may enforce
>>>> the input comp_mask will always be zero.
>>>
>>> The extended QUERY_DEVICE uverbs as currently merged is using comp_mask
>>> from input to choose to report on-demand-paging related value. So it
>>> seems it's needed.
>>>
>>> http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/drivers/infiniband/core/uverbs_cmd.c?id=v3.19-rc6#n3297
>>>
>>>> However, you will in any case need to be able to extended the size of the response in the future.
>>>>
>>>
>>> That's already the case for on demand paging.
>>>
>>>>>
>>>>> Link: http://mid.gmane.org/cover.1421931555.git.ydroneaud@opteya.com
>>>>> Cc: Sagi Grimberg <sagig-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>>>>> Cc: Shachar Raindel <raindel-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>>>>> Cc: Eli Cohen <eli-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>>>>> Cc: Haggai Eran <haggaie-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>>>>> Signed-off-by: Yann Droneaud <ydroneaud-RlY5vtjFyJ3QT0dZR+AlfA@public.gmane.org>
>>>>> ---
>>>>> drivers/infiniband/core/uverbs_cmd.c | 6 ++++++
>>>>> 1 file changed, 6 insertions(+)
>>>>>
>>>>> diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
>>>>> index 8668b328b7e6..80a1c90f1dbf 100644
>>>>> --- a/drivers/infiniband/core/uverbs_cmd.c
>>>>> +++ b/drivers/infiniband/core/uverbs_cmd.c
>>>>> @@ -3313,6 +3313,12 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
>>>>> if (err)
>>>>> return err;
>>>>>
>>>>> + if (cmd.comp_mask & (cmd.comp_mask + 1))
>>>>> + return -EINVAL;
>>>>> +
>>>>> + if (cmd.comp_mask & ~(__u32)IB_USER_VERBS_EX_QUERY_DEVICE_ODP)
>>>>> + return -EINVAL;
>>>>> +
>>>
>>> If we keep the checks on output buffer size from patch 1, returning
>>> -ENOSPC in case of size mismatch, and if we are sure that no bit in
>>> input comp_mask will ever trigger a call to a kernel function that can
>>> make the uverb fail, the latter check on known value could be dropped,
>>> allowing the userspace to set the highest mask (-1): userspace
>>> will use -ENOSPC to probe the expected size of the response buffer
>>> to match the highest supported comp_mask. But it's going to hurt
>>> userspace application not ready to receive -ENOSPC on newer kernel
>>> with extended QUERY_DEVICE ABI ... Oops.
>>>
>>> So in this end, the safest way to ensure userspace is doing the correct
>>> thing so that we have backward and forward compatibility is to check
>>> for known values in comp_mask, check for response buffer size and ensure
>>> that data structure chunk are stacked.
>>>
>>> The tighter are the checks now, the easier the interface could be
>>> extended latter.
>>
>> I understand this position, and I generally agree, but I think that
>> specifically for a verb like QUERY_DEVICE that only reads information from
>> the kernel driver to user-space, there is no harm in the kernel just
>> providing all the information it can fit in the response buffer provided
>> by user-space.
>>
>
> Returning as much as possible information to userspace is OK,
> but it's doing it the wrong way.
>
> I'm not specifically discussing QUERY_DEVICE, as I'm concerned with
> every extended uverbs to be added to the kernel.
>
>> Let me explain: newer fields are added to the kernel response struct at the
>> end, together with a new comp_mask bit.
>>
>> Older user-space with newer kernels will simply ask only for the buffer
>> size they care about. The fact that the struct is truncated doesn't affect
>> these programs because the truncated struct is a valid struct that was
>> presented by the kernel in an older version.
>>
>
> You cannot ensure the userspace being correct when specifying a request.
> It's a wrong assumption (perhaps not so wrong in the case of
> InfiniBand/RDMA, as most userspace program are using libibverbs,
> librdmacm and provider's libraries).
>
> That's why we must not be liberal with the request and check any bit of
> it for being something valid, so that erroneous values are catch now,
> ensuring userspace is not trying to request things we don't know yet
> and it's not aware of it too.
Does the solution I proposed above satisfy this requirement?
- The kernel validates input size == command struct size and
cmd.comp_mask == 0.
- The kernel fills as much information as it can fit in the buffer
provided by userspace.
- The kernel marks which fields it was able to fill in the response's
comp_mask.
Regards,
Haggai
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [dmidecode] [Patch v3] firmware: dmi-sysfs: add SMBIOS entry point area raw attribute
From: Ivan Khoronzhuk @ 2015-01-28 15:56 UTC (permalink / raw)
To: linux-kernel, ard.biesheuvel, grant.likely, matt.fleming,
linux-api, linux-doc@vger.kernel.org
Cc: dmidecode-devel, leif.lindholm
In-Reply-To: <1422448763-17583-1-git-send-email-ivan.khoronzhuk@linaro.org>
+ linux-api@vger.kernel.org
+ linux-doc@vger.kernel.org
On 01/28/2015 02:39 PM, Ivan Khoronzhuk wrote:
> Some utils, like dmidecode and smbios, needs to access SMBIOS entry
> table area in order to get information like SMBIOS version, size, etc.
> Currently it's done via /dev/mem. But for situation when /dev/mem
> usage is disabled, the utils have to use dmi sysfs instead, which
> doesn't represent SMBIOS entry. So this patch adds SMBIOS area to
> dmi-sysfs in order to allow utils in question to work correctly with
> dmi sysfs interface.
>
> Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
> ---
>
> v1: https://lkml.org/lkml/2015/1/23/643
> v2: https://lkml.org/lkml/2015/1/26/345
>
> v3..v2:
> firmware: dmi_scan: add symbol to get SMBIOS entry area
> firmware: dmi-sysfs: add SMBIOS entry point area attribute
> combined in one patch
> added SMBIOS information to ABI sysfs-dmi documentaton
>
> v2..v1:
> firmware: dmi_scan: add symbol to get SMBIOS entry area
> - used additional static var to hold SMBIOS raw table size
> - changed format of get_smbios_entry_area symbol
> returned pointer on const smbios table
>
> firmware: dmi-sysfs: add SMBIOS entry point area attribute
> - adopted to updated get_smbios_entry_area symbol
> - removed redundant array to save smbios table
>
>
> Documentation/ABI/testing/sysfs-firmware-dmi | 10 +++++++
> drivers/firmware/dmi-sysfs.c | 42 ++++++++++++++++++++++++++++
> drivers/firmware/dmi_scan.c | 26 +++++++++++++++++
> include/linux/dmi.h | 3 ++
> 4 files changed, 81 insertions(+)
>
> diff --git a/Documentation/ABI/testing/sysfs-firmware-dmi b/Documentation/ABI/testing/sysfs-firmware-dmi
> index c78f9ab..3a9ffe8 100644
> --- a/Documentation/ABI/testing/sysfs-firmware-dmi
> +++ b/Documentation/ABI/testing/sysfs-firmware-dmi
> @@ -12,6 +12,16 @@ Description:
> cannot ensure that the data as exported to userland is
> without error either.
>
> + The firmware provides DMI structures as a packed list of
> + data referenced by a SMBIOS table entry point. The SMBIOS
> + entry point contains general information, like SMBIOS
> + version, DMI table size, etc. The structure, content and
> + size of SMBIOS entry point is dependent on SMBIOS version.
> + That's why SMBIOS entry point is represented in dmi sysfs
> + like a raw attribute and is accessible via
> + /sys/firmware/dmi/smbios_raw_header. The format of SMBIOS
> + entry point header can be read in SMBIOS specification.
> +
> DMI is structured as a large table of entries, where
> each entry has a common header indicating the type and
> length of the entry, as well as a firmware-provided
> diff --git a/drivers/firmware/dmi-sysfs.c b/drivers/firmware/dmi-sysfs.c
> index e0f1cb3..61b6a38 100644
> --- a/drivers/firmware/dmi-sysfs.c
> +++ b/drivers/firmware/dmi-sysfs.c
> @@ -29,6 +29,8 @@
> #define MAX_ENTRY_TYPE 255 /* Most of these aren't used, but we consider
> the top entry type is only 8 bits */
>
> +static const u8 *smbios_raw_header;
> +
> struct dmi_sysfs_entry {
> struct dmi_header dh;
> struct kobject kobj;
> @@ -646,9 +648,37 @@ static void cleanup_entry_list(void)
> }
> }
>
> +static ssize_t smbios_entry_area_raw_read(struct file *filp,
> + struct kobject *kobj,
> + struct bin_attribute *bin_attr,
> + char *buf, loff_t pos, size_t count)
> +{
> + ssize_t size;
> +
> + size = bin_attr->size;
> +
> + if (size > pos)
> + size -= pos;
> + else
> + return 0;
> +
> + if (count < size)
> + size = count;
> +
> + memcpy(buf, &smbios_raw_header[pos], size);
> +
> + return size;
> +}
> +
> +static struct bin_attribute smbios_raw_area_attr = {
> + .read = smbios_entry_area_raw_read,
> + .attr = {.name = "smbios_raw_header", .mode = 0400},
> +};
> +
> static int __init dmi_sysfs_init(void)
> {
> int error = -ENOMEM;
> + int size;
> int val;
>
> /* Set up our directory */
> @@ -669,6 +699,18 @@ static int __init dmi_sysfs_init(void)
> goto err;
> }
>
> + smbios_raw_header = dmi_get_smbios_entry_area(&size);
> + if (!smbios_raw_header) {
> + pr_debug("dmi-sysfs: SMBIOS raw data is not available.\n");
> + error = -ENODATA;
> + goto err;
> + }
> +
> + /* Create the raw binary file to access the entry area */
> + smbios_raw_area_attr.size = size;
> + if (sysfs_create_bin_file(dmi_kobj, &smbios_raw_area_attr))
> + goto err;
> +
> pr_debug("dmi-sysfs: loaded.\n");
>
> return 0;
> diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
> index 420c8d8..d94c6b7 100644
> --- a/drivers/firmware/dmi_scan.c
> +++ b/drivers/firmware/dmi_scan.c
> @@ -113,6 +113,8 @@ static void dmi_table(u8 *buf, int len, int num,
> }
> }
>
> +static unsigned char smbios_header[32];
> +static int smbios_header_size;
> static phys_addr_t dmi_base;
> static u16 dmi_len;
> static u16 dmi_num;
> @@ -474,6 +476,8 @@ static int __init dmi_present(const u8 *buf)
> if (memcmp(buf, "_SM_", 4) == 0 &&
> buf[5] < 32 && dmi_checksum(buf, buf[5])) {
> smbios_ver = get_unaligned_be16(buf + 6);
> + smbios_header_size = buf[5];
> + memcpy(smbios_header, buf, smbios_header_size);
>
> /* Some BIOS report weird SMBIOS version, fix that up */
> switch (smbios_ver) {
> @@ -505,6 +509,8 @@ static int __init dmi_present(const u8 *buf)
> pr_info("SMBIOS %d.%d present.\n",
> dmi_ver >> 8, dmi_ver & 0xFF);
> } else {
> + smbios_header_size = 15;
> + memcpy(smbios_header, buf, smbios_header_size);
> dmi_ver = (buf[14] & 0xF0) << 4 |
> (buf[14] & 0x0F);
> pr_info("Legacy DMI %d.%d present.\n",
> @@ -531,6 +537,8 @@ static int __init dmi_smbios3_present(const u8 *buf)
> dmi_ver &= 0xFFFFFF;
> dmi_len = get_unaligned_le32(buf + 12);
> dmi_base = get_unaligned_le64(buf + 16);
> + smbios_header_size = buf[6];
> + memcpy(smbios_header, buf, smbios_header_size);
>
> /*
> * The 64-bit SMBIOS 3.0 entry point no longer has a field
> @@ -944,3 +952,21 @@ void dmi_memdev_name(u16 handle, const char **bank, const char **device)
> }
> }
> EXPORT_SYMBOL_GPL(dmi_memdev_name);
> +
> +/**
> + * dmi_get_smbios_entry_area - copy SMBIOS entry point area to array.
> + * @size - pointer to assign actual size of SMBIOS entry point area.
> + *
> + * returns NULL if table is not available, otherwise returns pointer on
> + * SMBIOS entry point area array.
> + */
> +const u8 *dmi_get_smbios_entry_area(int *size)
> +{
> + if (!smbios_header_size || !dmi_available)
> + return NULL;
> +
> + *size = smbios_header_size;
> +
> + return smbios_header;
> +}
> +EXPORT_SYMBOL_GPL(dmi_get_smbios_entry_area);
> diff --git a/include/linux/dmi.h b/include/linux/dmi.h
> index f820f0a..8e1a28d 100644
> --- a/include/linux/dmi.h
> +++ b/include/linux/dmi.h
> @@ -109,6 +109,7 @@ extern int dmi_walk(void (*decode)(const struct dmi_header *, void *),
> void *private_data);
> extern bool dmi_match(enum dmi_field f, const char *str);
> extern void dmi_memdev_name(u16 handle, const char **bank, const char **device);
> +const u8 *dmi_get_smbios_entry_area(int *size);
>
> #else
>
> @@ -140,6 +141,8 @@ static inline void dmi_memdev_name(u16 handle, const char **bank,
> const char **device) { }
> static inline const struct dmi_system_id *
> dmi_first_match(const struct dmi_system_id *list) { return NULL; }
> +static inline const u8 *dmi_get_smbios_entry_area(int *size)
> + { return NULL; }
>
> #endif
>
_______________________________________________
https://lists.nongnu.org/mailman/listinfo/dmidecode-devel
^ permalink raw reply
* Re: [PATCH v2 linux-trace 4/8] samples: bpf: simple tracing example in C
From: Arnaldo Carvalho de Melo @ 2015-01-28 16:24 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Steven Rostedt, Ingo Molnar, Namhyung Kim, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
In-Reply-To: <1422417973-10195-5-git-send-email-ast@plumgrid.com>
Em Tue, Jan 27, 2015 at 08:06:09PM -0800, Alexei Starovoitov escreveu:
> diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c
> new file mode 100644
> index 000000000000..7849ceb4bce6
> --- /dev/null
> +++ b/samples/bpf/tracex1_kern.c
> @@ -0,0 +1,28 @@
> +#include <linux/skbuff.h>
> +#include <linux/netdevice.h>
> +#include <uapi/linux/bpf.h>
> +#include <trace/bpf_trace.h>
> +#include "bpf_helpers.h"
> +
> +SEC("events/net/netif_receive_skb")
> +int bpf_prog1(struct bpf_context *ctx)
> +{
> + /*
> + * attaches to /sys/kernel/debug/tracing/events/net/netif_receive_skb
> + * prints events for loobpack device only
> + */
> + char devname[] = "lo";
> + struct net_device *dev;
> + struct sk_buff *skb = 0;
> +
> + skb = (struct sk_buff *) ctx->arg1;
> + dev = bpf_fetch_ptr(&skb->dev);
> + if (bpf_memcmp(dev->name, devname, 2) == 0)
I'm only starting to look at all this, so bear with me... But why do we
need to have it as "bpf_memcmp"? Can't we simply use it as "memcmp" and
have it use the right function?
Less typing, perhaps we would need to have a:
#define memcmp bpf_memcmp(s1, s2, n) bpf_memcmp(s1, s2, n)
in bpf_helpers.h to have it work?
- Arnaldo
> + /* print event using default tracepoint format */
> + return 1;
> +
> + /* drop event */
> + return 0;
> +}
^ permalink raw reply
* Re: [PATCH v2 linux-trace 4/8] samples: bpf: simple tracing example in C
From: Arnaldo Carvalho de Melo @ 2015-01-28 16:25 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Steven Rostedt, Ingo Molnar, Namhyung Kim, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
In-Reply-To: <20150128162415.GO7220@kernel.org>
Em Wed, Jan 28, 2015 at 01:24:15PM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Tue, Jan 27, 2015 at 08:06:09PM -0800, Alexei Starovoitov escreveu:
> > diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c
> > new file mode 100644
> > index 000000000000..7849ceb4bce6
> > --- /dev/null
> > +++ b/samples/bpf/tracex1_kern.c
> > @@ -0,0 +1,28 @@
> > +#include <linux/skbuff.h>
> > +#include <linux/netdevice.h>
> > +#include <uapi/linux/bpf.h>
> > +#include <trace/bpf_trace.h>
> > +#include "bpf_helpers.h"
> > +
> > +SEC("events/net/netif_receive_skb")
> > +int bpf_prog1(struct bpf_context *ctx)
> > +{
> > + /*
> > + * attaches to /sys/kernel/debug/tracing/events/net/netif_receive_skb
> > + * prints events for loobpack device only
> > + */
> > + char devname[] = "lo";
> > + struct net_device *dev;
> > + struct sk_buff *skb = 0;
> > +
> > + skb = (struct sk_buff *) ctx->arg1;
> > + dev = bpf_fetch_ptr(&skb->dev);
> > + if (bpf_memcmp(dev->name, devname, 2) == 0)
>
> I'm only starting to look at all this, so bear with me... But why do we
> need to have it as "bpf_memcmp"? Can't we simply use it as "memcmp" and
> have it use the right function?
>
> Less typing, perhaps we would need to have a:
>
> #define memcmp bpf_memcmp(s1, s2, n) bpf_memcmp(s1, s2, n)
Argh, like this:
#define memcmp(s1, s2, n) bpf_memcmp(s1, s2, n)
> in bpf_helpers.h to have it work?
>
> - Arnaldo
>
> > + /* print event using default tracepoint format */
> > + return 1;
> > +
> > + /* drop event */
> > + return 0;
> > +}
^ permalink raw reply
* Re: [PATCH net-next v1 01/18] net: ethtool: propagate get_settings error
From: Ben Hutchings @ 2015-01-28 16:30 UTC (permalink / raw)
To: David Decotigny
Cc: David S. Miller, Amir Vadai, linux-kernel, netdev, linux-api,
Eric Dumazet, Eugenia Emantayev, Or Gerlitz, Ido Shamay,
Joe Perches, Saeed Mahameed, Govindarajulu Varadarajan,
Venkata Duvvuru, Jeff Kirsher, Eyal Perry, Pravin B Shelar,
Ed Swierk, David Decotigny
In-Reply-To: <1422322574-6188-2-git-send-email-ddecotig@gmail.com>
[-- Attachment #1: Type: text/plain, Size: 942 bytes --]
On Mon, 2015-01-26 at 17:35 -0800, David Decotigny wrote:
> From: David Decotigny <decot@googlers.com>
>
> Signed-off-by: David Decotigny <decot@googlers.com>
> ---
> net/core/ethtool.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/core/ethtool.c b/net/core/ethtool.c
> index 91f74f3..52efb7e 100644
> --- a/net/core/ethtool.c
> +++ b/net/core/ethtool.c
> @@ -364,7 +364,7 @@ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
>
> if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
> return -EFAULT;
> - return 0;
> + return err;
err cannnot be negative at this point (and if it's positive, that's a
bug in the get_settings implementation which we happen to fix up).
Ben.
> }
>
> static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
--
Ben Hutchings
Teamwork is essential - it allows you to blame someone else.
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 811 bytes --]
^ permalink raw reply
* Re: [PATCH v2 linux-trace 4/8] samples: bpf: simple tracing example in C
From: Alexei Starovoitov @ 2015-01-28 16:42 UTC (permalink / raw)
To: Arnaldo Carvalho de Melo
Cc: Steven Rostedt, Ingo Molnar, Namhyung Kim, Jiri Olsa,
Masami Hiramatsu, Linux API, Network Development, LKML
In-Reply-To: <20150128162557.GP7220-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
On Wed, Jan 28, 2015 at 8:25 AM, Arnaldo Carvalho de Melo
<acme-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote:
> Em Wed, Jan 28, 2015 at 01:24:15PM -0300, Arnaldo Carvalho de Melo escreveu:
>> Em Tue, Jan 27, 2015 at 08:06:09PM -0800, Alexei Starovoitov escreveu:
>> > diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c
>> > new file mode 100644
>> > index 000000000000..7849ceb4bce6
>> > --- /dev/null
>> > +++ b/samples/bpf/tracex1_kern.c
>> > @@ -0,0 +1,28 @@
>> > +#include <linux/skbuff.h>
>> > +#include <linux/netdevice.h>
>> > +#include <uapi/linux/bpf.h>
>> > +#include <trace/bpf_trace.h>
>> > +#include "bpf_helpers.h"
>> > +
>> > +SEC("events/net/netif_receive_skb")
>> > +int bpf_prog1(struct bpf_context *ctx)
>> > +{
>> > + /*
>> > + * attaches to /sys/kernel/debug/tracing/events/net/netif_receive_skb
>> > + * prints events for loobpack device only
>> > + */
>> > + char devname[] = "lo";
>> > + struct net_device *dev;
>> > + struct sk_buff *skb = 0;
>> > +
>> > + skb = (struct sk_buff *) ctx->arg1;
>> > + dev = bpf_fetch_ptr(&skb->dev);
>> > + if (bpf_memcmp(dev->name, devname, 2) == 0)
>>
>> I'm only starting to look at all this, so bear with me... But why do we
>> need to have it as "bpf_memcmp"? Can't we simply use it as "memcmp" and
>> have it use the right function?
>>
>> Less typing, perhaps we would need to have a:
>>
>> #define memcmp bpf_memcmp(s1, s2, n) bpf_memcmp(s1, s2, n)
>
> Argh, like this:
>
> #define memcmp(s1, s2, n) bpf_memcmp(s1, s2, n)
>
>> in bpf_helpers.h to have it work?
yes, that will work just fine.
Since it's an example I made it explicit that bpf_memcmp()
has memcmp() semantics, but little bit different:
int bpf_memcmp(void *unsafe_ptr, void *safe_ptr, int size)
meaning that one of the pointers can point anywhere and
the function will be doing probe_kernel_read() underneath
similar to bpf_fetch_*() helpers.
If it was plain memcmp() it would give a wrong impression
that vanilla memcmp() can be used.
In general the programs cannot use any library functions
outside of helpers defined in uapi/linux/bpf.h
bpf_fetch_*() helpers are also explicit in examples.
If one need to do a lot of pointer walking, then macro like
#define D(P) ((typeof(P))bpf_fetch_ptr(&P))
would be easier to use: p = D(D(skb->dev)->ifalias)
multiple pointer derefs would look more natural...
^ permalink raw reply
* Re: [PATCH v2 linux-trace 4/8] samples: bpf: simple tracing example in C
From: Arnaldo Carvalho de Melo @ 2015-01-28 20:44 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Steven Rostedt, Ingo Molnar, Namhyung Kim, Jiri Olsa,
Masami Hiramatsu, Linux API, Network Development, LKML
In-Reply-To: <CAMEtUuxRXEPOaX3rksqQKdz4THMmtLR3=uHymEKaMSNkjMBhxw@mail.gmail.com>
Em Wed, Jan 28, 2015 at 08:42:29AM -0800, Alexei Starovoitov escreveu:
> On Wed, Jan 28, 2015 at 8:25 AM, Arnaldo Carvalho de Melo
> <acme@kernel.org> wrote:
> > Em Wed, Jan 28, 2015 at 01:24:15PM -0300, Arnaldo Carvalho de Melo escreveu:
> >> Em Tue, Jan 27, 2015 at 08:06:09PM -0800, Alexei Starovoitov escreveu:
> >> > + if (bpf_memcmp(dev->name, devname, 2) == 0)
> >> I'm only starting to look at all this, so bear with me... But why do we
> >> need to have it as "bpf_memcmp"? Can't we simply use it as "memcmp" and
> >> have it use the right function?
> >> Less typing, perhaps we would need to have a:
> >> #define memcmp bpf_memcmp(s1, s2, n) bpf_memcmp(s1, s2, n)
> > Argh, like this:
> > #define memcmp(s1, s2, n) bpf_memcmp(s1, s2, n)
> >> in bpf_helpers.h to have it work?
> yes, that will work just fine.
> Since it's an example I made it explicit that bpf_memcmp()
> has memcmp() semantics, but little bit different:
> int bpf_memcmp(void *unsafe_ptr, void *safe_ptr, int size)
Not knowing about the safe/unsafe pointers (at this point in my
conceptual eBPF learning process), I would think that it would be easier
to understand if it would reuse another well known idiom:
#define memcmp_from_user(kernel, user, n) bpf_memcmp(user, kernel, n)
That would be similar to:
copy_from_user(void *to, const void __user *from, unsigned long n)
But here, again bear with me, I'm just brainstorming, as from just
looking at:
bpf_memcmp(a, b, n)
I don't reuse anything I've learned before trying to understand eBPF,
not I see any well known marker (__user) that would help me understand
that that pointer needs special treatment/belongs to a different "domain".
> meaning that one of the pointers can point anywhere and
> the function will be doing probe_kernel_read() underneath
> similar to bpf_fetch_*() helpers.
> If it was plain memcmp() it would give a wrong impression
> that vanilla memcmp() can be used.
Since that is not the case, I agree that the 'memcmp' semantic can't be
used, as the two pointers are not on the same "domain", so to say.
> In general the programs cannot use any library functions
> outside of helpers defined in uapi/linux/bpf.h
>
> bpf_fetch_*() helpers are also explicit in examples.
> If one need to do a lot of pointer walking, then macro like
> #define D(P) ((typeof(P))bpf_fetch_ptr(&P))
> would be easier to use: p = D(D(skb->dev)->ifalias)
> multiple pointer derefs would look more natural...
And if possible, i.e. if the eBPF compiler would take care of that
somehow, would indeed be preferred as it looks more natural :-)
- Arnaldo
^ permalink raw reply
* Re: [PATCH v2 linux-trace 1/8] tracing: attach eBPF programs to tracepoints and syscalls
From: Namhyung Kim @ 2015-01-29 0:46 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Steven Rostedt, Ingo Molnar, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, linux-api, netdev, linux-kernel
In-Reply-To: <1422417973-10195-2-git-send-email-ast@plumgrid.com>
Hi Alexei,
On Tue, Jan 27, 2015 at 08:06:06PM -0800, Alexei Starovoitov wrote:
> User interface:
> fd = open("/sys/kernel/debug/tracing/__event__/filter")
>
> write(fd, "bpf_123")
>
> where 123 is process local FD associated with eBPF program previously loaded.
> __event__ is static tracepoint event or syscall.
> (kprobe support is in next patch)
> Once program is successfully attached to tracepoint event, the tracepoint
> will be auto-enabled
>
> close(fd)
> auto-disables tracepoint event and detaches eBPF program from it
>
> eBPF programs can call in-kernel helper functions to:
> - lookup/update/delete elements in maps
> - memcmp
> - fetch_ptr/u64/u32/u16/u8 values from unsafe address via probe_kernel_read(),
> so that eBPF program can walk any kernel data structures
>
> Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
> ---
[SNIP]
> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> index b03a0ea77b99..70482817231a 100644
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -1084,6 +1084,26 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
> return r;
> }
>
> +static int event_filter_release(struct inode *inode, struct file *filp)
> +{
> + struct ftrace_event_file *file;
> + char buf[2] = "0";
> +
> + mutex_lock(&event_mutex);
> + file = event_file_data(filp);
> + if (file) {
> + if (file->flags & TRACE_EVENT_FL_BPF) {
> + /* auto-disable the filter */
> + ftrace_event_enable_disable(file, 0);
Hmm.. what if user already enabled an event, attached a bpf filter and
then detached the filter - I'm not sure we can always auto-disable
it..
> +
> + /* if BPF filter was used, clear it on fd close */
> + apply_event_filter(file, buf);
> + }
> + }
> + mutex_unlock(&event_mutex);
> + return 0;
> +}
> +
> static ssize_t
> event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
> loff_t *ppos)
> @@ -1107,8 +1127,18 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
>
> mutex_lock(&event_mutex);
> file = event_file_data(filp);
> - if (file)
> + if (file) {
> + /*
> + * note to user space tools:
> + * write() into debugfs/tracing/events/xxx/filter file
> + * must be done with the same privilege level as open()
> + */
> err = apply_event_filter(file, buf);
> + if (!err && file->flags & TRACE_EVENT_FL_BPF)
> + /* once filter is applied, auto-enable it */
> + ftrace_event_enable_disable(file, 1);
> + }
> +
> mutex_unlock(&event_mutex);
>
> free_page((unsigned long) buf);
> @@ -1363,6 +1393,7 @@ static const struct file_operations ftrace_event_filter_fops = {
> .open = tracing_open_generic,
> .read = event_filter_read,
> .write = event_filter_write,
> + .release = event_filter_release,
> .llseek = default_llseek,
> };
>
> diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
> index ced69da0ff55..e0303b3cc9fb 100644
> --- a/kernel/trace/trace_events_filter.c
> +++ b/kernel/trace/trace_events_filter.c
> @@ -23,6 +23,9 @@
> #include <linux/mutex.h>
> #include <linux/perf_event.h>
> #include <linux/slab.h>
> +#include <linux/bpf.h>
> +#include <trace/bpf_trace.h>
> +#include <linux/filter.h>
>
> #include "trace.h"
> #include "trace_output.h"
> @@ -541,6 +544,21 @@ static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred,
> return WALK_PRED_DEFAULT;
> }
>
> +unsigned int trace_filter_call_bpf(struct event_filter *filter, void *ctx)
> +{
> + unsigned int ret;
> +
> + if (in_nmi()) /* not supported yet */
> + return 0;
But doesn't this mean to auto-disable all attached events during NMI
as returning 0 will prevent the event going to ring buffer?
I think it'd be better to keep an attached event in a soft-disabled
state like event trigger and give control of enabling to users..
Thanks,
Namhyung
> +
> + rcu_read_lock();
> + ret = BPF_PROG_RUN(filter->prog, ctx);
> + rcu_read_unlock();
> +
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(trace_filter_call_bpf);
^ permalink raw reply
* [RESEND] [PATCH] block: create ioctl to discard-or-zeroout a range of blocks
From: Darrick J. Wong @ 2015-01-29 2:00 UTC (permalink / raw)
To: Jens Axboe, Christoph Hellwig
Cc: linux-kernel, linux-fsdevel, linux-api, Jeff Layton,
J. Bruce Fields
Create a new ioctl to expose the block layer's newfound ability to
issue either a zeroing discard, a WRITE SAME with a zero page, or a
regular write with the zero page. This BLKZEROOUT2 ioctl takes
{start, length, flags} as parameters. So far, the only flag available
is to enable the zeroing discard part -- without it, the call invokes
the old BLKZEROOUT behavior. start and length have the same meaning
as in BLKZEROOUT.
Furthermore, because BLKZEROOUT2 issues commands directly to the
storage device, we must invalidate the page cache (as a regular
O_DIRECT write would do) to avoid returning stale cache contents at a
later time.
This patch depends on "block: Add discard flag to
blkdev_issue_zeroout() function" in Jens' for-3.20/core branch.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
block/ioctl.c | 45 ++++++++++++++++++++++++++++++++++++++-------
include/uapi/linux/fs.h | 7 +++++++
2 files changed, 45 insertions(+), 7 deletions(-)
diff --git a/block/ioctl.c b/block/ioctl.c
index 7d8befd..ff623d5 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -186,19 +186,39 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
}
static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start,
- uint64_t len)
+ uint64_t len, uint32_t flags)
{
+ int ret;
+ struct address_space *mapping;
+ uint64_t end = start + len - 1;
+
+ if (flags & ~BLKZEROOUT2_DISCARD_OK)
+ return -EINVAL;
if (start & 511)
return -EINVAL;
if (len & 511)
return -EINVAL;
- start >>= 9;
- len >>= 9;
-
- if (start + len > (i_size_read(bdev->bd_inode) >> 9))
+ if (end >= i_size_read(bdev->bd_inode))
return -EINVAL;
- return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL, false);
+ /* Invalidate the page cache, including dirty pages */
+ mapping = bdev->bd_inode->i_mapping;
+ truncate_inode_pages_range(mapping, start, end);
+
+ ret = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL,
+ flags & BLKZEROOUT2_DISCARD_OK);
+ if (ret)
+ goto out;
+
+ /*
+ * Invalidate again; if someone wandered in and dirtied a page,
+ * the caller will be given -EBUSY.
+ */
+ ret = invalidate_inode_pages2_range(mapping,
+ start >> PAGE_CACHE_SHIFT,
+ end >> PAGE_CACHE_SHIFT);
+out:
+ return ret;
}
static int put_ushort(unsigned long arg, unsigned short val)
@@ -326,7 +346,18 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
if (copy_from_user(range, (void __user *)arg, sizeof(range)))
return -EFAULT;
- return blk_ioctl_zeroout(bdev, range[0], range[1]);
+ return blk_ioctl_zeroout(bdev, range[0], range[1], 0);
+ }
+ case BLKZEROOUT2: {
+ struct blkzeroout2 p;
+
+ if (!(mode & FMODE_WRITE))
+ return -EBADF;
+
+ if (copy_from_user(&p, (void __user *)arg, sizeof(p)))
+ return -EFAULT;
+
+ return blk_ioctl_zeroout(bdev, p.start, p.length, p.flags);
}
case HDIO_GETGEO: {
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 3735fa0..54d24ea 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -150,6 +150,13 @@ struct inodes_stat_t {
#define BLKSECDISCARD _IO(0x12,125)
#define BLKROTATIONAL _IO(0x12,126)
#define BLKZEROOUT _IO(0x12,127)
+struct blkzeroout2 {
+ __u64 start;
+ __u64 length;
+ __u32 flags;
+};
+#define BLKZEROOUT2_DISCARD_OK 1
+#define BLKZEROOUT2 _IOR(0x12, 127, struct blkzeroout2)
#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
#define FIBMAP _IO(0x00,1) /* bmap access */
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox