* [PATCH v7 tip 5/8] samples: bpf: simple non-portable kprobe filter example
From: Alexei Starovoitov @ 2015-03-16 21:49 UTC (permalink / raw)
To: Ingo Molnar
Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, David S. Miller, Daniel Borkmann,
Peter Zijlstra, linux-api-u79uwXL29TY76Z2rM5mHXA,
netdev-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1426542584-9406-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
tracex1_kern.c - C program compiled into BPF.
It attaches to kprobe:netif_receive_skb
When skb->dev->name == "lo", it prints sample debug message into trace_pipe
via bpf_trace_printk() helper function.
tracex1_user.c - corresponding user space component that:
- loads bpf program via bpf() syscall
- opens kprobes:netif_receive_skb event via perf_event_open() syscall
- attaches the program to event via ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
- prints from trace_pipe
Note, this bpf program is completely non-portable. It must be recompiled
with current kernel headers. kprobe is not a stable ABI and bpf+kprobe scripts
may stop working any time.
bpf verifier will detect that it's using bpf_trace_printk() and kernel will
print warning banner:
** trace_printk() being used. Allocating extra memory. **
** **
** This means that this is a DEBUG kernel and it is **
** unsafe for production use. **
bpf_trace_printk() should be used for debugging of bpf program only.
Usage:
$ sudo tracex1
ping-19826 [000] d.s2 63103.382648: : skb ffff880466b1ca00 len 84
ping-19826 [000] d.s2 63103.382684: : skb ffff880466b1d300 len 84
ping-19826 [000] d.s2 63104.382533: : skb ffff880466b1ca00 len 84
ping-19826 [000] d.s2 63104.382594: : skb ffff880466b1d300 len 84
Signed-off-by: Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
---
samples/bpf/Makefile | 4 ++
samples/bpf/bpf_helpers.h | 6 +++
samples/bpf/bpf_load.c | 125 ++++++++++++++++++++++++++++++++++++++++---
samples/bpf/bpf_load.h | 3 ++
samples/bpf/libbpf.c | 14 ++++-
samples/bpf/libbpf.h | 5 +-
samples/bpf/sock_example.c | 2 +-
samples/bpf/test_verifier.c | 2 +-
samples/bpf/tracex1_kern.c | 50 +++++++++++++++++
samples/bpf/tracex1_user.c | 25 +++++++++
10 files changed, 224 insertions(+), 12 deletions(-)
create mode 100644 samples/bpf/tracex1_kern.c
create mode 100644 samples/bpf/tracex1_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index b5b3600dcdf5..51f6f01e5a3a 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -6,23 +6,27 @@ hostprogs-y := test_verifier test_maps
hostprogs-y += sock_example
hostprogs-y += sockex1
hostprogs-y += sockex2
+hostprogs-y += tracex1
test_verifier-objs := test_verifier.o libbpf.o
test_maps-objs := test_maps.o libbpf.o
sock_example-objs := sock_example.o libbpf.o
sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
+tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
always += sockex1_kern.o
always += sockex2_kern.o
+always += tracex1_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
HOSTLOADLIBES_sockex1 += -lelf
HOSTLOADLIBES_sockex2 += -lelf
+HOSTLOADLIBES_tracex1 += -lelf
# point this to your LLVM backend with bpf support
LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index ca0333146006..1c872bcf5a80 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -15,6 +15,12 @@ static int (*bpf_map_update_elem)(void *map, void *key, void *value,
(void *) BPF_FUNC_map_update_elem;
static int (*bpf_map_delete_elem)(void *map, void *key) =
(void *) BPF_FUNC_map_delete_elem;
+static int (*bpf_probe_read)(void *dst, int size, void *unsafe_ptr) =
+ (void *) BPF_FUNC_probe_read;
+static unsigned long long (*bpf_ktime_get_ns)(void) =
+ (void *) BPF_FUNC_ktime_get_ns;
+static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) =
+ (void *) BPF_FUNC_trace_printk;
/* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 1831d236382b..95c106e4bcdb 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -8,29 +8,70 @@
#include <unistd.h>
#include <string.h>
#include <stdbool.h>
+#include <stdlib.h>
#include <linux/bpf.h>
#include <linux/filter.h>
+#include <linux/perf_event.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <poll.h>
#include "libbpf.h"
#include "bpf_helpers.h"
#include "bpf_load.h"
+#define DEBUGFS "/sys/kernel/debug/tracing/"
+
static char license[128];
+static int kern_version;
static bool processed_sec[128];
int map_fd[MAX_MAPS];
int prog_fd[MAX_PROGS];
+int event_fd[MAX_PROGS];
int prog_cnt;
static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
{
- int fd;
bool is_socket = strncmp(event, "socket", 6) == 0;
-
- if (!is_socket)
- /* tracing events tbd */
+ bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
+ bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
+ enum bpf_prog_type prog_type;
+ char buf[256];
+ int fd, efd, err, id;
+ struct perf_event_attr attr = {};
+
+ attr.type = PERF_TYPE_TRACEPOINT;
+ attr.sample_type = PERF_SAMPLE_RAW;
+ attr.sample_period = 1;
+ attr.wakeup_events = 1;
+
+ if (is_socket) {
+ prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+ } else if (is_kprobe || is_kretprobe) {
+ prog_type = BPF_PROG_TYPE_KPROBE;
+ } else {
+ printf("Unknown event '%s'\n", event);
return -1;
+ }
+
+ if (is_kprobe || is_kretprobe) {
+ if (is_kprobe)
+ event += 7;
+ else
+ event += 10;
+
+ snprintf(buf, sizeof(buf),
+ "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
+ is_kprobe ? 'p' : 'r', event, event);
+ err = system(buf);
+ if (err < 0) {
+ printf("failed to create kprobe '%s' error '%s'\n",
+ event, strerror(errno));
+ return -1;
+ }
+ }
- fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER,
- prog, size, license);
+ fd = bpf_prog_load(prog_type, prog, size, license, kern_version);
if (fd < 0) {
printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf);
@@ -39,6 +80,41 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
prog_fd[prog_cnt++] = fd;
+ if (is_socket)
+ return 0;
+
+ strcpy(buf, DEBUGFS);
+ strcat(buf, "events/kprobes/");
+ strcat(buf, event);
+ strcat(buf, "/id");
+
+ efd = open(buf, O_RDONLY, 0);
+ if (efd < 0) {
+ printf("failed to open event %s\n", event);
+ return -1;
+ }
+
+ err = read(efd, buf, sizeof(buf));
+ if (err < 0 || err >= sizeof(buf)) {
+ printf("read from '%s' failed '%s'\n", event, strerror(errno));
+ return -1;
+ }
+
+ close(efd);
+
+ buf[err] = 0;
+ id = atoi(buf);
+ attr.config = id;
+
+ efd = perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
+ if (efd < 0) {
+ printf("event %d fd %d err %s\n", id, efd, strerror(errno));
+ return -1;
+ }
+ event_fd[prog_cnt - 1] = efd;
+ ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
+ ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
+
return 0;
}
@@ -135,6 +211,9 @@ int load_bpf_file(char *path)
if (gelf_getehdr(elf, &ehdr) != &ehdr)
return 1;
+ /* clear all kprobes */
+ i = system("echo \"\" > /sys/kernel/debug/tracing/kprobe_events");
+
/* scan over all elf sections to get license and map info */
for (i = 1; i < ehdr.e_shnum; i++) {
@@ -149,6 +228,14 @@ int load_bpf_file(char *path)
if (strcmp(shname, "license") == 0) {
processed_sec[i] = true;
memcpy(license, data->d_buf, data->d_size);
+ } else if (strcmp(shname, "version") == 0) {
+ processed_sec[i] = true;
+ if (data->d_size != sizeof(int)) {
+ printf("invalid size of version section %zd\n",
+ data->d_size);
+ return 1;
+ }
+ memcpy(&kern_version, data->d_buf, sizeof(int));
} else if (strcmp(shname, "maps") == 0) {
processed_sec[i] = true;
if (load_maps(data->d_buf, data->d_size))
@@ -178,7 +265,8 @@ int load_bpf_file(char *path)
if (parse_relo_and_apply(data, symbols, &shdr, insns))
continue;
- if (memcmp(shname_prog, "events/", 7) == 0 ||
+ if (memcmp(shname_prog, "kprobe/", 7) == 0 ||
+ memcmp(shname_prog, "kretprobe/", 10) == 0 ||
memcmp(shname_prog, "socket", 6) == 0)
load_and_attach(shname_prog, insns, data_prog->d_size);
}
@@ -193,7 +281,8 @@ int load_bpf_file(char *path)
if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
continue;
- if (memcmp(shname, "events/", 7) == 0 ||
+ if (memcmp(shname, "kprobe/", 7) == 0 ||
+ memcmp(shname, "kretprobe/", 10) == 0 ||
memcmp(shname, "socket", 6) == 0)
load_and_attach(shname, data->d_buf, data->d_size);
}
@@ -201,3 +290,23 @@ int load_bpf_file(char *path)
close(fd);
return 0;
}
+
+void read_trace_pipe(void)
+{
+ int trace_fd;
+
+ trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
+ if (trace_fd < 0)
+ return;
+
+ while (1) {
+ static char buf[4096];
+ ssize_t sz;
+
+ sz = read(trace_fd, buf, sizeof(buf));
+ if (sz) {
+ buf[sz] = 0;
+ puts(buf);
+ }
+ }
+}
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 27789a34f5e6..cbd7c2b532b9 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -6,6 +6,7 @@
extern int map_fd[MAX_MAPS];
extern int prog_fd[MAX_PROGS];
+extern int event_fd[MAX_PROGS];
/* parses elf file compiled by llvm .c->.o
* . parses 'maps' section and creates maps via BPF syscall
@@ -21,4 +22,6 @@ extern int prog_fd[MAX_PROGS];
*/
int load_bpf_file(char *path);
+void read_trace_pipe(void);
+
#endif
diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c
index 46d50b7ddf79..7e1efa7e2ed7 100644
--- a/samples/bpf/libbpf.c
+++ b/samples/bpf/libbpf.c
@@ -81,7 +81,7 @@ char bpf_log_buf[LOG_BUF_SIZE];
int bpf_prog_load(enum bpf_prog_type prog_type,
const struct bpf_insn *insns, int prog_len,
- const char *license)
+ const char *license, int kern_version)
{
union bpf_attr attr = {
.prog_type = prog_type,
@@ -93,6 +93,11 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
.log_level = 1,
};
+ /* assign one field outside of struct init to make sure any
+ * padding is zero initialized
+ */
+ attr.kern_version = kern_version;
+
bpf_log_buf[0] = 0;
return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
@@ -121,3 +126,10 @@ int open_raw_sock(const char *name)
return sock;
}
+
+int perf_event_open(struct perf_event_attr *attr, int pid, int cpu,
+ int group_fd, unsigned long flags)
+{
+ return syscall(__NR_perf_event_open, attr, pid, cpu,
+ group_fd, flags);
+}
diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
index 58c5fe1bdba1..ac7b09672b46 100644
--- a/samples/bpf/libbpf.h
+++ b/samples/bpf/libbpf.h
@@ -13,7 +13,7 @@ int bpf_get_next_key(int fd, void *key, void *next_key);
int bpf_prog_load(enum bpf_prog_type prog_type,
const struct bpf_insn *insns, int insn_len,
- const char *license);
+ const char *license, int kern_version);
#define LOG_BUF_SIZE 65536
extern char bpf_log_buf[LOG_BUF_SIZE];
@@ -182,4 +182,7 @@ extern char bpf_log_buf[LOG_BUF_SIZE];
/* create RAW socket and bind to interface 'name' */
int open_raw_sock(const char *name);
+struct perf_event_attr;
+int perf_event_open(struct perf_event_attr *attr, int pid, int cpu,
+ int group_fd, unsigned long flags);
#endif
diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c
index c8ad0404416f..a0ce251c5390 100644
--- a/samples/bpf/sock_example.c
+++ b/samples/bpf/sock_example.c
@@ -56,7 +56,7 @@ static int test_sock(void)
};
prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, sizeof(prog),
- "GPL");
+ "GPL", 0);
if (prog_fd < 0) {
printf("failed to load prog '%s'\n", strerror(errno));
goto cleanup;
diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c
index b96175e90363..740ce97cda5e 100644
--- a/samples/bpf/test_verifier.c
+++ b/samples/bpf/test_verifier.c
@@ -689,7 +689,7 @@ static int test(void)
prog_fd = bpf_prog_load(BPF_PROG_TYPE_UNSPEC, prog,
prog_len * sizeof(struct bpf_insn),
- "GPL");
+ "GPL", 0);
if (tests[i].result == ACCEPT) {
if (prog_fd < 0) {
diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c
new file mode 100644
index 000000000000..42176fce4847
--- /dev/null
+++ b/samples/bpf/tracex1_kern.c
@@ -0,0 +1,50 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <linux/version.h>
+#include "bpf_helpers.h"
+
+#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;})
+
+/* kprobe is NOT a stable ABI
+ * kernel functions can be removed, renamed or completely change semantics.
+ * Number of argumnets and their posistions can change, etc.
+ * This bpf+kprobe example can stop working any time.
+ */
+SEC("kprobe/__netif_receive_skb_core")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ /* attaches to kprobe netif_receive_skb,
+ * looks for packets on loobpack device and prints them
+ */
+ char devname[IFNAMSIZ] = {};
+ struct net_device *dev;
+ struct sk_buff *skb;
+ int len;
+
+ /* non-portable! works for the given kernel only */
+ skb = (struct sk_buff *) ctx->di;
+
+ dev = _(skb->dev);
+
+ len = _(skb->len);
+
+ bpf_probe_read(devname, sizeof(devname), dev->name);
+
+ if (devname[0] == 'l' && devname[1] == 'o') {
+ char fmt[] = "skb %p len %d\n";
+ /* using bpf_trace_printk() for DEBUG ONLY */
+ bpf_trace_printk(fmt, sizeof(fmt), skb, len);
+ }
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c
new file mode 100644
index 000000000000..31a48183beea
--- /dev/null
+++ b/samples/bpf/tracex1_user.c
@@ -0,0 +1,25 @@
+#include <stdio.h>
+#include <linux/bpf.h>
+#include <unistd.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int ac, char **argv)
+{
+ FILE *f;
+ char filename[256];
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ f = popen("taskset 1 ping -c5 localhost", "r");
+ (void) f;
+
+ read_trace_pipe();
+
+ return 0;
+}
--
1.7.9.5
^ permalink raw reply related
* [PATCH v7 tip 6/8] samples: bpf: counting example for kfree_skb and write syscall
From: Alexei Starovoitov @ 2015-03-16 21:49 UTC (permalink / raw)
To: Ingo Molnar
Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, David S. Miller, Daniel Borkmann,
Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1426542584-9406-1-git-send-email-ast@plumgrid.com>
this example has two probes in one C file that attach to different kprove events
and use two different maps.
1st probe is x64 specific equivalent of dropmon. It attaches to kfree_skb,
retrevies 'ip' address of kfree_skb() caller and counts number of packet drops
at that 'ip' address. User space prints 'location - count' map every second.
2nd probe attaches to kprobe:sys_write and computes a histogram of different
write sizes
Usage:
$ sudo tracex2
location 0xffffffff81695995 count 1
location 0xffffffff816d0da9 count 2
location 0xffffffff81695995 count 2
location 0xffffffff816d0da9 count 2
location 0xffffffff81695995 count 3
location 0xffffffff816d0da9 count 2
557145+0 records in
557145+0 records out
285258240 bytes (285 MB) copied, 1.02379 s, 279 MB/s
syscall write() stats
byte_size : count distribution
1 -> 1 : 3 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 0 | |
16 -> 31 : 2 | |
32 -> 63 : 3 | |
64 -> 127 : 1 | |
128 -> 255 : 1 | |
256 -> 511 : 0 | |
512 -> 1023 : 1118968 |************************************* |
Ctrl-C at any time. Kernel will auto cleanup maps and programs
$ addr2line -ape ./bld_x64/vmlinux 0xffffffff81695995 0xffffffff816d0da9
0xffffffff81695995: ./bld_x64/../net/ipv4/icmp.c:1038
0xffffffff816d0da9: ./bld_x64/../net/unix/af_unix.c:1231
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
samples/bpf/Makefile | 4 ++
samples/bpf/tracex2_kern.c | 86 +++++++++++++++++++++++++++++++++++++++
samples/bpf/tracex2_user.c | 95 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 185 insertions(+)
create mode 100644 samples/bpf/tracex2_kern.c
create mode 100644 samples/bpf/tracex2_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 51f6f01e5a3a..6dd272143733 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -7,6 +7,7 @@ hostprogs-y += sock_example
hostprogs-y += sockex1
hostprogs-y += sockex2
hostprogs-y += tracex1
+hostprogs-y += tracex2
test_verifier-objs := test_verifier.o libbpf.o
test_maps-objs := test_maps.o libbpf.o
@@ -14,12 +15,14 @@ sock_example-objs := sock_example.o libbpf.o
sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
+tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
always += sockex1_kern.o
always += sockex2_kern.o
always += tracex1_kern.o
+always += tracex2_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
@@ -27,6 +30,7 @@ HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
HOSTLOADLIBES_sockex1 += -lelf
HOSTLOADLIBES_sockex2 += -lelf
HOSTLOADLIBES_tracex1 += -lelf
+HOSTLOADLIBES_tracex2 += -lelf
# point this to your LLVM backend with bpf support
LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c
new file mode 100644
index 000000000000..334348335795
--- /dev/null
+++ b/samples/bpf/tracex2_kern.c
@@ -0,0 +1,86 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(long),
+ .value_size = sizeof(long),
+ .max_entries = 1024,
+};
+
+/* kprobe is NOT a stable ABI
+ * This bpf+kprobe example can stop working any time.
+ */
+SEC("kprobe/kfree_skb")
+int bpf_prog2(struct pt_regs *ctx)
+{
+ long loc = 0;
+ long init_val = 1;
+ long *value;
+
+ /* x64 specific: read ip of kfree_skb caller.
+ * non-portable version of __builtin_return_address(0)
+ */
+ bpf_probe_read(&loc, sizeof(loc), (void *)ctx->sp);
+
+ value = bpf_map_lookup_elem(&my_map, &loc);
+ if (value)
+ *value += 1;
+ else
+ bpf_map_update_elem(&my_map, &loc, &init_val, BPF_ANY);
+ return 0;
+}
+
+static unsigned int log2(unsigned int v)
+{
+ unsigned int r;
+ unsigned int shift;
+
+ r = (v > 0xFFFF) << 4; v >>= r;
+ shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+ shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+ shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+ r |= (v >> 1);
+ return r;
+}
+
+static unsigned int log2l(unsigned long v)
+{
+ unsigned int hi = v >> 32;
+ if (hi)
+ return log2(hi) + 32;
+ else
+ return log2(v);
+}
+
+struct bpf_map_def SEC("maps") my_hist_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = 64,
+};
+
+SEC("kprobe/sys_write")
+int bpf_prog3(struct pt_regs *ctx)
+{
+ long write_size = ctx->dx; /* arg3 */
+ long init_val = 1;
+ long *value;
+ u32 index = log2l(write_size);
+
+ value = bpf_map_lookup_elem(&my_hist_map, &index);
+ if (value)
+ __sync_fetch_and_add(value, 1);
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c
new file mode 100644
index 000000000000..91b8d0896fbb
--- /dev/null
+++ b/samples/bpf/tracex2_user.c
@@ -0,0 +1,95 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define MAX_INDEX 64
+#define MAX_STARS 38
+
+static void stars(char *str, long val, long max, int width)
+{
+ int i;
+
+ for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+ str[i] = '*';
+ if (val > max)
+ str[i - 1] = '+';
+ str[i] = '\0';
+}
+
+static void print_hist(int fd)
+{
+ int key;
+ long value;
+ long data[MAX_INDEX] = {};
+ char starstr[MAX_STARS];
+ int i;
+ int max_ind = -1;
+ long max_value = 0;
+
+ for (key = 0; key < MAX_INDEX; key++) {
+ bpf_lookup_elem(fd, &key, &value);
+ data[key] = value;
+ if (value && key > max_ind)
+ max_ind = key;
+ if (value > max_value)
+ max_value = value;
+ }
+
+ printf(" syscall write() stats\n");
+ printf(" byte_size : count distribution\n");
+ for (i = 1; i <= max_ind + 1; i++) {
+ stars(starstr, data[i - 1], max_value, MAX_STARS);
+ printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+ (1l << i) >> 1, (1l << i) - 1, data[i - 1],
+ MAX_STARS, starstr);
+ }
+}
+static void int_exit(int sig)
+{
+ print_hist(map_fd[1]);
+ exit(0);
+}
+
+int main(int ac, char **argv)
+{
+ char filename[256];
+ long key, next_key, value;
+ FILE *f;
+ int i;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ signal(SIGINT, int_exit);
+
+ /* start 'ping' in the background to have some kfree_skb events */
+ f = popen("ping -c5 localhost", "r");
+ (void) f;
+
+ /* start 'dd' in the background to have plenty of 'write' syscalls */
+ f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r");
+ (void) f;
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ for (i = 0; i < 5; i++) {
+ key = 0;
+ while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) {
+ bpf_lookup_elem(map_fd[0], &next_key, &value);
+ printf("location 0x%lx count %ld\n", next_key, value);
+ key = next_key;
+ }
+ if (key)
+ printf("\n");
+ sleep(1);
+ }
+ print_hist(map_fd[1]);
+
+ return 0;
+}
--
1.7.9.5
^ permalink raw reply related
* [PATCH v7 tip 7/8] samples: bpf: IO latency analysis (iosnoop/heatmap)
From: Alexei Starovoitov @ 2015-03-16 21:49 UTC (permalink / raw)
To: Ingo Molnar
Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, David S. Miller, Daniel Borkmann,
Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1426542584-9406-1-git-send-email-ast@plumgrid.com>
BPF C program attaches to blk_mq_start_request/blk_update_request kprobe events
to calculate IO latency.
For every completed block IO event it computes the time delta in nsec
and records in a histogram map: map[log10(delta)*10]++
User space reads this histogram map every 2 seconds and prints it as a 'heatmap'
using gray shades of text terminal. Black spaces have many events and white
spaces have very few events. Left most space is the smallest latency, right most
space is the largest latency in the range.
Usage:
$ sudo ./tracex3
and do 'sudo dd if=/dev/sda of=/dev/null' in other terminal.
Observe IO latencies and how different activity (like 'make kernel') affects it.
Similar experiments can be done for network transmit latencies, syscalls, etc
'-t' flag prints the heatmap using normal ascii characters:
$ sudo ./tracex3 -t
heatmap of IO latency
# - many events with this latency
- few events
|1us |10us |100us |1ms |10ms |100ms |1s |10s
*ooo. *O.#. # 221
. *# . # 125
.. .o#*.. # 55
. . . . .#O # 37
.# # 175
.#*. # 37
# # 199
. . *#*. # 55
*#..* # 42
# # 266
...***Oo#*OO**o#* . # 629
# # 271
. .#o* o.*o* # 221
. . o* *#O.. # 50
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
samples/bpf/Makefile | 4 ++
samples/bpf/tracex3_kern.c | 89 ++++++++++++++++++++++++++
samples/bpf/tracex3_user.c | 150 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 243 insertions(+)
create mode 100644 samples/bpf/tracex3_kern.c
create mode 100644 samples/bpf/tracex3_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 6dd272143733..dcd850546d52 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -8,6 +8,7 @@ hostprogs-y += sockex1
hostprogs-y += sockex2
hostprogs-y += tracex1
hostprogs-y += tracex2
+hostprogs-y += tracex3
test_verifier-objs := test_verifier.o libbpf.o
test_maps-objs := test_maps.o libbpf.o
@@ -16,6 +17,7 @@ sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
+tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
@@ -23,6 +25,7 @@ always += sockex1_kern.o
always += sockex2_kern.o
always += tracex1_kern.o
always += tracex2_kern.o
+always += tracex3_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
@@ -31,6 +34,7 @@ HOSTLOADLIBES_sockex1 += -lelf
HOSTLOADLIBES_sockex2 += -lelf
HOSTLOADLIBES_tracex1 += -lelf
HOSTLOADLIBES_tracex2 += -lelf
+HOSTLOADLIBES_tracex3 += -lelf
# point this to your LLVM backend with bpf support
LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c
new file mode 100644
index 000000000000..b9d66766f07a
--- /dev/null
+++ b/samples/bpf/tracex3_kern.c
@@ -0,0 +1,89 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(long),
+ .value_size = sizeof(u64),
+ .max_entries = 4096,
+};
+
+/* kprobe is NOT a stable ABI
+ * This bpf+kprobe example can stop working any time.
+ */
+SEC("kprobe/blk_mq_start_request")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ long rq = ctx->di;
+ u64 val = bpf_ktime_get_ns();
+
+ bpf_map_update_elem(&my_map, &rq, &val, BPF_ANY);
+ return 0;
+}
+
+static unsigned int log2l(unsigned long long n)
+{
+#define S(k) if (n >= (1ull << k)) { i += k; n >>= k; }
+ int i = -(n == 0);
+ S(32); S(16); S(8); S(4); S(2); S(1);
+ return i;
+#undef S
+}
+
+#define SLOTS 100
+
+struct bpf_map_def SEC("maps") lat_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u64),
+ .max_entries = SLOTS,
+};
+
+SEC("kprobe/blk_update_request")
+int bpf_prog2(struct pt_regs *ctx)
+{
+ long rq = ctx->di;
+ u64 *value, l, base;
+ u32 index;
+
+ value = bpf_map_lookup_elem(&my_map, &rq);
+ if (!value)
+ return 0;
+
+ u64 cur_time = bpf_ktime_get_ns();
+ u64 delta = cur_time - *value;
+
+ bpf_map_delete_elem(&my_map, &rq);
+
+ /* the lines below are computing index = log10(delta)*10
+ * using integer arithmetic
+ * index = 29 ~ 1 usec
+ * index = 59 ~ 1 msec
+ * index = 89 ~ 1 sec
+ * index = 99 ~ 10sec or more
+ * log10(x)*10 = log2(x)*10/log2(10) = log2(x)*3
+ */
+ l = log2l(delta);
+ base = 1ll << l;
+ index = (l * 64 + (delta - base) * 64 / base) * 3 / 64;
+
+ if (index >= SLOTS)
+ index = SLOTS - 1;
+
+ value = bpf_map_lookup_elem(&lat_map, &index);
+ if (value)
+ __sync_fetch_and_add((long *)value, 1);
+
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c
new file mode 100644
index 000000000000..0aaa933ab938
--- /dev/null
+++ b/samples/bpf/tracex3_user.c
@@ -0,0 +1,150 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+#define SLOTS 100
+
+static void clear_stats(int fd)
+{
+ __u32 key;
+ __u64 value = 0;
+
+ for (key = 0; key < SLOTS; key++)
+ bpf_update_elem(fd, &key, &value, BPF_ANY);
+}
+
+const char *color[] = {
+ "\033[48;5;255m",
+ "\033[48;5;252m",
+ "\033[48;5;250m",
+ "\033[48;5;248m",
+ "\033[48;5;246m",
+ "\033[48;5;244m",
+ "\033[48;5;242m",
+ "\033[48;5;240m",
+ "\033[48;5;238m",
+ "\033[48;5;236m",
+ "\033[48;5;234m",
+ "\033[48;5;232m",
+};
+const int num_colors = ARRAY_SIZE(color);
+
+const char nocolor[] = "\033[00m";
+
+const char *sym[] = {
+ " ",
+ " ",
+ ".",
+ ".",
+ "*",
+ "*",
+ "o",
+ "o",
+ "O",
+ "O",
+ "#",
+ "#",
+};
+
+bool full_range = false;
+bool text_only = false;
+
+static void print_banner(void)
+{
+ if (full_range)
+ printf("|1ns |10ns |100ns |1us |10us |100us"
+ " |1ms |10ms |100ms |1s |10s\n");
+ else
+ printf("|1us |10us |100us |1ms |10ms "
+ "|100ms |1s |10s\n");
+}
+
+static void print_hist(int fd)
+{
+ __u32 key;
+ __u64 value;
+ __u64 cnt[SLOTS];
+ __u64 max_cnt = 0;
+ __u64 total_events = 0;
+
+ for (key = 0; key < SLOTS; key++) {
+ value = 0;
+ bpf_lookup_elem(fd, &key, &value);
+ cnt[key] = value;
+ total_events += value;
+ if (value > max_cnt)
+ max_cnt = value;
+ }
+ clear_stats(fd);
+ for (key = full_range ? 0 : 29; key < SLOTS; key++) {
+ int c = num_colors * cnt[key] / (max_cnt + 1);
+
+ if (text_only)
+ printf("%s", sym[c]);
+ else
+ printf("%s %s", color[c], nocolor);
+ }
+ printf(" # %lld\n", total_events);
+}
+
+int main(int ac, char **argv)
+{
+ char filename[256];
+ int i;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ for (i = 1; i < ac; i++) {
+ if (strcmp(argv[i], "-a") == 0) {
+ full_range = true;
+ } else if (strcmp(argv[i], "-t") == 0) {
+ text_only = true;
+ } else if (strcmp(argv[i], "-h") == 0) {
+ printf("Usage:\n"
+ " -a display wider latency range\n"
+ " -t text only\n");
+ return 1;
+ }
+ }
+
+ printf(" heatmap of IO latency\n");
+ if (text_only)
+ printf(" %s", sym[num_colors - 1]);
+ else
+ printf(" %s %s", color[num_colors - 1], nocolor);
+ printf(" - many events with this latency\n");
+
+ if (text_only)
+ printf(" %s", sym[0]);
+ else
+ printf(" %s %s", color[0], nocolor);
+ printf(" - few events\n");
+
+ for (i = 0; ; i++) {
+ if (i % 20 == 0)
+ print_banner();
+ print_hist(map_fd[1]);
+ sleep(2);
+ }
+
+ return 0;
+}
--
1.7.9.5
^ permalink raw reply related
* [PATCH v7 tip 8/8] samples: bpf: kmem_alloc/free tracker
From: Alexei Starovoitov @ 2015-03-16 21:49 UTC (permalink / raw)
To: Ingo Molnar
Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
Masami Hiramatsu, David S. Miller, Daniel Borkmann,
Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1426542584-9406-1-git-send-email-ast@plumgrid.com>
One bpf program attaches to kmem_cache_alloc_node() and remembers all allocated
objects in the map.
Another program attaches to kmem_cache_free() and deletes corresponding
object from the map.
User space walks the map every second and prints any objects which are
older than 1 second.
Usage:
$ sudo tracex4
Then start few long living processes. The tracex4 will print:
obj 0xffff880465928000 is 13sec old was allocated at ip ffffffff8105dc32
obj 0xffff88043181c280 is 13sec old was allocated at ip ffffffff8105dc32
obj 0xffff880465848000 is 8sec old was allocated at ip ffffffff8105dc32
obj 0xffff8804338bc280 is 15sec old was allocated at ip ffffffff8105dc32
$ addr2line -fispe vmlinux ffffffff8105dc32
do_fork at fork.c:1665
As soon as processes exit the memory is reclaimed and tracex4 prints nothing.
Similar experiment can be done with __kmalloc/kfree pair.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
samples/bpf/Makefile | 4 +++
samples/bpf/tracex4_kern.c | 54 ++++++++++++++++++++++++++++++++++
samples/bpf/tracex4_user.c | 69 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 127 insertions(+)
create mode 100644 samples/bpf/tracex4_kern.c
create mode 100644 samples/bpf/tracex4_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index dcd850546d52..fe98fb226e6e 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -9,6 +9,7 @@ hostprogs-y += sockex2
hostprogs-y += tracex1
hostprogs-y += tracex2
hostprogs-y += tracex3
+hostprogs-y += tracex4
test_verifier-objs := test_verifier.o libbpf.o
test_maps-objs := test_maps.o libbpf.o
@@ -18,6 +19,7 @@ sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
+tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
@@ -26,6 +28,7 @@ always += sockex2_kern.o
always += tracex1_kern.o
always += tracex2_kern.o
always += tracex3_kern.o
+always += tracex4_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
@@ -35,6 +38,7 @@ HOSTLOADLIBES_sockex2 += -lelf
HOSTLOADLIBES_tracex1 += -lelf
HOSTLOADLIBES_tracex2 += -lelf
HOSTLOADLIBES_tracex3 += -lelf
+HOSTLOADLIBES_tracex4 += -lelf -lrt
# point this to your LLVM backend with bpf support
LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/tracex4_kern.c b/samples/bpf/tracex4_kern.c
new file mode 100644
index 000000000000..5754e21caf3b
--- /dev/null
+++ b/samples/bpf/tracex4_kern.c
@@ -0,0 +1,54 @@
+/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/ptrace.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct pair {
+ u64 val;
+ u64 ip;
+};
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(long),
+ .value_size = sizeof(struct pair),
+ .max_entries = 1000000,
+};
+
+/* kprobe is NOT a stable ABI
+ * This bpf+kprobe example can stop working any time.
+ */
+SEC("kprobe/kmem_cache_free")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ long ptr = ctx->si;
+
+ bpf_map_delete_elem(&my_map, &ptr);
+ return 0;
+}
+
+SEC("kretprobe/kmem_cache_alloc_node")
+int bpf_prog2(struct pt_regs *ctx)
+{
+ long ptr = ctx->ax;
+ long ip = 0;
+
+ /* get ip address of kmem_cache_alloc_node() caller */
+ bpf_probe_read(&ip, sizeof(ip), (void *)(ctx->bp + sizeof(ip)));
+
+ struct pair v = {
+ .val = bpf_ktime_get_ns(),
+ .ip = ip,
+ };
+
+ bpf_map_update_elem(&my_map, &ptr, &v, BPF_ANY);
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c
new file mode 100644
index 000000000000..bc4a3bdea6ed
--- /dev/null
+++ b/samples/bpf/tracex4_user.c
@@ -0,0 +1,69 @@
+/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <time.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+struct pair {
+ long long val;
+ __u64 ip;
+};
+
+static __u64 time_get_ns(void)
+{
+ struct timespec ts;
+
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ return ts.tv_sec * 1000000000ull + ts.tv_nsec;
+}
+
+static void print_old_objects(int fd)
+{
+ long long val = time_get_ns();
+ __u64 key, next_key;
+ struct pair v;
+
+ key = write(1, "\e[1;1H\e[2J", 12); /* clear screen */
+
+ key = -1;
+ while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) {
+ bpf_lookup_elem(map_fd[0], &next_key, &v);
+ key = next_key;
+ if (val - v.val < 1000000000ll)
+ /* object was allocated more then 1 sec ago */
+ continue;
+ printf("obj 0x%llx is %2lldsec old was allocated at ip %llx\n",
+ next_key, (val - v.val) / 1000000000ll, v.ip);
+ }
+}
+
+int main(int ac, char **argv)
+{
+ char filename[256];
+ int i;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ for (i = 0; ; i++) {
+ print_old_objects(map_fd[1]);
+ sleep(1);
+ }
+
+ return 0;
+}
--
1.7.9.5
^ permalink raw reply related
* Re: [PATCH] fstests: generic test for preadv2 behavior on linux
From: Dave Chinner @ 2015-03-16 22:02 UTC (permalink / raw)
To: Milosz Tanski
Cc: linux-kernel, Christoph Hellwig, linux-fsdevel, linux-aio,
Mel Gorman, Volker Lendecke, Tejun Heo, Jeff Moyer,
Theodore Ts'o, Al Viro, linux-api, Michael Kerrisk,
linux-arch, Andrew Morton
In-Reply-To: <1426530862-32276-1-git-send-email-milosz@adfin.com>
On Mon, Mar 16, 2015 at 02:34:22PM -0400, Milosz Tanski wrote:
> preadv2 is a new syscall introduced that is like preadv2 but with flag
> argument. The first use case of this is to let us add a flag to perform a
> non-blocking file using the page cache.
> ---
> src/Makefile | 2 +-
> src/preadv2-pwritev2.h | 52 +++++++++++++++++
> src/preadv2.c | 150 +++++++++++++++++++++++++++++++++++++++++++++++++
You should add this syscall to support to xfs_io (in the xfsprogs
package) rather than write a new helper for it. Mainly because:
> +void
> +usage(char *prog)
> +{
> + fprintf(stderr, "Usage: %s [-v] [-ctdw] [-n] -p POS -l LEN <filename>\n\n", prog);
> + fprintf(stderr, "General arguments:\n");
> + fprintf(stderr, " -v Verify that the syscall is supported and quit:\n");
> + fprintf(stderr, "\n");
> + fprintf(stderr, "Open arguments:\n");
> + fprintf(stderr, " -c Open file with O_CREAT flag\n");
> + fprintf(stderr, " -t Open file with O_TRUNC flag\n");
> + fprintf(stderr, " -d Open file with O_DIRECT flag\n");
> + fprintf(stderr, " -w Open file with O_RDWR flag vs O_RDONLY (default)\n");
> + fprintf(stderr, "\n");
> + fprintf(stderr, "preadv2 arguments:\n");
> + fprintf(stderr, " -n use RWF_NONBLOCK when performing read\n");
> + fprintf(stderr, " -p POS offset file to read at\n");
> + fprintf(stderr, " -l LEN length of file data to read\n");
The xfs_io pread command already supports all of these functions
except for the RWF_NONBLOCK flag, and anyone testing bleeding edge
functionality is also using a bleeding edge xfs_io binary.
Then you test for whether the functionality is available via
_require_xfs_io_command "preadv -n"
.....
> +# test file we'll be using
> +file=$SCRATCH_MNT/067.preadv2.$$
> +
> +# Create a file:
> +# two regions of data and a hole in the middle
> +# use O_DIRECT so it's not in the page cache
> +echo "create file"
> +$XFS_IO_PROG -t -f -d \
> + -c "pwrite 0 1024" \
> + -c "pwrite 2048 1024" \
> + $file > /dev/null
This does not create holes on most filesystems. You'll need to leave
holes of up 64k so that 64k block size filesystem end up with single
block holes in them.
> +# Make sure it returns EAGAIN on uncached data
> +echo "uncached"
> +$here/src/preadv2 -n -p 0 -l 1024 $file
$XFS_IO_PROG -c "pread -n 0 1024" $file | _filter_xfs_io
> +
> +# Make sure we read in the whole file, after that RWF_NONBLOCK should return us all the data
> +echo "cached"
> +$XFS_IO_PROG -f $file -c "pread 0 4096" $file > /dev/null
> +$here/src/preadv2 -n -p 0 -l 1024 $file
$XFS_IO_PROG -c "pread 0 4096" -c "pread -n 0 1024" $file | _filter_xfs_io
> +
> +# O_DIRECT and RWF_NONBLOCK should return EAGAIN always
> +echo "O_DIRECT"
> +$here/src/preadv2 -d -n -p 0 -l 1024 $file
$XFS_IO_PROG -d -c "pread -n 0 1024" $file | _filter_xfs_io
And so on....
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org. For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>
^ permalink raw reply
* Re: [PATCH] fstests: generic test for preadv2 behavior on linux
From: Milosz Tanski @ 2015-03-16 22:03 UTC (permalink / raw)
To: Andreas Dilger
Cc: LKML, Christoph Hellwig, linux-fsdevel@vger.kernel.org,
linux-aio@kvack.org, Mel Gorman, Volker Lendecke, Tejun Heo,
Jeff Moyer, Theodore Ts'o, Al Viro, Linux API,
Michael Kerrisk, linux-arch, Dave Chinner, Andrew Morton
In-Reply-To: <BF78A586-3F25-4434-B646-EEA9DB8E62B6@dilger.ca>
On Mon, Mar 16, 2015 at 5:07 PM, Andreas Dilger <adilger@dilger.ca> wrote:
>
>> On Mar 16, 2015, at 12:34 PM, Milosz Tanski <milosz@adfin.com> wrote:
>>
>> preadv2 is a new syscall introduced that is like preadv2 but with flag
>
> Sorry, "preadv2 ... is like preadv2"?
I already have a fix for in my branch. Robert Elliott was the first
one to notice that (via private email).
>
>> argument. The first use case of this is to let us add a flag to perform a
>> non-blocking file using the page cache.
>
> This is also missing a Signed-off-by: line.
Good catch. I'm going to fix the above to issues, add a pre-test check
for preadv2 (I just noticed it's missing) and I'm going to resend this
patch.
>
> Cheers, Andreas
>> ---
>> src/Makefile | 2 +-
>> src/preadv2-pwritev2.h | 52 +++++++++++++++++
>> src/preadv2.c | 150 +++++++++++++++++++++++++++++++++++++++++++++++++
>> tests/generic/067 | 85 ++++++++++++++++++++++++++++
>> tests/generic/067.out | 9 +++
>> tests/generic/group | 1 +
>> 6 files changed, 298 insertions(+), 1 deletion(-)
>> create mode 100644 src/preadv2-pwritev2.h
>> create mode 100644 src/preadv2.c
>> create mode 100755 tests/generic/067
>> create mode 100644 tests/generic/067.out
>>
>> diff --git a/src/Makefile b/src/Makefile
>> index 4781736..f7d3681 100644
>> --- a/src/Makefile
>> +++ b/src/Makefile
>> @@ -19,7 +19,7 @@ LINUX_TARGETS = xfsctl bstat t_mtab getdevicesize preallo_rw_pattern_reader \
>> bulkstat_unlink_test_modified t_dir_offset t_futimens t_immutable \
>> stale_handle pwrite_mmap_blocked t_dir_offset2 seek_sanity_test \
>> seek_copy_test t_readdir_1 t_readdir_2 fsync-tester nsexec cloner \
>> - renameat2 t_getcwd e4compact
>> + renameat2 t_getcwd e4compact preadv2
>>
>> SUBDIRS =
>>
>> diff --git a/src/preadv2-pwritev2.h b/src/preadv2-pwritev2.h
>> new file mode 100644
>> index 0000000..786e524
>> --- /dev/null
>> +++ b/src/preadv2-pwritev2.h
>> @@ -0,0 +1,52 @@
>> +#ifndef PREADV2_PWRITEV2_H
>> +#define PREADV2_PWRITEV2_H
>> +
>> +#include "global.h"
>> +
>> +#ifndef HAVE_PREADV2
>> +#include <sys/syscall.h>
>> +
>> +#if !defined(SYS_preadv2) && defined(__x86_64__)
>> +#define SYS_preadv2 323
>> +#define SYS_pwritev2 324
>> +#endif
>> +
>> +#if !defined (SYS_preadv2) && defined(__i386__)
>> +#define SYS_preadv2 359
>> +#define SYS_pwritev2 360
>> +#endif
>> +
>> +/* LO_HI_LONG taken from glibc */
>> +#define LO_HI_LONG(val) \
>> + (off_t) val, \
>> + (off_t) ((((uint64_t) (val)) >> (sizeof (long) * 4)) >> (sizeof (long) * 4))
>> +
>> +static inline ssize_t
>> +preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags)
>> +{
>> +#ifdef SYS_preadv2
>> + return syscall(SYS_preadv2, fd, iov, iovcnt, LO_HI_LONG(offset),
>> + flags);
>> +#else
>> + errno = ENOSYS;
>> + return -1;
>> +#endif
>> +}
>> +
>> +static inline ssize_t
>> +pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags)
>> +{
>> +#ifdef SYS_pwritev2
>> + return syscall(SYS_pwritev2, fd, iov, iovcnt, LO_HI_LONG(offset),
>> + flags);
>> +#else
>> + errno = ENOSYS;
>> + return -1;
>> +#endif
>> +}
>> +
>> +#define RWF_NONBLOCK 0x00000001
>> +#define RWF_DSYNC 0x00000002
>> +
>> +#endif /* HAVE_PREADV2 */
>> +#endif /* PREADV2_PWRITEV2_H */
>> diff --git a/src/preadv2.c b/src/preadv2.c
>> new file mode 100644
>> index 0000000..a4f89b5
>> --- /dev/null
>> +++ b/src/preadv2.c
>> @@ -0,0 +1,150 @@
>> +/*
>> + * Copyright 2014 Red Hat, Inc. All rights reserved.
>> + * Copyright 2015 Milosz Tanski
>> + *
>> + * License: GPLv2
>> + *
>> + */
>> +#include <stdio.h>
>> +#include <stdlib.h>
>> +#include <getopt.h>
>> +#include <string.h>
>> +#include <unistd.h>
>> +#include <errno.h>
>> +#include <linux/fs.h> /* for RWF_NONBLOCK */
>> +
>> +/*
>> + * Once preadv2 is part of the upstream kernel and there is glibc support for
>> + * it. We'll add support for preadv2 to xfs_io and this will be unnecessary.
>> + */
>> +#include "preadv2-pwritev2.h"
>> +
>> +/*
>> + * Test to see if the system call is implemented. If -EINVAL or -ENOSYS
>> + * are returned, consider the call unimplemented. All other errors are
>> + * considered success.
>> + *
>> + * Returns: 0 if the system call is implemented, 1 if the system call
>> + * is not implemented.
>> + */
>> +int
>> +preadv2_check(int fd)
>> +{
>> + int ret;
>> + struct iovec iov[] = {};
>> +
>> + /* 0 length read; just check iof the syscall is there.
>> + *
>> + * - 0 length iovec
>> + * - Position is -1 (eg. use current position)
>> + */
>> + ret = preadv2(fd, iov, 0, -1, 0);
>> +
>> + if (ret < 0) {
>> + if (errno == ENOSYS || errno == EINVAL)
>> + return 1;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +void
>> +usage(char *prog)
>> +{
>> + fprintf(stderr, "Usage: %s [-v] [-ctdw] [-n] -p POS -l LEN <filename>\n\n", prog);
>> + fprintf(stderr, "General arguments:\n");
>> + fprintf(stderr, " -v Verify that the syscall is supported and quit:\n");
>> + fprintf(stderr, "\n");
>> + fprintf(stderr, "Open arguments:\n");
>> + fprintf(stderr, " -c Open file with O_CREAT flag\n");
>> + fprintf(stderr, " -t Open file with O_TRUNC flag\n");
>> + fprintf(stderr, " -d Open file with O_DIRECT flag\n");
>> + fprintf(stderr, " -w Open file with O_RDWR flag vs O_RDONLY (default)\n");
>> + fprintf(stderr, "\n");
>> + fprintf(stderr, "preadv2 arguments:\n");
>> + fprintf(stderr, " -n use RWF_NONBLOCK when performing read\n");
>> + fprintf(stderr, " -p POS offset file to read at\n");
>> + fprintf(stderr, " -l LEN length of file data to read\n");
>> + fprintf(stderr, "\n");
>> + fflush(stderr);
>> +}
>> +
>> +int
>> +main(int argc, char **argv)
>> +{
>> + int fd;
>> + int ret;
>> + int opt;
>> + off_t pos = -1;
>> + struct iovec iov = { NULL, 0 };
>> + int o_flags = 0;
>> + int r_flags = 0;
>> + char *filename;
>> +
>> + while ((opt = getopt(argc, argv, "vctdwnp:l:")) != -1) {
>> + switch (opt) {
>> + case 'v':
>> + /*
>> + * See if we were called to check for availability of
>> + * sys_preadv2. STDIN is okay, since we do a zero
>> + * length read (see man 2 read).
>> + */
>> + ret = preadv2_check(STDIN_FILENO);
>> + exit(ret);
>> + case 'c':
>> + o_flags |= O_CREAT;
>> + break;
>> + case 't':
>> + o_flags |= O_TRUNC;
>> + break;
>> + case 'd':
>> + o_flags |= O_DIRECT;
>> + break;
>> + case 'w':
>> + o_flags |= O_RDWR;
>> + break;
>> + case 'n':
>> + r_flags |= RWF_NONBLOCK;
>> + break;
>> + case 'p':
>> + pos = atoll(optarg);
>> + break;
>> + case 'l':
>> + iov.iov_len = atoll(optarg);
>> + break;
>> + default:
>> + fprintf(stderr, "invalid option: %c\n", opt);
>> + usage(argv[0]);
>> + exit(1);
>> + }
>> + }
>> +
>> + if (optind >= argc) {
>> + usage(argv[0]);
>> + exit(1);
>> + }
>> +
>> + if ((o_flags & O_RDWR) != O_RDWR)
>> + o_flags |= O_RDONLY;
>> +
>> + if ((iov.iov_base = malloc(iov.iov_len)) == NULL) {
>> + perror("malloc");
>> + exit(1);
>> + }
>> +
>> + filename = argv[optind];
>> + fd = open(filename, o_flags);
>> +
>> + if (fd < 0) {
>> + perror("open");
>> + exit(1);
>> + }
>> +
>> + if ((ret = preadv2(fd, &iov, 1, pos, r_flags)) == -1) {
>> + perror("preadv2");
>> + exit(ret);
>> + }
>> +
>> + free(iov.iov_base);
>> + exit(0);
>> +}
>> diff --git a/tests/generic/067 b/tests/generic/067
>> new file mode 100755
>> index 0000000..4cc58f8
>> --- /dev/null
>> +++ b/tests/generic/067
>> @@ -0,0 +1,85 @@
>> +#! /bin/bash
>> +# FS QA Test No. 067
>> +#
>> +# Test for the preadv2 syscall
>> +#
>> +#-----------------------------------------------------------------------
>> +# Copyright (c) 2015 Milosz Tanski <mtanski@gmail.com>. All Rights Reserved.
>> +#
>> +# This program is free software; you can redistribute it and/or
>> +# modify it under the terms of the GNU General Public License as
>> +# published by the Free Software Foundation.
>> +#
>> +# This program is distributed in the hope that it would be useful,
>> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
>> +# GNU General Public License for more details.
>> +#
>> +# You should have received a copy of the GNU General Public License
>> +# along with this program; if not, write the Free Software Foundation,
>> +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
>> +#-----------------------------------------------------------------------
>> +#
>> +
>> +seq=`basename $0`
>> +seqres=$RESULT_DIR/$seq
>> +echo "QA output created by $seq"
>> +
>> +here=`pwd`
>> +tmp=/tmp/$$
>> +status=1 # failure is the default!
>> +trap "_cleanup; exit \$status" 0 1 2 3 15
>> +
>> +_cleanup()
>> +{
>> + cd /
>> + rm -f $tmp.*
>> +}
>> +
>> +# get standard environment, filters and checks
>> +. ./common/rc
>> +. ./common/filter
>> +
>> +# real QA test starts here
>> +
>> +# Modify as appropriate.
>> +_supported_fs generic
>> +_supported_os Linux
>> +_require_test
>> +
>> +# test file we'll be using
>> +file=$SCRATCH_MNT/067.preadv2.$$
>> +
>> +# Create a file:
>> +# two regions of data and a hole in the middle
>> +# use O_DIRECT so it's not in the page cache
>> +echo "create file"
>> +$XFS_IO_PROG -t -f -d \
>> + -c "pwrite 0 1024" \
>> + -c "pwrite 2048 1024" \
>> + $file > /dev/null
>> +
>> +# Make sure it returns EAGAIN on uncached data
>> +echo "uncached"
>> +$here/src/preadv2 -n -p 0 -l 1024 $file
>> +
>> +# Make sure we read in the whole file, after that RWF_NONBLOCK should return us all the data
>> +echo "cached"
>> +$XFS_IO_PROG -f $file -c "pread 0 4096" $file > /dev/null
>> +$here/src/preadv2 -n -p 0 -l 1024 $file
>> +
>> +# O_DIRECT and RWF_NONBLOCK should return EAGAIN always
>> +echo "O_DIRECT"
>> +$here/src/preadv2 -d -n -p 0 -l 1024 $file
>> +
>> +# Holes do not block
>> +echo "holes"
>> +$here/src/preadv2 -n -p 2048 -l 1024 $file
>> +
>> +# EOF behavior (no EAGAIN)
>> +echo "EOF"
>> +$here/src/preadv2 -n -p 3072 -l 1 $file
>> +
>> +# success, all done
>> +status=0
>> +exit
>> diff --git a/tests/generic/067.out b/tests/generic/067.out
>> new file mode 100644
>> index 0000000..6e3740f
>> --- /dev/null
>> +++ b/tests/generic/067.out
>> @@ -0,0 +1,9 @@
>> +QA output created by 067
>> +create file
>> +uncached
>> +preadv2: Resource temporarily unavailable
>> +cached
>> +O_DIRECT
>> +preadv2: Resource temporarily unavailable
>> +holes
>> +EOF
>> diff --git a/tests/generic/group b/tests/generic/group
>> index e5db772..91c5870 100644
>> --- a/tests/generic/group
>> +++ b/tests/generic/group
>> @@ -69,6 +69,7 @@
>> 064 auto quick prealloc
>> 065 metadata auto quick
>> 066 metadata auto quick
>> +067 auto quick rw
>> 068 other auto freeze dangerous stress
>> 069 rw udf auto quick
>> 070 attr udf auto quick stress
>> --
>> 1.9.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
> Cheers, Andreas
>
>
>
>
>
--
Milosz Tanski
CTO
16 East 34th Street, 15th floor
New York, NY 10016
p: 646-253-9055
e: milosz@adfin.com
^ permalink raw reply
* Re: [PATCH v7 5/5] xfs: add RWF_NONBLOCK support
From: Dave Chinner @ 2015-03-16 22:04 UTC (permalink / raw)
To: Milosz Tanski
Cc: linux-kernel, Christoph Hellwig, Christoph Hellwig, linux-fsdevel,
linux-aio, Mel Gorman, Volker Lendecke, Tejun Heo, Jeff Moyer,
Theodore Ts'o, Al Viro, linux-api, Michael Kerrisk,
linux-arch, Andrew Morton
In-Reply-To: <0a6f2c2297bfbfd7220e3c4d6bce34fc42bc22fb.1426528417.git.milosz@adfin.com>
On Mon, Mar 16, 2015 at 02:27:15PM -0400, Milosz Tanski wrote:
> From: Christoph Hellwig <hch@lst.de>
>
> Add support for non-blocking reads. The guts are handled by the generic
> code, the only addition is a non-blocking variant of xfs_rw_ilock.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Change looks good, but I haven't tested it, so:
Acked-by: Dave Chinner <david@fromorbit.com>
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply
* Re: [PATCH] fstests: generic test for preadv2 behavior on linux
From: Milosz Tanski @ 2015-03-16 22:11 UTC (permalink / raw)
To: Dave Chinner
Cc: LKML, Christoph Hellwig, linux-fsdevel@vger.kernel.org,
linux-aio@kvack.org, Mel Gorman, Volker Lendecke, Tejun Heo,
Jeff Moyer, Theodore Ts'o, Al Viro, Linux API,
Michael Kerrisk, linux-arch, Andrew Morton
In-Reply-To: <20150316220253.GF28557@dastard>
On Mon, Mar 16, 2015 at 6:02 PM, Dave Chinner <david@fromorbit.com> wrote:
> On Mon, Mar 16, 2015 at 02:34:22PM -0400, Milosz Tanski wrote:
>> preadv2 is a new syscall introduced that is like preadv2 but with flag
>> argument. The first use case of this is to let us add a flag to perform a
>> non-blocking file using the page cache.
>> ---
>> src/Makefile | 2 +-
>> src/preadv2-pwritev2.h | 52 +++++++++++++++++
>> src/preadv2.c | 150 +++++++++++++++++++++++++++++++++++++++++++++++++
>
> You should add this syscall to support to xfs_io (in the xfsprogs
> package) rather than write a new helper for it. Mainly because:
>
>> +void
>> +usage(char *prog)
>> +{
>> + fprintf(stderr, "Usage: %s [-v] [-ctdw] [-n] -p POS -l LEN <filename>\n\n", prog);
>> + fprintf(stderr, "General arguments:\n");
>> + fprintf(stderr, " -v Verify that the syscall is supported and quit:\n");
>> + fprintf(stderr, "\n");
>> + fprintf(stderr, "Open arguments:\n");
>> + fprintf(stderr, " -c Open file with O_CREAT flag\n");
>> + fprintf(stderr, " -t Open file with O_TRUNC flag\n");
>> + fprintf(stderr, " -d Open file with O_DIRECT flag\n");
>> + fprintf(stderr, " -w Open file with O_RDWR flag vs O_RDONLY (default)\n");
>> + fprintf(stderr, "\n");
>> + fprintf(stderr, "preadv2 arguments:\n");
>> + fprintf(stderr, " -n use RWF_NONBLOCK when performing read\n");
>> + fprintf(stderr, " -p POS offset file to read at\n");
>> + fprintf(stderr, " -l LEN length of file data to read\n");
>
> The xfs_io pread command already supports all of these functions
> except for the RWF_NONBLOCK flag, and anyone testing bleeding edge
> functionality is also using a bleeding edge xfs_io binary.
>
> Then you test for whether the functionality is available via
> _require_xfs_io_command "preadv -n"
>
> .....
>> +# test file we'll be using
>> +file=$SCRATCH_MNT/067.preadv2.$$
>> +
>> +# Create a file:
>> +# two regions of data and a hole in the middle
>> +# use O_DIRECT so it's not in the page cache
>> +echo "create file"
>> +$XFS_IO_PROG -t -f -d \
>> + -c "pwrite 0 1024" \
>> + -c "pwrite 2048 1024" \
>> + $file > /dev/null
>
> This does not create holes on most filesystems. You'll need to leave
> holes of up 64k so that 64k block size filesystem end up with single
> block holes in them.
Noted and I shall fix this in the next round.
>
>> +# Make sure it returns EAGAIN on uncached data
>> +echo "uncached"
>> +$here/src/preadv2 -n -p 0 -l 1024 $file
>
> $XFS_IO_PROG -c "pread -n 0 1024" $file | _filter_xfs_io
>
>> +
>> +# Make sure we read in the whole file, after that RWF_NONBLOCK should return us all the data
>> +echo "cached"
>> +$XFS_IO_PROG -f $file -c "pread 0 4096" $file > /dev/null
>> +$here/src/preadv2 -n -p 0 -l 1024 $file
>
> $XFS_IO_PROG -c "pread 0 4096" -c "pread -n 0 1024" $file | _filter_xfs_io
>
>> +
>> +# O_DIRECT and RWF_NONBLOCK should return EAGAIN always
>> +echo "O_DIRECT"
>> +$here/src/preadv2 -d -n -p 0 -l 1024 $file
>
> $XFS_IO_PROG -d -c "pread -n 0 1024" $file | _filter_xfs_io
>
> And so on....
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
Dave,
My plan is/was to wait till the main patch makes it into the upstream
linux kernel with the syscall numbers are set in stone. Possibly after
till glibc adds support for them. After that I was going remove my
preadv2 application from xfs_tests and add that functionality to
xfs_io.
With xfs_io living in separate repository I wanted to the case when/if
syscall numbers change (there's a bunch of new syscalls queued around
epoll) of having somebody test against xfs_io that has preadv2 but bad
ids.
As a side note, I did add mlock / munlock support to xfs_io that I'll
send in another patch.
--
Milosz Tanski
CTO
16 East 34th Street, 15th floor
New York, NY 10016
p: 646-253-9055
e: milosz@adfin.com
^ permalink raw reply
* Re: [PATCH v2 0/7] CLONE_FD: Task exit notification via file descriptor
From: Thiago Macieira @ 2015-03-16 22:14 UTC (permalink / raw)
To: Kees Cook
Cc: Josh Triplett, Al Viro, Andrew Morton, Andy Lutomirski,
Ingo Molnar, Oleg Nesterov, Paul E. McKenney, H. Peter Anvin,
Rik van Riel, Thomas Gleixner, Michael Kerrisk, LKML, Linux API,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
x86-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org
In-Reply-To: <CAGXu5jLmLLgVNCP=cyUiXuiszVXsS88SNUK2iBqArbA2V6vdHA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
On Monday 16 March 2015 14:44:20 Kees Cook wrote:
> > O_CLOEXEC
> > Set the close-on-exec flag on the new file
> >descriptor. See the description of the O_CLOEXEC flag in open(2) for
> >reasons why this may be useful.
>
> This begs the question: what happens when all CLONE_FD fds for a
> process are closed? Will the parent get SIGCHLD instead, will it
> auto-reap, or will it be un-wait-able (I assume not this...)
Depends on CLONE_AUTOREAP. If it's on, then no one gets SIGCHLD, no one can
wait() on it and the process autoreaps itself.
If it's no active, then the old rules apply: parent gets SIGCHILD and can
wait(). If the parent exited first, then the child gets reparented to init,
which can do the wait().
A child without CLONE_AUTOREAP should be wait()able. If it gets wait()ed
before the clonefd is read, the clonefd() will return a 0 read. If it gets
read before wait, then wait() reaps another child or returns -ECHILD. That's
no different than two threads doing simultaneous wait() on the same child.
--
Thiago Macieira - thiago.macieira (AT) intel.com
Software Architect - Intel Open Source Technology Center
^ permalink raw reply
* Re: [PATCH v2 0/7] CLONE_FD: Task exit notification via file descriptor
From: Kees Cook @ 2015-03-16 22:36 UTC (permalink / raw)
To: Thiago Macieira
Cc: Josh Triplett, Al Viro, Andrew Morton, Andy Lutomirski,
Ingo Molnar, Oleg Nesterov, Paul E. McKenney, H. Peter Anvin,
Rik van Riel, Thomas Gleixner, Michael Kerrisk, LKML, Linux API,
linux-fsdevel@vger.kernel.org, x86@kernel.org
In-Reply-To: <2381173.VxaIO6vGG3@tjmaciei-mobl4>
On Mon, Mar 16, 2015 at 3:14 PM, Thiago Macieira
<thiago.macieira@intel.com> wrote:
> On Monday 16 March 2015 14:44:20 Kees Cook wrote:
>> > O_CLOEXEC
>> > Set the close-on-exec flag on the new file
>> >descriptor. See the description of the O_CLOEXEC flag in open(2) for
>> >reasons why this may be useful.
>>
>> This begs the question: what happens when all CLONE_FD fds for a
>> process are closed? Will the parent get SIGCHLD instead, will it
>> auto-reap, or will it be un-wait-able (I assume not this...)
>
> Depends on CLONE_AUTOREAP. If it's on, then no one gets SIGCHLD, no one can
> wait() on it and the process autoreaps itself.
>
> If it's no active, then the old rules apply: parent gets SIGCHILD and can
> wait(). If the parent exited first, then the child gets reparented to init,
> which can do the wait().
>
> A child without CLONE_AUTOREAP should be wait()able. If it gets wait()ed
> before the clonefd is read, the clonefd() will return a 0 read. If it gets
> read before wait, then wait() reaps another child or returns -ECHILD. That's
> no different than two threads doing simultaneous wait() on the same child.
Cool. I think detailing this in the manpage would be helpful.
And just so I understand the races here, what happens in CLONE_FD
(without CLONE_AUTOREAP) case where the child dies, but the parent
never reads from the CLONE_FD fd, and closes it (or dies)? Will the
modes switch that late in the child's lifetime? (i.e. even though the
details were written to the fd, they were never read, yet it'll still
switch and generate a SIGCHLD, etc?)
Thanks!
-Kees
--
Kees Cook
Chrome OS Security
^ permalink raw reply
* Re: [PATCH 0/2] Move away from non-failing small allocations
From: Andrew Morton @ 2015-03-16 22:38 UTC (permalink / raw)
To: Michal Hocko
Cc: Johannes Weiner, Dave Chinner, Mel Gorman, Rik van Riel,
Wu Fengguang, linux-mm-Bw31MaZKKs3YtjvyW6yDsg, LKML, Linux API
In-Reply-To: <1426107294-21551-1-git-send-email-mhocko-AlSwsSmVLrQ@public.gmane.org>
On Wed, 11 Mar 2015 16:54:52 -0400 Michal Hocko <mhocko-AlSwsSmVLrQ@public.gmane.org> wrote:
> as per discussion at LSF/MM summit few days back it seems there is a
> general agreement on moving away from "small allocations do not fail"
> concept.
Such a change affects basically every part of the kernel and every
kernel developer. I expect most developers will say "it works well
enough and I'm not getting any bug reports so why should I spend time
on this?". It would help if we were to explain the justification very
clearly. https://lwn.net/Articles/636017/ is Jon's writeup of the
conference discussion.
Realistically, I don't think this overall effort will be successful -
we'll add the knob, it won't get enough testing and any attempt to
alter the default will be us deliberately destabilizing the kernel
without knowing how badly :(
I wonder if we can alter the behaviour only for filesystem code, so we
constrain the new behaviour just to that code where we're having
problems. Most/all fs code goes via vfs methods so there's a reasonably
small set of places where we can call
static inline void enter_fs_code(struct super_block *sb)
{
if (sb->my_small_allocations_can_fail)
current->small_allocations_can_fail++;
}
that way (or something similar) we can select the behaviour on a per-fs
basis and the rest of the kernel remains unaffected. Other subsystems
can opt in as well.
^ permalink raw reply
* Re: [PATCH v2 0/7] CLONE_FD: Task exit notification via file descriptor
From: Thiago Macieira @ 2015-03-16 22:50 UTC (permalink / raw)
To: Kees Cook
Cc: Josh Triplett, Al Viro, Andrew Morton, Andy Lutomirski,
Ingo Molnar, Oleg Nesterov, Paul E. McKenney, H. Peter Anvin,
Rik van Riel, Thomas Gleixner, Michael Kerrisk, LKML, Linux API,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
x86-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org
In-Reply-To: <CAGXu5jJr_DieZ281=H0ZNtNvOagO0pm8PRfNvRWKp4YwS=55GA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
On Monday 16 March 2015 15:36:16 Kees Cook wrote:
> And just so I understand the races here, what happens in CLONE_FD
> (without CLONE_AUTOREAP) case where the child dies, but the parent
> never reads from the CLONE_FD fd, and closes it (or dies)? Will the
> modes switch that late in the child's lifetime? (i.e. even though the
> details were written to the fd, they were never read, yet it'll still
> switch and generate a SIGCHLD, etc?)
What happens to a child that dies during the parent's lifetime but the parent
exits without reaping the child?
The same should happen, whatever that behaviour is.
--
Thiago Macieira - thiago.macieira (AT) intel.com
Software Architect - Intel Open Source Technology Center
^ permalink raw reply
* Re: [PATCH] fstests: generic test for preadv2 behavior on linux
From: Dave Chinner @ 2015-03-16 22:56 UTC (permalink / raw)
To: Milosz Tanski
Cc: LKML, Christoph Hellwig,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
linux-aio-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org, Mel Gorman,
Volker Lendecke, Tejun Heo, Jeff Moyer, Theodore Ts'o,
Al Viro, Linux API, Michael Kerrisk,
linux-arch-u79uwXL29TY76Z2rM5mHXA, Andrew Morton
In-Reply-To: <CANP1eJEj2buvwaU-jum=GROowY6DrysQ0NU+weXstn=83yVspQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
On Mon, Mar 16, 2015 at 06:11:19PM -0400, Milosz Tanski wrote:
> On Mon, Mar 16, 2015 at 6:02 PM, Dave Chinner <david-FqsqvQoI3Ljby3iVrkZq2A@public.gmane.org> wrote:
> > On Mon, Mar 16, 2015 at 02:34:22PM -0400, Milosz Tanski wrote:
> >> preadv2 is a new syscall introduced that is like preadv2 but with flag
> >> argument. The first use case of this is to let us add a flag to perform a
> >> non-blocking file using the page cache.
> >> ---
> >> src/Makefile | 2 +-
> >> src/preadv2-pwritev2.h | 52 +++++++++++++++++
> >> src/preadv2.c | 150 +++++++++++++++++++++++++++++++++++++++++++++++++
> >
> > You should add this syscall to support to xfs_io (in the xfsprogs
> > package) rather than write a new helper for it. Mainly because:
> >
> >> +void
> >> +usage(char *prog)
> >> +{
> >> + fprintf(stderr, "Usage: %s [-v] [-ctdw] [-n] -p POS -l LEN <filename>\n\n", prog);
> >> + fprintf(stderr, "General arguments:\n");
> >> + fprintf(stderr, " -v Verify that the syscall is supported and quit:\n");
> >> + fprintf(stderr, "\n");
> >> + fprintf(stderr, "Open arguments:\n");
> >> + fprintf(stderr, " -c Open file with O_CREAT flag\n");
> >> + fprintf(stderr, " -t Open file with O_TRUNC flag\n");
> >> + fprintf(stderr, " -d Open file with O_DIRECT flag\n");
> >> + fprintf(stderr, " -w Open file with O_RDWR flag vs O_RDONLY (default)\n");
> >> + fprintf(stderr, "\n");
> >> + fprintf(stderr, "preadv2 arguments:\n");
> >> + fprintf(stderr, " -n use RWF_NONBLOCK when performing read\n");
> >> + fprintf(stderr, " -p POS offset file to read at\n");
> >> + fprintf(stderr, " -l LEN length of file data to read\n");
> >
> > The xfs_io pread command already supports all of these functions
> > except for the RWF_NONBLOCK flag, and anyone testing bleeding edge
> > functionality is also using a bleeding edge xfs_io binary.
> >
> > Then you test for whether the functionality is available via
> > _require_xfs_io_command "preadv -n"
> >
> > .....
> >> +# test file we'll be using
> >> +file=$SCRATCH_MNT/067.preadv2.$$
> >> +
> >> +# Create a file:
> >> +# two regions of data and a hole in the middle
> >> +# use O_DIRECT so it's not in the page cache
> >> +echo "create file"
> >> +$XFS_IO_PROG -t -f -d \
> >> + -c "pwrite 0 1024" \
> >> + -c "pwrite 2048 1024" \
> >> + $file > /dev/null
> >
> > This does not create holes on most filesystems. You'll need to leave
> > holes of up 64k so that 64k block size filesystem end up with single
> > block holes in them.
>
> Noted and I shall fix this in the next round.
>
> >
> >> +# Make sure it returns EAGAIN on uncached data
> >> +echo "uncached"
> >> +$here/src/preadv2 -n -p 0 -l 1024 $file
> >
> > $XFS_IO_PROG -c "pread -n 0 1024" $file | _filter_xfs_io
> >
> >> +
> >> +# Make sure we read in the whole file, after that RWF_NONBLOCK should return us all the data
> >> +echo "cached"
> >> +$XFS_IO_PROG -f $file -c "pread 0 4096" $file > /dev/null
> >> +$here/src/preadv2 -n -p 0 -l 1024 $file
> >
> > $XFS_IO_PROG -c "pread 0 4096" -c "pread -n 0 1024" $file | _filter_xfs_io
> >
> >> +
> >> +# O_DIRECT and RWF_NONBLOCK should return EAGAIN always
> >> +echo "O_DIRECT"
> >> +$here/src/preadv2 -d -n -p 0 -l 1024 $file
> >
> > $XFS_IO_PROG -d -c "pread -n 0 1024" $file | _filter_xfs_io
> >
> > And so on....
> >
> > Cheers,
> >
> > Dave.
> > --
> > Dave Chinner
> > david-FqsqvQoI3Ljby3iVrkZq2A@public.gmane.org
>
> Dave,
>
> My plan is/was to wait till the main patch makes it into the upstream
> linux kernel with the syscall numbers are set in stone. Possibly after
> till glibc adds support for them. After that I was going remove my
> preadv2 application from xfs_tests and add that functionality to
> xfs_io.
I wouldn't worry too much about glibc - once the syscall numbers are
defined we can test for them directly, I think, like we do for
supporting various ioctls.
> With xfs_io living in separate repository I wanted to the case when/if
> syscall numbers change (there's a bunch of new syscalls queued around
> epoll) of having somebody test against xfs_io that has preadv2 but bad
> ids.
If the syscall numbers change, it's simple to patch, and as long as
we have set the syscall numbers in stone before a xfsprogs release
is done then there won't be any problems...
Cheers,
Dave.
--
Dave Chinner
david-FqsqvQoI3Ljby3iVrkZq2A@public.gmane.org
^ permalink raw reply
* Re: [PATCHv3 xfstests 2/3] generic: test openat and new O_BENEATH flag
From: Dave Chinner @ 2015-03-16 23:24 UTC (permalink / raw)
To: David Drysdale
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, Alexander Viro, Kees Cook,
Eric W. Biederman, Greg Kroah-Hartman, Meredydd Luff, Will Drewry,
Jorge Lucangeli Obes, Ricky Zhou, Lee Campbell, Julien Tinnes,
Mike Depinet, James Morris, Andy Lutomirski, Paolo Bonzini,
Paul Moore, Christoph Hellwig, Michael Kerrisk,
linux-api-u79uwXL29TY76Z2rM5mHXA,
linux-security-module-u79uwXL29TY76Z2rM5mHXA,
fstests-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1425909612-28034-3-git-send-email-drysdale-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
On Mon, Mar 09, 2015 at 02:00:11PM +0000, David Drysdale wrote:
> Test basic openat(2) behaviour.
>
> Test that if O_BENEATH flag is set, openat() will only
> open paths that have no .. component and do not start
> with /. Symlinks are also checked for the same restrictions.
>
> Signed-off-by: David Drysdale <drysdale-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> ---
> .gitignore | 1 +
> common/openat | 61 ++++++++++++++++++++++++++++++
> src/Makefile | 3 +-
> src/openat.c | 100 +++++++++++++++++++++++++++++++++++++++++++++++++
This strikes me as something that shoul dbe added to xfs_io for
testing, as it already supports a heap of other open flags and
xfstests is already dependent on it.
> tests/generic/151 | 89 ++++++++++++++++++++++++++++++++++++++++++++
> tests/generic/151.out | 9 +++++
> tests/generic/152 | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++
> tests/generic/152.out | 23 ++++++++++++
> tests/generic/group | 2 +
I'd also prefer one patch per new test - it's easier to review...
> +_openat_setup()
> +{
> + local dir=$1
> +
> + mkdir -p $dir/subdir
> + echo 0123456789 > $dir/topfile
> + echo 0123456789 > $dir/subdir/bottomfile
> +
> + ln -s subdir/bottomfile $dir/symlinkdown
> + ln -s ../topfile $dir/subdir/symlinkup
> + ln -s $dir/topfile $dir/subdir/symlinkout
> + ln -s bottomfile $dir/subdir/symlinkin
> +}
> +
> +#
> +# Check whether the openat wrapper program is available
> +#
> +_requires_openat()
> +{
> + OPENAT_PROG=$here/src/openat
> + _require_command $OPENAT_PROG
> +}
if this is part of xfs_io, then _requires_xfs_io_command "open -b"
could be used to test if the command is supported, and no need for
this function at all.
> +#
> +# This checks whether the O_BENEATH flag is supported by the openat syscall
> +#
> +_requires_o_beneath()
> +{
> + # Kernels that don't support O_BENEATH will silently accept it, so
> + # check for O_BENEATH behavior: attempting to open an absolute
> + # path should fail with EPERM.
> + $OPENAT_PROG -t -b $TEST_DIR
> + if [ $? -ne 0 ]; then
> + _notrun "kernel doesn't support O_BENEATH flag in openat syscall"
> + fi
> +}
as running the command would tell us if the kernel supports it, too.
> +#endif
> +#endif
> +
> +void usage(const char *progname)
> +{
> + fprintf(stderr, "Usage: %s [-f dirname] [-b] [-n] [-t] <file>\n",
> + progname);
> + fprintf(stderr," -f dirname : use this dir for dfd\n");
> + fprintf(stderr," -b : open with O_BENEATH\n");
> + fprintf(stderr," -n : open with O_NOFOLLOW\n");
> + fprintf(stderr," -t : test for expected EPERM failure\n");
> + fprintf(stderr," -h : show this usage message\n");
> + exit(1);
Hmm - you're also testing O_NOFOLLOW behaviour too? Perhaps that
should be mentioned/added to xfs_io, too?
The reason I suggest this, even though it's a little more work, is
tht we can then re-use the new flags in other tests easily without
needing to write new helper functions...
Cheers,
Dave.
--
Dave Chinner
david-FqsqvQoI3Ljby3iVrkZq2A@public.gmane.org
^ permalink raw reply
* Re: [PATCH v2 0/7] CLONE_FD: Task exit notification via file descriptor
From: josh-iaAMLnmF4UmaiuxdJuQwMA @ 2015-03-16 23:25 UTC (permalink / raw)
To: Kees Cook
Cc: Al Viro, Andrew Morton, Andy Lutomirski, Ingo Molnar,
Oleg Nesterov, Paul E. McKenney, H. Peter Anvin, Rik van Riel,
Thomas Gleixner, Michael Kerrisk, Thiago Macieira, LKML,
Linux API, linux-fsdevel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
x86-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org
In-Reply-To: <CAGXu5jLmLLgVNCP=cyUiXuiszVXsS88SNUK2iBqArbA2V6vdHA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
On Mon, Mar 16, 2015 at 02:44:20PM -0700, Kees Cook wrote:
> On Sun, Mar 15, 2015 at 12:59 AM, Josh Triplett <josh-iaAMLnmF4Un9QnwtaJ6UQQ@public.gmane.orgg> wrote:
> > - Make poll on a CLONE_FD for an exited task also return POLLHUP, for
> > compatibility with FreeBSD's pdfork. Thanks to David Drysdale for calling
> > attention to pdfork.
>
> I think POLLHUP should be mentioned in the manpage (now it only
> mentions POLLIN).
Added for v3.
> > CLONE_FD
> > Return a file descriptor associated with the new process, stor‐
> > ing it in location clonefd in the parent's address space. When
> > the new process exits, the file descriptor will become available
> > for reading.
> >
> > Unlike using signalfd(2) for the SIGCHLD signal, the file
> > descriptor returned by clone4() with the CLONE_FD flag works
> > even with SIGCHLD unblocked in one or more threads of the parent
> > process, allowing the process to have different handlers for
> > different child processes, such as those created by a library,
> > without introducing race conditions around process-wide signal
> > handling.
> >
> > clonefd_flags may contain the following additional flags for use
> > with CLONE_FD:
> >
> >
> > O_CLOEXEC
> > Set the close-on-exec flag on the new file descriptor.
> > See the description of the O_CLOEXEC flag in open(2) for
> > reasons why this may be useful.
>
> This begs the question: what happens when all CLONE_FD fds for a
> process are closed? Will the parent get SIGCHLD instead, will it
> auto-reap, or will it be un-wait-able (I assume not this...)
Whether the parent gets SIGCHLD is determined only by what signal you
request in clone; if you clone with CLONE_FD | SIGCHLD (or
CLONE_AUTOREAP | CLONE_FD | SIGCHLD), you'll get notification via both
clonefd (if you have one) and signal (if you have a handler). If you
pass a 0 signal (just CLONE_FD or CLONE_AUTOREAP | CLONE_FD), you'll
receive no signal, only the notification via clonefd. Independently, if
you have CLONE_AUTOREAP set, the process will autoreap.
Those are all orthogonal now.
If you close the clonefd, nothing special happens other than a
put_task_struct. While this is conceptually somewhat like a pipe, the
data is actually generated at read time, so the task exit doesn't care
whether there's a live clonefd or not. (Or, in the future, if there are
multiple live clonefds for the same process.)
> Looks promising!
Thanks!
And thanks for catching the manpage issue. I'd definitely welcome any
comments you have on the implementation as well.
- Josh Triplett
^ permalink raw reply
* Re: [PATCH v2 0/7] CLONE_FD: Task exit notification via file descriptor
From: Kees Cook @ 2015-03-16 23:26 UTC (permalink / raw)
To: Thiago Macieira
Cc: Josh Triplett, Al Viro, Andrew Morton, Andy Lutomirski,
Ingo Molnar, Oleg Nesterov, Paul E. McKenney, H. Peter Anvin,
Rik van Riel, Thomas Gleixner, Michael Kerrisk, LKML, Linux API,
linux-fsdevel@vger.kernel.org, x86@kernel.org
In-Reply-To: <2152593.rzz5GT2AaC@tjmaciei-mobl4>
On Mon, Mar 16, 2015 at 3:50 PM, Thiago Macieira
<thiago.macieira@intel.com> wrote:
> On Monday 16 March 2015 15:36:16 Kees Cook wrote:
>> And just so I understand the races here, what happens in CLONE_FD
>> (without CLONE_AUTOREAP) case where the child dies, but the parent
>> never reads from the CLONE_FD fd, and closes it (or dies)? Will the
>> modes switch that late in the child's lifetime? (i.e. even though the
>> details were written to the fd, they were never read, yet it'll still
>> switch and generate a SIGCHLD, etc?)
>
> What happens to a child that dies during the parent's lifetime but the parent
> exits without reaping the child?
>
> The same should happen, whatever that behaviour is.
Okay, sounds like the waitpid internals are tied to the read, not the
child death. Perfect, that means it'll still be waitpid-able by
whatever process inherits the zombie. Sounds good! Thanks for
clarifying. :)
-Kees
--
Kees Cook
Chrome OS Security
^ permalink raw reply
* Re: [PATCH v2 0/7] CLONE_FD: Task exit notification via file descriptor
From: josh-iaAMLnmF4UmaiuxdJuQwMA @ 2015-03-16 23:29 UTC (permalink / raw)
To: Thiago Macieira
Cc: Kees Cook, Al Viro, Andrew Morton, Andy Lutomirski, Ingo Molnar,
Oleg Nesterov, Paul E. McKenney, H. Peter Anvin, Rik van Riel,
Thomas Gleixner, Michael Kerrisk, LKML, Linux API,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
x86-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org
In-Reply-To: <2381173.VxaIO6vGG3@tjmaciei-mobl4>
On Mon, Mar 16, 2015 at 03:14:14PM -0700, Thiago Macieira wrote:
> On Monday 16 March 2015 14:44:20 Kees Cook wrote:
> > > O_CLOEXEC
> > > Set the close-on-exec flag on the new file
> > >descriptor. See the description of the O_CLOEXEC flag in open(2) for
> > >reasons why this may be useful.
> >
> > This begs the question: what happens when all CLONE_FD fds for a
> > process are closed? Will the parent get SIGCHLD instead, will it
> > auto-reap, or will it be un-wait-able (I assume not this...)
>
> Depends on CLONE_AUTOREAP. If it's on, then no one gets SIGCHLD, no one can
> wait() on it and the process autoreaps itself.
Minor nit: CLONE_AUTOREAP makes the process autoreap and nobody can wait
on it, but if you pass SIGCHLD or some other exit signal to clone then
you'll still get that signal.
> If it's no active, then the old rules apply: parent gets SIGCHILD and can
> wait(). If the parent exited first, then the child gets reparented to init,
> which can do the wait().
Right.
> A child without CLONE_AUTOREAP should be wait()able. If it gets wait()ed
> before the clonefd is read, the clonefd() will return a 0 read. If it gets
> read before wait, then wait() reaps another child or returns -ECHILD. That's
> no different than two threads doing simultaneous wait() on the same child.
Hrm? That isn't the semantics we implemented; you'll *always* get an
exit notification via the clonefd if you have it open, with or without
autoreap and whether or not a wait has occurred yet. And reading from
the clonefd does not serve as a wait; if you don't pass CLONE_AUTOREAP,
you'll still need to wait on the process.
- Josh Triplett
^ permalink raw reply
* Re: [PATCH v2 0/7] CLONE_FD: Task exit notification via file descriptor
From: josh @ 2015-03-16 23:35 UTC (permalink / raw)
To: Kees Cook
Cc: Thiago Macieira, Al Viro, Andrew Morton, Andy Lutomirski,
Ingo Molnar, Oleg Nesterov, Paul E. McKenney, H. Peter Anvin,
Rik van Riel, Thomas Gleixner, Michael Kerrisk, LKML, Linux API,
linux-fsdevel@vger.kernel.org, x86@kernel.org
In-Reply-To: <CAGXu5jJr_DieZ281=H0ZNtNvOagO0pm8PRfNvRWKp4YwS=55GA@mail.gmail.com>
On Mon, Mar 16, 2015 at 03:36:16PM -0700, Kees Cook wrote:
> On Mon, Mar 16, 2015 at 3:14 PM, Thiago Macieira
> <thiago.macieira@intel.com> wrote:
> > On Monday 16 March 2015 14:44:20 Kees Cook wrote:
> >> > O_CLOEXEC
> >> > Set the close-on-exec flag on the new file
> >> >descriptor. See the description of the O_CLOEXEC flag in open(2) for
> >> >reasons why this may be useful.
> >>
> >> This begs the question: what happens when all CLONE_FD fds for a
> >> process are closed? Will the parent get SIGCHLD instead, will it
> >> auto-reap, or will it be un-wait-able (I assume not this...)
> >
> > Depends on CLONE_AUTOREAP. If it's on, then no one gets SIGCHLD, no one can
> > wait() on it and the process autoreaps itself.
> >
> > If it's no active, then the old rules apply: parent gets SIGCHILD and can
> > wait(). If the parent exited first, then the child gets reparented to init,
> > which can do the wait().
> >
> > A child without CLONE_AUTOREAP should be wait()able. If it gets wait()ed
> > before the clonefd is read, the clonefd() will return a 0 read. If it gets
> > read before wait, then wait() reaps another child or returns -ECHILD. That's
> > no different than two threads doing simultaneous wait() on the same child.
>
> Cool. I think detailing this in the manpage would be helpful.
>
> And just so I understand the races here, what happens in CLONE_FD
> (without CLONE_AUTOREAP) case where the child dies, but the parent
> never reads from the CLONE_FD fd, and closes it (or dies)? Will the
> modes switch that late in the child's lifetime? (i.e. even though the
> details were written to the fd, they were never read, yet it'll still
> switch and generate a SIGCHLD, etc?)
This doesn't actually work like a pipe; the details aren't "written" to
the fd. The data is generated at read time, and if you never read,
that's fine. There's no semantic meaning attached to reading from the
clonefd; you still have to wait on the process if you don't pass
CLONE_AUTOREAP. (Or you can block SIGCHLD or use SA_NOCLDWAIT, if you
control the calling process's signal handling; AUTOREAP just lets you
avoid interacting with the calling process's signal handling.)
See my previous response for the rest.
- Josh Triplett
^ permalink raw reply
* Re: [RFC/PATCH 0/5] Add live source objects to DRM
From: Laurent Pinchart @ 2015-03-17 0:07 UTC (permalink / raw)
To: Daniel Vetter
Cc: Laurent Pinchart, linux-sh, linux-api, Magnus Damm, dri-devel,
Daniel Vetter, linux-media
In-Reply-To: <20150316083522.GE21993@phenom.ffwll.local>
Hi Daniel,
On Monday 16 March 2015 09:35:22 Daniel Vetter wrote:
> On Sun, Mar 15, 2015 at 11:55:35PM +0200, Laurent Pinchart wrote:
> > Hello,
> >
> > I have a feeling that RFC/PATCH will work better than just RFC, so here's
> > a patch set that adds a new object named live source to DRM.
> >
> > The need comes from the Renesas R-Car SoCs in which a video processing
> > engine (named VSP1) that operates from memory to memory has one output
> > directly connected to a plane of the display engine (DU) without going
> > through memory.
> >
> > The VSP1 is supported by a V4L2 driver. While it could be argued that it
> > should instead be supported directly by the DRM rcar-du driver, this
> > wouldn't be a good idea for at least two reasons. First, the R-Car SoCs
> > contain several VSP1 instances, of which only a subset have a direct DU
> > connection. The only other instances operate solely in memory to memory
> > mode. Then, the VSP1 is a video processing engine and not a display
> > engine. Its features are easily supported by the V4L2 API, but don't map
> > to the DRM/KMS API. Significant changes to DRM/KMS would be required,
> > beyond what is in my opinion an acceptable scope for a display API.
> >
> > Now that the need to interface two separate devices supported by two
> > different drivers in two separate subsystems has been established, we
> > need an API to do so. It should be noted that while that API doesn't
> > exist in the mainline kernel, the need isn't limited to Renesas SoCs.
> >
> > This patch set proposes one possible solution for the problem in the form
> > of a new DRM object named live source. Live sources are created by
> > drivers to model hardware connections between a plane input and an
> > external source, and are attached to planes through the KMS userspace
> > API.
> >
> > Patch 1/5 adds live source objects to DRM, with an in-kernel API for
> > drivers to register the sources, and a userspace API to enumerate and
> > configure them. Configuring a live source sets the width, height and
> > pixel format of the video stream. This should ideally be queried directly
> > from the driver that supports the live source device, but I've decided
> > not to implement such communication between V4L2 and DRM/KMS at the
> > moment to keep the proposal simple.
> >
> > Patch 2/2 implements connection between live sources and planes. This
> > takes different forms depending on whether drivers use the setplane or the
> > atomic updates API:
> >
> > - For setplane, the fb field can now contain a live source ID in addition
> > to a framebuffer ID. As DRM allocates object IDs from a single ID space,
> > the type can be inferred from the ID. This makes specifying both a
> > framebuffer and a live source impossible, which isn't an issue given
> > that such a configuration would be invalid.
> >
> > The live source is looked up by the DRM core and passed as a pointer to
> > the .update_plane() operation. Unlike framebuffers live sources are not
> > refcounted as they're created statically at driver initialization time.
> >
> > - For atomic update, a new SRC_ID property has been added to planes. The
> > live source is looked up from the source ID and stored into the plane
> > state.
>
> What about directly treating live sources as (very) special framebuffers?
> A bunch of reasons:
> - All the abi fu above gets resolved naturally.
> - You have lot of duplication between fb and live source wrt size,
> possible/selected pixel format and other stuff.
> - The backing storage of framebuffers is fully opaque anyway ...
>
> I think we still need separate live source objects though for things like
> telling userspace what v4l thing it corresponds to, and for getting at the
> pixel format. But connecting the live source with the plane could still be
> done with a framebuffer and a special flag in the addfb2 ioctl to use
> live sources as backing storage ids instead of gem/ttm handles.
>
> That would also give you a good point to enforce pixel format
> compatibility: As soon as someone created a framebuffer for a live source
> you disallow pixel format changes in the v4l pipeline to make sure things
> will fit.
That's an interesting idea, I'll give it a try. This will have to wait a
little bit though, I'll be busy with other tasks this week, and I'll be
attending ELC next week.
Thank you for the quick review and the proposal, it's very appreciated.
> > Patches 3/5 to 5/5 then implement support for live sources in the R-Car DU
> > driver.
> >
> > Nothing here is set in stone. One point I'm not sure about is whether live
> > sources support should be enabled for setplane or only for atomic updates.
> > Other parts of the API can certainly be modified as well, and I'm open for
> > totally different implementations as well.
>
> Imo this should be irrespective of atomic or not really. And by using
> magic framebuffers as the link we'd get that for free.
>
> Cheers, Daniel
>
> > Laurent Pinchart (5):
> > drm: Add live source object
> > drm: Connect live source to plane
> > drm/rcar-du: Add VSP1 support to the planes allocator
> > drm/rcar-du: Add VSP1 live source support
> > drm/rcar-du: Restart the DU group when a plane source changes
> >
> > drivers/gpu/drm/armada/armada_overlay.c | 2 +-
> > drivers/gpu/drm/drm_atomic.c | 7 +
> > drivers/gpu/drm/drm_atomic_helper.c | 4 +
> > drivers/gpu/drm/drm_crtc.c | 365 +++++++++++++++++++++--
> > drivers/gpu/drm/drm_fops.c | 6 +-
> > drivers/gpu/drm/drm_ioctl.c | 3 +
> > drivers/gpu/drm/drm_plane_helper.c | 1 +
> > drivers/gpu/drm/exynos/exynos_drm_crtc.c | 4 +-
> > drivers/gpu/drm/exynos/exynos_drm_plane.c | 3 +-
> > drivers/gpu/drm/exynos/exynos_drm_plane.h | 3 +-
> > drivers/gpu/drm/i915/intel_display.c | 4 +-
> > drivers/gpu/drm/i915/intel_sprite.c | 2 +-
> > drivers/gpu/drm/imx/ipuv3-plane.c | 3 +-
> > drivers/gpu/drm/nouveau/dispnv04/overlay.c | 6 +-
> > drivers/gpu/drm/omapdrm/omap_plane.c | 1 +
> > drivers/gpu/drm/rcar-du/rcar_du_crtc.c | 10 +-
> > drivers/gpu/drm/rcar-du/rcar_du_drv.c | 6 +-
> > drivers/gpu/drm/rcar-du/rcar_du_drv.h | 3 +
> > drivers/gpu/drm/rcar-du/rcar_du_group.c | 21 +-
> > drivers/gpu/drm/rcar-du/rcar_du_group.h | 2 +
> > drivers/gpu/drm/rcar-du/rcar_du_kms.c | 62 ++++-
> > drivers/gpu/drm/rcar-du/rcar_du_plane.c | 196 ++++++++++++---
> > drivers/gpu/drm/rcar-du/rcar_du_plane.h | 11 +
> > drivers/gpu/drm/rcar-du/rcar_du_regs.h | 1 +
> > drivers/gpu/drm/rockchip/rockchip_drm_vop.c | 5 +-
> > drivers/gpu/drm/shmobile/shmob_drm_plane.c | 3 +-
> > drivers/gpu/drm/sti/sti_drm_plane.c | 3 +-
> > drivers/gpu/drm/sti/sti_hqvdp.c | 2 +-
> > include/drm/drmP.h | 3 +
> > include/drm/drm_atomic_helper.h | 1 +
> > include/drm/drm_crtc.h | 48 ++++
> > include/drm/drm_plane_helper.h | 1 +
> > include/uapi/drm/drm.h | 4 +
> > include/uapi/drm/drm_mode.h | 32 +++
> > 34 files changed, 728 insertions(+), 100 deletions(-)
--
Regards,
Laurent Pinchart
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply
* Re: [PATCH v2 0/7] CLONE_FD: Task exit notification via file descriptor
From: Thiago Macieira @ 2015-03-17 0:49 UTC (permalink / raw)
To: josh-iaAMLnmF4UmaiuxdJuQwMA
Cc: Kees Cook, Al Viro, Andrew Morton, Andy Lutomirski, Ingo Molnar,
Oleg Nesterov, Paul E. McKenney, H. Peter Anvin, Rik van Riel,
Thomas Gleixner, Michael Kerrisk, LKML, Linux API,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
x86-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org
In-Reply-To: <20150316232949.GB31751@cloud>
On Monday 16 March 2015 16:29:49 josh-iaAMLnmF4UmaiuxdJuQwMA@public.gmane.org wrote:
> > A child without CLONE_AUTOREAP should be wait()able. If it gets wait()ed
> > before the clonefd is read, the clonefd() will return a 0 read. If it
> > gets
> > read before wait, then wait() reaps another child or returns -ECHILD.
> > That's no different than two threads doing simultaneous wait() on the
> > same child.
> Hrm? That isn't the semantics we implemented; you'll *always* get an
> exit notification via the clonefd if you have it open, with or without
> autoreap and whether or not a wait has occurred yet. And reading from
> the clonefd does not serve as a wait; if you don't pass CLONE_AUTOREAP,
> you'll still need to wait on the process.
Ah, I see what you're saying. Ok, I stand corrected: a child without
CLONE_AUTOREAP must be wait()ed on and whoever waits on it will get
information. In addition to that, the information is available on the clonefd
and it can happen at any time, before or after the wait().
In the case of an orphaned child, the file descriptor will close, that's all.
No modification is necessary to init.
--
Thiago Macieira - thiago.macieira (AT) intel.com
Software Architect - Intel Open Source Technology Center
^ permalink raw reply
* [PATCH net-next] bpf: allow BPF programs access 'protocol' and 'vlan_tci' fields
From: Alexei Starovoitov @ 2015-03-17 1:06 UTC (permalink / raw)
To: David S. Miller
Cc: Daniel Borkmann, Thomas Graf, linux-api-u79uwXL29TY76Z2rM5mHXA,
netdev-u79uwXL29TY76Z2rM5mHXA
as a follow on to patch 70006af95515 ("bpf: allow eBPF access skb fields")
this patch allows 'protocol' and 'vlan_tci' fields to be accessible
from extended BPF programs.
The usage of 'protocol', 'vlan_present' and 'vlan_tci' fields is the same as
corresponding SKF_AD_PROTOCOL, SKF_AD_VLAN_TAG_PRESENT and SKF_AD_VLAN_TAG
accesses in classic BPF.
Signed-off-by: Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
---
1.
I was thinking to drop ntohs() from 'protocol' field for extended BPF, since
the programs could do:
if (skb->protocol == htons(ETH_P_IP))
which would have saved one or two cpu cycles.
But having similar behavior between classic and extended seems to be better.
2.
'vlan_tci' name is picked to match real sk_buff->vlan_tci field
and matches tpacket's tp_vlan_tci field.
include/uapi/linux/bpf.h | 3 ++
net/core/filter.c | 72 ++++++++++++++++++++++++++++++-------------
samples/bpf/test_verifier.c | 9 ++++++
3 files changed, 62 insertions(+), 22 deletions(-)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 929545a27546..1623047af463 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -178,6 +178,9 @@ struct __sk_buff {
__u32 pkt_type;
__u32 mark;
__u32 queue_mapping;
+ __u32 protocol;
+ __u32 vlan_present;
+ __u32 vlan_tci;
};
#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index 4e9dd0ad0d5b..b95ae7fe7e4f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -177,6 +177,35 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
offsetof(struct sk_buff, queue_mapping));
break;
+
+ case SKF_AD_PROTOCOL:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
+
+ /* dst_reg = *(u16 *) (src_reg + offsetof(protocol)) */
+ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ offsetof(struct sk_buff, protocol));
+ /* dst_reg = ntohs(dst_reg) [emitting a nop or swap16] */
+ *insn++ = BPF_ENDIAN(BPF_FROM_BE, dst_reg, 16);
+ break;
+
+ case SKF_AD_VLAN_TAG:
+ case SKF_AD_VLAN_TAG_PRESENT:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
+ BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
+
+ /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
+ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ offsetof(struct sk_buff, vlan_tci));
+ if (skb_field == SKF_AD_VLAN_TAG) {
+ *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg,
+ ~VLAN_TAG_PRESENT);
+ } else {
+ /* dst_reg >>= 12 */
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12);
+ /* dst_reg &= 1 */
+ *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1);
+ }
+ break;
}
return insn - insn_buf;
@@ -190,13 +219,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
switch (fp->k) {
case SKF_AD_OFF + SKF_AD_PROTOCOL:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
-
- /* A = *(u16 *) (CTX + offsetof(protocol)) */
- *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
- offsetof(struct sk_buff, protocol));
- /* A = ntohs(A) [emitting a nop or swap16] */
- *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
+ cnt = convert_skb_access(SKF_AD_PROTOCOL, BPF_REG_A, BPF_REG_CTX, insn);
+ insn += cnt - 1;
break;
case SKF_AD_OFF + SKF_AD_PKTTYPE:
@@ -242,22 +266,15 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
break;
case SKF_AD_OFF + SKF_AD_VLAN_TAG:
- case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
- BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
+ cnt = convert_skb_access(SKF_AD_VLAN_TAG,
+ BPF_REG_A, BPF_REG_CTX, insn);
+ insn += cnt - 1;
+ break;
- /* A = *(u16 *) (CTX + offsetof(vlan_tci)) */
- *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
- offsetof(struct sk_buff, vlan_tci));
- if (fp->k == SKF_AD_OFF + SKF_AD_VLAN_TAG) {
- *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A,
- ~VLAN_TAG_PRESENT);
- } else {
- /* A >>= 12 */
- *insn++ = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 12);
- /* A &= 1 */
- *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 1);
- }
+ case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
+ cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
+ BPF_REG_A, BPF_REG_CTX, insn);
+ insn += cnt - 1;
break;
case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
@@ -1215,6 +1232,17 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off,
case offsetof(struct __sk_buff, queue_mapping):
return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn);
+
+ case offsetof(struct __sk_buff, protocol):
+ return convert_skb_access(SKF_AD_PROTOCOL, dst_reg, src_reg, insn);
+
+ case offsetof(struct __sk_buff, vlan_present):
+ return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
+ dst_reg, src_reg, insn);
+
+ case offsetof(struct __sk_buff, vlan_tci):
+ return convert_skb_access(SKF_AD_VLAN_TAG,
+ dst_reg, src_reg, insn);
}
return insn - insn_buf;
diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c
index df6dbb6576f6..75d561f9fd6a 100644
--- a/samples/bpf/test_verifier.c
+++ b/samples/bpf/test_verifier.c
@@ -658,6 +658,15 @@ static struct bpf_test tests[] = {
BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
offsetof(struct __sk_buff, queue_mapping)),
BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, protocol)),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, vlan_present)),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, vlan_tci)),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0),
BPF_EXIT_INSN(),
},
.result = ACCEPT,
--
1.7.9.5
^ permalink raw reply related
* Re: [PATCH v2 tip/core/rcu 01/22] smpboot: Add common code for notification from dying CPU
From: Peter Zijlstra @ 2015-03-17 8:18 UTC (permalink / raw)
To: Paul E. McKenney
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, mingo-DgEjT+Ai2ygdnm+yROfE0A,
laijs-BthXqXjhjHXQFUHtdCDX3A, dipankar-xthvdsQ13ZrQT0dZR+AlfA,
akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w,
josh-iaAMLnmF4UmaiuxdJuQwMA, tglx-hfZtesqFncYOwBW4kG4KsQ,
rostedt-nx8X9YLhiw1AfugRpC6u6w, dhowells-H+wXaHxf7aLQT0dZR+AlfA,
edumazet-hpIqsD4AKlfQT0dZR+AlfA, dvhart-VuQAYsv1563Yd54FQh9/CA,
fweisbec-Re5JQEeQqe8AvxtiuMwx3w, oleg-H+wXaHxf7aLQT0dZR+AlfA,
bobby.prani-Re5JQEeQqe8AvxtiuMwx3w,
linux-api-u79uwXL29TY76Z2rM5mHXA,
linux-arch-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1426531086-23825-1-git-send-email-paulmck-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
On Mon, Mar 16, 2015 at 11:37:45AM -0700, Paul E. McKenney wrote:
> From: "Paul E. McKenney" <paulmck-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
>
> RCU ignores offlined CPUs, so they cannot safely run RCU read-side code.
> (They -can- use SRCU, but not RCU.) This means that any use of RCU
> during or after the call to arch_cpu_idle_dead(). Unfortunately,
> commit 2ed53c0d6cc99 added a complete() call, which will contain RCU
> read-side critical sections if there is a task waiting to be awakened.
Got a little more detail there?
^ permalink raw reply
* Re: [PATCH 0/2] Move away from non-failing small allocations
From: Michal Hocko @ 2015-03-17 9:07 UTC (permalink / raw)
To: Andrew Morton
Cc: Johannes Weiner, Dave Chinner, Mel Gorman, Rik van Riel,
Wu Fengguang, linux-mm, LKML, Linux API
In-Reply-To: <20150316153843.af945a9e452404c22c4db999@linux-foundation.org>
On Mon 16-03-15 15:38:43, Andrew Morton wrote:
> On Wed, 11 Mar 2015 16:54:52 -0400 Michal Hocko <mhocko@suse.cz> wrote:
>
> > as per discussion at LSF/MM summit few days back it seems there is a
> > general agreement on moving away from "small allocations do not fail"
> > concept.
>
> Such a change affects basically every part of the kernel and every
> kernel developer. I expect most developers will say "it works well
> enough and I'm not getting any bug reports so why should I spend time
> on this?". It would help if we were to explain the justification very
> clearly. https://lwn.net/Articles/636017/ is Jon's writeup of the
> conference discussion.
OK, I thought that the description in the patch 1/2 was clear about the
motivation. I can try harder of course. Which part do you miss there? Or
was it the cover that wasn't specific enough?
> Realistically, I don't think this overall effort will be successful -
> we'll add the knob, it won't get enough testing and any attempt to
> alter the default will be us deliberately destabilizing the kernel
> without knowing how badly :(
Without the knob we do not allow users to test this at all though and
the transition will _never_ happen. Which is IMHO bad.
> I wonder if we can alter the behaviour only for filesystem code, so we
> constrain the new behaviour just to that code where we're having
> problems. Most/all fs code goes via vfs methods so there's a reasonably
> small set of places where we can call
We are seeing issues with the fs code now because the test cases which
led to the current discussion exercise FS code. The code which does
lock(); kmalloc(GFP_KERNEL) is not reduced there though. I am pretty sure
we can find other subsystems if we try hard enough.
> static inline void enter_fs_code(struct super_block *sb)
> {
> if (sb->my_small_allocations_can_fail)
> current->small_allocations_can_fail++;
> }
>
> that way (or something similar) we can select the behaviour on a per-fs
> basis and the rest of the kernel remains unaffected. Other subsystems
> can opt in as well.
This is basically leading to GFP_MAYFAIL which is completely backwards
(the hard requirement should be an exception not a default rule).
I really do not want to end up with stuffing random may_fail annotations
all over the kernel.
--
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply
* Re: [PATCH net-next] bpf: allow BPF programs access 'protocol' and 'vlan_tci' fields
From: Daniel Borkmann @ 2015-03-17 9:22 UTC (permalink / raw)
To: Alexei Starovoitov, David S. Miller
Cc: Thomas Graf, linux-api-u79uwXL29TY76Z2rM5mHXA,
netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1426554362-29991-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
On 03/17/2015 02:06 AM, Alexei Starovoitov wrote:
> as a follow on to patch 70006af95515 ("bpf: allow eBPF access skb fields")
> this patch allows 'protocol' and 'vlan_tci' fields to be accessible
> from extended BPF programs.
>
> The usage of 'protocol', 'vlan_present' and 'vlan_tci' fields is the same as
> corresponding SKF_AD_PROTOCOL, SKF_AD_VLAN_TAG_PRESENT and SKF_AD_VLAN_TAG
> accesses in classic BPF.
>
> Signed-off-by: Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
Ok, code looks good to me.
> 1.
> I was thinking to drop ntohs() from 'protocol' field for extended BPF, since
> the programs could do:
> if (skb->protocol == htons(ETH_P_IP))
> which would have saved one or two cpu cycles.
> But having similar behavior between classic and extended seems to be better.
I'm thinking that skb->protocol == htons(ETH_P_IP) might actually
be more obvious, and, as you mentioned, the compiler can already
resolve the htons() during compile time instead of runtime, which
would be another plus.
Either behavior we should document later anyway.
The question to me here is, do we need to keep similar behavior?
After all, the way of programming both from a user perspective is
quite different (i.e. bpf_asm versus C/LLVM).
Similarly, I was wondering, if just exporting raw skb->vlan_tci is
already sufficient, and the user can e.g. write helpers to extract
bits himself from that protocol field?
> 2.
> 'vlan_tci' name is picked to match real sk_buff->vlan_tci field
> and matches tpacket's tp_vlan_tci field.
^ permalink raw reply
* Re: [v9 1/5] vfs: adds general codes to enforces project quota limits
From: Jan Kara @ 2015-03-17 9:37 UTC (permalink / raw)
To: Dave Chinner
Cc: Jan Kara, Li Xi, linux-fsdevel, linux-ext4, linux-api, tytso,
adilger, viro, hch, dmonakhov, dchinner
In-Reply-To: <20150316214930.GE28557@dastard>
On Tue 17-03-15 08:49:30, Dave Chinner wrote:
> On Mon, Mar 16, 2015 at 03:29:44PM +0100, Jan Kara wrote:
> > On Wed 11-03-15 12:03:19, Li Xi wrote:
> > > This patch adds support for a new quota type PRJQUOTA for project quota
> > > enforcement. Also a new method get_projid() is added into dquot_operations
> > > structure.
> > >
> > > Signed-off-by: Li Xi <lixi@ddn.com>
> > > Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
> > > Reviewed-by: Jan Kara <jack@suse.cz>
> > ...
> > > diff --git a/fs/quota/quota.c b/fs/quota/quota.c
> > > index 2aa4151..c76b350 100644
> > > --- a/fs/quota/quota.c
> > > +++ b/fs/quota/quota.c
> > > @@ -30,11 +30,15 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
> > > case Q_XGETQSTATV:
> > > case Q_XQUOTASYNC:
> > > break;
> > > - /* allow to query information for dquots we "own" */
> > > + /*
> > > + * allow to query information for dquots we "own"
> > > + * always allow querying project quota
> > > + */
> > > case Q_GETQUOTA:
> > > case Q_XGETQUOTA:
> > > if ((type == USRQUOTA && uid_eq(current_euid(), make_kuid(current_user_ns(), id))) ||
> > > - (type == GRPQUOTA && in_egroup_p(make_kgid(current_user_ns(), id))))
> > > + (type == GRPQUOTA && in_egroup_p(make_kgid(current_user_ns(), id))) ||
> > > + (type == PRJQUOTA))
> > > break;
> > I wanted to merge this patch but this hunk caught my eye. Why do we
> > suddently allow querying of project quotas? Traditionally that has been
> > allowed only with CAP_SYS_ADMIN. I agree it looks too restrictive to me but
> > unless that's a bug, I think we have to adhere to original behavior and
> > drop this hunk. Dave, was that behavior of project quotas intended?
>
> This is for quota reports, right?
Yes.
> Project quotas are managed by the administrator as individual users
> may not even have access to all the files under a project and hence
> often cannot do anything about running out of quota space. i.e. users
> don't own project quotas like they "own" user and group quotas.
> user/group quotas imply the user has permission to access/modify the
> files within the quota, whereas that is not true of project quotas.
>
> e.g. Think about a project that compartmentalises information along
> user acess bounds: even if a user can't access parts of the project
> quota space, allowing them to query the accounting of space used by
> the project is leaking information about how much data there is in
> the project they can't access....
OK, thanks for the explanation. So Q_GETQUOTA and Q_XGETQUOTA for
PRJQUOTA have to stay for CAP_SYS_ADMIN capable processes only (at least
for now).
Honza
--
Jan Kara <jack@suse.cz>
SUSE Labs, CR
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox