[PATCH v2 net-next 05/12] bpf: convert stackmap to pre-allocation

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Alexei Starovoitov <ast@fb.com>
To: "David S . Miller" <davem@davemloft.net>
Cc: Daniel Borkmann <daniel@iogearbox.net>,
	Daniel Wagner <daniel.wagner@bmw-carit.de>,
	Tom Zanussi <tom.zanussi@linux.intel.com>,
	Wang Nan <wangnan0@huawei.com>, He Kuang <hekuang@huawei.com>,
	Martin KaFai Lau <kafai@fb.com>,
	Brendan Gregg <brendan.d.gregg@gmail.com>,
	<netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
	<kernel-team@fb.com>
Subject: [PATCH v2 net-next 05/12] bpf: convert stackmap to pre-allocation
Date: Mon, 7 Mar 2016 21:57:17 -0800	[thread overview]
Message-ID: <1457416641-306326-6-git-send-email-ast@fb.com> (raw)
In-Reply-To: <1457416641-306326-1-git-send-email-ast@fb.com>

It was observed that calling bpf_get_stackid() from a kprobe inside
slub or from spin_unlock causes similar deadlock as with hashmap,
therefore convert stackmap to use pre-allocated memory.

The call_rcu is no longer feasible mechanism, since delayed freeing
causes bpf_get_stackid() to fail unpredictably when number of actual
stacks is significantly less than user requested max_entries.
Since elements are no longer freed into slub, we can push elements into
freelist immediately and let them be recycled.
However the very unlikley race between user space map_lookup() and
program-side recycling is possible:
     cpu0                          cpu1
     ----                          ----
user does lookup(stackidX)
starts copying ips into buffer
                                   delete(stackidX)
                                   calls bpf_get_stackid()
				   which recyles the element and
                                   overwrites with new stack trace

To avoid user space seeing a partial stack trace consisting of two
merged stack traces, do bucket = xchg(, NULL); copy; xchg(,bucket);
to preserve consistent stack trace delivery to user space.
Now we can move memset(,0) of left-over element value from critical
path of bpf_get_stackid() into slow-path of user space lookup.
Also disallow lookup() from bpf program, since it's useless and
program shouldn't be messing with collected stack trace.

Note that similar race between user space lookup and kernel side updates
is also present in hashmap, but it's not a new race. bpf programs were
always allowed to modify hash and array map elements while user space
is copying them.

Fixes: d5a3b1f69186 ("bpf: introduce BPF_MAP_TYPE_STACK_TRACE")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   |  1 +
 kernel/bpf/stackmap.c | 86 ++++++++++++++++++++++++++++++++++++++++-----------
 kernel/bpf/syscall.c  |  2 ++
 3 files changed, 71 insertions(+), 18 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index efd1d4ca95c6..21ee41b92e8a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -195,6 +195,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
 			   u64 flags);
 int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
 			    u64 flags);
+int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value);
 
 /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
  * forced to use 'long' read/writes to try to atomically copy long counters.
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index f0a02c344358..499d9e933f8e 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -10,9 +10,10 @@
 #include <linux/vmalloc.h>
 #include <linux/stacktrace.h>
 #include <linux/perf_event.h>
+#include "percpu_freelist.h"
 
 struct stack_map_bucket {
-	struct rcu_head rcu;
+	struct pcpu_freelist_node fnode;
 	u32 hash;
 	u32 nr;
 	u64 ip[];
@@ -20,10 +21,34 @@ struct stack_map_bucket {
 
 struct bpf_stack_map {
 	struct bpf_map map;
+	void *elems;
+	struct pcpu_freelist freelist;
 	u32 n_buckets;
-	struct stack_map_bucket __rcu *buckets[];
+	struct stack_map_bucket *buckets[];
 };
 
+static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
+{
+	u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
+	int err;
+
+	smap->elems = vzalloc(elem_size * smap->map.max_entries);
+	if (!smap->elems)
+		return -ENOMEM;
+
+	err = pcpu_freelist_init(&smap->freelist);
+	if (err)
+		goto free_elems;
+
+	pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size,
+			       smap->map.max_entries);
+	return 0;
+
+free_elems:
+	vfree(smap->elems);
+	return err;
+}
+
 /* Called from syscall */
 static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 {
@@ -70,12 +95,22 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	smap->n_buckets = n_buckets;
 	smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
+	err = bpf_map_precharge_memlock(smap->map.pages);
+	if (err)
+		goto free_smap;
+
 	err = get_callchain_buffers();
 	if (err)
 		goto free_smap;
 
+	err = prealloc_elems_and_freelist(smap);
+	if (err)
+		goto put_buffers;
+
 	return &smap->map;
 
+put_buffers:
+	put_callchain_buffers();
 free_smap:
 	kvfree(smap);
 	return ERR_PTR(err);
@@ -121,7 +156,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 	ips = trace->ip + skip + init_nr;
 	hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
 	id = hash & (smap->n_buckets - 1);
-	bucket = rcu_dereference(smap->buckets[id]);
+	bucket = READ_ONCE(smap->buckets[id]);
 
 	if (bucket && bucket->hash == hash) {
 		if (flags & BPF_F_FAST_STACK_CMP)
@@ -135,19 +170,18 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 	if (bucket && !(flags & BPF_F_REUSE_STACKID))
 		return -EEXIST;
 
-	new_bucket = kmalloc(sizeof(struct stack_map_bucket) + map->value_size,
-			     GFP_ATOMIC | __GFP_NOWARN);
+	new_bucket = (struct stack_map_bucket *)
+		pcpu_freelist_pop(&smap->freelist);
 	if (unlikely(!new_bucket))
 		return -ENOMEM;
 
 	memcpy(new_bucket->ip, ips, trace_len);
-	memset(new_bucket->ip + trace_len / 8, 0, map->value_size - trace_len);
 	new_bucket->hash = hash;
 	new_bucket->nr = trace_nr;
 
 	old_bucket = xchg(&smap->buckets[id], new_bucket);
 	if (old_bucket)
-		kfree_rcu(old_bucket, rcu);
+		pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
 	return id;
 }
 
@@ -160,17 +194,34 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
-/* Called from syscall or from eBPF program */
+/* Called from eBPF program */
 static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
 {
+	return NULL;
+}
+
+/* Called from syscall */
+int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
+{
 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
-	struct stack_map_bucket *bucket;
-	u32 id = *(u32 *)key;
+	struct stack_map_bucket *bucket, *old_bucket;
+	u32 id = *(u32 *)key, trace_len;
 
 	if (unlikely(id >= smap->n_buckets))
-		return NULL;
-	bucket = rcu_dereference(smap->buckets[id]);
-	return bucket ? bucket->ip : NULL;
+		return -ENOENT;
+
+	bucket = xchg(&smap->buckets[id], NULL);
+	if (!bucket)
+		return -ENOENT;
+
+	trace_len = bucket->nr * sizeof(u64);
+	memcpy(value, bucket->ip, trace_len);
+	memset(value + trace_len, 0, map->value_size - trace_len);
+
+	old_bucket = xchg(&smap->buckets[id], bucket);
+	if (old_bucket)
+		pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
+	return 0;
 }
 
 static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -196,7 +247,7 @@ static int stack_map_delete_elem(struct bpf_map *map, void *key)
 
 	old_bucket = xchg(&smap->buckets[id], NULL);
 	if (old_bucket) {
-		kfree_rcu(old_bucket, rcu);
+		pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
 		return 0;
 	} else {
 		return -ENOENT;
@@ -207,13 +258,12 @@ static int stack_map_delete_elem(struct bpf_map *map, void *key)
 static void stack_map_free(struct bpf_map *map)
 {
 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
-	int i;
 
+	/* wait for bpf programs to complete before freeing stack map */
 	synchronize_rcu();
 
-	for (i = 0; i < smap->n_buckets; i++)
-		if (smap->buckets[i])
-			kfree_rcu(smap->buckets[i], rcu);
+	vfree(smap->elems);
+	pcpu_freelist_destroy(&smap->freelist);
 	kvfree(smap);
 	put_callchain_buffers();
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cbd94b2144ff..2978d0d08869 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -290,6 +290,8 @@ static int map_lookup_elem(union bpf_attr *attr)
 		err = bpf_percpu_hash_copy(map, key, value);
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		err = bpf_percpu_array_copy(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
+		err = bpf_stackmap_copy(map, key, value);
 	} else {
 		rcu_read_lock();
 		ptr = map->ops->map_lookup_elem(map, key);
-- 
2.8.0.rc1

next prev parent reply	other threads:[~2016-03-08  5:57 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-03-08  5:57 [PATCH v2 net-next 0/12] bpf: map pre-alloc Alexei Starovoitov
2016-03-08  5:57 ` [PATCH v2 net-next 01/12] bpf: prevent kprobe+bpf deadlocks Alexei Starovoitov
2016-03-08  5:57 ` [PATCH v2 net-next 02/12] bpf: introduce percpu_freelist Alexei Starovoitov
2016-03-08  5:57 ` [PATCH v2 net-next 03/12] bpf: pre-allocate hash map elements Alexei Starovoitov
2016-03-08  5:57 ` [PATCH v2 net-next 04/12] bpf: check for reserved flag bits in array and stack maps Alexei Starovoitov
2016-03-08  5:57 ` Alexei Starovoitov [this message]
2016-03-08  5:57 ` [PATCH v2 net-next 06/12] samples/bpf: make map creation more verbose Alexei Starovoitov
2016-03-08  5:57 ` [PATCH v2 net-next 07/12] samples/bpf: move ksym_search() into library Alexei Starovoitov
2016-03-08  5:57 ` [PATCH v2 net-next 08/12] samples/bpf: add map_flags to bpf loader Alexei Starovoitov
2016-03-08  5:57 ` [PATCH v2 net-next 09/12] samples/bpf: test both pre-alloc and normal maps Alexei Starovoitov
2016-03-08  9:13 ` [PATCH v2 net-next 0/12] bpf: map pre-alloc Daniel Wagner
2016-03-08 16:38   ` Alexei Starovoitov
2016-03-08 20:31 ` David Miller
2016-03-08 23:05   ` Alexei Starovoitov

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:efd1d4ca95c dfblob:21ee41b92e8 dfblob:f0a02c34435
dfblob:499d9e933f8 dfblob:cbd94b2144f dfblob:2978d0d0886 )
 OR (
bs:"[PATCH v2 net-next 05/12] bpf: convert stackmap to pre-allocation" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1457416641-306326-6-git-send-email-ast@fb.com \
    --to=ast@fb.com \
    --cc=brendan.d.gregg@gmail.com \
    --cc=daniel.wagner@bmw-carit.de \
    --cc=daniel@iogearbox.net \
    --cc=davem@davemloft.net \
    --cc=hekuang@huawei.com \
    --cc=kafai@fb.com \
    --cc=kernel-team@fb.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=tom.zanussi@linux.intel.com \
    --cc=wangnan0@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).