* [PATCH bpf v2 1/5] bpf: Check rcu_read_lock_trace_held() before calling bpf map helpers
2023-11-13 12:33 [PATCH bpf v2 0/5] bpf: Fix the release of inner map Hou Tao
@ 2023-11-13 12:33 ` Hou Tao
2023-11-13 12:33 ` [PATCH bpf v2 2/5] bpf: Add map and need_defer parameters to .map_fd_put_ptr() Hou Tao
` (3 subsequent siblings)
4 siblings, 0 replies; 14+ messages in thread
From: Hou Tao @ 2023-11-13 12:33 UTC (permalink / raw)
To: bpf
Cc: Martin KaFai Lau, Alexei Starovoitov, Andrii Nakryiko, Song Liu,
Hao Luo, Yonghong Song, Daniel Borkmann, KP Singh,
Stanislav Fomichev, Jiri Olsa, John Fastabend, houtao1
From: Hou Tao <houtao1@huawei.com>
These three bpf_map_{lookup,update,delete}_elem() helpers are also
available for sleepable bpf program, so add the corresponding lock
assertion for sleepable bpf program, otherwise the following warning
will be reported when a sleepable bpf program manipulates bpf map under
interpreter mode (aka bpf_jit_enable=0):
WARNING: CPU: 3 PID: 4985 at kernel/bpf/helpers.c:40 ......
CPU: 3 PID: 4985 Comm: test_progs Not tainted 6.6.0+ #2
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) ......
RIP: 0010:bpf_map_lookup_elem+0x54/0x60
......
Call Trace:
<TASK>
? __warn+0xa5/0x240
? bpf_map_lookup_elem+0x54/0x60
? report_bug+0x1ba/0x1f0
? handle_bug+0x40/0x80
? exc_invalid_op+0x18/0x50
? asm_exc_invalid_op+0x1b/0x20
? __pfx_bpf_map_lookup_elem+0x10/0x10
? rcu_lockdep_current_cpu_online+0x65/0xb0
? rcu_is_watching+0x23/0x50
? bpf_map_lookup_elem+0x54/0x60
? __pfx_bpf_map_lookup_elem+0x10/0x10
___bpf_prog_run+0x513/0x3b70
__bpf_prog_run32+0x9d/0xd0
? __bpf_prog_enter_sleepable_recur+0xad/0x120
? __bpf_prog_enter_sleepable_recur+0x3e/0x120
bpf_trampoline_6442580665+0x4d/0x1000
__x64_sys_getpgid+0x5/0x30
? do_syscall_64+0x36/0xb0
entry_SYSCALL_64_after_hwframe+0x6e/0x76
</TASK>
Signed-off-by: Hou Tao <houtao1@huawei.com>
---
kernel/bpf/helpers.c | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 56b0c1f678ee7..f43038931935e 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -32,12 +32,13 @@
*
* Different map implementations will rely on rcu in map methods
* lookup/update/delete, therefore eBPF programs must run under rcu lock
- * if program is allowed to access maps, so check rcu_read_lock_held in
- * all three functions.
+ * if program is allowed to access maps, so check rcu_read_lock_held() or
+ * rcu_read_lock_trace_held() in all three functions.
*/
BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
return (unsigned long) map->ops->map_lookup_elem(map, key);
}
@@ -53,7 +54,8 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = {
BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
void *, value, u64, flags)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
return map->ops->map_update_elem(map, key, value, flags);
}
@@ -70,7 +72,8 @@ const struct bpf_func_proto bpf_map_update_elem_proto = {
BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
return map->ops->map_delete_elem(map, key);
}
--
2.29.2
^ permalink raw reply related [flat|nested] 14+ messages in thread* [PATCH bpf v2 2/5] bpf: Add map and need_defer parameters to .map_fd_put_ptr()
2023-11-13 12:33 [PATCH bpf v2 0/5] bpf: Fix the release of inner map Hou Tao
2023-11-13 12:33 ` [PATCH bpf v2 1/5] bpf: Check rcu_read_lock_trace_held() before calling bpf map helpers Hou Tao
@ 2023-11-13 12:33 ` Hou Tao
2023-11-13 12:33 ` [PATCH bpf v2 3/5] bpf: Defer the free of inner map when necessary Hou Tao
` (2 subsequent siblings)
4 siblings, 0 replies; 14+ messages in thread
From: Hou Tao @ 2023-11-13 12:33 UTC (permalink / raw)
To: bpf
Cc: Martin KaFai Lau, Alexei Starovoitov, Andrii Nakryiko, Song Liu,
Hao Luo, Yonghong Song, Daniel Borkmann, KP Singh,
Stanislav Fomichev, Jiri Olsa, John Fastabend, houtao1
From: Hou Tao <houtao1@huawei.com>
map is the pointer of outer map, and need_defer needs some explanation.
need_defer tells the implementation to defer the reference release of
the passed element and ensure that the element is still alive before
the bpf program, which may manipulate it, exits.
The following three cases will invoke map_fd_put_ptr() and different
need_defer values will be passed to these callers:
1) release the reference of the old element in the map during map update
or map deletion. The release must be deferred, otherwise the bpf
program may incur use-after-free problem, so need_defer needs to be
true.
2) release the reference of the to-be-added element in the error path of
map update. The to-be-added element is not visible to any bpf
program, so it is OK to pass false for need_defer parameter.
3) release the references of all elements in the map during map release.
Any bpf program which has access to the map must have been exited and
released, so need_defer=false will be OK.
These two parameters will be used by the following patches to fix the
potential use-after-free problem for map-in-map.
Signed-off-by: Hou Tao <houtao1@huawei.com>
---
include/linux/bpf.h | 6 +++++-
kernel/bpf/arraymap.c | 12 +++++++-----
kernel/bpf/hashtab.c | 6 +++---
kernel/bpf/map_in_map.c | 2 +-
kernel/bpf/map_in_map.h | 2 +-
5 files changed, 17 insertions(+), 11 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b4825d3cdb292..7aaf625191fca 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -106,7 +106,11 @@ struct bpf_map_ops {
/* funcs called by prog_array and perf_event_array map */
void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
int fd);
- void (*map_fd_put_ptr)(void *ptr);
+ /* If need_defer is true, the implementation should guarantee that
+ * the to-be-put element is still alive before the bpf program, which
+ * may manipulate it, exists.
+ */
+ void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer);
int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
u32 (*map_fd_sys_lookup_elem)(void *ptr);
void (*map_seq_show_elem)(struct bpf_map *map, void *key,
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 2058e89b5ddd0..bd90c3c090323 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -867,7 +867,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
}
if (old_ptr)
- map->ops->map_fd_put_ptr(old_ptr);
+ map->ops->map_fd_put_ptr(map, old_ptr, true);
return 0;
}
@@ -890,7 +890,7 @@ static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
}
if (old_ptr) {
- map->ops->map_fd_put_ptr(old_ptr);
+ map->ops->map_fd_put_ptr(map, old_ptr, true);
return 0;
} else {
return -ENOENT;
@@ -913,8 +913,9 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map,
return prog;
}
-static void prog_fd_array_put_ptr(void *ptr)
+static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool deferred)
{
+ /* bpf_prog is freed after one RCU or tasks trace grace period */
bpf_prog_put(ptr);
}
@@ -1239,8 +1240,9 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
return ee;
}
-static void perf_event_fd_array_put_ptr(void *ptr)
+static void perf_event_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool deferred)
{
+ /* bpf_perf_event is freed after one RCU grace period */
bpf_event_entry_free_rcu(ptr);
}
@@ -1294,7 +1296,7 @@ static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
return cgroup_get_from_fd(fd);
}
-static void cgroup_fd_array_put_ptr(void *ptr)
+static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool deferred)
{
/* cgroup_put free cgrp after a rcu grace period */
cgroup_put(ptr);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index fd8d4b0addfca..5b9146fa825fd 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -897,7 +897,7 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
if (map->ops->map_fd_put_ptr) {
ptr = fd_htab_map_get_ptr(map, l);
- map->ops->map_fd_put_ptr(ptr);
+ map->ops->map_fd_put_ptr(map, ptr, true);
}
}
@@ -2484,7 +2484,7 @@ static void fd_htab_map_free(struct bpf_map *map)
hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
void *ptr = fd_htab_map_get_ptr(map, l);
- map->ops->map_fd_put_ptr(ptr);
+ map->ops->map_fd_put_ptr(map, ptr, false);
}
}
@@ -2525,7 +2525,7 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
ret = htab_map_update_elem(map, key, &ptr, map_flags);
if (ret)
- map->ops->map_fd_put_ptr(ptr);
+ map->ops->map_fd_put_ptr(map, ptr, false);
return ret;
}
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index cd5eafaba97e2..7d5754d18fd04 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -127,7 +127,7 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
return inner_map;
}
-void bpf_map_fd_put_ptr(void *ptr)
+void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool deferred)
{
/* ptr->ops->map_free() has to go through one
* rcu grace period by itself.
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index bcb7534afb3c0..7d61602354de8 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -13,7 +13,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd);
void bpf_map_meta_free(struct bpf_map *map_meta);
void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
int ufd);
-void bpf_map_fd_put_ptr(void *ptr);
+void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer);
u32 bpf_map_fd_sys_lookup_elem(void *ptr);
#endif
--
2.29.2
^ permalink raw reply related [flat|nested] 14+ messages in thread* [PATCH bpf v2 3/5] bpf: Defer the free of inner map when necessary
2023-11-13 12:33 [PATCH bpf v2 0/5] bpf: Fix the release of inner map Hou Tao
2023-11-13 12:33 ` [PATCH bpf v2 1/5] bpf: Check rcu_read_lock_trace_held() before calling bpf map helpers Hou Tao
2023-11-13 12:33 ` [PATCH bpf v2 2/5] bpf: Add map and need_defer parameters to .map_fd_put_ptr() Hou Tao
@ 2023-11-13 12:33 ` Hou Tao
2023-11-13 12:33 ` [PATCH bpf v2 4/5] bpf: Optimize the free of inner map Hou Tao
2023-11-13 12:33 ` [PATCH bpf v2 5/5] selftests/bpf: Add test cases for " Hou Tao
4 siblings, 0 replies; 14+ messages in thread
From: Hou Tao @ 2023-11-13 12:33 UTC (permalink / raw)
To: bpf
Cc: Martin KaFai Lau, Alexei Starovoitov, Andrii Nakryiko, Song Liu,
Hao Luo, Yonghong Song, Daniel Borkmann, KP Singh,
Stanislav Fomichev, Jiri Olsa, John Fastabend, houtao1
From: Hou Tao <houtao1@huawei.com>
When updating or deleting an inner map in map array or map htab, the map
may still be accessed by non-sleepable program or sleepable program.
However bpf_map_fd_put_ptr() decreases the ref-count of the inner map
directly through bpf_map_put(), if the ref-count is the last ref-count
(which is true for most cases), the inner map will be free by
ops->map_free() in a kworker. But for now, most .map_free() callbacks
don't use synchronize_rcu() or its variants to wait for the elapse of a
RCU grace period, so after the invocation of ops->map_free completes,
the bpf program which is accessing the inner map may incur
use-after-free problem.
Fix it by telling bpf_map_free_deferred() to wait for both one RCU grace
period and one tasks trace RCU grace period before calling
ops->map_free() for inner map.
Fixes: bba1dc0b55ac ("bpf: Remove redundant synchronize_rcu.")
Fixes: 638e4b825d52 ("bpf: Allows per-cpu maps and map-in-map in sleepable programs")
Signed-off-by: Hou Tao <houtao1@huawei.com>
---
include/linux/bpf.h | 1 +
kernel/bpf/map_in_map.c | 11 ++++++++---
kernel/bpf/syscall.c | 5 +++++
3 files changed, 14 insertions(+), 3 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7aaf625191fca..ec3c90202ffe6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -292,6 +292,7 @@ struct bpf_map {
} owner;
bool bypass_spec_v1;
bool frozen; /* write-once; write-protected by freeze_mutex */
+ bool free_after_mult_rcu_gp;
s64 __percpu *elem_count;
};
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 7d5754d18fd04..cf33630655661 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -129,10 +129,15 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool deferred)
{
- /* ptr->ops->map_free() has to go through one
- * rcu grace period by itself.
+ struct bpf_map *inner_map = ptr;
+
+ /* The inner map may still be used by both non-sleepable and sleepable
+ * bpf program, so free it after one RCU grace period and one tasks
+ * trace RCU grace period.
*/
- bpf_map_put(ptr);
+ if (deferred)
+ WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true);
+ bpf_map_put(inner_map);
}
u32 bpf_map_fd_sys_lookup_elem(void *ptr)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a144eb286974b..e2d2701ce2c45 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -35,6 +35,7 @@
#include <linux/rcupdate_trace.h>
#include <linux/memcontrol.h>
#include <linux/trace_events.h>
+#include <linux/rcupdate_wait.h>
#include <net/netfilter/nf_bpf_link.h>
#include <net/netkit.h>
@@ -696,6 +697,10 @@ static void bpf_map_free_deferred(struct work_struct *work)
security_bpf_map_free(map);
bpf_map_release_memcg(map);
+
+ if (READ_ONCE(map->free_after_mult_rcu_gp))
+ synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
+
/* implementation dependent freeing */
map->ops->map_free(map);
/* Delay freeing of btf_record for maps, as map_free
--
2.29.2
^ permalink raw reply related [flat|nested] 14+ messages in thread* [PATCH bpf v2 4/5] bpf: Optimize the free of inner map
2023-11-13 12:33 [PATCH bpf v2 0/5] bpf: Fix the release of inner map Hou Tao
` (2 preceding siblings ...)
2023-11-13 12:33 ` [PATCH bpf v2 3/5] bpf: Defer the free of inner map when necessary Hou Tao
@ 2023-11-13 12:33 ` Hou Tao
2023-11-18 7:34 ` Martin KaFai Lau
2023-11-21 5:19 ` Alexei Starovoitov
2023-11-13 12:33 ` [PATCH bpf v2 5/5] selftests/bpf: Add test cases for " Hou Tao
4 siblings, 2 replies; 14+ messages in thread
From: Hou Tao @ 2023-11-13 12:33 UTC (permalink / raw)
To: bpf
Cc: Martin KaFai Lau, Alexei Starovoitov, Andrii Nakryiko, Song Liu,
Hao Luo, Yonghong Song, Daniel Borkmann, KP Singh,
Stanislav Fomichev, Jiri Olsa, John Fastabend, houtao1
From: Hou Tao <houtao1@huawei.com>
When removing the inner map from the outer map, the inner map will be
freed after one RCU grace period and one RCU tasks trace grace
period, so it is certain that the bpf program which may access the inner
map, has exited before the inner map is freed.
However there is unnecessary to wait for one RCU grace period or one RCU
tasks trace grace period if the outer map is only accessed by sleepable
program or non-sleepable program. So recording the context of the owned
bpf programs when adding map into env->used_maps and using the recorded
access context to decide which, and how many, RCU grace periods are
needed when freeing the inner map.
Signed-off-by: Hou Tao <houtao1@huawei.com>
---
include/linux/bpf.h | 9 ++++++++-
kernel/bpf/map_in_map.c | 18 +++++++++++++-----
kernel/bpf/syscall.c | 15 +++++++++++++--
kernel/bpf/verifier.c | 5 +++++
4 files changed, 39 insertions(+), 8 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ec3c90202ffe6..8faa1af4b39df 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -245,6 +245,12 @@ struct bpf_list_node_kern {
void *owner;
} __attribute__((aligned(8)));
+enum {
+ BPF_MAP_ACC_NORMAL_PROG_CTX = 1,
+ BPF_MAP_ACC_SLEEPABLE_PROG_CTX = 2,
+ BPF_MAP_ACC_PROG_CTX_MASK = BPF_MAP_ACC_NORMAL_PROG_CTX | BPF_MAP_ACC_SLEEPABLE_PROG_CTX,
+};
+
struct bpf_map {
/* The first two cachelines with read-mostly members of which some
* are also accessed in fast-path (e.g. ops, max_entries).
@@ -292,7 +298,8 @@ struct bpf_map {
} owner;
bool bypass_spec_v1;
bool frozen; /* write-once; write-protected by freeze_mutex */
- bool free_after_mult_rcu_gp;
+ atomic_t owned_prog_ctx;
+ atomic_t may_be_accessed_prog_ctx;
s64 __percpu *elem_count;
};
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index cf33630655661..e3d26a89ac5b6 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -131,12 +131,20 @@ void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool deferred)
{
struct bpf_map *inner_map = ptr;
- /* The inner map may still be used by both non-sleepable and sleepable
- * bpf program, so free it after one RCU grace period and one tasks
- * trace RCU grace period.
+ /* Defer the freeing of inner map according to the owner program
+ * context of outer maps, so unnecessary multiple RCU GP waitings
+ * can be avoided.
*/
- if (deferred)
- WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true);
+ if (deferred) {
+ /* owned_prog_ctx may be updated concurrently by new bpf program
+ * so add smp_mb() below to ensure that reading owned_prog_ctx
+ * will return the newly-set bit when the new bpf program finds
+ * the inner map before it is removed from outer map.
+ */
+ smp_mb();
+ atomic_or(atomic_read(&map->owned_prog_ctx),
+ &inner_map->may_be_accessed_prog_ctx);
+ }
bpf_map_put(inner_map);
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e2d2701ce2c45..5a7906f2b027e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -694,12 +694,20 @@ static void bpf_map_free_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
struct btf_record *rec = map->record;
+ int acc_ctx;
security_bpf_map_free(map);
bpf_map_release_memcg(map);
- if (READ_ONCE(map->free_after_mult_rcu_gp))
- synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
+ acc_ctx = atomic_read(&map->may_be_accessed_prog_ctx) & BPF_MAP_ACC_PROG_CTX_MASK;
+ if (acc_ctx) {
+ if (acc_ctx == BPF_MAP_ACC_NORMAL_PROG_CTX)
+ synchronize_rcu();
+ else if (acc_ctx == BPF_MAP_ACC_SLEEPABLE_PROG_CTX)
+ synchronize_rcu_tasks_trace();
+ else
+ synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
+ }
/* implementation dependent freeing */
map->ops->map_free(map);
@@ -5326,6 +5334,9 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
goto out_unlock;
}
+ /* No need to update owned_prog_ctx, because the bpf program doesn't
+ * access the map.
+ */
memcpy(used_maps_new, used_maps_old,
sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
used_maps_new[prog->aux->used_map_cnt] = map;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bd1c42eb540f1..d8d5432b240dc 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -18012,12 +18012,17 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
return -E2BIG;
}
+ atomic_or(env->prog->aux->sleepable ? BPF_MAP_ACC_SLEEPABLE_PROG_CTX :
+ BPF_MAP_ACC_NORMAL_PROG_CTX,
+ &map->owned_prog_ctx);
/* hold the map. If the program is rejected by verifier,
* the map will be released by release_maps() or it
* will be used by the valid program until it's unloaded
* and all maps are released in free_used_maps()
*/
bpf_map_inc(map);
+ /* Paired with smp_mb() in bpf_map_fd_put_ptr() */
+ smp_mb__after_atomic();
aux->map_index = env->used_map_cnt;
env->used_maps[env->used_map_cnt++] = map;
--
2.29.2
^ permalink raw reply related [flat|nested] 14+ messages in thread* Re: [PATCH bpf v2 4/5] bpf: Optimize the free of inner map
2023-11-13 12:33 ` [PATCH bpf v2 4/5] bpf: Optimize the free of inner map Hou Tao
@ 2023-11-18 7:34 ` Martin KaFai Lau
2023-11-18 13:04 ` Hou Tao
2023-11-21 5:19 ` Alexei Starovoitov
1 sibling, 1 reply; 14+ messages in thread
From: Martin KaFai Lau @ 2023-11-18 7:34 UTC (permalink / raw)
To: Hou Tao
Cc: Alexei Starovoitov, Andrii Nakryiko, Song Liu, Hao Luo,
Yonghong Song, Daniel Borkmann, KP Singh, Stanislav Fomichev,
Jiri Olsa, John Fastabend, houtao1, bpf
On 11/13/23 4:33 AM, Hou Tao wrote:
> From: Hou Tao <houtao1@huawei.com>
>
> When removing the inner map from the outer map, the inner map will be
> freed after one RCU grace period and one RCU tasks trace grace
> period, so it is certain that the bpf program which may access the inner
> map, has exited before the inner map is freed.
>
> However there is unnecessary to wait for one RCU grace period or one RCU
> tasks trace grace period if the outer map is only accessed by sleepable
> program or non-sleepable program. So recording the context of the owned
> bpf programs when adding map into env->used_maps and using the recorded
> access context to decide which, and how many, RCU grace periods are
> needed when freeing the inner map.
>
> Signed-off-by: Hou Tao <houtao1@huawei.com>
> ---
> include/linux/bpf.h | 9 ++++++++-
> kernel/bpf/map_in_map.c | 18 +++++++++++++-----
> kernel/bpf/syscall.c | 15 +++++++++++++--
> kernel/bpf/verifier.c | 5 +++++
> 4 files changed, 39 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index ec3c90202ffe6..8faa1af4b39df 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -245,6 +245,12 @@ struct bpf_list_node_kern {
> void *owner;
> } __attribute__((aligned(8)));
>
> +enum {
> + BPF_MAP_ACC_NORMAL_PROG_CTX = 1,
> + BPF_MAP_ACC_SLEEPABLE_PROG_CTX = 2,
nit. It is a bit flag. Use (1U << 0) and (1U << 1) to make it more obvious.
How about renaming this to BPF_MAP_RCU_GP and BPF_MAP_RCU_TT_GP to better
reflect it is used to decide if the map_free needs to wait for any rcu gp?
> + BPF_MAP_ACC_PROG_CTX_MASK = BPF_MAP_ACC_NORMAL_PROG_CTX | BPF_MAP_ACC_SLEEPABLE_PROG_CTX,
> +};
> +
> struct bpf_map {
> /* The first two cachelines with read-mostly members of which some
> * are also accessed in fast-path (e.g. ops, max_entries).
> @@ -292,7 +298,8 @@ struct bpf_map {
> } owner;
> bool bypass_spec_v1;
> bool frozen; /* write-once; write-protected by freeze_mutex */
> - bool free_after_mult_rcu_gp;
> + atomic_t owned_prog_ctx;
Instead of the enum flags, this should only need a true/false value to tell
whether the outer map has ever been used by a sleepable prog or not. may be
renaming this to "atomic used_by_sleepable;"?
> + atomic_t may_be_accessed_prog_ctx;
nit. rename this to "rcu_gp_flags;"
> s64 __percpu *elem_count;
> };
>
> diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
> index cf33630655661..e3d26a89ac5b6 100644
> --- a/kernel/bpf/map_in_map.c
> +++ b/kernel/bpf/map_in_map.c
> @@ -131,12 +131,20 @@ void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool deferred)
> {
> struct bpf_map *inner_map = ptr;
>
> - /* The inner map may still be used by both non-sleepable and sleepable
> - * bpf program, so free it after one RCU grace period and one tasks
> - * trace RCU grace period.
> + /* Defer the freeing of inner map according to the owner program
> + * context of outer maps, so unnecessary multiple RCU GP waitings
> + * can be avoided.
> */
> - if (deferred)
> - WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true);
> + if (deferred) {
> + /* owned_prog_ctx may be updated concurrently by new bpf program
> + * so add smp_mb() below to ensure that reading owned_prog_ctx
> + * will return the newly-set bit when the new bpf program finds
> + * the inner map before it is removed from outer map.
> + */
> + smp_mb();
This part took my head spinning a little, so it is better to ask. The
owned_prog_ctx is set during verification time. There are many instructions till
the prog is actually verified, attached (another syscall) and then run to do the
actual lookup(&outer_map). Is this level of reordering practically possible?
> + atomic_or(atomic_read(&map->owned_prog_ctx),
> + &inner_map->may_be_accessed_prog_ctx);
> + }
> bpf_map_put(inner_map);
> }
>
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index e2d2701ce2c45..5a7906f2b027e 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -694,12 +694,20 @@ static void bpf_map_free_deferred(struct work_struct *work)
> {
> struct bpf_map *map = container_of(work, struct bpf_map, work);
> struct btf_record *rec = map->record;
> + int acc_ctx;
>
> security_bpf_map_free(map);
> bpf_map_release_memcg(map);
>
> - if (READ_ONCE(map->free_after_mult_rcu_gp))
> - synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
> + acc_ctx = atomic_read(&map->may_be_accessed_prog_ctx) & BPF_MAP_ACC_PROG_CTX_MASK;
The mask should not be needed.
> + if (acc_ctx) {
> + if (acc_ctx == BPF_MAP_ACC_NORMAL_PROG_CTX)
> + synchronize_rcu();
> + else if (acc_ctx == BPF_MAP_ACC_SLEEPABLE_PROG_CTX)
> + synchronize_rcu_tasks_trace();
> + else
> + synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
Is it better to add a rcu_head to the map and then use call_rcu_(). e.g. when
there is many delete happened to the outer map during a process restart to
re-populate the outer map. It is relatively much cheaper to add a rcu_head to
the map comparing to adding one for each elem. wdyt?
> + }
>
> /* implementation dependent freeing */
> map->ops->map_free(map);
> @@ -5326,6 +5334,9 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
> goto out_unlock;
> }
>
> + /* No need to update owned_prog_ctx, because the bpf program doesn't
> + * access the map.
> + */
> memcpy(used_maps_new, used_maps_old,
> sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
> used_maps_new[prog->aux->used_map_cnt] = map;
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index bd1c42eb540f1..d8d5432b240dc 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -18012,12 +18012,17 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
> return -E2BIG;
> }
>
> + atomic_or(env->prog->aux->sleepable ? BPF_MAP_ACC_SLEEPABLE_PROG_CTX :
> + BPF_MAP_ACC_NORMAL_PROG_CTX,
> + &map->owned_prog_ctx);
> /* hold the map. If the program is rejected by verifier,
> * the map will be released by release_maps() or it
> * will be used by the valid program until it's unloaded
> * and all maps are released in free_used_maps()
> */
> bpf_map_inc(map);
> + /* Paired with smp_mb() in bpf_map_fd_put_ptr() */
> + smp_mb__after_atomic();
>
> aux->map_index = env->used_map_cnt;
> env->used_maps[env->used_map_cnt++] = map;
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH bpf v2 4/5] bpf: Optimize the free of inner map
2023-11-18 7:34 ` Martin KaFai Lau
@ 2023-11-18 13:04 ` Hou Tao
2023-11-18 23:28 ` Martin KaFai Lau
0 siblings, 1 reply; 14+ messages in thread
From: Hou Tao @ 2023-11-18 13:04 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: Alexei Starovoitov, Andrii Nakryiko, Song Liu, Hao Luo,
Yonghong Song, Daniel Borkmann, KP Singh, Stanislav Fomichev,
Jiri Olsa, John Fastabend, houtao1, bpf
Hi,
On 11/18/2023 3:34 PM, Martin KaFai Lau wrote:
> On 11/13/23 4:33 AM, Hou Tao wrote:
>> From: Hou Tao <houtao1@huawei.com>
>>
>> When removing the inner map from the outer map, the inner map will be
>> freed after one RCU grace period and one RCU tasks trace grace
>> period, so it is certain that the bpf program which may access the inner
>> map, has exited before the inner map is freed.
>>
>> However there is unnecessary to wait for one RCU grace period or one RCU
>> tasks trace grace period if the outer map is only accessed by sleepable
>> program or non-sleepable program. So recording the context of the owned
>> bpf programs when adding map into env->used_maps and using the recorded
>> access context to decide which, and how many, RCU grace periods are
>> needed when freeing the inner map.
>>
>> Signed-off-by: Hou Tao <houtao1@huawei.com>
>> ---
>> include/linux/bpf.h | 9 ++++++++-
>> kernel/bpf/map_in_map.c | 18 +++++++++++++-----
>> kernel/bpf/syscall.c | 15 +++++++++++++--
>> kernel/bpf/verifier.c | 5 +++++
>> 4 files changed, 39 insertions(+), 8 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index ec3c90202ffe6..8faa1af4b39df 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -245,6 +245,12 @@ struct bpf_list_node_kern {
>> void *owner;
>> } __attribute__((aligned(8)));
>> +enum {
>> + BPF_MAP_ACC_NORMAL_PROG_CTX = 1,
>> + BPF_MAP_ACC_SLEEPABLE_PROG_CTX = 2,
>
> nit. It is a bit flag. Use (1U << 0) and (1U << 1) to make it more
> obvious.
Will fix.
>
> How about renaming this to BPF_MAP_RCU_GP and BPF_MAP_RCU_TT_GP to
> better reflect it is used to decide if the map_free needs to wait for
> any rcu gp?
OK. It is fine with me.
>
>> + BPF_MAP_ACC_PROG_CTX_MASK = BPF_MAP_ACC_NORMAL_PROG_CTX |
>> BPF_MAP_ACC_SLEEPABLE_PROG_CTX,
>> +};
>> +
>> struct bpf_map {
>> /* The first two cachelines with read-mostly members of which some
>> * are also accessed in fast-path (e.g. ops, max_entries).
>> @@ -292,7 +298,8 @@ struct bpf_map {
>> } owner;
>> bool bypass_spec_v1;
>> bool frozen; /* write-once; write-protected by freeze_mutex */
>> - bool free_after_mult_rcu_gp;
>> + atomic_t owned_prog_ctx;
>
> Instead of the enum flags, this should only need a true/false value to
> tell whether the outer map has ever been used by a sleepable prog or
> not. may be renaming this to "atomic used_by_sleepable;"?
I have considered about that. But there is maps which is only accessed
in userspace (e.g. map used in BPF_PROG_BIND_MAP or maps in test_maps),
so I though one bool is not enough.
>
>> + atomic_t may_be_accessed_prog_ctx;
>
> nit. rename this to "rcu_gp_flags;"
Will do.
>
>> s64 __percpu *elem_count;
>> };
>> diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
>> index cf33630655661..e3d26a89ac5b6 100644
>> --- a/kernel/bpf/map_in_map.c
>> +++ b/kernel/bpf/map_in_map.c
>> @@ -131,12 +131,20 @@ void bpf_map_fd_put_ptr(struct bpf_map *map,
>> void *ptr, bool deferred)
>> {
>> struct bpf_map *inner_map = ptr;
>> - /* The inner map may still be used by both non-sleepable and
>> sleepable
>> - * bpf program, so free it after one RCU grace period and one tasks
>> - * trace RCU grace period.
>> + /* Defer the freeing of inner map according to the owner program
>> + * context of outer maps, so unnecessary multiple RCU GP waitings
>> + * can be avoided.
>> */
>> - if (deferred)
>> - WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true);
>> + if (deferred) {
>> + /* owned_prog_ctx may be updated concurrently by new bpf
>> program
>> + * so add smp_mb() below to ensure that reading owned_prog_ctx
>> + * will return the newly-set bit when the new bpf program finds
>> + * the inner map before it is removed from outer map.
>> + */
>> + smp_mb();
>
> This part took my head spinning a little, so it is better to ask. The
> owned_prog_ctx is set during verification time. There are many
> instructions till the prog is actually verified, attached (another
> syscall) and then run to do the actual lookup(&outer_map). Is this
> level of reordering practically possible?
Er, I added the memory barrier due to uncertainty. According to [1],
syscall doesn't imply a memory barrier. And If I understand correctly,
when the program is loaded and attached through bpf_sys_bpf, there will
be no new syscall.
[1]:
https://www.kernel.org/doc/html/latest/core-api/wrappers/memory-barriers.html
>
>> + atomic_or(atomic_read(&map->owned_prog_ctx),
>> + &inner_map->may_be_accessed_prog_ctx);
>> + }
>> bpf_map_put(inner_map);
>> }
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index e2d2701ce2c45..5a7906f2b027e 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -694,12 +694,20 @@ static void bpf_map_free_deferred(struct
>> work_struct *work)
>> {
>> struct bpf_map *map = container_of(work, struct bpf_map, work);
>> struct btf_record *rec = map->record;
>> + int acc_ctx;
>> security_bpf_map_free(map);
>> bpf_map_release_memcg(map);
>> - if (READ_ONCE(map->free_after_mult_rcu_gp))
>> - synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
>> + acc_ctx = atomic_read(&map->may_be_accessed_prog_ctx) &
>> BPF_MAP_ACC_PROG_CTX_MASK;
>
> The mask should not be needed.
Yep. Will remove it.
>
>> + if (acc_ctx) {
>> + if (acc_ctx == BPF_MAP_ACC_NORMAL_PROG_CTX)
>> + synchronize_rcu();
>> + else if (acc_ctx == BPF_MAP_ACC_SLEEPABLE_PROG_CTX)
>> + synchronize_rcu_tasks_trace();
>> + else
>> + synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
>
> Is it better to add a rcu_head to the map and then use call_rcu_().
> e.g. when there is many delete happened to the outer map during a
> process restart to re-populate the outer map. It is relatively much
> cheaper to add a rcu_head to the map comparing to adding one for each
> elem. wdyt?
Good idea. call_rcu() will be much cheaper than synchronize_rcu(). Will do.
>
>> + }
>> /* implementation dependent freeing */
>> map->ops->map_free(map);
>> @@ -5326,6 +5334,9 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
>> goto out_unlock;
>> }
>> + /* No need to update owned_prog_ctx, because the bpf program
>> doesn't
>> + * access the map.
>> + */
>> memcpy(used_maps_new, used_maps_old,
>> sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
>> used_maps_new[prog->aux->used_map_cnt] = map;
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index bd1c42eb540f1..d8d5432b240dc 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -18012,12 +18012,17 @@ static int resolve_pseudo_ldimm64(struct
>> bpf_verifier_env *env)
>> return -E2BIG;
>> }
>> + atomic_or(env->prog->aux->sleepable ?
>> BPF_MAP_ACC_SLEEPABLE_PROG_CTX :
>> + BPF_MAP_ACC_NORMAL_PROG_CTX,
>> + &map->owned_prog_ctx);
>> /* hold the map. If the program is rejected by verifier,
>> * the map will be released by release_maps() or it
>> * will be used by the valid program until it's unloaded
>> * and all maps are released in free_used_maps()
>> */
>> bpf_map_inc(map);
>> + /* Paired with smp_mb() in bpf_map_fd_put_ptr() */
>> + smp_mb__after_atomic();
>> aux->map_index = env->used_map_cnt;
>> env->used_maps[env->used_map_cnt++] = map;
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH bpf v2 4/5] bpf: Optimize the free of inner map
2023-11-18 13:04 ` Hou Tao
@ 2023-11-18 23:28 ` Martin KaFai Lau
2023-11-21 2:27 ` Hou Tao
0 siblings, 1 reply; 14+ messages in thread
From: Martin KaFai Lau @ 2023-11-18 23:28 UTC (permalink / raw)
To: Hou Tao
Cc: Alexei Starovoitov, Andrii Nakryiko, Song Liu, Hao Luo,
Yonghong Song, Daniel Borkmann, KP Singh, Stanislav Fomichev,
Jiri Olsa, John Fastabend, houtao1, bpf
On 11/18/23 5:04 AM, Hou Tao wrote:
>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>> index ec3c90202ffe6..8faa1af4b39df 100644
>>> --- a/include/linux/bpf.h
>>> +++ b/include/linux/bpf.h
>>> @@ -245,6 +245,12 @@ struct bpf_list_node_kern {
>>> void *owner;
>>> } __attribute__((aligned(8)));
>>> +enum {
>>> + BPF_MAP_ACC_NORMAL_PROG_CTX = 1,
>>> + BPF_MAP_ACC_SLEEPABLE_PROG_CTX = 2,
>>
>> nit. It is a bit flag. Use (1U << 0) and (1U << 1) to make it more
>> obvious.
>
> Will fix.
>>
>> How about renaming this to BPF_MAP_RCU_GP and BPF_MAP_RCU_TT_GP to
>> better reflect it is used to decide if the map_free needs to wait for
>> any rcu gp?
>
> OK. It is fine with me.
>>
>>> + BPF_MAP_ACC_PROG_CTX_MASK = BPF_MAP_ACC_NORMAL_PROG_CTX |
>>> BPF_MAP_ACC_SLEEPABLE_PROG_CTX,
>>> +};
>>> +
>>> struct bpf_map {
>>> /* The first two cachelines with read-mostly members of which some
>>> * are also accessed in fast-path (e.g. ops, max_entries).
>>> @@ -292,7 +298,8 @@ struct bpf_map {
>>> } owner;
>>> bool bypass_spec_v1;
>>> bool frozen; /* write-once; write-protected by freeze_mutex */
>>> - bool free_after_mult_rcu_gp;
>>> + atomic_t owned_prog_ctx;
>>
>> Instead of the enum flags, this should only need a true/false value to
>> tell whether the outer map has ever been used by a sleepable prog or
>> not. may be renaming this to "atomic used_by_sleepable;"?
>
> I have considered about that. But there is maps which is only accessed
> in userspace (e.g. map used in BPF_PROG_BIND_MAP or maps in test_maps),
> so I though one bool is not enough.
Good point.
In this case, a nit on the name. The "prog_ctx" part in the owned_prog_ctx
usually means a different thing, like the "struct __sk_buff". How about using
this pair of names?
/* may be just "long" and then set_bit? */
atomic_t used_in_rcu_gp;
atomic_t free_by_rcu_gp;
shortened the name by removing the "_flags" suggested in the earlier comment.
>>
>>> + atomic_t may_be_accessed_prog_ctx;
>>
>> nit. rename this to "rcu_gp_flags;"
>
> Will do.
>>
>>> s64 __percpu *elem_count;
>>> };
>>> diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
>>> index cf33630655661..e3d26a89ac5b6 100644
>>> --- a/kernel/bpf/map_in_map.c
>>> +++ b/kernel/bpf/map_in_map.c
>>> @@ -131,12 +131,20 @@ void bpf_map_fd_put_ptr(struct bpf_map *map,
>>> void *ptr, bool deferred)
>>> {
>>> struct bpf_map *inner_map = ptr;
>>> - /* The inner map may still be used by both non-sleepable and
>>> sleepable
>>> - * bpf program, so free it after one RCU grace period and one tasks
>>> - * trace RCU grace period.
>>> + /* Defer the freeing of inner map according to the owner program
>>> + * context of outer maps, so unnecessary multiple RCU GP waitings
>>> + * can be avoided.
>>> */
>>> - if (deferred)
>>> - WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true);
>>> + if (deferred) {
>>> + /* owned_prog_ctx may be updated concurrently by new bpf
>>> program
>>> + * so add smp_mb() below to ensure that reading owned_prog_ctx
>>> + * will return the newly-set bit when the new bpf program finds
>>> + * the inner map before it is removed from outer map.
>>> + */
>>> + smp_mb();
>>
>> This part took my head spinning a little, so it is better to ask. The
>> owned_prog_ctx is set during verification time. There are many
>> instructions till the prog is actually verified, attached (another
>> syscall) and then run to do the actual lookup(&outer_map). Is this
>> level of reordering practically possible?
>
> Er, I added the memory barrier due to uncertainty. According to [1],
My immediate thought was a more straight forward spin lock instead.
Agree that smp_mb() works as well, ok.
> syscall doesn't imply a memory barrier. And If I understand correctly,
> when the program is loaded and attached through bpf_sys_bpf, there will
> be no new syscall. >
> [1]:
> https://www.kernel.org/doc/html/latest/core-api/wrappers/memory-barriers.html
>
>>
>>> + atomic_or(atomic_read(&map->owned_prog_ctx),
>>> + &inner_map->may_be_accessed_prog_ctx);
>>> + }
>>> bpf_map_put(inner_map);
>>> }
>>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>>> index e2d2701ce2c45..5a7906f2b027e 100644
>>> --- a/kernel/bpf/syscall.c
>>> +++ b/kernel/bpf/syscall.c
>>> @@ -694,12 +694,20 @@ static void bpf_map_free_deferred(struct
>>> work_struct *work)
>>> {
>>> struct bpf_map *map = container_of(work, struct bpf_map, work);
>>> struct btf_record *rec = map->record;
>>> + int acc_ctx;
>>> security_bpf_map_free(map);
>>> bpf_map_release_memcg(map);
>>> - if (READ_ONCE(map->free_after_mult_rcu_gp))
>>> - synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
>>> + acc_ctx = atomic_read(&map->may_be_accessed_prog_ctx) &
>>> BPF_MAP_ACC_PROG_CTX_MASK;
>>
>> The mask should not be needed.
>
> Yep. Will remove it.
>>
>>> + if (acc_ctx) {
>>> + if (acc_ctx == BPF_MAP_ACC_NORMAL_PROG_CTX)
>>> + synchronize_rcu();
>>> + else if (acc_ctx == BPF_MAP_ACC_SLEEPABLE_PROG_CTX)
>>> + synchronize_rcu_tasks_trace();
>>> + else
>>> + synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
>>
>> Is it better to add a rcu_head to the map and then use call_rcu_().
>> e.g. when there is many delete happened to the outer map during a
>> process restart to re-populate the outer map. It is relatively much
>> cheaper to add a rcu_head to the map comparing to adding one for each
>> elem. wdyt?
>
> Good idea. call_rcu() will be much cheaper than synchronize_rcu(). Will do.
>>
>>> + }
>>> /* implementation dependent freeing */
>>> map->ops->map_free(map);
>>> @@ -5326,6 +5334,9 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
>>> goto out_unlock;
>>> }
>>> + /* No need to update owned_prog_ctx, because the bpf program
>>> doesn't
>>> + * access the map.
>>> + */
>>> memcpy(used_maps_new, used_maps_old,
>>> sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
>>> used_maps_new[prog->aux->used_map_cnt] = map;
>>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>>> index bd1c42eb540f1..d8d5432b240dc 100644
>>> --- a/kernel/bpf/verifier.c
>>> +++ b/kernel/bpf/verifier.c
>>> @@ -18012,12 +18012,17 @@ static int resolve_pseudo_ldimm64(struct
>>> bpf_verifier_env *env)
>>> return -E2BIG;
>>> }
>>> + atomic_or(env->prog->aux->sleepable ?
>>> BPF_MAP_ACC_SLEEPABLE_PROG_CTX :
>>> + BPF_MAP_ACC_NORMAL_PROG_CTX,
>>> + &map->owned_prog_ctx);
This is setting a bit. Would set_bit() work as good?
>>> /* hold the map. If the program is rejected by verifier,
>>> * the map will be released by release_maps() or it
>>> * will be used by the valid program until it's unloaded
>>> * and all maps are released in free_used_maps()
>>> */
>>> bpf_map_inc(map);
>>> + /* Paired with smp_mb() in bpf_map_fd_put_ptr() */
>>> + smp_mb__after_atomic();
To make it clearer, can this be moved immediately after the set_bit / atomic_or
above?
>>> aux->map_index = env->used_map_cnt;
>>> env->used_maps[env->used_map_cnt++] = map;
>
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH bpf v2 4/5] bpf: Optimize the free of inner map
2023-11-18 23:28 ` Martin KaFai Lau
@ 2023-11-21 2:27 ` Hou Tao
0 siblings, 0 replies; 14+ messages in thread
From: Hou Tao @ 2023-11-21 2:27 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: Alexei Starovoitov, Andrii Nakryiko, Song Liu, Hao Luo,
Yonghong Song, Daniel Borkmann, KP Singh, Stanislav Fomichev,
Jiri Olsa, John Fastabend, houtao1, bpf
Hi,
On 11/19/2023 7:28 AM, Martin KaFai Lau wrote:
> On 11/18/23 5:04 AM, Hou Tao wrote:
>>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>>> index ec3c90202ffe6..8faa1af4b39df 100644
>>>> --- a/include/linux/bpf.h
>>>> +++ b/include/linux/bpf.h
>>>> @@ -245,6 +245,12 @@ struct bpf_list_node_kern {
>>>> void *owner;
>>>> } __attribute__((aligned(8)));
>>>> +enum {
>>>> + BPF_MAP_ACC_NORMAL_PROG_CTX = 1,
>>>> + BPF_MAP_ACC_SLEEPABLE_PROG_CTX = 2,
>>>
>>> nit. It is a bit flag. Use (1U << 0) and (1U << 1) to make it more
>>> obvious.
>>
>> Will fix.
>>>
>>> How about renaming this to BPF_MAP_RCU_GP and BPF_MAP_RCU_TT_GP to
>>> better reflect it is used to decide if the map_free needs to wait for
>>> any rcu gp?
>>
>> OK. It is fine with me.
>>>
>>>> + BPF_MAP_ACC_PROG_CTX_MASK = BPF_MAP_ACC_NORMAL_PROG_CTX |
>>>> BPF_MAP_ACC_SLEEPABLE_PROG_CTX,
>>>> +};
>>>> +
>>>> struct bpf_map {
>>>> /* The first two cachelines with read-mostly members of
>>>> which some
>>>> * are also accessed in fast-path (e.g. ops, max_entries).
>>>> @@ -292,7 +298,8 @@ struct bpf_map {
>>>> } owner;
>>>> bool bypass_spec_v1;
>>>> bool frozen; /* write-once; write-protected by freeze_mutex */
>>>> - bool free_after_mult_rcu_gp;
>>>> + atomic_t owned_prog_ctx;
>>>
>>> Instead of the enum flags, this should only need a true/false value to
>>> tell whether the outer map has ever been used by a sleepable prog or
>>> not. may be renaming this to "atomic used_by_sleepable;"?
>>
>> I have considered about that. But there is maps which is only accessed
>> in userspace (e.g. map used in BPF_PROG_BIND_MAP or maps in test_maps),
>> so I though one bool is not enough.
>
> Good point.
>
> In this case, a nit on the name. The "prog_ctx" part in the
> owned_prog_ctx usually means a different thing, like the "struct
> __sk_buff". How about using this pair of names?
>
> /* may be just "long" and then set_bit? */
> atomic_t used_in_rcu_gp;
> atomic_t free_by_rcu_gp;
>
> shortened the name by removing the "_flags" suggested in the earlier
> comment.
Will update the names. The naming is hard, so thanks for the suggestions.
I have tried set_bit() and test_bit() . The reason I switched to
atomic_or() is that the implementation will be much cleaner in
bpf_map_fd_put_ptr(). If using set_bit(), I had to do the following
things in bpf_map_fd_put_ptr() to atomically assign the bits in
used_in_rcu_gp to free_by_rcu_gp:
used = READ_ONCE(used_in_rcu_gp);
if (test_bit(BPF_MAP_RCU_GP, &used))
set_bit(BPF_MAP_RCU_GP, &free_by_rcu_gp);
if (test_bit(BPF_MAP_RCU_TT_GP , &used)
set_bit(BPF_MAP_RCU_TT_GP, &free_by_rcu_gp);
But for atomic_or(), only two statements are needed:
atomic_or(atomic_read(&used_in_rcu_gp), &free_by_rcu_gp)
I could use set_bit/test_bit() if it is preferred.
>
>>>
>>>> + atomic_t may_be_accessed_prog_ctx;
>>>
>>> nit. rename this to "rcu_gp_flags;"
>>
>> Will do.
>>>
>>>> s64 __percpu *elem_count;
>>>> };
>>>> diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
>>>> index cf33630655661..e3d26a89ac5b6 100644
>>>> --- a/kernel/bpf/map_in_map.c
>>>> +++ b/kernel/bpf/map_in_map.c
>>>> @@ -131,12 +131,20 @@ void bpf_map_fd_put_ptr(struct bpf_map *map,
>>>> void *ptr, bool deferred)
>>>> {
>>>> struct bpf_map *inner_map = ptr;
>>>> - /* The inner map may still be used by both non-sleepable and
>>>> sleepable
>>>> - * bpf program, so free it after one RCU grace period and one
>>>> tasks
>>>> - * trace RCU grace period.
>>>> + /* Defer the freeing of inner map according to the owner program
>>>> + * context of outer maps, so unnecessary multiple RCU GP waitings
>>>> + * can be avoided.
>>>> */
>>>> - if (deferred)
>>>> - WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true);
>>>> + if (deferred) {
>>>> + /* owned_prog_ctx may be updated concurrently by new bpf
>>>> program
>>>> + * so add smp_mb() below to ensure that reading
>>>> owned_prog_ctx
>>>> + * will return the newly-set bit when the new bpf program
>>>> finds
>>>> + * the inner map before it is removed from outer map.
>>>> + */
>>>> + smp_mb();
>>>
>>> This part took my head spinning a little, so it is better to ask. The
>>> owned_prog_ctx is set during verification time. There are many
>>> instructions till the prog is actually verified, attached (another
>>> syscall) and then run to do the actual lookup(&outer_map). Is this
>>> level of reordering practically possible?
>>
>> Er, I added the memory barrier due to uncertainty. According to [1],
> My immediate thought was a more straight forward spin lock instead.
>
> Agree that smp_mb() works as well, ok.
I see. I will keep the smp_mb().
>
>> syscall doesn't imply a memory barrier. And If I understand correctly,
>> when the program is loaded and attached through bpf_sys_bpf, there will
>> be no new syscall. >
>> [1]:
>> https://www.kernel.org/doc/html/latest/core-api/wrappers/memory-barriers.html
>>
>>
>>>
>>>> + atomic_or(atomic_read(&map->owned_prog_ctx),
>>>> + &inner_map->may_be_accessed_prog_ctx);
>>>> + }
>>>> bpf_map_put(inner_map);
>>>> }
>>>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>>>> index e2d2701ce2c45..5a7906f2b027e 100644
>>>> --- a/kernel/bpf/syscall.c
>>>> +++ b/kernel/bpf/syscall.c
>>>> @@ -694,12 +694,20 @@ static void bpf_map_free_deferred(struct
>>>> work_struct *work)
>>>> {
>>>> struct bpf_map *map = container_of(work, struct bpf_map, work);
>>>> struct btf_record *rec = map->record;
>>>> + int acc_ctx;
>>>> security_bpf_map_free(map);
>>>> bpf_map_release_memcg(map);
>>>> - if (READ_ONCE(map->free_after_mult_rcu_gp))
>>>> - synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
>>>> + acc_ctx = atomic_read(&map->may_be_accessed_prog_ctx) &
>>>> BPF_MAP_ACC_PROG_CTX_MASK;
>>>
>>> The mask should not be needed.
>>
>> Yep. Will remove it.
>>>
>>>> + if (acc_ctx) {
>>>> + if (acc_ctx == BPF_MAP_ACC_NORMAL_PROG_CTX)
>>>> + synchronize_rcu();
>>>> + else if (acc_ctx == BPF_MAP_ACC_SLEEPABLE_PROG_CTX)
>>>> + synchronize_rcu_tasks_trace();
>>>> + else
>>>> + synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
>>>
>>> Is it better to add a rcu_head to the map and then use call_rcu_().
>>> e.g. when there is many delete happened to the outer map during a
>>> process restart to re-populate the outer map. It is relatively much
>>> cheaper to add a rcu_head to the map comparing to adding one for each
>>> elem. wdyt?
>>
>> Good idea. call_rcu() will be much cheaper than synchronize_rcu().
>> Will do.
>>>
>>>> + }
>>>> /* implementation dependent freeing */
>>>> map->ops->map_free(map);
>>>> @@ -5326,6 +5334,9 @@ static int bpf_prog_bind_map(union bpf_attr
>>>> *attr)
>>>> goto out_unlock;
>>>> }
>>>> + /* No need to update owned_prog_ctx, because the bpf program
>>>> doesn't
>>>> + * access the map.
>>>> + */
>>>> memcpy(used_maps_new, used_maps_old,
>>>> sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
>>>> used_maps_new[prog->aux->used_map_cnt] = map;
>>>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>>>> index bd1c42eb540f1..d8d5432b240dc 100644
>>>> --- a/kernel/bpf/verifier.c
>>>> +++ b/kernel/bpf/verifier.c
>>>> @@ -18012,12 +18012,17 @@ static int resolve_pseudo_ldimm64(struct
>>>> bpf_verifier_env *env)
>>>> return -E2BIG;
>>>> }
>>>> + atomic_or(env->prog->aux->sleepable ?
>>>> BPF_MAP_ACC_SLEEPABLE_PROG_CTX :
>>>> + BPF_MAP_ACC_NORMAL_PROG_CTX,
>>>> + &map->owned_prog_ctx);
>
> This is setting a bit. Would set_bit() work as good?
Please see my reply above.
>
>>>> /* hold the map. If the program is rejected by
>>>> verifier,
>>>> * the map will be released by release_maps() or it
>>>> * will be used by the valid program until it's
>>>> unloaded
>>>> * and all maps are released in free_used_maps()
>>>> */
>>>> bpf_map_inc(map);
>>>> + /* Paired with smp_mb() in bpf_map_fd_put_ptr() */
>>>> + smp_mb__after_atomic();
>
> To make it clearer, can this be moved immediately after the set_bit /
> atomic_or above?
Will do. Thanks for all the suggestions.
>
>>>> aux->map_index = env->used_map_cnt;
>>>> env->used_maps[env->used_map_cnt++] = map;
>>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH bpf v2 4/5] bpf: Optimize the free of inner map
2023-11-13 12:33 ` [PATCH bpf v2 4/5] bpf: Optimize the free of inner map Hou Tao
2023-11-18 7:34 ` Martin KaFai Lau
@ 2023-11-21 5:19 ` Alexei Starovoitov
2023-11-21 6:45 ` Hou Tao
1 sibling, 1 reply; 14+ messages in thread
From: Alexei Starovoitov @ 2023-11-21 5:19 UTC (permalink / raw)
To: Hou Tao
Cc: bpf, Martin KaFai Lau, Andrii Nakryiko, Song Liu, Hao Luo,
Yonghong Song, Daniel Borkmann, KP Singh, Stanislav Fomichev,
Jiri Olsa, John Fastabend, houtao1
On Mon, Nov 13, 2023 at 08:33:23PM +0800, Hou Tao wrote:
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index e2d2701ce2c45..5a7906f2b027e 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -694,12 +694,20 @@ static void bpf_map_free_deferred(struct work_struct *work)
> {
> struct bpf_map *map = container_of(work, struct bpf_map, work);
> struct btf_record *rec = map->record;
> + int acc_ctx;
>
> security_bpf_map_free(map);
> bpf_map_release_memcg(map);
>
> - if (READ_ONCE(map->free_after_mult_rcu_gp))
> - synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
The previous patch 3 is doing too much.
There is maybe_wait_bpf_programs() that will do synchronize_rcu()
when necessary.
The patch 3 could do synchronize_rcu_tasks_trace() only and it will solve the issue.
> + acc_ctx = atomic_read(&map->may_be_accessed_prog_ctx) & BPF_MAP_ACC_PROG_CTX_MASK;
> + if (acc_ctx) {
> + if (acc_ctx == BPF_MAP_ACC_NORMAL_PROG_CTX)
> + synchronize_rcu();
> + else if (acc_ctx == BPF_MAP_ACC_SLEEPABLE_PROG_CTX)
> + synchronize_rcu_tasks_trace();
> + else
> + synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
and this patch 4 goes to far.
Could you add sleepable_refcnt in addition to existing refcnt that is incremented
in outer map when it's used by sleepable prog and when sleepable_refcnt > 0
the caller of bpf_map_free_deferred sets free_after_mult_rcu_gp.
(which should be renamed to free_after_tasks_rcu_gp).
Patch 3 is simpler and patch 4 is simple too.
No need for atomic_or games.
In addition I'd like to see an extra patch that demonstrates this UAF
when update/delete is done by syscall bpf prog type.
The test case in patch 5 is doing update/delete from user space.
If that was the only issue we could have easily extended maybe_wait_bpf_programs()
to do synchronize_rcu_tasks_trace() and that would close the issue exposed by patch 5.
But inner maps can indeed be updated by syscall bpf prog and since they run
under rcu_read_lock_trace() we cannot add synchronize_rcu_tasks_trace() to
maybe_wait_bpf_programs() because it will deadlock.
So let's make sure we have test cases for all combinations where inner maps
can be updated/deleted.
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH bpf v2 4/5] bpf: Optimize the free of inner map
2023-11-21 5:19 ` Alexei Starovoitov
@ 2023-11-21 6:45 ` Hou Tao
2023-11-21 17:49 ` Alexei Starovoitov
0 siblings, 1 reply; 14+ messages in thread
From: Hou Tao @ 2023-11-21 6:45 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: bpf, Martin KaFai Lau, Andrii Nakryiko, Song Liu, Hao Luo,
Yonghong Song, Daniel Borkmann, KP Singh, Stanislav Fomichev,
Jiri Olsa, John Fastabend, houtao1
Hi Alexei,
On 11/21/2023 1:19 PM, Alexei Starovoitov wrote:
> On Mon, Nov 13, 2023 at 08:33:23PM +0800, Hou Tao wrote:
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index e2d2701ce2c45..5a7906f2b027e 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -694,12 +694,20 @@ static void bpf_map_free_deferred(struct work_struct *work)
>> {
>> struct bpf_map *map = container_of(work, struct bpf_map, work);
>> struct btf_record *rec = map->record;
>> + int acc_ctx;
>>
>> security_bpf_map_free(map);
>> bpf_map_release_memcg(map);
>>
>> - if (READ_ONCE(map->free_after_mult_rcu_gp))
>> - synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
> The previous patch 3 is doing too much.
> There is maybe_wait_bpf_programs() that will do synchronize_rcu()
> when necessary.
> The patch 3 could do synchronize_rcu_tasks_trace() only and it will solve the issue.
I didn't follow how synchronize_rcu() in maybe_wait_bpf_programs() will
help bpf_map_free_deferred() to defer the free of inner map. Could you
please elaborate on that ? In my understanding, bpf_map_update_value()
invokes maybe_wait_bpf_programs() after the deletion of old inner map
from outer map completes. If the ref-count of inner map in the outer map
is the last one, bpf_map_free_deferred() will be called when the
deletion completes, so maybe_wait_bpf_programs() will run concurrently
with bpf_map_free_deferred().
>
>> + acc_ctx = atomic_read(&map->may_be_accessed_prog_ctx) & BPF_MAP_ACC_PROG_CTX_MASK;
>> + if (acc_ctx) {
>> + if (acc_ctx == BPF_MAP_ACC_NORMAL_PROG_CTX)
>> + synchronize_rcu();
>> + else if (acc_ctx == BPF_MAP_ACC_SLEEPABLE_PROG_CTX)
>> + synchronize_rcu_tasks_trace();
>> + else
>> + synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
> and this patch 4 goes to far.
> Could you add sleepable_refcnt in addition to existing refcnt that is incremented
> in outer map when it's used by sleepable prog and when sleepable_refcnt > 0
> the caller of bpf_map_free_deferred sets free_after_mult_rcu_gp.
> (which should be renamed to free_after_tasks_rcu_gp).
> Patch 3 is simpler and patch 4 is simple too.
> No need for atomic_or games.
>
> In addition I'd like to see an extra patch that demonstrates this UAF
> when update/delete is done by syscall bpf prog type.
> The test case in patch 5 is doing update/delete from user space.
Do you mean update/delete operations on outer map, right ? Because in
patch 5, inner map is updated from bpf program instead of user space.
> If that was the only issue we could have easily extended maybe_wait_bpf_programs()
> to do synchronize_rcu_tasks_trace() and that would close the issue exposed by patch 5.
> But inner maps can indeed be updated by syscall bpf prog and since they run
> under rcu_read_lock_trace() we cannot add synchronize_rcu_tasks_trace() to
> maybe_wait_bpf_programs() because it will deadlock.
> So let's make sure we have test cases for all combinations where inner maps
> can be updated/deleted.
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH bpf v2 4/5] bpf: Optimize the free of inner map
2023-11-21 6:45 ` Hou Tao
@ 2023-11-21 17:49 ` Alexei Starovoitov
2023-11-22 1:54 ` Hou Tao
0 siblings, 1 reply; 14+ messages in thread
From: Alexei Starovoitov @ 2023-11-21 17:49 UTC (permalink / raw)
To: Hou Tao
Cc: bpf, Martin KaFai Lau, Andrii Nakryiko, Song Liu, Hao Luo,
Yonghong Song, Daniel Borkmann, KP Singh, Stanislav Fomichev,
Jiri Olsa, John Fastabend, Hou Tao
On Mon, Nov 20, 2023 at 10:45 PM Hou Tao <houtao@huaweicloud.com> wrote:
>
> Hi Alexei,
>
> On 11/21/2023 1:19 PM, Alexei Starovoitov wrote:
> > On Mon, Nov 13, 2023 at 08:33:23PM +0800, Hou Tao wrote:
> >> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> >> index e2d2701ce2c45..5a7906f2b027e 100644
> >> --- a/kernel/bpf/syscall.c
> >> +++ b/kernel/bpf/syscall.c
> >> @@ -694,12 +694,20 @@ static void bpf_map_free_deferred(struct work_struct *work)
> >> {
> >> struct bpf_map *map = container_of(work, struct bpf_map, work);
> >> struct btf_record *rec = map->record;
> >> + int acc_ctx;
> >>
> >> security_bpf_map_free(map);
> >> bpf_map_release_memcg(map);
> >>
> >> - if (READ_ONCE(map->free_after_mult_rcu_gp))
> >> - synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
> > The previous patch 3 is doing too much.
> > There is maybe_wait_bpf_programs() that will do synchronize_rcu()
> > when necessary.
> > The patch 3 could do synchronize_rcu_tasks_trace() only and it will solve the issue.
>
> I didn't follow how synchronize_rcu() in maybe_wait_bpf_programs() will
> help bpf_map_free_deferred() to defer the free of inner map. Could you
> please elaborate on that ? In my understanding, bpf_map_update_value()
> invokes maybe_wait_bpf_programs() after the deletion of old inner map
> from outer map completes. If the ref-count of inner map in the outer map
> is the last one, bpf_map_free_deferred() will be called when the
> deletion completes, so maybe_wait_bpf_programs() will run concurrently
> with bpf_map_free_deferred().
The code was quite different back then.
See commit 1ae80cf31938 ("bpf: wait for running BPF programs when
updating map-in-map")
that was added to specifically address the case where bpf prog is
looking at the old inner map.
The commit log talks about a little bit of a different issue,
but the end result was the same. It prevented UAF since map free
logic was waiting for normal RCU GP back then.
See this comment:
void bpf_map_fd_put_ptr(void *ptr)
{
/* ptr->ops->map_free() has to go through one
* rcu grace period by itself.
*/
bpf_map_put(ptr);
}
that code was added when map-in-map was introduced.
Also see this commit:
https://lore.kernel.org/bpf/20220218181801.2971275-1-eric.dumazet@gmail.com/
In cases of batched updates (when multiple inner maps are deleted from
outer map) we should not call sync_rcu for every element being
deleted.
The introduced latency can be bad.
I guess maybe_wait_bpf_programs() was too much brute force.
It would call sync_rcu() regardless whether refcnt dropped to zero.
It mainly cares about user space assumptions.
This patch 3 and 4 will wait for sync_rcu only when refcnt==0,
so it should be ok.
Now we don't have 'wait for rcu gp' in map_free, so
maybe_wait_bpf_programs() is racy as you pointed out.
bpf_map_put() will drop refcnt of inner map and it might proceed into
bpf_map_free_deferred()->*_map_free() while bpf prog is still observing
a pointer to that map.
We need to adjust a comment in maybe_wait_bpf_programs() to say
it will only wait for non-sleepable bpf progs.
Sleepable might still see 'old' inner map after syscall map_delete
returns to user space.
> >
> >> + acc_ctx = atomic_read(&map->may_be_accessed_prog_ctx) & BPF_MAP_ACC_PROG_CTX_MASK;
> >> + if (acc_ctx) {
> >> + if (acc_ctx == BPF_MAP_ACC_NORMAL_PROG_CTX)
> >> + synchronize_rcu();
> >> + else if (acc_ctx == BPF_MAP_ACC_SLEEPABLE_PROG_CTX)
> >> + synchronize_rcu_tasks_trace();
> >> + else
> >> + synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
> > and this patch 4 goes to far.
> > Could you add sleepable_refcnt in addition to existing refcnt that is incremented
> > in outer map when it's used by sleepable prog and when sleepable_refcnt > 0
> > the caller of bpf_map_free_deferred sets free_after_mult_rcu_gp.
> > (which should be renamed to free_after_tasks_rcu_gp).
> > Patch 3 is simpler and patch 4 is simple too.
> > No need for atomic_or games.
> >
> > In addition I'd like to see an extra patch that demonstrates this UAF
> > when update/delete is done by syscall bpf prog type.
> > The test case in patch 5 is doing update/delete from user space.
>
> Do you mean update/delete operations on outer map, right ? Because in
> patch 5, inner map is updated from bpf program instead of user space.
patch 5 does:
bpf_map_update_elem(inner_map,...
That's only to trigger UAF.
We need a test that does bpf_map_update_elem(outer_map,...
from sleepable bpf prog to make sure we do _not_ have a code in
the kernel that synchronously waits for RCU tasks trace GP at that time.
So, you're correct, maybe_wait_bpf_programs() is not sufficient any more,
but we cannot delete it, since it addresses user space assumptions
on what bpf progs see when the inner map is replaced.
I still don't like atomic_or() logic and masks.
Why not to have sleepable_refcnt and
if (sleepable_refcnt > 0)
synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
else
synchronize_rcu();
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH bpf v2 4/5] bpf: Optimize the free of inner map
2023-11-21 17:49 ` Alexei Starovoitov
@ 2023-11-22 1:54 ` Hou Tao
0 siblings, 0 replies; 14+ messages in thread
From: Hou Tao @ 2023-11-22 1:54 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: bpf, Martin KaFai Lau, Andrii Nakryiko, Song Liu, Hao Luo,
Yonghong Song, Daniel Borkmann, KP Singh, Stanislav Fomichev,
Jiri Olsa, John Fastabend, Hou Tao
Hi,
On 11/22/2023 1:49 AM, Alexei Starovoitov wrote:
> On Mon, Nov 20, 2023 at 10:45 PM Hou Tao <houtao@huaweicloud.com> wrote:
>> Hi Alexei,
>>
>> On 11/21/2023 1:19 PM, Alexei Starovoitov wrote:
>>> On Mon, Nov 13, 2023 at 08:33:23PM +0800, Hou Tao wrote:
>>>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>>>> index e2d2701ce2c45..5a7906f2b027e 100644
>>>> --- a/kernel/bpf/syscall.c
>>>> +++ b/kernel/bpf/syscall.c
>>>> @@ -694,12 +694,20 @@ static void bpf_map_free_deferred(struct work_struct *work)
>>>> {
>>>> struct bpf_map *map = container_of(work, struct bpf_map, work);
>>>> struct btf_record *rec = map->record;
>>>> + int acc_ctx;
>>>>
>>>> security_bpf_map_free(map);
>>>> bpf_map_release_memcg(map);
>>>>
>>>> - if (READ_ONCE(map->free_after_mult_rcu_gp))
>>>> - synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
>>> The previous patch 3 is doing too much.
>>> There is maybe_wait_bpf_programs() that will do synchronize_rcu()
>>> when necessary.
>>> The patch 3 could do synchronize_rcu_tasks_trace() only and it will solve the issue.
>> I didn't follow how synchronize_rcu() in maybe_wait_bpf_programs() will
>> help bpf_map_free_deferred() to defer the free of inner map. Could you
>> please elaborate on that ? In my understanding, bpf_map_update_value()
>> invokes maybe_wait_bpf_programs() after the deletion of old inner map
>> from outer map completes. If the ref-count of inner map in the outer map
>> is the last one, bpf_map_free_deferred() will be called when the
>> deletion completes, so maybe_wait_bpf_programs() will run concurrently
>> with bpf_map_free_deferred().
> The code was quite different back then.
> See commit 1ae80cf31938 ("bpf: wait for running BPF programs when
> updating map-in-map")
> that was added to specifically address the case where bpf prog is
> looking at the old inner map.
> The commit log talks about a little bit of a different issue,
> but the end result was the same. It prevented UAF since map free
> logic was waiting for normal RCU GP back then.
> See this comment:
> void bpf_map_fd_put_ptr(void *ptr)
> {
> /* ptr->ops->map_free() has to go through one
> * rcu grace period by itself.
> */
> bpf_map_put(ptr);
> }
>
> that code was added when map-in-map was introduced.
I see. Thanks for the explanation. If there is still synchronize_rcu in
.map_free(), using synchronize_rcu_tasks_trace in bpf_map_free_deferred
will be enough.
>
> Also see this commit:
> https://lore.kernel.org/bpf/20220218181801.2971275-1-eric.dumazet@gmail.com/
>
> In cases of batched updates (when multiple inner maps are deleted from
> outer map) we should not call sync_rcu for every element being
> deleted.
> The introduced latency can be bad.
>
> I guess maybe_wait_bpf_programs() was too much brute force.
> It would call sync_rcu() regardless whether refcnt dropped to zero.
> It mainly cares about user space assumptions.
> This patch 3 and 4 will wait for sync_rcu only when refcnt==0,
> so it should be ok.
>
> Now we don't have 'wait for rcu gp' in map_free, so
> maybe_wait_bpf_programs() is racy as you pointed out.
> bpf_map_put() will drop refcnt of inner map and it might proceed into
> bpf_map_free_deferred()->*_map_free() while bpf prog is still observing
> a pointer to that map.
>
> We need to adjust a comment in maybe_wait_bpf_programs() to say
> it will only wait for non-sleepable bpf progs.
> Sleepable might still see 'old' inner map after syscall map_delete
> returns to user space.
Could we make update/deletion operation from bpf_syscall_bpf helper
being a special case and don't call synchronize_rcu_tasks_trace() in
maybe_wait_bpf_programs(), but does synchronize_rcu_tasks_trace() for
all other cases ?
>
>
>>>> + acc_ctx = atomic_read(&map->may_be_accessed_prog_ctx) & BPF_MAP_ACC_PROG_CTX_MASK;
>>>> + if (acc_ctx) {
>>>> + if (acc_ctx == BPF_MAP_ACC_NORMAL_PROG_CTX)
>>>> + synchronize_rcu();
>>>> + else if (acc_ctx == BPF_MAP_ACC_SLEEPABLE_PROG_CTX)
>>>> + synchronize_rcu_tasks_trace();
>>>> + else
>>>> + synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
>>> and this patch 4 goes to far.
>>> Could you add sleepable_refcnt in addition to existing refcnt that is incremented
>>> in outer map when it's used by sleepable prog and when sleepable_refcnt > 0
>>> the caller of bpf_map_free_deferred sets free_after_mult_rcu_gp.
>>> (which should be renamed to free_after_tasks_rcu_gp).
>>> Patch 3 is simpler and patch 4 is simple too.
>>> No need for atomic_or games.
>>>
>>> In addition I'd like to see an extra patch that demonstrates this UAF
>>> when update/delete is done by syscall bpf prog type.
>>> The test case in patch 5 is doing update/delete from user space.
>> Do you mean update/delete operations on outer map, right ? Because in
>> patch 5, inner map is updated from bpf program instead of user space.
> patch 5 does:
> bpf_map_update_elem(inner_map,...
>
> That's only to trigger UAF.
> We need a test that does bpf_map_update_elem(outer_map,...
> from sleepable bpf prog to make sure we do _not_ have a code in
> the kernel that synchronously waits for RCU tasks trace GP at that time.
I think we already have such test case in prog_tests/map_ptr.c. In
map_ptr.c, it will use light skeleton to setup the map-in-map defined in
progs/map_ptr_kern.c. I got the dead-lock when trying to do
synchronize_rcu_tasks_trace() in bpf_map_fd_put_ptr(). But I could add a
explicit one.
> So, you're correct, maybe_wait_bpf_programs() is not sufficient any more,
> but we cannot delete it, since it addresses user space assumptions
> on what bpf progs see when the inner map is replaced.
>
> I still don't like atomic_or() logic and masks.
> Why not to have sleepable_refcnt and
> if (sleepable_refcnt > 0)
> synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
> else
> synchronize_rcu();
I think the main reason is that there is four possible case for the free
of inner map:
(1) neither call synchronize_rcu() nor synchronize_rcu_tasks_trace()
It is true when the outer map is only being accessed in user space.
(2) only call synchronize_rcu()
the outer map is only being accessed by non-sleepable bpf program
(3) only call synchronize_rcu_tasks_trace
the outer map is only being accessed by sleepable bpf program
(4) call both synchronize_rcu() and synchronize_rcu_tasks_trace()
Only using sleepable_refcnt can not express 4 possible cases and we also
need to atomically copy the states from outer map to inner map, because
one inner map may be used concurrently by multiple outer map, so atomic
or mask are chosen.
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH bpf v2 5/5] selftests/bpf: Add test cases for inner map
2023-11-13 12:33 [PATCH bpf v2 0/5] bpf: Fix the release of inner map Hou Tao
` (3 preceding siblings ...)
2023-11-13 12:33 ` [PATCH bpf v2 4/5] bpf: Optimize the free of inner map Hou Tao
@ 2023-11-13 12:33 ` Hou Tao
4 siblings, 0 replies; 14+ messages in thread
From: Hou Tao @ 2023-11-13 12:33 UTC (permalink / raw)
To: bpf
Cc: Martin KaFai Lau, Alexei Starovoitov, Andrii Nakryiko, Song Liu,
Hao Luo, Yonghong Song, Daniel Borkmann, KP Singh,
Stanislav Fomichev, Jiri Olsa, John Fastabend, houtao1
From: Hou Tao <houtao1@huawei.com>
Add test cases to test the race between the destroy of inner map due to
map-in-map update and the access of inner map in bpf program. The
following 4 combination are added:
(1) array map in map array + bpf program
(2) array map in map array + sleepable bpf program
(3) array map in map htab + bpf program
(4) array map in map htab + sleepable bpf program
Before applying the fixes, when running `./test_prog -a map_in_map`, the
following error was reported:
==================================================================
BUG: KASAN: slab-use-after-free in array_map_update_elem+0x48/0x3e0
Read of size 4 at addr ffff888114f33824 by task test_progs/1858
CPU: 1 PID: 1858 Comm: test_progs Tainted: G O 6.6.0+ #7
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) ......
Call Trace:
<TASK>
dump_stack_lvl+0x4a/0x90
print_report+0xd2/0x620
kasan_report+0xd1/0x110
__asan_load4+0x81/0xa0
array_map_update_elem+0x48/0x3e0
bpf_prog_be94a9f26772f5b7_access_map_in_array+0xe6/0xf6
trace_call_bpf+0x1aa/0x580
kprobe_perf_func+0xdd/0x430
kprobe_dispatcher+0xa0/0xb0
kprobe_ftrace_handler+0x18b/0x2e0
0xffffffffc02280f7
RIP: 0010:__x64_sys_getpgid+0x1/0x30
......
</TASK>
Allocated by task 1857:
kasan_save_stack+0x26/0x50
kasan_set_track+0x25/0x40
kasan_save_alloc_info+0x1e/0x30
__kasan_kmalloc+0x98/0xa0
__kmalloc_node+0x6a/0x150
__bpf_map_area_alloc+0x141/0x170
bpf_map_area_alloc+0x10/0x20
array_map_alloc+0x11f/0x310
map_create+0x28a/0xb40
__sys_bpf+0x753/0x37c0
__x64_sys_bpf+0x44/0x60
do_syscall_64+0x36/0xb0
entry_SYSCALL_64_after_hwframe+0x6e/0x76
Freed by task 11:
kasan_save_stack+0x26/0x50
kasan_set_track+0x25/0x40
kasan_save_free_info+0x2b/0x50
__kasan_slab_free+0x113/0x190
slab_free_freelist_hook+0xd7/0x1e0
__kmem_cache_free+0x170/0x260
kfree+0x9b/0x160
kvfree+0x2d/0x40
bpf_map_area_free+0xe/0x20
array_map_free+0x120/0x2c0
bpf_map_free_deferred+0xd7/0x1e0
process_one_work+0x462/0x990
worker_thread+0x370/0x670
kthread+0x1b0/0x200
ret_from_fork+0x3a/0x70
ret_from_fork_asm+0x1b/0x30
Last potentially related work creation:
kasan_save_stack+0x26/0x50
__kasan_record_aux_stack+0x94/0xb0
kasan_record_aux_stack_noalloc+0xb/0x20
__queue_work+0x331/0x950
queue_work_on+0x75/0x80
bpf_map_put+0xfa/0x160
bpf_map_fd_put_ptr+0xe/0x20
bpf_fd_array_map_update_elem+0x174/0x1b0
bpf_map_update_value+0x2b7/0x4a0
__sys_bpf+0x2551/0x37c0
__x64_sys_bpf+0x44/0x60
do_syscall_64+0x36/0xb0
entry_SYSCALL_64_after_hwframe+0x6e/0x76
Signed-off-by: Hou Tao <houtao1@huawei.com>
---
.../selftests/bpf/prog_tests/map_in_map.c | 141 ++++++++++++++++++
.../selftests/bpf/progs/access_map_in_map.c | 93 ++++++++++++
2 files changed, 234 insertions(+)
create mode 100644 tools/testing/selftests/bpf/prog_tests/map_in_map.c
create mode 100644 tools/testing/selftests/bpf/progs/access_map_in_map.c
diff --git a/tools/testing/selftests/bpf/prog_tests/map_in_map.c b/tools/testing/selftests/bpf/prog_tests/map_in_map.c
new file mode 100644
index 0000000000000..d2a10eb4e5b52
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/map_in_map.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2023. Huawei Technologies Co., Ltd */
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <test_progs.h>
+#include <bpf/btf.h>
+#include "access_map_in_map.skel.h"
+
+struct thread_ctx {
+ pthread_barrier_t barrier;
+ int outer_map_fd;
+ int start, abort;
+ int loop, err;
+};
+
+static int wait_for_start_or_abort(struct thread_ctx *ctx)
+{
+ while (!ctx->start && !ctx->abort)
+ usleep(1);
+ return ctx->abort ? -1 : 0;
+}
+
+static void *update_map_fn(void *data)
+{
+ struct thread_ctx *ctx = data;
+ int loop = ctx->loop, err = 0;
+
+ if (wait_for_start_or_abort(ctx) < 0)
+ return NULL;
+ pthread_barrier_wait(&ctx->barrier);
+
+ while (loop-- > 0) {
+ int fd, zero = 0;
+
+ fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, 4, 4, 1, NULL);
+ if (fd < 0) {
+ err |= 1;
+ pthread_barrier_wait(&ctx->barrier);
+ continue;
+ }
+
+ /* Remove the old inner map */
+ if (bpf_map_update_elem(ctx->outer_map_fd, &zero, &fd, 0) < 0)
+ err |= 2;
+ close(fd);
+ pthread_barrier_wait(&ctx->barrier);
+ }
+
+ ctx->err = err;
+
+ return NULL;
+}
+
+static void *access_map_fn(void *data)
+{
+ struct thread_ctx *ctx = data;
+ int loop = ctx->loop;
+
+ if (wait_for_start_or_abort(ctx) < 0)
+ return NULL;
+ pthread_barrier_wait(&ctx->barrier);
+
+ while (loop-- > 0) {
+ /* Access the old inner map */
+ syscall(SYS_getpgid);
+ pthread_barrier_wait(&ctx->barrier);
+ }
+
+ return NULL;
+}
+
+static void test_map_in_map_access(const char *prog_name, const char *map_name)
+{
+ struct access_map_in_map *skel;
+ struct bpf_map *outer_map;
+ struct bpf_program *prog;
+ struct thread_ctx ctx;
+ pthread_t tid[2];
+ int err;
+
+ skel = access_map_in_map__open();
+ if (!ASSERT_OK_PTR(skel, "access_map_in_map open"))
+ return;
+
+ prog = bpf_object__find_program_by_name(skel->obj, prog_name);
+ if (!ASSERT_OK_PTR(prog, "find program"))
+ goto out;
+ bpf_program__set_autoload(prog, true);
+
+ outer_map = bpf_object__find_map_by_name(skel->obj, map_name);
+ if (!ASSERT_OK_PTR(outer_map, "find map"))
+ goto out;
+
+ err = access_map_in_map__load(skel);
+ if (!ASSERT_OK(err, "access_map_in_map load"))
+ goto out;
+
+ err = access_map_in_map__attach(skel);
+ if (!ASSERT_OK(err, "access_map_in_map attach"))
+ goto out;
+
+ skel->bss->tgid = getpid();
+
+ memset(&ctx, 0, sizeof(ctx));
+ pthread_barrier_init(&ctx.barrier, NULL, 2);
+ ctx.outer_map_fd = bpf_map__fd(outer_map);
+ ctx.loop = 4;
+
+ err = pthread_create(&tid[0], NULL, update_map_fn, &ctx);
+ if (!ASSERT_OK(err, "close_thread"))
+ goto out;
+
+ err = pthread_create(&tid[1], NULL, access_map_fn, &ctx);
+ if (!ASSERT_OK(err, "read_thread")) {
+ ctx.abort = 1;
+ pthread_join(tid[0], NULL);
+ goto out;
+ }
+
+ ctx.start = 1;
+ pthread_join(tid[0], NULL);
+ pthread_join(tid[1], NULL);
+
+ ASSERT_OK(ctx.err, "err");
+out:
+ access_map_in_map__destroy(skel);
+}
+
+void test_map_in_map(void)
+{
+ if (test__start_subtest("acc_map_in_array"))
+ test_map_in_map_access("access_map_in_array", "outer_array_map");
+ if (test__start_subtest("sleepable_acc_map_in_array"))
+ test_map_in_map_access("sleepable_access_map_in_array", "outer_array_map");
+ if (test__start_subtest("acc_map_in_htab"))
+ test_map_in_map_access("access_map_in_htab", "outer_htab_map");
+ if (test__start_subtest("sleepable_acc_map_in_htab"))
+ test_map_in_map_access("sleepable_access_map_in_htab", "outer_htab_map");
+}
+
diff --git a/tools/testing/selftests/bpf/progs/access_map_in_map.c b/tools/testing/selftests/bpf/progs/access_map_in_map.c
new file mode 100644
index 0000000000000..1126871c2ebd8
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/access_map_in_map.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2023. Huawei Technologies Co., Ltd */
+#include <linux/bpf.h>
+#include <time.h>
+#include <bpf/bpf_helpers.h>
+
+#include "bpf_misc.h"
+
+struct inner_map_type {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(key_size, 4);
+ __uint(value_size, 4);
+ __uint(max_entries, 1);
+} inner_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+ __type(key, int);
+ __type(value, int);
+ __uint(max_entries, 1);
+ __array(values, struct inner_map_type);
+} outer_array_map SEC(".maps") = {
+ .values = {
+ [0] = &inner_map,
+ },
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
+ __type(key, int);
+ __type(value, int);
+ __uint(max_entries, 1);
+ __array(values, struct inner_map_type);
+} outer_htab_map SEC(".maps") = {
+ .values = {
+ [0] = &inner_map,
+ },
+};
+
+char _license[] SEC("license") = "GPL";
+
+int tgid = 0;
+
+static int acc_map_in_map(void *outer_map)
+{
+ int i, key, value = 0xdeadbeef;
+ void *inner_map;
+
+ if ((bpf_get_current_pid_tgid() >> 32) != tgid)
+ return 0;
+
+ /* Find nonexistent inner map */
+ key = 1;
+ inner_map = bpf_map_lookup_elem(outer_map, &key);
+ if (inner_map)
+ return 0;
+
+ /* Find the old inner map */
+ key = 0;
+ inner_map = bpf_map_lookup_elem(outer_map, &key);
+ if (!inner_map)
+ return 0;
+
+ /* Wait for the old inner map to be replaced */
+ for (i = 0; i < 2048; i++)
+ bpf_map_update_elem(inner_map, &key, &value, 0);
+
+ return 0;
+}
+
+SEC("?kprobe/" SYS_PREFIX "sys_getpgid")
+int access_map_in_array(void *ctx)
+{
+ return acc_map_in_map(&outer_array_map);
+}
+
+SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
+int sleepable_access_map_in_array(void *ctx)
+{
+ return acc_map_in_map(&outer_array_map);
+}
+
+SEC("?kprobe/" SYS_PREFIX "sys_getpgid")
+int access_map_in_htab(void *ctx)
+{
+ return acc_map_in_map(&outer_htab_map);
+}
+
+SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
+int sleepable_access_map_in_htab(void *ctx)
+{
+ return acc_map_in_map(&outer_htab_map);
+}
--
2.29.2
^ permalink raw reply related [flat|nested] 14+ messages in thread