From: Yonghong Song <yhs@fb.com>
To: Denis Salopek <denis.salopek@sartura.hr>, <bpf@vger.kernel.org>
Cc: <juraj.vijtiuk@sartura.hr>, <luka.oreskovic@sartura.hr>,
<luka.perkov@sartura.hr>
Subject: Re: [PATCH v2 bpf-next] bpf: add lookup_and_delete_elem support to hashtab
Date: Mon, 1 Mar 2021 18:02:37 -0800 [thread overview]
Message-ID: <d8104f41-4302-7b68-5bc1-fb014a261e42@fb.com> (raw)
In-Reply-To: <YDtk/vr/lk62L4KP@gmail.com>
On 2/28/21 1:40 AM, Denis Salopek wrote:
> Extend the existing bpf_map_lookup_and_delete_elem() functionality to
> hashtab maps, in addition to stacks and queues.
> Create a new hashtab bpf_map_ops function that does lookup and deletion
> of the element under the same bucket lock and add the created map_ops to
> bpf.h.
> Add the appropriate test cases to 'maps' and 'lru_map' selftests
> accompanied with new test_progs.
>
> Cc: Juraj Vijtiuk <juraj.vijtiuk@sartura.hr>
> Cc: Luka Oreskovic <luka.oreskovic@sartura.hr>
> Cc: Luka Perkov <luka.perkov@sartura.hr>
> Signed-off-by: Denis Salopek <denis.salopek@sartura.hr>
> ---
> v2: Add functionality for LRU/per-CPU, add test_progs tests.
> ---
> include/linux/bpf.h | 2 +
> kernel/bpf/hashtab.c | 80 +++++
> kernel/bpf/syscall.c | 14 +-
> .../bpf/prog_tests/lookup_and_delete.c | 283 ++++++++++++++++++
> .../bpf/progs/test_lookup_and_delete.c | 26 ++
> tools/testing/selftests/bpf/test_lru_map.c | 8 +
> tools/testing/selftests/bpf/test_maps.c | 19 +-
> 7 files changed, 430 insertions(+), 2 deletions(-)
> create mode 100644 tools/testing/selftests/bpf/prog_tests/lookup_and_delete.c
> create mode 100644 tools/testing/selftests/bpf/progs/test_lookup_and_delete.c
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 4c730863fa77..0bcc4f89af40 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -67,6 +67,8 @@ struct bpf_map_ops {
> void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
> int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr,
> union bpf_attr __user *uattr);
> + int (*map_lookup_and_delete_elem)(struct bpf_map *map, void *key,
> + void *value);
> int (*map_lookup_and_delete_batch)(struct bpf_map *map,
> const union bpf_attr *attr,
> union bpf_attr __user *uattr);
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index 330d721dd2af..8c3334d1b6b3 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -1401,6 +1401,82 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
> rcu_read_unlock();
> }
>
> +static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
> + void *value, bool is_lru_map,
> + bool is_percpu)
> +{
> + struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> + u32 hash, key_size, value_size;
> + struct hlist_nulls_head *head;
> + int cpu, off = 0, ret;
> + struct htab_elem *l;
> + unsigned long flags;
> + void __percpu *pptr;
> + struct bucket *b;
> +
> + key_size = map->key_size;
> + value_size = round_up(map->value_size, 8);
> +
> + hash = htab_map_hash(key, key_size, htab->hashrnd);
> + b = __select_bucket(htab, hash);
> + head = &b->head;
> +
> + ret = htab_lock_bucket(htab, b, hash, &flags);
> + if (ret)
> + return ret;
> +
> + l = lookup_elem_raw(head, hash, key, key_size);
> + if (l) {
> + if (is_percpu) {
> + pptr = htab_elem_get_ptr(l, key_size);
> + for_each_possible_cpu(cpu) {
> + bpf_long_memcpy(value + off,
> + per_cpu_ptr(pptr, cpu),
> + value_size);
> + off += value_size;
> + }
> + } else {
> + copy_map_value(map, value, l->key + round_up(key_size, 8));
For hashtab lookup elem, BPF_F_LOCK flag may be set by user, I think
hashtab lookup_and_delete_elem should also support this flag, so user
can ensure they always get a lock protected sane value.
We have the following libbpf APIs.
LIBBPF_API int bpf_map_lookup_elem(int fd, const void *key, void *value);
LIBBPF_API int bpf_map_lookup_elem_flags(int fd, const void *key, void
*value,
__u64 flags);
LIBBPF_API int bpf_map_lookup_and_delete_elem(int fd, const void *key,
void *value);
Previously, bpf_map_lookup_and_delete_elem only supports queue/stack,
which does not need flags as it does not support BPF_F_LOCK so we
are fine.
Maybe similar to bpf_map_lookup_elem_flags() we add a
bpf_map_lookup_and_delete_elem_flags()? Maybe libbpf v1.0
can consolidate into a better uniform api.
> + }
> +
> + hlist_nulls_del_rcu(&l->hash_node);
> + if (!is_lru_map)
> + free_htab_elem(htab, l);
> + } else
> + ret = -ENOENT;
> +
> + htab_unlock_bucket(htab, b, hash, flags);
> +
> + if (is_lru_map && l)
> + bpf_lru_push_free(&htab->lru, &l->lru_node);
> +
> + return ret;
> +}
> +
> +static int htab_map_lookup_and_delete_elem(struct bpf_map *map,
> + void *key, void *value)
> +{
> + return __htab_map_lookup_and_delete_elem(map, key, value, false, false);
> +}
> +
> +static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
> + void *key, void *value)
> +{
> + return __htab_map_lookup_and_delete_elem(map, key, value, false, true);
> +}
> +
> +static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
> + void *value)
> +{
> + return __htab_map_lookup_and_delete_elem(map, key, value, true, false);
> +}
> +
> +static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
> + void *key, void *value)
> +{
> + return __htab_map_lookup_and_delete_elem(map, key, value, true, true);
> +}
> +
> static int
> __htab_map_lookup_and_delete_batch(struct bpf_map *map,
> const union bpf_attr *attr,
> @@ -1934,6 +2010,7 @@ const struct bpf_map_ops htab_map_ops = {
> .map_free = htab_map_free,
> .map_get_next_key = htab_map_get_next_key,
> .map_lookup_elem = htab_map_lookup_elem,
> + .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
> .map_update_elem = htab_map_update_elem,
> .map_delete_elem = htab_map_delete_elem,
> .map_gen_lookup = htab_map_gen_lookup,
> @@ -1954,6 +2031,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
> .map_free = htab_map_free,
> .map_get_next_key = htab_map_get_next_key,
> .map_lookup_elem = htab_lru_map_lookup_elem,
> + .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
> .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
> .map_update_elem = htab_lru_map_update_elem,
> .map_delete_elem = htab_lru_map_delete_elem,
> @@ -2077,6 +2155,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
> .map_free = htab_map_free,
> .map_get_next_key = htab_map_get_next_key,
> .map_lookup_elem = htab_percpu_map_lookup_elem,
> + .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
> .map_update_elem = htab_percpu_map_update_elem,
> .map_delete_elem = htab_map_delete_elem,
> .map_seq_show_elem = htab_percpu_map_seq_show_elem,
> @@ -2096,6 +2175,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
> .map_free = htab_map_free,
> .map_get_next_key = htab_map_get_next_key,
> .map_lookup_elem = htab_lru_percpu_map_lookup_elem,
> + .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
> .map_update_elem = htab_lru_percpu_map_update_elem,
> .map_delete_elem = htab_lru_map_delete_elem,
> .map_seq_show_elem = htab_percpu_map_seq_show_elem,
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index c859bc46d06c..2634aa4a2f37 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -1495,7 +1495,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
> goto err_put;
> }
>
> - value_size = map->value_size;
> + value_size = bpf_map_value_size(map);
>
> err = -ENOMEM;
> value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
> @@ -1505,6 +1505,18 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
> if (map->map_type == BPF_MAP_TYPE_QUEUE ||
> map->map_type == BPF_MAP_TYPE_STACK) {
> err = map->ops->map_pop_elem(map, value);
> + } else if (map->map_type == BPF_MAP_TYPE_HASH ||
> + map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
> + map->map_type == BPF_MAP_TYPE_LRU_HASH ||
> + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
> + if (!bpf_map_is_dev_bound(map)) {
> + bpf_disable_instrumentation();
> + rcu_read_lock();
> + err = map->ops->map_lookup_and_delete_elem(map, key, value);
> + rcu_read_unlock();
> + bpf_enable_instrumentation();
> + maybe_wait_bpf_programs(map);
maybe_wait_bpf_programs(map) is mostly for map-in-map.
but I think it is okay to put it here in case in the future
we will support map-in-map here. If maybe_wait_bpf_programs()
get inlined which mostly likely is the case, the compiler
should be able to optimize it away.
> + }
> } else {
> err = -ENOTSUPP;
> }
> diff --git a/tools/testing/selftests/bpf/prog_tests/lookup_and_delete.c b/tools/testing/selftests/bpf/prog_tests/lookup_and_delete.c
> new file mode 100644
> index 000000000000..05123bbcdc1c
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/lookup_and_delete.c
> @@ -0,0 +1,283 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +
> +#include <test_progs.h>
> +#include "test_lookup_and_delete.skel.h"
> +
> +#define START_VALUE 1234
> +#define NEW_VALUE 4321
> +#define MAX_ENTRIES 2
> +
> +static int duration;
> +static int nr_cpus;
> +
> +static int fill_values(int map_fd)
> +{
> + __u64 key, value = START_VALUE;
> + int err;
> +
> + for (key = 1; key < MAX_ENTRIES + 1; key++) {
> + err = bpf_map_update_elem(map_fd, &key, &value, BPF_NOEXIST);
> + if (CHECK(err, "bpf_map_update_elem", "failed\n"))
You can use
if (!ASSERT_OK(err, "bpf_map_update_elem"))
to save you from explicit "failed" string.
The same for some later other CHECK usages.
> + return -1;
> + }
> +
> + return 0;
> +}
> +
> +static int fill_values_percpu(int map_fd)
> +{
> + BPF_DECLARE_PERCPU(__u64, value);
> + int i, err;
> + u64 key;
> +
> + for (i = 0; i < nr_cpus; i++)
> + bpf_percpu(value, i) = START_VALUE;
> +
> + for (key = 1; key < MAX_ENTRIES + 1; key++) {
> + err = bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST);
> + if (CHECK(err, "bpf_map_update_elem", "failed\n"))
> + return -1;
> + }
> +
> + return 0;
> +}
> +
[...]
> diff --git a/tools/testing/selftests/bpf/progs/test_lookup_and_delete.c b/tools/testing/selftests/bpf/progs/test_lookup_and_delete.c
> new file mode 100644
> index 000000000000..eb19de8bb415
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/test_lookup_and_delete.c
> @@ -0,0 +1,26 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include "vmlinux.h"
> +#include <bpf/bpf_helpers.h>
> +
> +__u32 set_pid;
> +__u64 set_key;
> +__u64 set_value;
Please add "= 0" to the above declaration to make
it llvm10 friendly.
> +
> +struct {
> + __uint(type, BPF_MAP_TYPE_HASH);
> + __uint(max_entries, 2);
> + __type(key, __u64);
> + __type(value, __u64);
> +} hash_map SEC(".maps");
> +
> +SEC("tp/syscalls/sys_enter_getpgid")
> +int bpf_lookup_and_delete_test(const void *ctx)
> +{
> + if (set_pid == bpf_get_current_pid_tgid() >> 32)
> + bpf_map_update_elem(&hash_map, &set_key, &set_value, BPF_NOEXIST);
> +
> + return 0;
> +}
> +
> +char _license[] SEC("license") = "GPL";
[...]
next prev parent reply other threads:[~2021-03-02 10:38 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-02-28 9:40 [PATCH v2 bpf-next] bpf: add lookup_and_delete_elem support to hashtab Denis Salopek
2021-03-02 2:02 ` Yonghong Song [this message]
2021-03-13 8:54 ` Denis Salopek
2021-03-15 0:08 ` Yonghong Song
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=d8104f41-4302-7b68-5bc1-fb014a261e42@fb.com \
--to=yhs@fb.com \
--cc=bpf@vger.kernel.org \
--cc=denis.salopek@sartura.hr \
--cc=juraj.vijtiuk@sartura.hr \
--cc=luka.oreskovic@sartura.hr \
--cc=luka.perkov@sartura.hr \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox