From: Mykyta Yatsenko <mykyta.yatsenko5@gmail.com>
To: bpf@vger.kernel.org, ast@kernel.org, andrii@kernel.org,
daniel@iogearbox.net, kafai@meta.com, kernel-team@meta.com,
eddyz87@gmail.com, memxor@gmail.com,
herbert@gondor.apana.org.au
Cc: Mykyta Yatsenko <yatsenko@meta.com>
Subject: [PATCH bpf-next v4 04/11] bpf: Implement iteration ops for resizable hashtab
Date: Wed, 13 May 2026 15:36:07 -0700 [thread overview]
Message-ID: <20260513-rhash-v4-4-dd3d541ccb0b@meta.com> (raw)
In-Reply-To: <20260513-rhash-v4-0-dd3d541ccb0b@meta.com>
From: Mykyta Yatsenko <yatsenko@meta.com>
Implement rhtab_map_get_next_key(), batch lookup/lookup-and-delete,
and the seq_file BPF iterator for BPF_MAP_TYPE_RHASH.
All three use rhashtable_next_key() with the previous key as cursor.
This avoids the rhashtable walker state machine and its spinlock
acquisition per enter/exit, and lets callers stay stateless across
syscalls. ERR_PTR(-ENOENT) from rhashtable_next_key() (cursor was
concurrently deleted) is handled per callsite: get_next_key falls
back to the first key (matches BPF UAPI semantics of htab); the
seq_file iterator and batch terminate iteration.
Also implements rhtab_map_mem_usage() to report memory consumption.
Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
---
kernel/bpf/hashtab.c | 290 ++++++++++++++++++++++++++++++++++++++++++++++++--
kernel/bpf/map_iter.c | 3 +-
2 files changed, 284 insertions(+), 9 deletions(-)
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index da1754e1a525..61eb88cb9229 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -3021,68 +3021,342 @@ static void rhtab_map_free_internal_structs(struct bpf_map *map)
}
static int rhtab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+ __must_hold_shared(RCU)
{
- return -EOPNOTSUPP;
+ struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+ struct rhtab_elem *elem;
+
+ elem = rhashtable_next_key(&rhtab->ht, key, rhtab_params);
+
+ /* if not found, return the first key */
+ if (PTR_ERR(elem) == -ENOENT)
+ elem = rhashtable_next_key(&rhtab->ht, NULL, rhtab_params);
+
+ if (!elem)
+ return -ENOENT;
+
+ memcpy(next_key, elem->data, map->key_size);
+ return 0;
}
static void rhtab_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m)
{
+ void *value;
+
+ /* Guarantee that hashtab value is not freed */
+ guard(rcu)();
+
+ value = rhtab_map_lookup_elem(map, key);
+ if (!value)
+ return;
+
+ btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
+ seq_puts(m, ": ");
+ btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
+ seq_putc(m, '\n');
}
static long bpf_each_rhash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
void *callback_ctx, u64 flags)
{
- return -EOPNOTSUPP;
+ struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+ void *prev_key = NULL;
+ struct rhtab_elem *elem;
+ int num_elems = 0;
+ void *key_buf = NULL;
+ u64 ret = 0;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ /*
+ * Buffer for stashing the cursor across rcu_read_unlock() in the
+ * cond_resched path.
+ */
+ key_buf = kmalloc_nolock(map->key_size, 0, NUMA_NO_NODE);
+ if (!key_buf)
+ return -ENOMEM;
+
+ rcu_read_lock();
+ /*
+ * Best-effort iteration: if rhashtable is concurrently resized or
+ * elements are deleted/inserted, there may be missed or duplicate
+ * elements visited.
+ */
+ while ((elem = rhashtable_next_key(&rhtab->ht, prev_key, rhtab_params))) {
+ if (IS_ERR(elem))
+ break;
+ num_elems++;
+ ret = callback_fn((u64)(long)map,
+ (u64)(long)elem->data,
+ (u64)(long)rhtab_elem_value(elem, map->key_size),
+ (u64)(long)callback_ctx, 0);
+ if (ret)
+ break;
+
+ prev_key = elem->data; /* valid while RCU held */
+
+ if (need_resched()) {
+ memcpy(key_buf, elem->data, map->key_size);
+ rcu_read_unlock();
+ cond_resched();
+ rcu_read_lock();
+ prev_key = key_buf;
+ }
+ }
+ rcu_read_unlock();
+
+ kfree_nolock(key_buf);
+ return num_elems;
}
static u64 rhtab_map_mem_usage(const struct bpf_map *map)
{
- return 0;
+ struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+ u64 num_entries;
+
+ /* Excludes rhashtable bucket overhead (~ nelems * sizeof(void *) at 75% load). */
+ num_entries = atomic_read(&rhtab->ht.nelems);
+ return sizeof(struct bpf_rhtab) + rhtab->elem_size * num_entries;
+}
+
+static int __rhtab_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ bool do_delete)
+{
+ struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+ void __user *uvalues = u64_to_user_ptr(attr->batch.values);
+ void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
+ void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
+ void *cursor = NULL, *keys = NULL, *values = NULL, *dst_key, *dst_val;
+ struct rhtab_elem **del_elems = NULL;
+ u32 max_count, total, key_size, value_size, i;
+ bool has_next_cursor = false;
+ struct rhtab_elem *elem;
+ u64 elem_map_flags, map_flags;
+ int ret = 0;
+
+ elem_map_flags = attr->batch.elem_flags;
+ ret = bpf_map_check_op_flags(map, elem_map_flags, BPF_F_LOCK);
+ if (ret)
+ return ret;
+
+ map_flags = attr->batch.flags;
+ if (map_flags)
+ return -EINVAL;
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ if (put_user(0, &uattr->batch.count))
+ return -EFAULT;
+
+ key_size = map->key_size;
+ value_size = map->value_size;
+
+ keys = kvmalloc_array(max_count, key_size, GFP_USER | __GFP_NOWARN);
+ values = kvmalloc_array(max_count, value_size, GFP_USER | __GFP_NOWARN);
+ if (do_delete)
+ del_elems = kvmalloc_array(max_count, sizeof(void *),
+ GFP_USER | __GFP_NOWARN);
+ cursor = kmalloc(key_size, GFP_USER | __GFP_NOWARN);
+
+ if (!keys || !values || !cursor || (do_delete && !del_elems)) {
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ if (ubatch && copy_from_user(cursor, ubatch, key_size)) {
+ ret = -EFAULT;
+ goto free;
+ }
+
+ dst_key = keys;
+ dst_val = values;
+ total = 0;
+
+ rcu_read_lock();
+
+ /*
+ * If the cursor key was concurrently deleted, rhashtable_next_key()
+ * returns ERR_PTR(-ENOENT); the batch terminates with no elements
+ * and userspace must restart from a NULL cursor.
+ */
+ elem = rhashtable_next_key(&rhtab->ht, ubatch ? cursor : NULL,
+ rhtab_params);
+ while (elem && !IS_ERR(elem) && total < max_count) {
+ memcpy(dst_key, elem->data, key_size);
+ rhtab_read_elem_value(map, dst_val, elem, elem_map_flags);
+ check_and_init_map_value(map, dst_val);
+
+ if (do_delete)
+ del_elems[total] = elem;
+
+ elem = rhashtable_next_key(&rhtab->ht, dst_key, rhtab_params);
+ dst_key += key_size;
+ dst_val += value_size;
+ total++;
+
+ /* Bail to userspace to avoid stalls. */
+ if (need_resched())
+ break;
+ }
+
+ if (elem && !IS_ERR(elem)) {
+ /* Stash next key as the cursor */
+ memcpy(cursor, elem->data, key_size);
+ has_next_cursor = true;
+ }
+
+ if (do_delete) {
+ for (i = 0; i < total; i++)
+ rhtab_delete_elem(rhtab, del_elems[i], NULL, 0);
+ }
+
+ rcu_read_unlock();
+
+ if (total == 0) {
+ ret = -ENOENT;
+ goto free;
+ }
+
+ /* No more elements after this batch. */
+ if (!has_next_cursor)
+ ret = -ENOENT;
+
+ if (copy_to_user(ukeys, keys, total * key_size) ||
+ copy_to_user(uvalues, values, total * value_size) ||
+ put_user(total, &uattr->batch.count) ||
+ (has_next_cursor &&
+ copy_to_user(u64_to_user_ptr(attr->batch.out_batch),
+ cursor, key_size))) {
+ ret = -EFAULT;
+ goto free;
+ }
+
+free:
+ kfree(cursor);
+ kvfree(keys);
+ kvfree(values);
+ kvfree(del_elems);
+ return ret;
}
static int rhtab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
- return -EOPNOTSUPP;
+ return __rhtab_map_lookup_and_delete_batch(map, attr, uattr, false);
}
static int rhtab_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
- return -EOPNOTSUPP;
+ return __rhtab_map_lookup_and_delete_batch(map, attr, uattr, true);
}
struct bpf_iter_seq_rhash_map_info {
struct bpf_map *map;
struct bpf_rhtab *rhtab;
+ void *last_key;
+ bool last_key_valid;
};
static void *bpf_rhash_map_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires_shared(RCU)
{
- return NULL;
+ struct bpf_iter_seq_rhash_map_info *info = seq->private;
+ void *key = *pos > 0 && info->last_key_valid ? info->last_key : NULL;
+ struct rhtab_elem *elem;
+
+ rcu_read_lock();
+ elem = rhashtable_next_key(&info->rhtab->ht, key,
+ rhtab_params);
+ if (IS_ERR_OR_NULL(elem))
+ return NULL;
+ if (*pos == 0)
+ ++*pos;
+ return elem;
}
static void *bpf_rhash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- return NULL;
+ struct bpf_iter_seq_rhash_map_info *info = seq->private;
+ struct rhtab_elem *elem = v;
+ void *next;
+
+ memcpy(info->last_key, elem->data, info->map->key_size);
+ info->last_key_valid = true;
+ ++*pos;
+
+ next = rhashtable_next_key(&info->rhtab->ht, info->last_key,
+ rhtab_params);
+ if (IS_ERR(next))
+ return NULL;
+ return next;
+}
+
+static int __bpf_rhash_map_seq_show(struct seq_file *seq,
+ struct rhtab_elem *elem)
+{
+ struct bpf_iter_seq_rhash_map_info *info = seq->private;
+ struct bpf_iter__bpf_map_elem ctx = {};
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, elem == NULL);
+ if (prog) {
+ ctx.meta = &meta;
+ ctx.map = info->map;
+ if (elem) {
+ ctx.key = elem->data;
+ ctx.value = rhtab_elem_value(elem, info->map->key_size);
+ }
+ ret = bpf_iter_run_prog(prog, &ctx);
+ }
+
+ return ret;
}
static int bpf_rhash_map_seq_show(struct seq_file *seq, void *v)
{
- return 0;
+ return __bpf_rhash_map_seq_show(seq, v);
}
static void bpf_rhash_map_seq_stop(struct seq_file *seq, void *v)
+ __releases_shared(RCU)
{
+ if (!v)
+ (void)__bpf_rhash_map_seq_show(seq, NULL);
+
+ rcu_read_unlock();
}
static int bpf_iter_init_rhash_map(void *priv_data, struct bpf_iter_aux_info *aux)
{
+ struct bpf_iter_seq_rhash_map_info *info = priv_data;
+ struct bpf_map *map = aux->map;
+
+ info->last_key_valid = false;
+ info->last_key = kmalloc(map->key_size, GFP_USER);
+ if (!info->last_key)
+ return -ENOMEM;
+
+ bpf_map_inc_with_uref(map);
+ info->map = map;
+ info->rhtab = container_of(map, struct bpf_rhtab, map);
return 0;
}
static void bpf_iter_fini_rhash_map(void *priv_data)
{
+ struct bpf_iter_seq_rhash_map_info *info = priv_data;
+
+ kfree(info->last_key);
+ bpf_map_put_with_uref(info->map);
}
static const struct seq_operations bpf_rhash_map_seq_ops = {
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 261a03ea73d3..4a2aafbe28b4 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -119,7 +119,8 @@ static int bpf_iter_attach_map(struct bpf_prog *prog,
is_percpu = true;
else if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
- map->map_type != BPF_MAP_TYPE_ARRAY)
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_RHASH)
goto put_map;
key_acc_size = prog->aux->max_rdonly_access;
--
2.53.0-Meta
next prev parent reply other threads:[~2026-05-13 22:37 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-13 22:36 [PATCH bpf-next v4 00/11] bpf: Introduce resizable hash map Mykyta Yatsenko
2026-05-13 22:36 ` [PATCH bpf-next v4 01/11] rhashtable: Add rhashtable_next_key() API Mykyta Yatsenko
2026-05-13 23:21 ` bot+bpf-ci
2026-05-13 22:36 ` [PATCH bpf-next v4 02/11] rhashtable: Add selftest for rhashtable_next_key() Mykyta Yatsenko
2026-05-13 22:36 ` [PATCH bpf-next v4 03/11] bpf: Implement resizable hashmap basic functions Mykyta Yatsenko
2026-05-13 23:21 ` bot+bpf-ci
2026-05-13 22:36 ` Mykyta Yatsenko [this message]
2026-05-13 22:36 ` [PATCH bpf-next v4 05/11] bpf: Allow special fields in resizable hashtab Mykyta Yatsenko
2026-05-13 23:21 ` bot+bpf-ci
2026-05-13 22:36 ` [PATCH bpf-next v4 06/11] bpf: Optimize word-sized keys for resizable hashtable Mykyta Yatsenko
2026-05-13 22:36 ` [PATCH bpf-next v4 07/11] libbpf: Support " Mykyta Yatsenko
2026-05-13 22:36 ` [PATCH bpf-next v4 08/11] selftests/bpf: Add basic tests for resizable hash map Mykyta Yatsenko
2026-05-13 22:36 ` [PATCH bpf-next v4 09/11] selftests/bpf: Add BPF iterator " Mykyta Yatsenko
2026-05-13 22:36 ` [PATCH bpf-next v4 10/11] bpftool: Add rhash map documentation Mykyta Yatsenko
2026-05-13 22:36 ` [PATCH bpf-next v4 11/11] selftests/bpf: Add resizable hashmap to benchmarks Mykyta Yatsenko
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260513-rhash-v4-4-dd3d541ccb0b@meta.com \
--to=mykyta.yatsenko5@gmail.com \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=eddyz87@gmail.com \
--cc=herbert@gondor.apana.org.au \
--cc=kafai@meta.com \
--cc=kernel-team@meta.com \
--cc=memxor@gmail.com \
--cc=yatsenko@meta.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox