[PATCH bpf-next v4 04/11] bpf: Implement iteration ops for resizable hashtab

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Mykyta Yatsenko <mykyta.yatsenko5@gmail.com>
To: bpf@vger.kernel.org, ast@kernel.org, andrii@kernel.org,
	 daniel@iogearbox.net, kafai@meta.com, kernel-team@meta.com,
	 eddyz87@gmail.com, memxor@gmail.com,
	herbert@gondor.apana.org.au
Cc: Mykyta Yatsenko <yatsenko@meta.com>
Subject: [PATCH bpf-next v4 04/11] bpf: Implement iteration ops for resizable hashtab
Date: Wed, 13 May 2026 15:36:07 -0700	[thread overview]
Message-ID: <20260513-rhash-v4-4-dd3d541ccb0b@meta.com> (raw)
In-Reply-To: <20260513-rhash-v4-0-dd3d541ccb0b@meta.com>

From: Mykyta Yatsenko <yatsenko@meta.com>

Implement rhtab_map_get_next_key(), batch lookup/lookup-and-delete,
and the seq_file BPF iterator for BPF_MAP_TYPE_RHASH.

All three use rhashtable_next_key() with the previous key as cursor.
This avoids the rhashtable walker state machine and its spinlock
acquisition per enter/exit, and lets callers stay stateless across
syscalls. ERR_PTR(-ENOENT) from rhashtable_next_key() (cursor was
concurrently deleted) is handled per callsite: get_next_key falls
back to the first key (matches BPF UAPI semantics of htab); the
seq_file iterator and batch terminate iteration.

Also implements rhtab_map_mem_usage() to report memory consumption.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
---
 kernel/bpf/hashtab.c  | 290 ++++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/bpf/map_iter.c |   3 +-
 2 files changed, 284 insertions(+), 9 deletions(-)

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index da1754e1a525..61eb88cb9229 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -3021,68 +3021,342 @@ static void rhtab_map_free_internal_structs(struct bpf_map *map)
 }
 
 static int rhtab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+	__must_hold_shared(RCU)
 {
-	return -EOPNOTSUPP;
+	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+	struct rhtab_elem *elem;
+
+	elem = rhashtable_next_key(&rhtab->ht, key, rhtab_params);
+
+	/* if not found, return the first key */
+	if (PTR_ERR(elem) == -ENOENT)
+		elem = rhashtable_next_key(&rhtab->ht, NULL, rhtab_params);
+
+	if (!elem)
+		return -ENOENT;
+
+	memcpy(next_key, elem->data, map->key_size);
+	return 0;
 }
 
 static void rhtab_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m)
 {
+	void *value;
+
+	/* Guarantee that hashtab value is not freed */
+	guard(rcu)();
+
+	value = rhtab_map_lookup_elem(map, key);
+	if (!value)
+		return;
+
+	btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
+	seq_puts(m, ": ");
+	btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
+	seq_putc(m, '\n');
 }
 
 static long bpf_each_rhash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
 				void *callback_ctx, u64 flags)
 {
-	return -EOPNOTSUPP;
+	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+	void *prev_key = NULL;
+	struct rhtab_elem *elem;
+	int num_elems = 0;
+	void *key_buf = NULL;
+	u64 ret = 0;
+
+	if (flags != 0)
+		return -EINVAL;
+
+	/*
+	 * Buffer for stashing the cursor across rcu_read_unlock() in the
+	 * cond_resched path.
+	 */
+	key_buf = kmalloc_nolock(map->key_size, 0, NUMA_NO_NODE);
+	if (!key_buf)
+		return -ENOMEM;
+
+	rcu_read_lock();
+	/*
+	 * Best-effort iteration: if rhashtable is concurrently resized or
+	 * elements are deleted/inserted, there may be missed or duplicate
+	 * elements visited.
+	 */
+	while ((elem = rhashtable_next_key(&rhtab->ht, prev_key, rhtab_params))) {
+		if (IS_ERR(elem))
+			break;
+		num_elems++;
+		ret = callback_fn((u64)(long)map,
+				  (u64)(long)elem->data,
+				  (u64)(long)rhtab_elem_value(elem, map->key_size),
+				  (u64)(long)callback_ctx, 0);
+		if (ret)
+			break;
+
+		prev_key = elem->data;	/* valid while RCU held */
+
+		if (need_resched()) {
+			memcpy(key_buf, elem->data, map->key_size);
+			rcu_read_unlock();
+			cond_resched();
+			rcu_read_lock();
+			prev_key = key_buf;
+		}
+	}
+	rcu_read_unlock();
+
+	kfree_nolock(key_buf);
+	return num_elems;
 }
 
 static u64 rhtab_map_mem_usage(const struct bpf_map *map)
 {
-	return 0;
+	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+	u64 num_entries;
+
+	/* Excludes rhashtable bucket overhead (~ nelems * sizeof(void *) at 75% load). */
+	num_entries = atomic_read(&rhtab->ht.nelems);
+	return sizeof(struct bpf_rhtab) + rhtab->elem_size * num_entries;
+}
+
+static int __rhtab_map_lookup_and_delete_batch(struct bpf_map *map,
+					       const union bpf_attr *attr,
+					       union bpf_attr __user *uattr,
+					       bool do_delete)
+{
+	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+	void __user *uvalues = u64_to_user_ptr(attr->batch.values);
+	void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
+	void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
+	void *cursor = NULL, *keys = NULL, *values = NULL, *dst_key, *dst_val;
+	struct rhtab_elem **del_elems = NULL;
+	u32 max_count, total, key_size, value_size, i;
+	bool has_next_cursor = false;
+	struct rhtab_elem *elem;
+	u64 elem_map_flags, map_flags;
+	int ret = 0;
+
+	elem_map_flags = attr->batch.elem_flags;
+	ret = bpf_map_check_op_flags(map, elem_map_flags, BPF_F_LOCK);
+	if (ret)
+		return ret;
+
+	map_flags = attr->batch.flags;
+	if (map_flags)
+		return -EINVAL;
+
+	max_count = attr->batch.count;
+	if (!max_count)
+		return 0;
+
+	if (put_user(0, &uattr->batch.count))
+		return -EFAULT;
+
+	key_size = map->key_size;
+	value_size = map->value_size;
+
+	keys = kvmalloc_array(max_count, key_size, GFP_USER | __GFP_NOWARN);
+	values = kvmalloc_array(max_count, value_size, GFP_USER | __GFP_NOWARN);
+	if (do_delete)
+		del_elems = kvmalloc_array(max_count, sizeof(void *),
+					   GFP_USER | __GFP_NOWARN);
+	cursor = kmalloc(key_size, GFP_USER | __GFP_NOWARN);
+
+	if (!keys || !values || !cursor || (do_delete && !del_elems)) {
+		ret = -ENOMEM;
+		goto free;
+	}
+
+	if (ubatch && copy_from_user(cursor, ubatch, key_size)) {
+		ret = -EFAULT;
+		goto free;
+	}
+
+	dst_key = keys;
+	dst_val = values;
+	total = 0;
+
+	rcu_read_lock();
+
+	/*
+	 * If the cursor key was concurrently deleted, rhashtable_next_key()
+	 * returns ERR_PTR(-ENOENT); the batch terminates with no elements
+	 * and userspace must restart from a NULL cursor.
+	 */
+	elem = rhashtable_next_key(&rhtab->ht, ubatch ? cursor : NULL,
+				   rhtab_params);
+	while (elem && !IS_ERR(elem) && total < max_count) {
+		memcpy(dst_key, elem->data, key_size);
+		rhtab_read_elem_value(map, dst_val, elem, elem_map_flags);
+		check_and_init_map_value(map, dst_val);
+
+		if (do_delete)
+			del_elems[total] = elem;
+
+		elem = rhashtable_next_key(&rhtab->ht, dst_key, rhtab_params);
+		dst_key += key_size;
+		dst_val += value_size;
+		total++;
+
+		/* Bail to userspace to avoid stalls. */
+		if (need_resched())
+			break;
+	}
+
+	if (elem && !IS_ERR(elem)) {
+		/* Stash next key as the cursor */
+		memcpy(cursor, elem->data, key_size);
+		has_next_cursor = true;
+	}
+
+	if (do_delete) {
+		for (i = 0; i < total; i++)
+			rhtab_delete_elem(rhtab, del_elems[i], NULL, 0);
+	}
+
+	rcu_read_unlock();
+
+	if (total == 0) {
+		ret = -ENOENT;
+		goto free;
+	}
+
+	/* No more elements after this batch. */
+	if (!has_next_cursor)
+		ret = -ENOENT;
+
+	if (copy_to_user(ukeys, keys, total * key_size) ||
+	    copy_to_user(uvalues, values, total * value_size) ||
+	    put_user(total, &uattr->batch.count) ||
+	    (has_next_cursor &&
+	     copy_to_user(u64_to_user_ptr(attr->batch.out_batch),
+			  cursor, key_size))) {
+		ret = -EFAULT;
+		goto free;
+	}
+
+free:
+	kfree(cursor);
+	kvfree(keys);
+	kvfree(values);
+	kvfree(del_elems);
+	return ret;
 }
 
 static int rhtab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
 				  union bpf_attr __user *uattr)
 {
-	return -EOPNOTSUPP;
+	return __rhtab_map_lookup_and_delete_batch(map, attr, uattr, false);
 }
 
 static int rhtab_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr,
 					     union bpf_attr __user *uattr)
 {
-	return -EOPNOTSUPP;
+	return __rhtab_map_lookup_and_delete_batch(map, attr, uattr, true);
 }
 
 struct bpf_iter_seq_rhash_map_info {
 	struct bpf_map *map;
 	struct bpf_rhtab *rhtab;
+	void *last_key;
+	bool last_key_valid;
 };
 
 static void *bpf_rhash_map_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires_shared(RCU)
 {
-	return NULL;
+	struct bpf_iter_seq_rhash_map_info *info = seq->private;
+	void *key = *pos > 0 && info->last_key_valid ? info->last_key : NULL;
+	struct rhtab_elem *elem;
+
+	rcu_read_lock();
+	elem = rhashtable_next_key(&info->rhtab->ht, key,
+				   rhtab_params);
+	if (IS_ERR_OR_NULL(elem))
+		return NULL;
+	if (*pos == 0)
+		++*pos;
+	return elem;
 }
 
 static void *bpf_rhash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	return NULL;
+	struct bpf_iter_seq_rhash_map_info *info = seq->private;
+	struct rhtab_elem *elem = v;
+	void *next;
+
+	memcpy(info->last_key, elem->data, info->map->key_size);
+	info->last_key_valid = true;
+	++*pos;
+
+	next = rhashtable_next_key(&info->rhtab->ht, info->last_key,
+				   rhtab_params);
+	if (IS_ERR(next))
+		return NULL;
+	return next;
+}
+
+static int __bpf_rhash_map_seq_show(struct seq_file *seq,
+				    struct rhtab_elem *elem)
+{
+	struct bpf_iter_seq_rhash_map_info *info = seq->private;
+	struct bpf_iter__bpf_map_elem ctx = {};
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	int ret = 0;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, elem == NULL);
+	if (prog) {
+		ctx.meta = &meta;
+		ctx.map = info->map;
+		if (elem) {
+			ctx.key = elem->data;
+			ctx.value = rhtab_elem_value(elem, info->map->key_size);
+		}
+		ret = bpf_iter_run_prog(prog, &ctx);
+	}
+
+	return ret;
 }
 
 static int bpf_rhash_map_seq_show(struct seq_file *seq, void *v)
 {
-	return 0;
+	return __bpf_rhash_map_seq_show(seq, v);
 }
 
 static void bpf_rhash_map_seq_stop(struct seq_file *seq, void *v)
+	__releases_shared(RCU)
 {
+	if (!v)
+		(void)__bpf_rhash_map_seq_show(seq, NULL);
+
+	rcu_read_unlock();
 }
 
 static int bpf_iter_init_rhash_map(void *priv_data, struct bpf_iter_aux_info *aux)
 {
+	struct bpf_iter_seq_rhash_map_info *info = priv_data;
+	struct bpf_map *map = aux->map;
+
+	info->last_key_valid = false;
+	info->last_key = kmalloc(map->key_size, GFP_USER);
+	if (!info->last_key)
+		return -ENOMEM;
+
+	bpf_map_inc_with_uref(map);
+	info->map = map;
+	info->rhtab = container_of(map, struct bpf_rhtab, map);
 	return 0;
 }
 
 static void bpf_iter_fini_rhash_map(void *priv_data)
 {
+	struct bpf_iter_seq_rhash_map_info *info = priv_data;
+
+	kfree(info->last_key);
+	bpf_map_put_with_uref(info->map);
 }
 
 static const struct seq_operations bpf_rhash_map_seq_ops = {
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 261a03ea73d3..4a2aafbe28b4 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -119,7 +119,8 @@ static int bpf_iter_attach_map(struct bpf_prog *prog,
 		is_percpu = true;
 	else if (map->map_type != BPF_MAP_TYPE_HASH &&
 		 map->map_type != BPF_MAP_TYPE_LRU_HASH &&
-		 map->map_type != BPF_MAP_TYPE_ARRAY)
+		 map->map_type != BPF_MAP_TYPE_ARRAY &&
+		 map->map_type != BPF_MAP_TYPE_RHASH)
 		goto put_map;
 
 	key_acc_size = prog->aux->max_rdonly_access;

-- 
2.53.0-Meta

next prev parent reply	other threads:[~2026-05-13 22:37 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-13 22:36 [PATCH bpf-next v4 00/11] bpf: Introduce resizable hash map Mykyta Yatsenko
2026-05-13 22:36 ` [PATCH bpf-next v4 01/11] rhashtable: Add rhashtable_next_key() API Mykyta Yatsenko
2026-05-13 23:21   ` bot+bpf-ci
2026-05-13 22:36 ` [PATCH bpf-next v4 02/11] rhashtable: Add selftest for rhashtable_next_key() Mykyta Yatsenko
2026-05-13 22:36 ` [PATCH bpf-next v4 03/11] bpf: Implement resizable hashmap basic functions Mykyta Yatsenko
2026-05-13 23:21   ` bot+bpf-ci
2026-05-14 10:59     ` Mykyta Yatsenko
2026-05-13 22:36 ` Mykyta Yatsenko [this message]
2026-05-13 22:36 ` [PATCH bpf-next v4 05/11] bpf: Allow special fields in resizable hashtab Mykyta Yatsenko
2026-05-13 23:21   ` bot+bpf-ci
2026-05-14 11:00     ` Mykyta Yatsenko
2026-05-13 22:36 ` [PATCH bpf-next v4 06/11] bpf: Optimize word-sized keys for resizable hashtable Mykyta Yatsenko
2026-05-13 22:36 ` [PATCH bpf-next v4 07/11] libbpf: Support " Mykyta Yatsenko
2026-05-13 22:36 ` [PATCH bpf-next v4 08/11] selftests/bpf: Add basic tests for resizable hash map Mykyta Yatsenko
2026-05-13 22:36 ` [PATCH bpf-next v4 09/11] selftests/bpf: Add BPF iterator " Mykyta Yatsenko
2026-05-13 22:36 ` [PATCH bpf-next v4 10/11] bpftool: Add rhash map documentation Mykyta Yatsenko
2026-05-13 22:36 ` [PATCH bpf-next v4 11/11] selftests/bpf: Add resizable hashmap to benchmarks Mykyta Yatsenko

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:da1754e1a52 dfblob:61eb88cb922 dfblob:261a03ea73d
dfblob:4a2aafbe28b )
 OR (
bs:"[PATCH bpf-next v4 04/11] bpf: Implement iteration ops for resizable hashtab" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260513-rhash-v4-4-dd3d541ccb0b@meta.com \
    --to=mykyta.yatsenko5@gmail.com \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=eddyz87@gmail.com \
    --cc=herbert@gondor.apana.org.au \
    --cc=kafai@meta.com \
    --cc=kernel-team@meta.com \
    --cc=memxor@gmail.com \
    --cc=yatsenko@meta.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.