All of lore.kernel.org
 help / color / mirror / Atom feed
From: Martin KaFai Lau <martin.lau@linux.dev>
To: bpf@vger.kernel.org
Cc: 'Alexei Starovoitov ' <ast@kernel.org>,
	'Andrii Nakryiko ' <andrii@kernel.org>,
	'Daniel Borkmann ' <daniel@iogearbox.net>,
	netdev@vger.kernel.org, kernel-team@meta.com,
	'Amery Hung ' <ameryhung@gmail.com>
Subject: [RFC PATCH bpf-next 03/12] bpf: Add bpf_rbtree_{root,left,right} kfunc
Date: Fri, 18 Apr 2025 15:46:41 -0700	[thread overview]
Message-ID: <20250418224652.105998-4-martin.lau@linux.dev> (raw)
In-Reply-To: <20250418224652.105998-1-martin.lau@linux.dev>

From: Martin KaFai Lau <martin.lau@kernel.org>

In the kernel fq qdisc implementation, it requires to traverse a rbtree
stored with the networking "flows".

In the later bpf selftests prog, the much simplified logic that uses
the bpf_rbtree_{root,left,right} to traverse the tree is like:

struct fq_flow {
	struct bpf_rb_node	fq_node;
	struct bpf_rb_node	rate_node;
	struct bpf_refcount	refcount;
	unsigned long		sk_long;
};

struct fq_flow_root {
	struct bpf_spin_lock lock;
	struct bpf_rb_root root __contains(fq_flow, fq_node);
};

struct fq_flow *fq_classify(...)
{
	struct bpf_rb_node *tofree[FQ_GC_MAX];
	struct fq_flow_root *root;
	struct fq_flow *gc_f, *f;
	struct bpf_rb_node *p;
	int i, fcnt = 0;

	/* ... */

	f = NULL;
	bpf_spin_lock(&root->lock);
	p = bpf_rbtree_root(&root->root);
	while (can_loop) {
		if (!p)
			break;

		gc_f = bpf_rb_entry(p, struct fq_flow, fq_node);
		if (gc_f->sk_long == sk_long) {
			f = bpf_refcount_acquire(gc_f);
			break;
		}

		/* To be removed from the rbtree */
		if (fcnt < FQ_GC_MAX && fq_gc_candidate(gc_f, jiffies_now))
			tofree[fcnt++] = p;

		if (gc_f->sk_long > sk_long)
			p = bpf_rbtree_left(&root->root, p);
		else
			p = bpf_rbtree_right(&root->root, p);
	}

	/* remove from the rbtree */
	for (i = 0; i < fcnt; i++) {
		p = tofree[i];
		tofree[i] = bpf_rbtree_remove(&root->root, p);
	}

	bpf_spin_unlock(&root->lock);

	/* bpf_obj_drop the fq_flow(s) that have just been removed
	 * from the rbtree.
	 */
	for (i = 0; i < fcnt; i++) {
		p = tofree[i];
		if (p) {
			gc_f = bpf_rb_entry(p, struct fq_flow, fq_node);
			bpf_obj_drop(gc_f);
		}
	}

	return f;

}

The above simplified code needs to traverse the rbtree for two purposes,
1) find the flow with the desired sk_long value
2) while searching for the sk_long, collect flows that are
   the fq_gc_candidate. They will be removed from the rbtree.

This patch adds the bpf_rbtree_{root,left,right} kfunc to enable
the rbtree traversal. The returned bpf_rb_node pointer will be a
non-owning reference which is the same as the returned pointer
of the exisiting bpf_rbtree_first kfunc.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 kernel/bpf/helpers.c  | 30 ++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c | 23 ++++++++++++++++++-----
 2 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index e3a2662f4e33..36150d340c16 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2366,6 +2366,33 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
 	return (struct bpf_rb_node *)rb_first_cached(r);
 }
 
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_root(struct bpf_rb_root *root)
+{
+	struct rb_root_cached *r = (struct rb_root_cached *)root;
+
+	return (struct bpf_rb_node *)r->rb_root.rb_node;
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_left(struct bpf_rb_root *root, struct bpf_rb_node *node)
+{
+	struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
+
+	if (READ_ONCE(node_internal->owner) != root)
+		return NULL;
+
+	return (struct bpf_rb_node *)node_internal->rb_node.rb_left;
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_right(struct bpf_rb_root *root, struct bpf_rb_node *node)
+{
+	struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
+
+	if (READ_ONCE(node_internal->owner) != root)
+		return NULL;
+
+	return (struct bpf_rb_node *)node_internal->rb_node.rb_right;
+}
+
 /**
  * bpf_task_acquire - Acquire a reference to a task. A task acquired by this
  * kfunc which is not stored in a map as a kptr, must be released by calling
@@ -3214,6 +3241,9 @@ BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_rbtree_add_impl)
 BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_left, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_right, KF_RET_NULL)
 
 #ifdef CONFIG_CGROUPS
 BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bf14da00f09a..3624de1c6925 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -12081,6 +12081,9 @@ enum special_kfunc_type {
 	KF_bpf_rbtree_remove,
 	KF_bpf_rbtree_add_impl,
 	KF_bpf_rbtree_first,
+	KF_bpf_rbtree_root,
+	KF_bpf_rbtree_left,
+	KF_bpf_rbtree_right,
 	KF_bpf_dynptr_from_skb,
 	KF_bpf_dynptr_from_xdp,
 	KF_bpf_dynptr_slice,
@@ -12121,6 +12124,9 @@ BTF_ID(func, bpf_rdonly_cast)
 BTF_ID(func, bpf_rbtree_remove)
 BTF_ID(func, bpf_rbtree_add_impl)
 BTF_ID(func, bpf_rbtree_first)
+BTF_ID(func, bpf_rbtree_root)
+BTF_ID(func, bpf_rbtree_left)
+BTF_ID(func, bpf_rbtree_right)
 #ifdef CONFIG_NET
 BTF_ID(func, bpf_dynptr_from_skb)
 BTF_ID(func, bpf_dynptr_from_xdp)
@@ -12156,6 +12162,9 @@ BTF_ID(func, bpf_rcu_read_unlock)
 BTF_ID(func, bpf_rbtree_remove)
 BTF_ID(func, bpf_rbtree_add_impl)
 BTF_ID(func, bpf_rbtree_first)
+BTF_ID(func, bpf_rbtree_root)
+BTF_ID(func, bpf_rbtree_left)
+BTF_ID(func, bpf_rbtree_right)
 #ifdef CONFIG_NET
 BTF_ID(func, bpf_dynptr_from_skb)
 BTF_ID(func, bpf_dynptr_from_xdp)
@@ -12591,7 +12600,10 @@ static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
 {
 	return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
 	       btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
-	       btf_id == special_kfunc_list[KF_bpf_rbtree_first];
+	       btf_id == special_kfunc_list[KF_bpf_rbtree_first] ||
+	       btf_id == special_kfunc_list[KF_bpf_rbtree_root] ||
+	       btf_id == special_kfunc_list[KF_bpf_rbtree_left] ||
+	       btf_id == special_kfunc_list[KF_bpf_rbtree_right];
 }
 
 static bool is_bpf_iter_num_api_kfunc(u32 btf_id)
@@ -12691,7 +12703,9 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
 		break;
 	case BPF_RB_NODE:
 		ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
-		       kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]);
+		       kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
+		       kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_left] ||
+		       kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_right]);
 		break;
 	default:
 		verbose(env, "verifier internal error: unexpected graph node argument type %s\n",
@@ -13216,15 +13230,14 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 				}
 			} else {
 				if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) {
-					verbose(env, "rbtree_remove node input must be non-owning ref\n");
+					verbose(env, "%s can only take non-owning bpf_rb_node pointer\n", func_name);
 					return -EINVAL;
 				}
 				if (in_rbtree_lock_required_cb(env)) {
-					verbose(env, "rbtree_remove not allowed in rbtree cb\n");
+					verbose(env, "%s not allowed in rbtree cb\n", func_name);
 					return -EINVAL;
 				}
 			}
-
 			ret = process_kf_arg_ptr_to_rbtree_node(env, reg, regno, meta);
 			if (ret < 0)
 				return ret;
-- 
2.47.1


  parent reply	other threads:[~2025-04-18 22:47 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-04-18 22:46 [RFC PATCH bpf-next 00/12] bpf: A fq example similar to the kernel sch_fq.c implementation Martin KaFai Lau
2025-04-18 22:46 ` [RFC PATCH bpf-next 01/12] bpf: Check KF_bpf_rbtree_add_impl for the "case KF_ARG_PTR_TO_RB_NODE" Martin KaFai Lau
2025-04-22  1:05   ` Kumar Kartikeya Dwivedi
2025-04-18 22:46 ` [RFC PATCH bpf-next 02/12] bpf: Simplify reg0 marking for the rbtree kfuncs that return a bpf_rb_node pointer Martin KaFai Lau
2025-04-22  1:14   ` Kumar Kartikeya Dwivedi
2025-04-18 22:46 ` Martin KaFai Lau [this message]
2025-04-22  1:43   ` [RFC PATCH bpf-next 03/12] bpf: Add bpf_rbtree_{root,left,right} kfunc Kumar Kartikeya Dwivedi
2025-04-18 22:46 ` [RFC PATCH bpf-next 04/12] selftests/bpf: Adjust failure message in the rbtree_fail test Martin KaFai Lau
2025-04-22  1:44   ` Kumar Kartikeya Dwivedi
2025-04-18 22:46 ` [RFC PATCH bpf-next 05/12] bpf: Allow refcounted bpf_rb_node used in bpf_rbtree_{remove,left,right} Martin KaFai Lau
2025-04-22  2:32   ` Kumar Kartikeya Dwivedi
2025-04-18 22:46 ` [RFC PATCH bpf-next 06/12] selftests/bpf: Adjust test that does not allow refcounted node in rbtree_remove Martin KaFai Lau
2025-04-22  2:36   ` Kumar Kartikeya Dwivedi
2025-04-22  2:48     ` Kumar Kartikeya Dwivedi
2025-04-18 22:46 ` [RFC PATCH bpf-next 07/12] selftests/bpf: Add rbtree_search test Martin KaFai Lau
2025-04-22  3:03   ` Kumar Kartikeya Dwivedi
2025-04-18 22:46 ` [RFC PATCH bpf-next 08/12] bpf: Simplify reg0 marking for the list kfuncs that return a bpf_list_node pointer Martin KaFai Lau
2025-04-22  3:05   ` Kumar Kartikeya Dwivedi
2025-04-18 22:46 ` [RFC PATCH bpf-next 09/12] bpf: Add bpf_list_{front,back} kfunc Martin KaFai Lau
2025-04-22  3:07   ` Kumar Kartikeya Dwivedi
2025-04-18 22:46 ` [RFC PATCH bpf-next 10/12] selftests/bpf: Add test for bpf_list_{front,back} Martin KaFai Lau
2025-04-22  3:08   ` Kumar Kartikeya Dwivedi
2025-04-25 23:28     ` Martin KaFai Lau
2025-04-18 22:46 ` [RFC PATCH bpf-next 11/12] bpf: net: Add a qdisc kfunc to set sk_pacing_status Martin KaFai Lau
2025-04-18 22:46 ` [RFC PATCH bpf-next 12/12] selftests/bpf: A bpf fq implementation similar to the kernel sch_fq Martin KaFai Lau
2025-04-25  0:13   ` Alexei Starovoitov
2025-04-25 23:50     ` Martin KaFai Lau

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250418224652.105998-4-martin.lau@linux.dev \
    --to=martin.lau@linux.dev \
    --cc=ameryhung@gmail.com \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=kernel-team@meta.com \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.