From: Amery Hung <ameryhung@gmail.com>
To: netdev@vger.kernel.org
Cc: bpf@vger.kernel.org, daniel@iogearbox.net, andrii@kernel.org,
alexei.starovoitov@gmail.com, martin.lau@kernel.org,
kuba@kernel.org, edumazet@google.com, xiyou.wangcong@gmail.com,
cong.wang@bytedance.com, jhs@mojatatu.com, sinquersw@gmail.com,
toke@redhat.com, jiri@resnulli.us, stfomichev@gmail.com,
ekarani.silvestre@ccc.ufcg.edu.br, yangpeihao@sjtu.edu.cn,
yepeilin.cs@gmail.com, ameryhung@gmail.com, ming.lei@redhat.com,
kernel-team@meta.com
Subject: [PATCH bpf-next v3 08/18] bpf: net_sched: Support implementation of Qdisc_ops in bpf
Date: Fri, 31 Jan 2025 11:28:47 -0800 [thread overview]
Message-ID: <20250131192912.133796-9-ameryhung@gmail.com> (raw)
In-Reply-To: <20250131192912.133796-1-ameryhung@gmail.com>
From: Amery Hung <amery.hung@bytedance.com>
Enable users to implement a classless qdisc using bpf. The last few
patches in this series has prepared struct_ops to support core operators
in Qdisc_ops. The recent advancement in bpf such as allocated
objects, bpf list and bpf rbtree has also provided powerful and flexible
building blocks to realize sophisticated scheduling algorithms. Therefore,
in this patch, we start allowing qdisc to be implemented using bpf
struct_ops. Users can implement Qdisc_ops.{enqueue, dequeue, init, reset,
and .destroy in Qdisc_ops in bpf and register the qdisc dynamically into
the kernel.
We do not allow users to attach bpf qdiscs to classful qdiscs. This is to
prevent accidentally breaking existings clasful qdiscs if they rely on
some data in the child qdisc. This restrication can potentially be lifted
in the future. Note that, we still allow bpf qdisc to be attached to mq.
Co-developed-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Amery Hung <amery.hung@bytedance.com>
---
net/sched/Kconfig | 12 +++
net/sched/Makefile | 1 +
net/sched/bpf_qdisc.c | 210 ++++++++++++++++++++++++++++++++++++++++
net/sched/sch_api.c | 14 ++-
net/sched/sch_generic.c | 3 +-
5 files changed, 236 insertions(+), 4 deletions(-)
create mode 100644 net/sched/bpf_qdisc.c
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 8180d0c12fce..ccd0255da5a5 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -403,6 +403,18 @@ config NET_SCH_ETS
If unsure, say N.
+config NET_SCH_BPF
+ bool "BPF-based Qdisc"
+ depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF
+ help
+ This option allows BPF-based queueing disiplines. With BPF struct_ops,
+ users can implement supported operators in Qdisc_ops using BPF programs.
+ The queue holding skb can be built with BPF maps or graphs.
+
+ Say Y here if you want to use BPF-based Qdisc.
+
+ If unsure, say N.
+
menuconfig NET_SCH_DEFAULT
bool "Allow override default queue discipline"
help
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 82c3f78ca486..904d784902d1 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -62,6 +62,7 @@ obj-$(CONFIG_NET_SCH_FQ_PIE) += sch_fq_pie.o
obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o
obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o
obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o
+obj-$(CONFIG_NET_SCH_BPF) += bpf_qdisc.o
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c
new file mode 100644
index 000000000000..00f3232f4a98
--- /dev/null
+++ b/net/sched/bpf_qdisc.c
@@ -0,0 +1,210 @@
+#include <linux/types.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/filter.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+static struct bpf_struct_ops bpf_Qdisc_ops;
+
+struct bpf_sk_buff_ptr {
+ struct sk_buff *skb;
+};
+
+static int bpf_qdisc_init(struct btf *btf)
+{
+ return 0;
+}
+
+static const struct bpf_func_proto *
+bpf_qdisc_get_func_proto(enum bpf_func_id func_id,
+ const struct bpf_prog *prog)
+{
+ /* Tail call is disabled since there is no gaurantee valid refcounted
+ * kptrs will always be passed to another bpf program with __ref arguments.
+ */
+ switch (func_id) {
+ case BPF_FUNC_tail_call:
+ return NULL;
+ default:
+ return bpf_base_func_proto(func_id, prog);
+ }
+}
+
+BTF_ID_LIST_SINGLE(bpf_sk_buff_ids, struct, sk_buff)
+BTF_ID_LIST_SINGLE(bpf_sk_buff_ptr_ids, struct, bpf_sk_buff_ptr)
+
+static bool bpf_qdisc_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ struct btf *btf = prog->aux->attach_btf;
+ u32 arg;
+
+ arg = btf_ctx_arg_idx(btf, prog->aux->attach_func_proto, off);
+ if (bpf_struct_ops_prog_moff(prog) == offsetof(struct Qdisc_ops, enqueue)) {
+ if (arg == 2 && type == BPF_READ) {
+ info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED;
+ info->btf = btf;
+ info->btf_id = bpf_sk_buff_ptr_ids[0];
+ return true;
+ }
+ }
+
+ return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+static int bpf_qdisc_btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size)
+{
+ const struct btf_type *t, *skbt;
+ size_t end;
+
+ skbt = btf_type_by_id(reg->btf, bpf_sk_buff_ids[0]);
+ t = btf_type_by_id(reg->btf, reg->btf_id);
+ if (t != skbt) {
+ bpf_log(log, "only read is supported\n");
+ return -EACCES;
+ }
+
+ switch (off) {
+ case offsetof(struct sk_buff, tstamp):
+ end = offsetofend(struct sk_buff, tstamp);
+ break;
+ case offsetof(struct sk_buff, priority):
+ end = offsetofend(struct sk_buff, priority);
+ break;
+ case offsetof(struct sk_buff, mark):
+ end = offsetofend(struct sk_buff, mark);
+ break;
+ case offsetof(struct sk_buff, queue_mapping):
+ end = offsetofend(struct sk_buff, queue_mapping);
+ break;
+ case offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, tc_classid):
+ end = offsetof(struct sk_buff, cb) +
+ offsetofend(struct qdisc_skb_cb, tc_classid);
+ break;
+ case offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, data[0]) ...
+ offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb,
+ data[QDISC_CB_PRIV_LEN - 1]):
+ end = offsetof(struct sk_buff, cb) +
+ offsetofend(struct qdisc_skb_cb, data[QDISC_CB_PRIV_LEN - 1]);
+ break;
+ case offsetof(struct sk_buff, tc_index):
+ end = offsetofend(struct sk_buff, tc_index);
+ break;
+ default:
+ bpf_log(log, "no write support to sk_buff at off %d\n", off);
+ return -EACCES;
+ }
+
+ if (off + size > end) {
+ bpf_log(log,
+ "write access at off %d with size %d beyond the member of sk_buff ended at %zu\n",
+ off, size, end);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static const struct bpf_verifier_ops bpf_qdisc_verifier_ops = {
+ .get_func_proto = bpf_qdisc_get_func_proto,
+ .is_valid_access = bpf_qdisc_is_valid_access,
+ .btf_struct_access = bpf_qdisc_btf_struct_access,
+};
+
+static int bpf_qdisc_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ const struct Qdisc_ops *uqdisc_ops;
+ struct Qdisc_ops *qdisc_ops;
+ u32 moff;
+
+ uqdisc_ops = (const struct Qdisc_ops *)udata;
+ qdisc_ops = (struct Qdisc_ops *)kdata;
+
+ moff = __btf_member_bit_offset(t, member) / 8;
+ switch (moff) {
+ case offsetof(struct Qdisc_ops, peek):
+ qdisc_ops->peek = qdisc_peek_dequeued;
+ return 0;
+ case offsetof(struct Qdisc_ops, id):
+ if (bpf_obj_name_cpy(qdisc_ops->id, uqdisc_ops->id,
+ sizeof(qdisc_ops->id)) <= 0)
+ return -EINVAL;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int bpf_qdisc_reg(void *kdata, struct bpf_link *link)
+{
+ return register_qdisc(kdata);
+}
+
+static void bpf_qdisc_unreg(void *kdata, struct bpf_link *link)
+{
+ return unregister_qdisc(kdata);
+}
+
+static int Qdisc_ops__enqueue(struct sk_buff *skb__ref, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ return 0;
+}
+
+static struct sk_buff *Qdisc_ops__dequeue(struct Qdisc *sch)
+{
+ return NULL;
+}
+
+static struct sk_buff *Qdisc_ops__peek(struct Qdisc *sch)
+{
+ return NULL;
+}
+
+static int Qdisc_ops__init(struct Qdisc *sch, struct nlattr *arg,
+ struct netlink_ext_ack *extack)
+{
+ return 0;
+}
+
+static void Qdisc_ops__reset(struct Qdisc *sch)
+{
+}
+
+static void Qdisc_ops__destroy(struct Qdisc *sch)
+{
+}
+
+static struct Qdisc_ops __bpf_ops_qdisc_ops = {
+ .enqueue = Qdisc_ops__enqueue,
+ .dequeue = Qdisc_ops__dequeue,
+ .peek = Qdisc_ops__peek,
+ .init = Qdisc_ops__init,
+ .reset = Qdisc_ops__reset,
+ .destroy = Qdisc_ops__destroy,
+};
+
+static struct bpf_struct_ops bpf_Qdisc_ops = {
+ .verifier_ops = &bpf_qdisc_verifier_ops,
+ .reg = bpf_qdisc_reg,
+ .unreg = bpf_qdisc_unreg,
+ .init_member = bpf_qdisc_init_member,
+ .init = bpf_qdisc_init,
+ .name = "Qdisc_ops",
+ .cfi_stubs = &__bpf_ops_qdisc_ops,
+ .owner = THIS_MODULE,
+};
+
+static int __init bpf_qdisc_kfunc_init(void)
+{
+ return register_bpf_struct_ops(&bpf_Qdisc_ops, Qdisc_ops);
+}
+late_initcall(bpf_qdisc_kfunc_init);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index e3e91cf867eb..c8057e0692a6 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -25,6 +25,7 @@
#include <linux/hrtimer.h>
#include <linux/slab.h>
#include <linux/hashtable.h>
+#include <linux/bpf.h>
#include <net/net_namespace.h>
#include <net/sock.h>
@@ -358,7 +359,7 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
read_lock(&qdisc_mod_lock);
for (q = qdisc_base; q; q = q->next) {
if (nla_strcmp(kind, q->id) == 0) {
- if (!try_module_get(q->owner))
+ if (!bpf_try_module_get(q, q->owner))
q = NULL;
break;
}
@@ -1200,6 +1201,13 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
return -EINVAL;
}
+ if (new &&
+ !(parent->flags & TCQ_F_MQROOT) &&
+ new->ops->owner == BPF_MODULE_OWNER) {
+ NL_SET_ERR_MSG(extack, "BPF qdisc not supported on a non root");
+ return -EINVAL;
+ }
+
if (new &&
!(parent->flags & TCQ_F_MQROOT) &&
rcu_access_pointer(new->stab)) {
@@ -1287,7 +1295,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
/* We will try again qdisc_lookup_ops,
* so don't keep a reference.
*/
- module_put(ops->owner);
+ bpf_module_put(ops, ops->owner);
err = -EAGAIN;
goto err_out;
}
@@ -1398,7 +1406,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
netdev_put(dev, &sch->dev_tracker);
qdisc_free(sch);
err_out2:
- module_put(ops->owner);
+ bpf_module_put(ops, ops->owner);
err_out:
*errp = err;
return NULL;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 14ab2f4c190a..e6fda9f20272 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -24,6 +24,7 @@
#include <linux/if_vlan.h>
#include <linux/skb_array.h>
#include <linux/if_macvlan.h>
+#include <linux/bpf.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
#include <net/dst.h>
@@ -1078,7 +1079,7 @@ static void __qdisc_destroy(struct Qdisc *qdisc)
ops->destroy(qdisc);
lockdep_unregister_key(&qdisc->root_lock_key);
- module_put(ops->owner);
+ bpf_module_put(ops, ops->owner);
netdev_put(dev, &qdisc->dev_tracker);
trace_qdisc_destroy(qdisc);
--
2.47.1
next prev parent reply other threads:[~2025-01-31 19:29 UTC|newest]
Thread overview: 33+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-01-31 19:28 [PATCH bpf-next v3 00/18] bpf qdisc Amery Hung
2025-01-31 19:28 ` [PATCH bpf-next v3 01/18] bpf: Make every prog keep a copy of ctx_arg_info Amery Hung
2025-02-04 5:57 ` Eduard Zingerman
2025-01-31 19:28 ` [PATCH bpf-next v3 02/18] bpf: Support getting referenced kptr from struct_ops argument Amery Hung
2025-02-04 5:57 ` Eduard Zingerman
2025-01-31 19:28 ` [PATCH bpf-next v3 03/18] selftests/bpf: Test referenced kptr arguments of struct_ops programs Amery Hung
2025-02-04 5:58 ` Eduard Zingerman
2025-01-31 19:28 ` [PATCH bpf-next v3 04/18] bpf: Allow struct_ops prog to return referenced kptr Amery Hung
2025-02-04 5:58 ` Eduard Zingerman
2025-01-31 19:28 ` [PATCH bpf-next v3 05/18] selftests/bpf: Test returning referenced kptr from struct_ops programs Amery Hung
2025-02-04 5:58 ` Eduard Zingerman
2025-01-31 19:28 ` [PATCH bpf-next v3 06/18] bpf: Prepare to reuse get_ctx_arg_idx Amery Hung
2025-01-31 19:28 ` [PATCH bpf-next v3 07/18] bpf: Generalize finding member offset of struct_ops prog Amery Hung
2025-02-04 5:59 ` Eduard Zingerman
2025-01-31 19:28 ` Amery Hung [this message]
2025-02-04 22:18 ` [PATCH bpf-next v3 08/18] bpf: net_sched: Support implementation of Qdisc_ops in bpf Jakub Kicinski
2025-02-04 23:21 ` Amery Hung
2025-02-05 1:27 ` Jakub Kicinski
2025-02-05 4:13 ` Amery Hung
2025-01-31 19:28 ` [PATCH bpf-next v3 09/18] bpf: net_sched: Add basic bpf qdisc kfuncs Amery Hung
2025-01-31 19:28 ` [PATCH bpf-next v3 10/18] bpf: Search and add kfuncs in struct_ops prologue and epilogue Amery Hung
2025-02-04 5:59 ` Eduard Zingerman
2025-01-31 19:28 ` [PATCH bpf-next v3 11/18] bpf: net_sched: Add a qdisc watchdog timer Amery Hung
2025-01-31 19:28 ` [PATCH bpf-next v3 12/18] bpf: net_sched: Support updating bstats Amery Hung
2025-01-31 19:28 ` [PATCH bpf-next v3 13/18] bpf: net_sched: Support updating qstats Amery Hung
2025-01-31 19:28 ` [PATCH bpf-next v3 14/18] bpf: net_sched: Allow writing to more Qdisc members Amery Hung
2025-01-31 19:28 ` [PATCH bpf-next v3 15/18] libbpf: Support creating and destroying qdisc Amery Hung
2025-01-31 19:28 ` [PATCH bpf-next v3 16/18] selftests/bpf: Add a basic fifo qdisc test Amery Hung
2025-01-31 19:28 ` [PATCH bpf-next v3 17/18] selftests/bpf: Add a bpf fq qdisc to selftest Amery Hung
2025-01-31 19:28 ` [PATCH bpf-next v3 18/18] selftests/bpf: Test attaching bpf qdisc to mq and non root Amery Hung
2025-02-01 0:45 ` Amery Hung
2025-02-04 5:58 ` Eduard Zingerman
2025-02-04 18:17 ` Amery Hung
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250131192912.133796-9-ameryhung@gmail.com \
--to=ameryhung@gmail.com \
--cc=alexei.starovoitov@gmail.com \
--cc=andrii@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=cong.wang@bytedance.com \
--cc=daniel@iogearbox.net \
--cc=edumazet@google.com \
--cc=ekarani.silvestre@ccc.ufcg.edu.br \
--cc=jhs@mojatatu.com \
--cc=jiri@resnulli.us \
--cc=kernel-team@meta.com \
--cc=kuba@kernel.org \
--cc=martin.lau@kernel.org \
--cc=ming.lei@redhat.com \
--cc=netdev@vger.kernel.org \
--cc=sinquersw@gmail.com \
--cc=stfomichev@gmail.com \
--cc=toke@redhat.com \
--cc=xiyou.wangcong@gmail.com \
--cc=yangpeihao@sjtu.edu.cn \
--cc=yepeilin.cs@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).