BPF List
 help / color / mirror / Atom feed
* [PATCH bpf-next 1/2] bpf: Allow top down cgroup prog ordering
@ 2025-02-06 22:59 Yonghong Song
  2025-02-06 23:00 ` [PATCH bpf-next 2/2] selftests/bpf: Add selftests allowing cgroup prog top-down ordering Yonghong Song
  2025-02-11 22:57 ` [PATCH bpf-next 1/2] bpf: Allow top down cgroup prog ordering Andrii Nakryiko
  0 siblings, 2 replies; 4+ messages in thread
From: Yonghong Song @ 2025-02-06 22:59 UTC (permalink / raw)
  To: bpf
  Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, kernel-team,
	Martin KaFai Lau

Currently for bpf progs in a cgroup hierarchy, the effective prog array
is computed from bottom cgroup to upper cgroups. For example, the following
cgroup hierarchy
    root cgroup: p1, p2
        subcgroup: p3, p4
have BPF_F_ALLOW_MULTI for both cgroup levels.
The effective cgroup array ordering looks like
    p3 p4 p1 p2
and at run time, the progs will execute based on that order.

But in some cases, it is desirable to have root prog executes earlier than
children progs. For example,
  - prog p1 intends to collect original pkt dest addresses.
  - prog p3 will modify original pkt dest addresses to a proxy address for
    security reason.
The end result is that prog p1 gets proxy address which is not what it
wants. Also, putting p1 to every child cgroup is not desirable either as it
will duplicate itself in many child cgroups. And this is exactly a use case
we are encountering in Meta.

To fix this issue, let us introduce a flag BPF_F_PRIO_TOPDOWN. If the flag
is specified at attachment time, the prog has higher priority and the
ordering with that flag will be from top to bottom. For example, in the
above example,
    root cgroup: p1, p2
        subcgroup: p3, p4
Let us say p1, p2 and p4 are marked with BPF_F_PRIO_TOPDOWN. The final
effective array ordering will be
    p1 p2 p4 p3

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
 include/linux/bpf-cgroup.h     |  1 +
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/cgroup.c            | 37 +++++++++++++++++++++++++++++++---
 kernel/bpf/syscall.c           |  3 ++-
 tools/include/uapi/linux/bpf.h |  1 +
 5 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 7fc69083e745..3d4f221df9ef 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -111,6 +111,7 @@ struct bpf_prog_list {
 	struct bpf_prog *prog;
 	struct bpf_cgroup_link *link;
 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
+	bool is_prio_topdown;
 };
 
 int cgroup_bpf_inherit(struct cgroup *cgrp);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index fff6cdb8d11a..7ae8e8751e78 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1207,6 +1207,7 @@ enum bpf_perf_event_type {
 #define BPF_F_BEFORE		(1U << 3)
 #define BPF_F_AFTER		(1U << 4)
 #define BPF_F_ID		(1U << 5)
+#define BPF_F_PRIO_TOPDOWN	(1U << 6)
 #define BPF_F_LINK		BPF_F_LINK /* 1 << 13 */
 
 /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 46e5db65dbc8..f31250c6025b 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -382,6 +382,21 @@ static u32 prog_list_length(struct hlist_head *head)
 	return cnt;
 }
 
+static u32 prog_list_length_with_topdown_cnt(struct hlist_head *head, int *topdown_cnt)
+{
+	struct bpf_prog_list *pl;
+	u32 cnt = 0;
+
+	hlist_for_each_entry(pl, head, node) {
+		if (!prog_list_prog(pl))
+			continue;
+		cnt++;
+		if (pl->is_prio_topdown)
+			(*topdown_cnt) += 1;
+	}
+	return cnt;
+}
+
 /* if parent has non-overridable prog attached,
  * disallow attaching new programs to the descendent cgroup.
  * if parent has overridable or multi-prog, allow attaching
@@ -423,12 +438,13 @@ static int compute_effective_progs(struct cgroup *cgrp,
 	struct bpf_prog_array *progs;
 	struct bpf_prog_list *pl;
 	struct cgroup *p = cgrp;
-	int cnt = 0;
+	int i, cnt = 0, topdown_cnt = 0, fstart, bstart, init_bstart;
 
 	/* count number of effective programs by walking parents */
 	do {
 		if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
-			cnt += prog_list_length(&p->bpf.progs[atype]);
+			cnt += prog_list_length_with_topdown_cnt(&p->bpf.progs[atype],
+								 &topdown_cnt);
 		p = cgroup_parent(p);
 	} while (p);
 
@@ -439,20 +455,34 @@ static int compute_effective_progs(struct cgroup *cgrp,
 	/* populate the array with effective progs */
 	cnt = 0;
 	p = cgrp;
+	fstart = topdown_cnt;
+	bstart = topdown_cnt - 1;
 	do {
 		if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
 			continue;
 
+		init_bstart = bstart;
 		hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
 			if (!prog_list_prog(pl))
 				continue;
 
-			item = &progs->items[cnt];
+			if (!pl->is_prio_topdown) {
+				item = &progs->items[fstart];
+				fstart++;
+			} else {
+				item = &progs->items[bstart];
+				bstart--;
+			}
 			item->prog = prog_list_prog(pl);
 			bpf_cgroup_storages_assign(item->cgroup_storage,
 						   pl->storage);
 			cnt++;
 		}
+
+		/* reverse topdown priority progs ordering at this cgroup level */
+		for (i = 0; i < (init_bstart - bstart)/2; i++)
+			swap(progs->items[init_bstart - i], progs->items[bstart + 1 + i]);
+
 	} while ((p = cgroup_parent(p)));
 
 	*array = progs;
@@ -698,6 +728,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
 
 	pl->prog = prog;
 	pl->link = link;
+	pl->is_prio_topdown = !!(flags & BPF_F_PRIO_TOPDOWN);
 	bpf_cgroup_storages_assign(pl->storage, storage);
 	cgrp->bpf.flags[atype] = saved_flags;
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c420edbfb7c8..2711f6769263 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4168,7 +4168,8 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 #define BPF_F_ATTACH_MASK_BASE	\
 	(BPF_F_ALLOW_OVERRIDE |	\
 	 BPF_F_ALLOW_MULTI |	\
-	 BPF_F_REPLACE)
+	 BPF_F_REPLACE |	\
+	 BPF_F_PRIO_TOPDOWN)
 
 #define BPF_F_ATTACH_MASK_MPROG	\
 	(BPF_F_REPLACE |	\
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 2acf9b336371..de962fbb7c4c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1207,6 +1207,7 @@ enum bpf_perf_event_type {
 #define BPF_F_BEFORE		(1U << 3)
 #define BPF_F_AFTER		(1U << 4)
 #define BPF_F_ID		(1U << 5)
+#define BPF_F_PRIO_TOPDOWN	(1U << 6)
 #define BPF_F_LINK		BPF_F_LINK /* 1 << 13 */
 
 /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
-- 
2.43.5


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH bpf-next 2/2] selftests/bpf: Add selftests allowing cgroup prog top-down ordering
  2025-02-06 22:59 [PATCH bpf-next 1/2] bpf: Allow top down cgroup prog ordering Yonghong Song
@ 2025-02-06 23:00 ` Yonghong Song
  2025-02-11 22:57 ` [PATCH bpf-next 1/2] bpf: Allow top down cgroup prog ordering Andrii Nakryiko
  1 sibling, 0 replies; 4+ messages in thread
From: Yonghong Song @ 2025-02-06 23:00 UTC (permalink / raw)
  To: bpf
  Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, kernel-team,
	Martin KaFai Lau

Add a few selftests which allows cgroup prog top-down ordering.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
 .../selftests/bpf/prog_tests/cgroup_topdown.c | 128 ++++++++++++++++++
 .../selftests/bpf/progs/cgroup_topdown.c      |  41 ++++++
 2 files changed, 169 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_topdown.c
 create mode 100644 tools/testing/selftests/bpf/progs/cgroup_topdown.c

diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_topdown.c b/tools/testing/selftests/bpf/prog_tests/cgroup_topdown.c
new file mode 100644
index 000000000000..61e89ab7f6fd
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_topdown.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+#include <test_progs.h>
+#include "cgroup_helpers.h"
+#include "cgroup_topdown.skel.h"
+
+static int run_getsockopt_test(int cg_parent, int cg_child, int sock_fd, bool all_topdown)
+{
+	LIBBPF_OPTS(bpf_prog_attach_opts, opts);
+	enum bpf_attach_type prog_c_atype, prog_c2_atype, prog_p_atype, prog_p2_atype;
+	int prog_c_fd, prog_c2_fd, prog_p_fd, prog_p2_fd;
+	struct cgroup_topdown *skel = NULL;
+	struct bpf_program *prog;
+	__u8 *result, buf;
+	socklen_t optlen;
+	int err = 0;
+
+	skel = cgroup_topdown__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "cgroup_topdown__open_and_load"))
+		return 0;
+
+	buf = 0x00;
+	err = setsockopt(sock_fd, SOL_IP, IP_TOS, &buf, 1);
+	if (!ASSERT_OK(err, "setsockopt"))
+		goto close_skel;
+
+	opts.flags = BPF_F_ALLOW_MULTI;
+	if (all_topdown)
+		opts.flags |= BPF_F_PRIO_TOPDOWN;
+	prog = skel->progs.child;
+	prog_c_fd = bpf_program__fd(prog);
+	prog_c_atype = bpf_program__expected_attach_type(prog);
+	err = bpf_prog_attach_opts(prog_c_fd, cg_child, prog_c_atype, &opts);
+	if (!ASSERT_OK(err, "bpf_prog_attach_opts-child"))
+		goto close_skel;
+
+	opts.flags = BPF_F_ALLOW_MULTI | BPF_F_PRIO_TOPDOWN;
+	prog = skel->progs.child_2;
+	prog_c2_fd = bpf_program__fd(prog);
+	prog_c2_atype = bpf_program__expected_attach_type(prog);
+	err = bpf_prog_attach_opts(prog_c2_fd, cg_child, prog_c2_atype, &opts);
+	if (!ASSERT_OK(err, "bpf_prog_attach_opts-child_2"))
+		goto detach_child;
+
+	optlen = 1;
+	err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen);
+	if (!ASSERT_OK(err, "getsockopt"))
+		goto detach_child_2;
+
+	result = skel->bss->result;
+	if (all_topdown)
+		ASSERT_TRUE(result[0] == 1 && result[1] == 2, "child only");
+	else
+		ASSERT_TRUE(result[0] == 2 && result[1] == 1, "child only");
+
+	skel->bss->idx = 0;
+	memset(result, 0, 4);
+
+	opts.flags = BPF_F_ALLOW_MULTI;
+	if (all_topdown)
+		opts.flags |= BPF_F_PRIO_TOPDOWN;
+	prog = skel->progs.parent;
+	prog_p_fd = bpf_program__fd(prog);
+	prog_p_atype = bpf_program__expected_attach_type(prog);
+	err = bpf_prog_attach_opts(prog_p_fd, cg_parent, prog_p_atype, &opts);
+	if (!ASSERT_OK(err, "bpf_prog_attach_opts-parent"))
+		goto detach_child_2;
+
+	opts.flags = BPF_F_ALLOW_MULTI | BPF_F_PRIO_TOPDOWN;
+	prog = skel->progs.parent_2;
+	prog_p2_fd = bpf_program__fd(prog);
+	prog_p2_atype = bpf_program__expected_attach_type(prog);
+	err = bpf_prog_attach_opts(prog_p2_fd, cg_parent, prog_p2_atype, &opts);
+	if (!ASSERT_OK(err, "bpf_prog_attach_opts-parent_2"))
+		goto detach_parent;
+
+	err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen);
+	if (!ASSERT_OK(err, "getsockopt"))
+		goto detach_parent_2;
+
+	if (all_topdown)
+		ASSERT_TRUE(result[0] == 3 && result[1] == 4 && result[2] == 1 && result[3] == 2,
+			    "parent and child");
+	else
+		ASSERT_TRUE(result[0] == 4 && result[1] == 2 && result[2] == 1 && result[3] == 3,
+			    "parent and child");
+
+detach_parent_2:
+	ASSERT_OK(bpf_prog_detach2(prog_p2_fd, cg_parent, prog_p2_atype),
+		  "bpf_prog_detach2-parent_2");
+detach_parent:
+	ASSERT_OK(bpf_prog_detach2(prog_p_fd, cg_parent, prog_p_atype),
+		  "bpf_prog_detach2-parent");
+detach_child_2:
+	ASSERT_OK(bpf_prog_detach2(prog_c2_fd, cg_child, prog_c2_atype),
+		  "bpf_prog_detach2-child_2");
+detach_child:
+	ASSERT_OK(bpf_prog_detach2(prog_c_fd, cg_child, prog_c_atype),
+		  "bpf_prog_detach2-child");
+close_skel:
+	cgroup_topdown__destroy(skel);
+	return err;
+}
+
+void test_cgroup_topdown(void)
+{
+	int cg_parent = -1, cg_child = -1, sock_fd = -1;
+
+	cg_parent = test__join_cgroup("/parent");
+	if (!ASSERT_GE(cg_parent, 0, "join_cgroup /parent"))
+		goto out;
+
+	cg_child = test__join_cgroup("/parent/child");
+	if (!ASSERT_GE(cg_child, 0, "join_cgroup /parent/child"))
+		goto out;
+
+	sock_fd = socket(AF_INET, SOCK_STREAM, 0);
+	if (!ASSERT_GE(sock_fd, 0, "socket"))
+		goto out;
+
+	ASSERT_OK(run_getsockopt_test(cg_parent, cg_child, sock_fd, false), "getsockopt_test_1");
+	ASSERT_OK(run_getsockopt_test(cg_parent, cg_child, sock_fd, true), "getsockopt_test_2");
+
+out:
+	close(sock_fd);
+	close(cg_child);
+	close(cg_parent);
+}
diff --git a/tools/testing/selftests/bpf/progs/cgroup_topdown.c b/tools/testing/selftests/bpf/progs/cgroup_topdown.c
new file mode 100644
index 000000000000..4ef6202baa0a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cgroup_topdown.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+unsigned int idx;
+__u8 result[4];
+
+SEC("cgroup/getsockopt")
+int child(struct bpf_sockopt *ctx)
+{
+	if (idx < 4)
+		result[idx++] = 1;
+	return 1;
+}
+
+SEC("cgroup/getsockopt")
+int child_2(struct bpf_sockopt *ctx)
+{
+	if (idx < 4)
+		result[idx++] = 2;
+	return 1;
+}
+
+SEC("cgroup/getsockopt")
+int parent(struct bpf_sockopt *ctx)
+{
+	if (idx < 4)
+		result[idx++] = 3;
+	return 1;
+}
+
+SEC("cgroup/getsockopt")
+int parent_2(struct bpf_sockopt *ctx)
+{
+	if (idx < 4)
+		result[idx++] = 4;
+	return 1;
+}
-- 
2.43.5


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH bpf-next 1/2] bpf: Allow top down cgroup prog ordering
  2025-02-06 22:59 [PATCH bpf-next 1/2] bpf: Allow top down cgroup prog ordering Yonghong Song
  2025-02-06 23:00 ` [PATCH bpf-next 2/2] selftests/bpf: Add selftests allowing cgroup prog top-down ordering Yonghong Song
@ 2025-02-11 22:57 ` Andrii Nakryiko
  2025-02-12  5:35   ` Yonghong Song
  1 sibling, 1 reply; 4+ messages in thread
From: Andrii Nakryiko @ 2025-02-11 22:57 UTC (permalink / raw)
  To: Yonghong Song
  Cc: bpf, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
	kernel-team, Martin KaFai Lau

On Thu, Feb 6, 2025 at 3:00 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
> Currently for bpf progs in a cgroup hierarchy, the effective prog array
> is computed from bottom cgroup to upper cgroups. For example, the following
> cgroup hierarchy
>     root cgroup: p1, p2
>         subcgroup: p3, p4
> have BPF_F_ALLOW_MULTI for both cgroup levels.
> The effective cgroup array ordering looks like
>     p3 p4 p1 p2
> and at run time, the progs will execute based on that order.
>
> But in some cases, it is desirable to have root prog executes earlier than
> children progs. For example,
>   - prog p1 intends to collect original pkt dest addresses.
>   - prog p3 will modify original pkt dest addresses to a proxy address for
>     security reason.
> The end result is that prog p1 gets proxy address which is not what it
> wants. Also, putting p1 to every child cgroup is not desirable either as it
> will duplicate itself in many child cgroups. And this is exactly a use case
> we are encountering in Meta.
>
> To fix this issue, let us introduce a flag BPF_F_PRIO_TOPDOWN. If the flag
> is specified at attachment time, the prog has higher priority and the
> ordering with that flag will be from top to bottom. For example, in the
> above example,
>     root cgroup: p1, p2
>         subcgroup: p3, p4
> Let us say p1, p2 and p4 are marked with BPF_F_PRIO_TOPDOWN. The final

I'm not a big fan of PRIO_TOPDOWN naming, and this example just
provides further argument for why. Between p3 and p4 programs in
subcgroup, there is no notion of TOPDOWN, they are at the same level
of the hierarchy.

In graphs, for DFS, PRIO_TOPDOWN semantics corresponds to pre-order vs
(current and default) post-order. So why not something like
BPF_F_PREORDER or some variation on that?

Also, for your example if would be nicer if p1 and p3 were the default
post-order attachment, while p2 and p4 were pre-order. Then you'd have
p2, p4, p3, p1, where everything is swapped relative to original
ordering ;)

> effective array ordering will be
>     p1 p2 p4 p3
>
> Suggested-by: Andrii Nakryiko <andrii@kernel.org>
> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
> ---
>  include/linux/bpf-cgroup.h     |  1 +
>  include/uapi/linux/bpf.h       |  1 +
>  kernel/bpf/cgroup.c            | 37 +++++++++++++++++++++++++++++++---
>  kernel/bpf/syscall.c           |  3 ++-
>  tools/include/uapi/linux/bpf.h |  1 +
>  5 files changed, 39 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index 7fc69083e745..3d4f221df9ef 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
> @@ -111,6 +111,7 @@ struct bpf_prog_list {
>         struct bpf_prog *prog;
>         struct bpf_cgroup_link *link;
>         struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
> +       bool is_prio_topdown;

let's go with `int flags`, we increase the size of struct
bpf_prog_list by 8 bytes anyways, so let's make this a bit more
generic?

>  };
>
>  int cgroup_bpf_inherit(struct cgroup *cgrp);
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index fff6cdb8d11a..7ae8e8751e78 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1207,6 +1207,7 @@ enum bpf_perf_event_type {
>  #define BPF_F_BEFORE           (1U << 3)
>  #define BPF_F_AFTER            (1U << 4)
>  #define BPF_F_ID               (1U << 5)
> +#define BPF_F_PRIO_TOPDOWN     (1U << 6)
>  #define BPF_F_LINK             BPF_F_LINK /* 1 << 13 */
>
>  /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index 46e5db65dbc8..f31250c6025b 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -382,6 +382,21 @@ static u32 prog_list_length(struct hlist_head *head)
>         return cnt;
>  }
>
> +static u32 prog_list_length_with_topdown_cnt(struct hlist_head *head, int *topdown_cnt)

instead of duplicating prog_list_length(), let's add this `int
*topdown_cnt` counter as an optional argument, which prog_list_length
will fill out only if it's provided, i.e., you'll just have:

if (topdown_cnt && pl->is_prio_topdown)
   (*topdown_cnt) += 1;

as one extra condition inside the loop?

> +{
> +       struct bpf_prog_list *pl;
> +       u32 cnt = 0;
> +
> +       hlist_for_each_entry(pl, head, node) {
> +               if (!prog_list_prog(pl))
> +                       continue;
> +               cnt++;
> +               if (pl->is_prio_topdown)
> +                       (*topdown_cnt) += 1;
> +       }
> +       return cnt;
> +}
> +

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH bpf-next 1/2] bpf: Allow top down cgroup prog ordering
  2025-02-11 22:57 ` [PATCH bpf-next 1/2] bpf: Allow top down cgroup prog ordering Andrii Nakryiko
@ 2025-02-12  5:35   ` Yonghong Song
  0 siblings, 0 replies; 4+ messages in thread
From: Yonghong Song @ 2025-02-12  5:35 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: bpf, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
	kernel-team, Martin KaFai Lau



On 2/11/25 2:57 PM, Andrii Nakryiko wrote:
> On Thu, Feb 6, 2025 at 3:00 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>> Currently for bpf progs in a cgroup hierarchy, the effective prog array
>> is computed from bottom cgroup to upper cgroups. For example, the following
>> cgroup hierarchy
>>      root cgroup: p1, p2
>>          subcgroup: p3, p4
>> have BPF_F_ALLOW_MULTI for both cgroup levels.
>> The effective cgroup array ordering looks like
>>      p3 p4 p1 p2
>> and at run time, the progs will execute based on that order.
>>
>> But in some cases, it is desirable to have root prog executes earlier than
>> children progs. For example,
>>    - prog p1 intends to collect original pkt dest addresses.
>>    - prog p3 will modify original pkt dest addresses to a proxy address for
>>      security reason.
>> The end result is that prog p1 gets proxy address which is not what it
>> wants. Also, putting p1 to every child cgroup is not desirable either as it
>> will duplicate itself in many child cgroups. And this is exactly a use case
>> we are encountering in Meta.
>>
>> To fix this issue, let us introduce a flag BPF_F_PRIO_TOPDOWN. If the flag
>> is specified at attachment time, the prog has higher priority and the
>> ordering with that flag will be from top to bottom. For example, in the
>> above example,
>>      root cgroup: p1, p2
>>          subcgroup: p3, p4
>> Let us say p1, p2 and p4 are marked with BPF_F_PRIO_TOPDOWN. The final
> I'm not a big fan of PRIO_TOPDOWN naming, and this example just
> provides further argument for why. Between p3 and p4 programs in
> subcgroup, there is no notion of TOPDOWN, they are at the same level
> of the hierarchy.
>
> In graphs, for DFS, PRIO_TOPDOWN semantics corresponds to pre-order vs
> (current and default) post-order. So why not something like
> BPF_F_PREORDER or some variation on that?

BPF_F_PREORDER sounds good to me.

>
> Also, for your example if would be nicer if p1 and p3 were the default
> post-order attachment, while p2 and p4 were pre-order. Then you'd have
> p2, p4, p3, p1, where everything is swapped relative to original
> ordering ;)

Yes, will adjust the test for such scenario!

>
>> effective array ordering will be
>>      p1 p2 p4 p3
>>
>> Suggested-by: Andrii Nakryiko <andrii@kernel.org>
>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>> ---
>>   include/linux/bpf-cgroup.h     |  1 +
>>   include/uapi/linux/bpf.h       |  1 +
>>   kernel/bpf/cgroup.c            | 37 +++++++++++++++++++++++++++++++---
>>   kernel/bpf/syscall.c           |  3 ++-
>>   tools/include/uapi/linux/bpf.h |  1 +
>>   5 files changed, 39 insertions(+), 4 deletions(-)
>>
>> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
>> index 7fc69083e745..3d4f221df9ef 100644
>> --- a/include/linux/bpf-cgroup.h
>> +++ b/include/linux/bpf-cgroup.h
>> @@ -111,6 +111,7 @@ struct bpf_prog_list {
>>          struct bpf_prog *prog;
>>          struct bpf_cgroup_link *link;
>>          struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
>> +       bool is_prio_topdown;
> let's go with `int flags`, we increase the size of struct
> bpf_prog_list by 8 bytes anyways, so let's make this a bit more
> generic?

Ack.

>
>>   };
>>
>>   int cgroup_bpf_inherit(struct cgroup *cgrp);
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index fff6cdb8d11a..7ae8e8751e78 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -1207,6 +1207,7 @@ enum bpf_perf_event_type {
>>   #define BPF_F_BEFORE           (1U << 3)
>>   #define BPF_F_AFTER            (1U << 4)
>>   #define BPF_F_ID               (1U << 5)
>> +#define BPF_F_PRIO_TOPDOWN     (1U << 6)
>>   #define BPF_F_LINK             BPF_F_LINK /* 1 << 13 */
>>
>>   /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
>> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
>> index 46e5db65dbc8..f31250c6025b 100644
>> --- a/kernel/bpf/cgroup.c
>> +++ b/kernel/bpf/cgroup.c
>> @@ -382,6 +382,21 @@ static u32 prog_list_length(struct hlist_head *head)
>>          return cnt;
>>   }
>>
>> +static u32 prog_list_length_with_topdown_cnt(struct hlist_head *head, int *topdown_cnt)
> instead of duplicating prog_list_length(), let's add this `int
> *topdown_cnt` counter as an optional argument, which prog_list_length
> will fill out only if it's provided, i.e., you'll just have:
>
> if (topdown_cnt && pl->is_prio_topdown)
>     (*topdown_cnt) += 1;
>
> as one extra condition inside the loop?

I thought about this as well. I tried to create a new function since
prog_list_length() is used in several different places. This
is not critical path, so yes, I can just add one addititional parameter
for prog_list_length() as you suggested.

>
>> +{
>> +       struct bpf_prog_list *pl;
>> +       u32 cnt = 0;
>> +
>> +       hlist_for_each_entry(pl, head, node) {
>> +               if (!prog_list_prog(pl))
>> +                       continue;
>> +               cnt++;
>> +               if (pl->is_prio_topdown)
>> +                       (*topdown_cnt) += 1;
>> +       }
>> +       return cnt;
>> +}
>> +


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2025-02-12  5:35 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-02-06 22:59 [PATCH bpf-next 1/2] bpf: Allow top down cgroup prog ordering Yonghong Song
2025-02-06 23:00 ` [PATCH bpf-next 2/2] selftests/bpf: Add selftests allowing cgroup prog top-down ordering Yonghong Song
2025-02-11 22:57 ` [PATCH bpf-next 1/2] bpf: Allow top down cgroup prog ordering Andrii Nakryiko
2025-02-12  5:35   ` Yonghong Song

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox