Re: [PATCH bpf-next v3 1/2] bpf: Allow pre-ordering for bpf cgroup progs

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Yonghong Song <yonghong.song@linux.dev>
To: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org, Alexei Starovoitov <ast@kernel.org>,
	Andrii Nakryiko <andrii@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	kernel-team@fb.com, Martin KaFai Lau <martin.lau@kernel.org>
Subject: Re: [PATCH bpf-next v3 1/2] bpf: Allow pre-ordering for bpf cgroup progs
Date: Mon, 24 Feb 2025 12:59:42 -0800	[thread overview]
Message-ID: <aec4ff8a-1cdf-43ba-ba76-7ed7911d0367@linux.dev> (raw)
In-Reply-To: <CAEf4Bza-Wz6Tsi=h9hwFg1TGbjsYMBi4BDGEnqtMjSj_GpOFNQ@mail.gmail.com>



On 2/24/25 10:41 AM, Andrii Nakryiko wrote:
> On Thu, Feb 13, 2025 at 8:48 AM Yonghong Song <yonghong.song@linux.dev> wrote:
>> Currently for bpf progs in a cgroup hierarchy, the effective prog array
>> is computed from bottom cgroup to upper cgroups (post-ordering). For
>> example, the following cgroup hierarchy
>>      root cgroup: p1, p2
>>          subcgroup: p3, p4
>> have BPF_F_ALLOW_MULTI for both cgroup levels.
>> The effective cgroup array ordering looks like
>>      p3 p4 p1 p2
>> and at run time, progs will execute based on that order.
>>
>> But in some cases, it is desirable to have root prog executes earlier than
>> children progs (pre-ordering). For example,
>>    - prog p1 intends to collect original pkt dest addresses.
>>    - prog p3 will modify original pkt dest addresses to a proxy address for
>>      security reason.
>> The end result is that prog p1 gets proxy address which is not what it
>> wants. Putting p1 to every child cgroup is not desirable either as it
>> will duplicate itself in many child cgroups. And this is exactly a use case
>> we are encountering in Meta.
>>
>> To fix this issue, let us introduce a flag BPF_F_PREORDER. If the flag
>> is specified at attachment time, the prog has higher priority and the
>> ordering with that flag will be from top to bottom (pre-ordering).
>> For example, in the above example,
>>      root cgroup: p1, p2
>>          subcgroup: p3, p4
>> Let us say p2 and p4 are marked with BPF_F_PREORDER. The final
>> effective array ordering will be
>>      p2 p4 p3 p1
>>
>> Suggested-by: Andrii Nakryiko <andrii@kernel.org>
>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>> ---
>>   include/linux/bpf-cgroup.h     |  1 +
>>   include/uapi/linux/bpf.h       |  1 +
>>   kernel/bpf/cgroup.c            | 33 +++++++++++++++++++++++++--------
>>   kernel/bpf/syscall.c           |  3 ++-
>>   tools/include/uapi/linux/bpf.h |  1 +
>>   5 files changed, 30 insertions(+), 9 deletions(-)
>>
> LGTM, see one suggestion below, but it doesn't change the essence
>
> Acked-by: Andrii Nakryiko <andrii@kernel.org>
>
>> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
>> index 7fc69083e745..9de7adb68294 100644
>> --- a/include/linux/bpf-cgroup.h
>> +++ b/include/linux/bpf-cgroup.h
>> @@ -111,6 +111,7 @@ struct bpf_prog_list {
>>          struct bpf_prog *prog;
>>          struct bpf_cgroup_link *link;
>>          struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
>> +       u32 flags;
>>   };
>>
>>   int cgroup_bpf_inherit(struct cgroup *cgrp);
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index fff6cdb8d11a..beac5cdf2d2c 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -1207,6 +1207,7 @@ enum bpf_perf_event_type {
>>   #define BPF_F_BEFORE           (1U << 3)
>>   #define BPF_F_AFTER            (1U << 4)
>>   #define BPF_F_ID               (1U << 5)
>> +#define BPF_F_PREORDER         (1U << 6)
>>   #define BPF_F_LINK             BPF_F_LINK /* 1 << 13 */
>>
>>   /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
>> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
>> index 46e5db65dbc8..31d33058174c 100644
>> --- a/kernel/bpf/cgroup.c
>> +++ b/kernel/bpf/cgroup.c
>> @@ -369,7 +369,7 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
>>   /* count number of elements in the list.
>>    * it's slow but the list cannot be long
>>    */
>> -static u32 prog_list_length(struct hlist_head *head)
>> +static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt)
>>   {
>>          struct bpf_prog_list *pl;
>>          u32 cnt = 0;
>> @@ -377,6 +377,8 @@ static u32 prog_list_length(struct hlist_head *head)
>>          hlist_for_each_entry(pl, head, node) {
>>                  if (!prog_list_prog(pl))
>>                          continue;
>> +               if (preorder_cnt && (pl->flags & BPF_F_PREORDER))
>> +                       (*preorder_cnt)++;
>>                  cnt++;
>>          }
>>          return cnt;
>> @@ -400,7 +402,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
>>
>>                  if (flags & BPF_F_ALLOW_MULTI)
>>                          return true;
>> -               cnt = prog_list_length(&p->bpf.progs[atype]);
>> +               cnt = prog_list_length(&p->bpf.progs[atype], NULL);
>>                  WARN_ON_ONCE(cnt > 1);
>>                  if (cnt == 1)
>>                          return !!(flags & BPF_F_ALLOW_OVERRIDE);
>> @@ -423,12 +425,12 @@ static int compute_effective_progs(struct cgroup *cgrp,
>>          struct bpf_prog_array *progs;
>>          struct bpf_prog_list *pl;
>>          struct cgroup *p = cgrp;
>> -       int cnt = 0;
>> +       int i, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart;
>>
>>          /* count number of effective programs by walking parents */
>>          do {
>>                  if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
>> -                       cnt += prog_list_length(&p->bpf.progs[atype]);
>> +                       cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt);
>>                  p = cgroup_parent(p);
>>          } while (p);
>>
>> @@ -439,20 +441,34 @@ static int compute_effective_progs(struct cgroup *cgrp,
>>          /* populate the array with effective progs */
>>          cnt = 0;
>>          p = cgrp;
>> +       fstart = preorder_cnt;
>> +       bstart = preorder_cnt - 1;
>>          do {
>>                  if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
>>                          continue;
>>
>> +               init_bstart = bstart;
>>                  hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
>>                          if (!prog_list_prog(pl))
>>                                  continue;
>>
>> -                       item = &progs->items[cnt];
>> +                       if (pl->flags & BPF_F_PREORDER) {
>> +                               item = &progs->items[bstart];
>> +                               bstart--;
>> +                       } else {
>> +                               item = &progs->items[fstart];
>> +                               fstart++;
>> +                       }
>>                          item->prog = prog_list_prog(pl);
>>                          bpf_cgroup_storages_assign(item->cgroup_storage,
>>                                                     pl->storage);
>>                          cnt++;
>>                  }
>> +
>> +               /* reverse pre-ordering progs at this cgroup level */
>> +               for (i = 0; i < (init_bstart - bstart)/2; i++)
>> +                       swap(progs->items[init_bstart - i], progs->items[bstart + 1 + i]);
> nit: this is a bit mind-bending to read and verify, let's do it a bit
> more bullet-proof way:
>
> for (i = bstart + 1, j = init_bstart; i < j; i++, j--)
>      swap(progs->items[i], progs->items[j]);
>
> ?

Sound good to me. Will send v4 soon.

>
>> +
>>          } while ((p = cgroup_parent(p)));
>>
>>          *array = progs;
>> @@ -663,7 +679,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
>>                   */
>>                  return -EPERM;
>>
>> -       if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
>> +       if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS)
>>                  return -E2BIG;
>>
>>          pl = find_attach_entry(progs, prog, link, replace_prog,

[...]

     prev parent reply	other threads:[~2025-02-24 20:59 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-02-13 16:48 [PATCH bpf-next v3 1/2] bpf: Allow pre-ordering for bpf cgroup progs Yonghong Song
2025-02-13 16:48 ` [PATCH bpf-next v3 2/2] selftests/bpf: Add selftests allowing cgroup prog pre-ordering Yonghong Song
2025-02-24 18:41 ` [PATCH bpf-next v3 1/2] bpf: Allow pre-ordering for bpf cgroup progs Andrii Nakryiko
2025-02-24 20:59   ` Yonghong Song [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aec4ff8a-1cdf-43ba-ba76-7ed7911d0367@linux.dev \
    --to=yonghong.song@linux.dev \
    --cc=andrii.nakryiko@gmail.com \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=kernel-team@fb.com \
    --cc=martin.lau@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.