From: Stanislav Fomichev <sdf@google.com>
To: bpf@vger.kernel.org
Cc: ast@kernel.org, daniel@iogearbox.net, andrii@kernel.org,
martin.lau@linux.dev, song@kernel.org, yhs@fb.com,
john.fastabend@gmail.com, kpsingh@kernel.org, sdf@google.com,
haoluo@google.com, jolsa@kernel.org
Subject: [PATCH bpf-next] RFC: bpf: query effective progs without cgroup_mutex
Date: Thu, 11 May 2023 08:57:35 -0700 [thread overview]
Message-ID: <20230511155740.902548-1-sdf@google.com> (raw)
We're observing some stalls on the heavily loaded machines
in the cgroup_bpf_prog_query path. This is likely due to
being blocked on cgroup_mutex.
IIUC, the cgroup_mutex is there mostly to protect the non-effective
fields (cgrp->bpf.progs) which might be changed by the update path.
For the BPF_F_QUERY_EFFECTIVE case, all we need is to rcu_dereference
a bunch of pointers (and keep them around for consistency), so
let's do it.
Sending out as an RFC because it looks a bit ugly. It would also
be nice to handle non-effective case locklessly as well, but it
might require a larger rework.
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
include/linux/bpf-cgroup-defs.h | 2 +-
include/linux/bpf-cgroup.h | 1 +
kernel/bpf/cgroup.c | 80 +++++++++++++++++----------------
3 files changed, 44 insertions(+), 39 deletions(-)
diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h
index 7b121bd780eb..df0b8faa1a17 100644
--- a/include/linux/bpf-cgroup-defs.h
+++ b/include/linux/bpf-cgroup-defs.h
@@ -56,7 +56,7 @@ struct cgroup_bpf {
* have either zero or one element
* when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS
*/
- struct hlist_head progs[MAX_CGROUP_BPF_ATTACH_TYPE];
+ struct hlist_head __rcu progs[MAX_CGROUP_BPF_ATTACH_TYPE];
u8 flags[MAX_CGROUP_BPF_ATTACH_TYPE];
/* list of cgroup shared storages */
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 57e9e109257e..555e9cbb3a05 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -106,6 +106,7 @@ struct bpf_prog_list {
struct bpf_prog *prog;
struct bpf_cgroup_link *link;
struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
+ struct rcu_head rcu;
};
int cgroup_bpf_inherit(struct cgroup *cgrp);
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index a06e118a9be5..92a1b33dcc06 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -285,12 +285,15 @@ static void cgroup_bpf_release(struct work_struct *work)
mutex_lock(&cgroup_mutex);
for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
- struct hlist_head *progs = &cgrp->bpf.progs[atype];
+ struct hlist_head *progs;
struct bpf_prog_list *pl;
struct hlist_node *pltmp;
+ progs = rcu_dereference_protected(&cgrp->bpf.progs[atype],
+ lockdep_is_held(&cgroup_mutex));
+
hlist_for_each_entry_safe(pl, pltmp, progs, node) {
- hlist_del(&pl->node);
+ hlist_del_rcu(&pl->node);
if (pl->prog) {
if (pl->prog->expected_attach_type == BPF_LSM_CGROUP)
bpf_trampoline_unlink_cgroup_shim(pl->prog);
@@ -301,7 +304,7 @@ static void cgroup_bpf_release(struct work_struct *work)
bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog);
bpf_cgroup_link_auto_detach(pl->link);
}
- kfree(pl);
+ kfree_rcu(pl, rcu);
static_branch_dec(&cgroup_bpf_enabled_key[atype]);
}
old_array = rcu_dereference_protected(
@@ -352,12 +355,12 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
/* count number of elements in the list.
* it's slow but the list cannot be long
*/
-static u32 prog_list_length(struct hlist_head *head)
+static u32 prog_list_length(struct hlist_head __rcu *head)
{
struct bpf_prog_list *pl;
u32 cnt = 0;
- hlist_for_each_entry(pl, head, node) {
+ hlist_for_each_entry_rcu(pl, head, node) {
if (!prog_list_prog(pl))
continue;
cnt++;
@@ -553,7 +556,7 @@ static int update_effective_progs(struct cgroup *cgrp,
#define BPF_CGROUP_MAX_PROGS 64
-static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
+static struct bpf_prog_list *find_attach_entry(struct hlist_head __rcu *progs,
struct bpf_prog *prog,
struct bpf_cgroup_link *link,
struct bpf_prog *replace_prog,
@@ -565,10 +568,10 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
if (!allow_multi) {
if (hlist_empty(progs))
return NULL;
- return hlist_entry(progs->first, typeof(*pl), node);
+ return hlist_entry(rcu_dereference_raw(progs)->first, typeof(*pl), node);
}
- hlist_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry_rcu(pl, progs, node) {
if (prog && pl->prog == prog && prog != replace_prog)
/* disallow attaching the same prog twice */
return ERR_PTR(-EINVAL);
@@ -579,7 +582,7 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
/* direct prog multi-attach w/ replacement case */
if (replace_prog) {
- hlist_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry_rcu(pl, progs, node) {
if (pl->prog == replace_prog)
/* a match found */
return pl;
@@ -615,8 +618,8 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
struct bpf_prog *new_prog = prog ? : link->link.prog;
enum cgroup_bpf_attach_type atype;
+ struct hlist_head __rcu *progs;
struct bpf_prog_list *pl;
- struct hlist_head *progs;
int err;
if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
@@ -658,6 +661,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
prog ? : link->link.prog, cgrp))
return -ENOMEM;
+ WRITE_ONCE(cgrp->bpf.flags[atype], saved_flags);
if (pl) {
old_prog = pl->prog;
} else {
@@ -669,12 +673,12 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
return -ENOMEM;
}
if (hlist_empty(progs))
- hlist_add_head(&pl->node, progs);
+ hlist_add_head_rcu(&pl->node, progs);
else
- hlist_for_each(last, progs) {
+ __hlist_for_each_rcu(last, progs) {
if (last->next)
continue;
- hlist_add_behind(&pl->node, last);
+ hlist_add_behind_rcu(&pl->node, last);
break;
}
}
@@ -682,7 +686,6 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
pl->prog = prog;
pl->link = link;
bpf_cgroup_storages_assign(pl->storage, storage);
- cgrp->bpf.flags[atype] = saved_flags;
if (type == BPF_LSM_CGROUP) {
err = bpf_trampoline_link_cgroup_shim(new_prog, atype);
@@ -796,7 +799,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
enum cgroup_bpf_attach_type atype;
struct bpf_prog *old_prog;
struct bpf_prog_list *pl;
- struct hlist_head *progs;
+ struct hlist_head __rcu *progs;
bool found = false;
atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id);
@@ -808,7 +811,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
if (link->link.prog->type != new_prog->type)
return -EINVAL;
- hlist_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry_rcu(pl, progs, node) {
if (pl->link == link) {
found = true;
break;
@@ -847,7 +850,7 @@ static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
return ret;
}
-static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
+static struct bpf_prog_list *find_detach_entry(struct hlist_head __rcu *progs,
struct bpf_prog *prog,
struct bpf_cgroup_link *link,
bool allow_multi)
@@ -862,7 +865,7 @@ static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
/* to maintain backward compatibility NONE and OVERRIDE cgroups
* allow detaching with invalid FD (prog==NULL) in legacy mode
*/
- return hlist_entry(progs->first, typeof(*pl), node);
+ return hlist_entry(rcu_dereference_raw(progs)->first, typeof(*pl), node);
}
if (!prog && !link)
@@ -872,7 +875,7 @@ static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
return ERR_PTR(-EINVAL);
/* find the prog or link and detach it */
- hlist_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry_rcu(pl, progs, node) {
if (pl->prog == prog && pl->link == link)
return pl;
}
@@ -894,9 +897,9 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
enum cgroup_bpf_attach_type atype)
{
struct cgroup_subsys_state *css;
+ struct hlist_head __rcu *head;
struct bpf_prog_array *progs;
struct bpf_prog_list *pl;
- struct hlist_head *head;
struct cgroup *cg;
int pos;
@@ -913,7 +916,7 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
continue;
head = &cg->bpf.progs[atype];
- hlist_for_each_entry(pl, head, node) {
+ hlist_for_each_entry_rcu(pl, head, node) {
if (!prog_list_prog(pl))
continue;
if (pl->prog == prog && pl->link == link)
@@ -950,9 +953,9 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
struct bpf_cgroup_link *link, enum bpf_attach_type type)
{
enum cgroup_bpf_attach_type atype;
+ struct hlist_head __rcu *progs;
struct bpf_prog *old_prog;
struct bpf_prog_list *pl;
- struct hlist_head *progs;
u32 attach_btf_id = 0;
u32 flags;
@@ -989,12 +992,12 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
}
/* now can actually delete it from this cgroup list */
- hlist_del(&pl->node);
+ hlist_del_rcu(&pl->node);
kfree(pl);
if (hlist_empty(progs))
/* last program was detached, reset flags to zero */
- cgrp->bpf.flags[atype] = 0;
+ WRITE_ONCE(cgrp->bpf.flags[atype], 0);
if (old_prog) {
if (type == BPF_LSM_CGROUP)
bpf_trampoline_unlink_cgroup_shim(old_prog);
@@ -1022,10 +1025,10 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
__u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+ struct bpf_prog_array *effective[MAX_CGROUP_BPF_ATTACH_TYPE];
enum bpf_attach_type type = attr->query.attach_type;
enum cgroup_bpf_attach_type from_atype, to_atype;
enum cgroup_bpf_attach_type atype;
- struct bpf_prog_array *effective;
int cnt, ret = 0, i;
int total_cnt = 0;
u32 flags;
@@ -1046,14 +1049,15 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
if (from_atype < 0)
return -EINVAL;
to_atype = from_atype;
- flags = cgrp->bpf.flags[from_atype];
+ flags = READ_ONCE(cgrp->bpf.flags[from_atype]);
}
for (atype = from_atype; atype <= to_atype; atype++) {
if (effective_query) {
- effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
- lockdep_is_held(&cgroup_mutex));
- total_cnt += bpf_prog_array_length(effective);
+ effective[atype] = rcu_dereference_protected(
+ cgrp->bpf.effective[atype],
+ lockdep_is_held(&cgroup_mutex));
+ total_cnt += bpf_prog_array_length(effective[atype]);
} else {
total_cnt += prog_list_length(&cgrp->bpf.progs[atype]);
}
@@ -1076,12 +1080,10 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
if (effective_query) {
- effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
- lockdep_is_held(&cgroup_mutex));
- cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
- ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
+ cnt = min_t(int, bpf_prog_array_length(effective[atype]), total_cnt);
+ ret = bpf_prog_array_copy_to_user(effective[atype], prog_ids, cnt);
} else {
- struct hlist_head *progs;
+ struct hlist_head __rcu *progs;
struct bpf_prog_list *pl;
struct bpf_prog *prog;
u32 id;
@@ -1089,7 +1091,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
progs = &cgrp->bpf.progs[atype];
cnt = min_t(int, prog_list_length(progs), total_cnt);
i = 0;
- hlist_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry_rcu(pl, progs, node) {
prog = prog_list_prog(pl);
id = prog->aux->id;
if (copy_to_user(prog_ids + i, &id, sizeof(id)))
@@ -1099,7 +1101,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
}
if (prog_attach_flags) {
- flags = cgrp->bpf.flags[atype];
+ flags = READ_ONCE(cgrp->bpf.flags[atype]);
for (i = 0; i < cnt; i++)
if (copy_to_user(prog_attach_flags + i,
@@ -1112,6 +1114,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
prog_ids += cnt;
total_cnt -= cnt;
}
+ if (total_cnt != 0)
+ return -EAGAIN; /* raced with the detach */
return ret;
}
@@ -1120,9 +1124,9 @@ static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
{
int ret;
- mutex_lock(&cgroup_mutex);
+ rcu_read_lock();
ret = __cgroup_bpf_query(cgrp, attr, uattr);
- mutex_unlock(&cgroup_mutex);
+ rcu_read_unlock();
return ret;
}
--
2.40.1.521.gf1e218fcd8-goog
next reply other threads:[~2023-05-11 15:57 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-05-11 15:57 Stanislav Fomichev [this message]
2023-05-11 15:57 ` [PATCH bpf-next v5 0/4] bpf: Don't EFAULT for {g,s}setsockopt with wrong optlen Stanislav Fomichev
2023-05-11 15:57 ` [PATCH bpf-next v5 1/4] " Stanislav Fomichev
2023-05-11 15:57 ` [PATCH bpf-next v5 2/4] selftests/bpf: Update EFAULT {g,s}etsockopt selftests Stanislav Fomichev
2023-05-11 15:57 ` [PATCH bpf-next v5 3/4] selftests/bpf: Correctly handle optlen > 4096 Stanislav Fomichev
2023-05-11 15:57 ` [PATCH bpf-next v5 4/4] bpf: Document EFAULT changes for sockopt Stanislav Fomichev
2023-05-11 15:59 ` [PATCH bpf-next] RFC: bpf: query effective progs without cgroup_mutex Stanislav Fomichev
-- strict thread matches above, loose matches on Subject: below --
2023-05-05 18:45 Stanislav Fomichev
2023-05-05 18:47 ` Stanislav Fomichev
2023-05-05 21:35 ` Stanislav Fomichev
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230511155740.902548-1-sdf@google.com \
--to=sdf@google.com \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=haoluo@google.com \
--cc=john.fastabend@gmail.com \
--cc=jolsa@kernel.org \
--cc=kpsingh@kernel.org \
--cc=martin.lau@linux.dev \
--cc=song@kernel.org \
--cc=yhs@fb.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox