[PATCH bpf-next] RFC: bpf: query effective progs without cgroup

BPF List
 help / color / mirror / Atom feed

* [PATCH bpf-next] RFC: bpf: query effective progs without cgroup_mutex
@ 2023-05-05 18:45 Stanislav Fomichev
  2023-05-05 18:47 ` Stanislav Fomichev
  0 siblings, 1 reply; 5+ messages in thread
From: Stanislav Fomichev @ 2023-05-05 18:45 UTC (permalink / raw)
  To: bpf
  Cc: ast, daniel, andrii, martin.lau, song, yhs, john.fastabend,
	kpsingh, sdf, haoluo, jolsa

We're observing some stalls on the heavily loaded machines
in the cgroup_bpf_prog_query path. This is likely due to
being blocked on cgroup_mutex.

IIUC, the cgroup_mutex is there mostly to protect the non-effective
fields (cgrp->bpf.progs) which might be changed by the update path.
For the BPF_F_QUERY_EFFECTIVE case, all we need is to rcu_dereference
a bunch of pointers (and keep them around for consistency), so
let's do it.

Sending out as an RFC because it looks a bit ugly. It would also
be nice to handle non-effective case locklessly as well, but it
might require a larger rework.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 kernel/bpf/cgroup.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index a06e118a9be5..c9d4b66e2c15 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1022,10 +1022,10 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 	__u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
 	bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
 	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+	struct bpf_prog_array *effective[MAX_CGROUP_BPF_ATTACH_TYPE];
 	enum bpf_attach_type type = attr->query.attach_type;
 	enum cgroup_bpf_attach_type from_atype, to_atype;
 	enum cgroup_bpf_attach_type atype;
-	struct bpf_prog_array *effective;
 	int cnt, ret = 0, i;
 	int total_cnt = 0;
 	u32 flags;
@@ -1051,9 +1051,9 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 
 	for (atype = from_atype; atype <= to_atype; atype++) {
 		if (effective_query) {
-			effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
-							      lockdep_is_held(&cgroup_mutex));
-			total_cnt += bpf_prog_array_length(effective);
+			effective[atype] = rcu_dereference_protected(cgrp->bpf.effective[atype],
+								     lockdep_is_held(&cgroup_mutex));
+			total_cnt += bpf_prog_array_length(effective[atype]);
 		} else {
 			total_cnt += prog_list_length(&cgrp->bpf.progs[atype]);
 		}
@@ -1076,10 +1076,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 
 	for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
 		if (effective_query) {
-			effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
-							      lockdep_is_held(&cgroup_mutex));
-			cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
-			ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
+			cnt = min_t(int, bpf_prog_array_length(effective[atype]), total_cnt);
+			ret = bpf_prog_array_copy_to_user(effective[atype], prog_ids, cnt);
 		} else {
 			struct hlist_head *progs;
 			struct bpf_prog_list *pl;
@@ -1118,11 +1116,23 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 			    union bpf_attr __user *uattr)
 {
+	__u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
+	bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
+	bool need_mutex = false;
 	int ret;
 
-	mutex_lock(&cgroup_mutex);
+	if (effective_query && !prog_attach_flags)
+		need_mutex = false;
+
+	if (need_mutex)
+		mutex_lock(&cgroup_mutex);
+	else
+		rcu_read_lock();
 	ret = __cgroup_bpf_query(cgrp, attr, uattr);
-	mutex_unlock(&cgroup_mutex);
+	if (need_mutex)
+		mutex_unlock(&cgroup_mutex);
+	else
+		rcu_read_unlock();
 	return ret;
 }
 
-- 
2.40.1.521.gf1e218fcd8-goog


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH bpf-next] RFC: bpf: query effective progs without cgroup_mutex
  2023-05-05 18:45 [PATCH bpf-next] RFC: bpf: query effective progs without cgroup_mutex Stanislav Fomichev
@ 2023-05-05 18:47 ` Stanislav Fomichev
  2023-05-05 21:35   ` Stanislav Fomichev
  0 siblings, 1 reply; 5+ messages in thread
From: Stanislav Fomichev @ 2023-05-05 18:47 UTC (permalink / raw)
  To: bpf
  Cc: ast, daniel, andrii, martin.lau, song, yhs, john.fastabend,
	kpsingh, haoluo, jolsa

On Fri, May 5, 2023 at 11:45 AM Stanislav Fomichev <sdf@google.com> wrote:
>
> We're observing some stalls on the heavily loaded machines
> in the cgroup_bpf_prog_query path. This is likely due to
> being blocked on cgroup_mutex.
>
> IIUC, the cgroup_mutex is there mostly to protect the non-effective
> fields (cgrp->bpf.progs) which might be changed by the update path.
> For the BPF_F_QUERY_EFFECTIVE case, all we need is to rcu_dereference
> a bunch of pointers (and keep them around for consistency), so
> let's do it.
>
> Sending out as an RFC because it looks a bit ugly. It would also
> be nice to handle non-effective case locklessly as well, but it
> might require a larger rework.
>
> Signed-off-by: Stanislav Fomichev <sdf@google.com>
> ---
>  kernel/bpf/cgroup.c | 30 ++++++++++++++++++++----------
>  1 file changed, 20 insertions(+), 10 deletions(-)
>
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index a06e118a9be5..c9d4b66e2c15 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -1022,10 +1022,10 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>         __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
>         bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
>         __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
> +       struct bpf_prog_array *effective[MAX_CGROUP_BPF_ATTACH_TYPE];
>         enum bpf_attach_type type = attr->query.attach_type;
>         enum cgroup_bpf_attach_type from_atype, to_atype;
>         enum cgroup_bpf_attach_type atype;
> -       struct bpf_prog_array *effective;
>         int cnt, ret = 0, i;
>         int total_cnt = 0;
>         u32 flags;
> @@ -1051,9 +1051,9 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>
>         for (atype = from_atype; atype <= to_atype; atype++) {
>                 if (effective_query) {
> -                       effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
> -                                                             lockdep_is_held(&cgroup_mutex));
> -                       total_cnt += bpf_prog_array_length(effective);
> +                       effective[atype] = rcu_dereference_protected(cgrp->bpf.effective[atype],
> +                                                                    lockdep_is_held(&cgroup_mutex));
> +                       total_cnt += bpf_prog_array_length(effective[atype]);
>                 } else {
>                         total_cnt += prog_list_length(&cgrp->bpf.progs[atype]);
>                 }
> @@ -1076,10 +1076,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>
>         for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
>                 if (effective_query) {
> -                       effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
> -                                                             lockdep_is_held(&cgroup_mutex));
> -                       cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
> -                       ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
> +                       cnt = min_t(int, bpf_prog_array_length(effective[atype]), total_cnt);
> +                       ret = bpf_prog_array_copy_to_user(effective[atype], prog_ids, cnt);
>                 } else {
>                         struct hlist_head *progs;
>                         struct bpf_prog_list *pl;
> @@ -1118,11 +1116,23 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>  static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>                             union bpf_attr __user *uattr)
>  {
> +       __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
> +       bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
> +       bool need_mutex = false;

Oops, this has to be true, but you get the idea...

>         int ret;
>
> -       mutex_lock(&cgroup_mutex);
> +       if (effective_query && !prog_attach_flags)
> +               need_mutex = false;
> +
> +       if (need_mutex)
> +               mutex_lock(&cgroup_mutex);
> +       else
> +               rcu_read_lock();
>         ret = __cgroup_bpf_query(cgrp, attr, uattr);
> -       mutex_unlock(&cgroup_mutex);
> +       if (need_mutex)
> +               mutex_unlock(&cgroup_mutex);
> +       else
> +               rcu_read_unlock();
>         return ret;
>  }
>
> --
> 2.40.1.521.gf1e218fcd8-goog
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH bpf-next] RFC: bpf: query effective progs without cgroup_mutex
  2023-05-05 18:47 ` Stanislav Fomichev
@ 2023-05-05 21:35   ` Stanislav Fomichev
  0 siblings, 0 replies; 5+ messages in thread
From: Stanislav Fomichev @ 2023-05-05 21:35 UTC (permalink / raw)
  To: bpf
  Cc: ast, daniel, andrii, martin.lau, song, yhs, john.fastabend,
	kpsingh, haoluo, jolsa

On 05/05, Stanislav Fomichev wrote:
> On Fri, May 5, 2023 at 11:45 AM Stanislav Fomichev <sdf@google.com> wrote:
> >
> > We're observing some stalls on the heavily loaded machines
> > in the cgroup_bpf_prog_query path. This is likely due to
> > being blocked on cgroup_mutex.
> >
> > IIUC, the cgroup_mutex is there mostly to protect the non-effective
> > fields (cgrp->bpf.progs) which might be changed by the update path.
> > For the BPF_F_QUERY_EFFECTIVE case, all we need is to rcu_dereference
> > a bunch of pointers (and keep them around for consistency), so
> > let's do it.
> >
> > Sending out as an RFC because it looks a bit ugly. It would also
> > be nice to handle non-effective case locklessly as well, but it
> > might require a larger rework.
> >
> > Signed-off-by: Stanislav Fomichev <sdf@google.com>

Ah, that wont work, can't copy to user while holding rcu.

Maybe we can even go as far as converting the progs list to rcu..
This should allow as to drop mutex altogether during the query path.
As long as we return EAGAIN on racy deatach we should be fine it seems.
Will try to play with this idea.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH bpf-next] RFC: bpf: query effective progs without cgroup_mutex
@ 2023-05-11 15:57 Stanislav Fomichev
  2023-05-11 15:59 ` Stanislav Fomichev
  0 siblings, 1 reply; 5+ messages in thread
From: Stanislav Fomichev @ 2023-05-11 15:57 UTC (permalink / raw)
  To: bpf
  Cc: ast, daniel, andrii, martin.lau, song, yhs, john.fastabend,
	kpsingh, sdf, haoluo, jolsa

We're observing some stalls on the heavily loaded machines
in the cgroup_bpf_prog_query path. This is likely due to
being blocked on cgroup_mutex.

IIUC, the cgroup_mutex is there mostly to protect the non-effective
fields (cgrp->bpf.progs) which might be changed by the update path.
For the BPF_F_QUERY_EFFECTIVE case, all we need is to rcu_dereference
a bunch of pointers (and keep them around for consistency), so
let's do it.

Sending out as an RFC because it looks a bit ugly. It would also
be nice to handle non-effective case locklessly as well, but it
might require a larger rework.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 include/linux/bpf-cgroup-defs.h |  2 +-
 include/linux/bpf-cgroup.h      |  1 +
 kernel/bpf/cgroup.c             | 80 +++++++++++++++++----------------
 3 files changed, 44 insertions(+), 39 deletions(-)

diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h
index 7b121bd780eb..df0b8faa1a17 100644
--- a/include/linux/bpf-cgroup-defs.h
+++ b/include/linux/bpf-cgroup-defs.h
@@ -56,7 +56,7 @@ struct cgroup_bpf {
 	 * have either zero or one element
 	 * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS
 	 */
-	struct hlist_head progs[MAX_CGROUP_BPF_ATTACH_TYPE];
+	struct hlist_head __rcu progs[MAX_CGROUP_BPF_ATTACH_TYPE];
 	u8 flags[MAX_CGROUP_BPF_ATTACH_TYPE];
 
 	/* list of cgroup shared storages */
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 57e9e109257e..555e9cbb3a05 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -106,6 +106,7 @@ struct bpf_prog_list {
 	struct bpf_prog *prog;
 	struct bpf_cgroup_link *link;
 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
+	struct rcu_head rcu;
 };
 
 int cgroup_bpf_inherit(struct cgroup *cgrp);
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index a06e118a9be5..92a1b33dcc06 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -285,12 +285,15 @@ static void cgroup_bpf_release(struct work_struct *work)
 	mutex_lock(&cgroup_mutex);
 
 	for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
-		struct hlist_head *progs = &cgrp->bpf.progs[atype];
+		struct hlist_head *progs;
 		struct bpf_prog_list *pl;
 		struct hlist_node *pltmp;
 
+		progs = rcu_dereference_protected(&cgrp->bpf.progs[atype],
+						  lockdep_is_held(&cgroup_mutex));
+
 		hlist_for_each_entry_safe(pl, pltmp, progs, node) {
-			hlist_del(&pl->node);
+			hlist_del_rcu(&pl->node);
 			if (pl->prog) {
 				if (pl->prog->expected_attach_type == BPF_LSM_CGROUP)
 					bpf_trampoline_unlink_cgroup_shim(pl->prog);
@@ -301,7 +304,7 @@ static void cgroup_bpf_release(struct work_struct *work)
 					bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog);
 				bpf_cgroup_link_auto_detach(pl->link);
 			}
-			kfree(pl);
+			kfree_rcu(pl, rcu);
 			static_branch_dec(&cgroup_bpf_enabled_key[atype]);
 		}
 		old_array = rcu_dereference_protected(
@@ -352,12 +355,12 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
 /* count number of elements in the list.
  * it's slow but the list cannot be long
  */
-static u32 prog_list_length(struct hlist_head *head)
+static u32 prog_list_length(struct hlist_head __rcu *head)
 {
 	struct bpf_prog_list *pl;
 	u32 cnt = 0;
 
-	hlist_for_each_entry(pl, head, node) {
+	hlist_for_each_entry_rcu(pl, head, node) {
 		if (!prog_list_prog(pl))
 			continue;
 		cnt++;
@@ -553,7 +556,7 @@ static int update_effective_progs(struct cgroup *cgrp,
 
 #define BPF_CGROUP_MAX_PROGS 64
 
-static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
+static struct bpf_prog_list *find_attach_entry(struct hlist_head __rcu *progs,
 					       struct bpf_prog *prog,
 					       struct bpf_cgroup_link *link,
 					       struct bpf_prog *replace_prog,
@@ -565,10 +568,10 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
 	if (!allow_multi) {
 		if (hlist_empty(progs))
 			return NULL;
-		return hlist_entry(progs->first, typeof(*pl), node);
+		return hlist_entry(rcu_dereference_raw(progs)->first, typeof(*pl), node);
 	}
 
-	hlist_for_each_entry(pl, progs, node) {
+	hlist_for_each_entry_rcu(pl, progs, node) {
 		if (prog && pl->prog == prog && prog != replace_prog)
 			/* disallow attaching the same prog twice */
 			return ERR_PTR(-EINVAL);
@@ -579,7 +582,7 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
 
 	/* direct prog multi-attach w/ replacement case */
 	if (replace_prog) {
-		hlist_for_each_entry(pl, progs, node) {
+		hlist_for_each_entry_rcu(pl, progs, node) {
 			if (pl->prog == replace_prog)
 				/* a match found */
 				return pl;
@@ -615,8 +618,8 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
 	struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
 	struct bpf_prog *new_prog = prog ? : link->link.prog;
 	enum cgroup_bpf_attach_type atype;
+	struct hlist_head __rcu *progs;
 	struct bpf_prog_list *pl;
-	struct hlist_head *progs;
 	int err;
 
 	if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
@@ -658,6 +661,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
 				      prog ? : link->link.prog, cgrp))
 		return -ENOMEM;
 
+	WRITE_ONCE(cgrp->bpf.flags[atype], saved_flags);
 	if (pl) {
 		old_prog = pl->prog;
 	} else {
@@ -669,12 +673,12 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
 			return -ENOMEM;
 		}
 		if (hlist_empty(progs))
-			hlist_add_head(&pl->node, progs);
+			hlist_add_head_rcu(&pl->node, progs);
 		else
-			hlist_for_each(last, progs) {
+			__hlist_for_each_rcu(last, progs) {
 				if (last->next)
 					continue;
-				hlist_add_behind(&pl->node, last);
+				hlist_add_behind_rcu(&pl->node, last);
 				break;
 			}
 	}
@@ -682,7 +686,6 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
 	pl->prog = prog;
 	pl->link = link;
 	bpf_cgroup_storages_assign(pl->storage, storage);
-	cgrp->bpf.flags[atype] = saved_flags;
 
 	if (type == BPF_LSM_CGROUP) {
 		err = bpf_trampoline_link_cgroup_shim(new_prog, atype);
@@ -796,7 +799,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
 	enum cgroup_bpf_attach_type atype;
 	struct bpf_prog *old_prog;
 	struct bpf_prog_list *pl;
-	struct hlist_head *progs;
+	struct hlist_head __rcu *progs;
 	bool found = false;
 
 	atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id);
@@ -808,7 +811,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
 	if (link->link.prog->type != new_prog->type)
 		return -EINVAL;
 
-	hlist_for_each_entry(pl, progs, node) {
+	hlist_for_each_entry_rcu(pl, progs, node) {
 		if (pl->link == link) {
 			found = true;
 			break;
@@ -847,7 +850,7 @@ static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
 	return ret;
 }
 
-static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
+static struct bpf_prog_list *find_detach_entry(struct hlist_head __rcu *progs,
 					       struct bpf_prog *prog,
 					       struct bpf_cgroup_link *link,
 					       bool allow_multi)
@@ -862,7 +865,7 @@ static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
 		/* to maintain backward compatibility NONE and OVERRIDE cgroups
 		 * allow detaching with invalid FD (prog==NULL) in legacy mode
 		 */
-		return hlist_entry(progs->first, typeof(*pl), node);
+		return hlist_entry(rcu_dereference_raw(progs)->first, typeof(*pl), node);
 	}
 
 	if (!prog && !link)
@@ -872,7 +875,7 @@ static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
 		return ERR_PTR(-EINVAL);
 
 	/* find the prog or link and detach it */
-	hlist_for_each_entry(pl, progs, node) {
+	hlist_for_each_entry_rcu(pl, progs, node) {
 		if (pl->prog == prog && pl->link == link)
 			return pl;
 	}
@@ -894,9 +897,9 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
 				  enum cgroup_bpf_attach_type atype)
 {
 	struct cgroup_subsys_state *css;
+	struct hlist_head __rcu *head;
 	struct bpf_prog_array *progs;
 	struct bpf_prog_list *pl;
-	struct hlist_head *head;
 	struct cgroup *cg;
 	int pos;
 
@@ -913,7 +916,7 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
 				continue;
 
 			head = &cg->bpf.progs[atype];
-			hlist_for_each_entry(pl, head, node) {
+			hlist_for_each_entry_rcu(pl, head, node) {
 				if (!prog_list_prog(pl))
 					continue;
 				if (pl->prog == prog && pl->link == link)
@@ -950,9 +953,9 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 			       struct bpf_cgroup_link *link, enum bpf_attach_type type)
 {
 	enum cgroup_bpf_attach_type atype;
+	struct hlist_head __rcu *progs;
 	struct bpf_prog *old_prog;
 	struct bpf_prog_list *pl;
-	struct hlist_head *progs;
 	u32 attach_btf_id = 0;
 	u32 flags;
 
@@ -989,12 +992,12 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 	}
 
 	/* now can actually delete it from this cgroup list */
-	hlist_del(&pl->node);
+	hlist_del_rcu(&pl->node);
 
 	kfree(pl);
 	if (hlist_empty(progs))
 		/* last program was detached, reset flags to zero */
-		cgrp->bpf.flags[atype] = 0;
+		WRITE_ONCE(cgrp->bpf.flags[atype], 0);
 	if (old_prog) {
 		if (type == BPF_LSM_CGROUP)
 			bpf_trampoline_unlink_cgroup_shim(old_prog);
@@ -1022,10 +1025,10 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 	__u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
 	bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
 	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+	struct bpf_prog_array *effective[MAX_CGROUP_BPF_ATTACH_TYPE];
 	enum bpf_attach_type type = attr->query.attach_type;
 	enum cgroup_bpf_attach_type from_atype, to_atype;
 	enum cgroup_bpf_attach_type atype;
-	struct bpf_prog_array *effective;
 	int cnt, ret = 0, i;
 	int total_cnt = 0;
 	u32 flags;
@@ -1046,14 +1049,15 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 		if (from_atype < 0)
 			return -EINVAL;
 		to_atype = from_atype;
-		flags = cgrp->bpf.flags[from_atype];
+		flags = READ_ONCE(cgrp->bpf.flags[from_atype]);
 	}
 
 	for (atype = from_atype; atype <= to_atype; atype++) {
 		if (effective_query) {
-			effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
-							      lockdep_is_held(&cgroup_mutex));
-			total_cnt += bpf_prog_array_length(effective);
+			effective[atype] = rcu_dereference_protected(
+						cgrp->bpf.effective[atype],
+						lockdep_is_held(&cgroup_mutex));
+			total_cnt += bpf_prog_array_length(effective[atype]);
 		} else {
 			total_cnt += prog_list_length(&cgrp->bpf.progs[atype]);
 		}
@@ -1076,12 +1080,10 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 
 	for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
 		if (effective_query) {
-			effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
-							      lockdep_is_held(&cgroup_mutex));
-			cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
-			ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
+			cnt = min_t(int, bpf_prog_array_length(effective[atype]), total_cnt);
+			ret = bpf_prog_array_copy_to_user(effective[atype], prog_ids, cnt);
 		} else {
-			struct hlist_head *progs;
+			struct hlist_head __rcu *progs;
 			struct bpf_prog_list *pl;
 			struct bpf_prog *prog;
 			u32 id;
@@ -1089,7 +1091,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 			progs = &cgrp->bpf.progs[atype];
 			cnt = min_t(int, prog_list_length(progs), total_cnt);
 			i = 0;
-			hlist_for_each_entry(pl, progs, node) {
+			hlist_for_each_entry_rcu(pl, progs, node) {
 				prog = prog_list_prog(pl);
 				id = prog->aux->id;
 				if (copy_to_user(prog_ids + i, &id, sizeof(id)))
@@ -1099,7 +1101,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 			}
 
 			if (prog_attach_flags) {
-				flags = cgrp->bpf.flags[atype];
+				flags = READ_ONCE(cgrp->bpf.flags[atype]);
 
 				for (i = 0; i < cnt; i++)
 					if (copy_to_user(prog_attach_flags + i,
@@ -1112,6 +1114,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 		prog_ids += cnt;
 		total_cnt -= cnt;
 	}
+	if (total_cnt != 0)
+		return -EAGAIN; /* raced with the detach */
 	return ret;
 }
 
@@ -1120,9 +1124,9 @@ static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 {
 	int ret;
 
-	mutex_lock(&cgroup_mutex);
+	rcu_read_lock();
 	ret = __cgroup_bpf_query(cgrp, attr, uattr);
-	mutex_unlock(&cgroup_mutex);
+	rcu_read_unlock();
 	return ret;
 }
 
-- 
2.40.1.521.gf1e218fcd8-goog


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH bpf-next] RFC: bpf: query effective progs without cgroup_mutex
  2023-05-11 15:57 Stanislav Fomichev
@ 2023-05-11 15:59 ` Stanislav Fomichev
  0 siblings, 0 replies; 5+ messages in thread
From: Stanislav Fomichev @ 2023-05-11 15:59 UTC (permalink / raw)
  To: bpf
  Cc: ast, daniel, andrii, martin.lau, song, yhs, john.fastabend,
	kpsingh, haoluo, jolsa

On Thu, May 11, 2023 at 8:57 AM Stanislav Fomichev <sdf@google.com> wrote:
>
> We're observing some stalls on the heavily loaded machines
> in the cgroup_bpf_prog_query path. This is likely due to
> being blocked on cgroup_mutex.
>
> IIUC, the cgroup_mutex is there mostly to protect the non-effective
> fields (cgrp->bpf.progs) which might be changed by the update path.
> For the BPF_F_QUERY_EFFECTIVE case, all we need is to rcu_dereference
> a bunch of pointers (and keep them around for consistency), so
> let's do it.
>
> Sending out as an RFC because it looks a bit ugly. It would also
> be nice to handle non-effective case locklessly as well, but it
> might require a larger rework.

Oops, please ignore this one. Somehow got into my unrelated sockopt series..

> Signed-off-by: Stanislav Fomichev <sdf@google.com>
> ---
>  include/linux/bpf-cgroup-defs.h |  2 +-
>  include/linux/bpf-cgroup.h      |  1 +
>  kernel/bpf/cgroup.c             | 80 +++++++++++++++++----------------
>  3 files changed, 44 insertions(+), 39 deletions(-)
>
> diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h
> index 7b121bd780eb..df0b8faa1a17 100644
> --- a/include/linux/bpf-cgroup-defs.h
> +++ b/include/linux/bpf-cgroup-defs.h
> @@ -56,7 +56,7 @@ struct cgroup_bpf {
>          * have either zero or one element
>          * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS
>          */
> -       struct hlist_head progs[MAX_CGROUP_BPF_ATTACH_TYPE];
> +       struct hlist_head __rcu progs[MAX_CGROUP_BPF_ATTACH_TYPE];
>         u8 flags[MAX_CGROUP_BPF_ATTACH_TYPE];
>
>         /* list of cgroup shared storages */
> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index 57e9e109257e..555e9cbb3a05 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
> @@ -106,6 +106,7 @@ struct bpf_prog_list {
>         struct bpf_prog *prog;
>         struct bpf_cgroup_link *link;
>         struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
> +       struct rcu_head rcu;
>  };
>
>  int cgroup_bpf_inherit(struct cgroup *cgrp);
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index a06e118a9be5..92a1b33dcc06 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -285,12 +285,15 @@ static void cgroup_bpf_release(struct work_struct *work)
>         mutex_lock(&cgroup_mutex);
>
>         for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
> -               struct hlist_head *progs = &cgrp->bpf.progs[atype];
> +               struct hlist_head *progs;
>                 struct bpf_prog_list *pl;
>                 struct hlist_node *pltmp;
>
> +               progs = rcu_dereference_protected(&cgrp->bpf.progs[atype],
> +                                                 lockdep_is_held(&cgroup_mutex));
> +
>                 hlist_for_each_entry_safe(pl, pltmp, progs, node) {
> -                       hlist_del(&pl->node);
> +                       hlist_del_rcu(&pl->node);
>                         if (pl->prog) {
>                                 if (pl->prog->expected_attach_type == BPF_LSM_CGROUP)
>                                         bpf_trampoline_unlink_cgroup_shim(pl->prog);
> @@ -301,7 +304,7 @@ static void cgroup_bpf_release(struct work_struct *work)
>                                         bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog);
>                                 bpf_cgroup_link_auto_detach(pl->link);
>                         }
> -                       kfree(pl);
> +                       kfree_rcu(pl, rcu);
>                         static_branch_dec(&cgroup_bpf_enabled_key[atype]);
>                 }
>                 old_array = rcu_dereference_protected(
> @@ -352,12 +355,12 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
>  /* count number of elements in the list.
>   * it's slow but the list cannot be long
>   */
> -static u32 prog_list_length(struct hlist_head *head)
> +static u32 prog_list_length(struct hlist_head __rcu *head)
>  {
>         struct bpf_prog_list *pl;
>         u32 cnt = 0;
>
> -       hlist_for_each_entry(pl, head, node) {
> +       hlist_for_each_entry_rcu(pl, head, node) {
>                 if (!prog_list_prog(pl))
>                         continue;
>                 cnt++;
> @@ -553,7 +556,7 @@ static int update_effective_progs(struct cgroup *cgrp,
>
>  #define BPF_CGROUP_MAX_PROGS 64
>
> -static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
> +static struct bpf_prog_list *find_attach_entry(struct hlist_head __rcu *progs,
>                                                struct bpf_prog *prog,
>                                                struct bpf_cgroup_link *link,
>                                                struct bpf_prog *replace_prog,
> @@ -565,10 +568,10 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
>         if (!allow_multi) {
>                 if (hlist_empty(progs))
>                         return NULL;
> -               return hlist_entry(progs->first, typeof(*pl), node);
> +               return hlist_entry(rcu_dereference_raw(progs)->first, typeof(*pl), node);
>         }
>
> -       hlist_for_each_entry(pl, progs, node) {
> +       hlist_for_each_entry_rcu(pl, progs, node) {
>                 if (prog && pl->prog == prog && prog != replace_prog)
>                         /* disallow attaching the same prog twice */
>                         return ERR_PTR(-EINVAL);
> @@ -579,7 +582,7 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
>
>         /* direct prog multi-attach w/ replacement case */
>         if (replace_prog) {
> -               hlist_for_each_entry(pl, progs, node) {
> +               hlist_for_each_entry_rcu(pl, progs, node) {
>                         if (pl->prog == replace_prog)
>                                 /* a match found */
>                                 return pl;
> @@ -615,8 +618,8 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
>         struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
>         struct bpf_prog *new_prog = prog ? : link->link.prog;
>         enum cgroup_bpf_attach_type atype;
> +       struct hlist_head __rcu *progs;
>         struct bpf_prog_list *pl;
> -       struct hlist_head *progs;
>         int err;
>
>         if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
> @@ -658,6 +661,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
>                                       prog ? : link->link.prog, cgrp))
>                 return -ENOMEM;
>
> +       WRITE_ONCE(cgrp->bpf.flags[atype], saved_flags);
>         if (pl) {
>                 old_prog = pl->prog;
>         } else {
> @@ -669,12 +673,12 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
>                         return -ENOMEM;
>                 }
>                 if (hlist_empty(progs))
> -                       hlist_add_head(&pl->node, progs);
> +                       hlist_add_head_rcu(&pl->node, progs);
>                 else
> -                       hlist_for_each(last, progs) {
> +                       __hlist_for_each_rcu(last, progs) {
>                                 if (last->next)
>                                         continue;
> -                               hlist_add_behind(&pl->node, last);
> +                               hlist_add_behind_rcu(&pl->node, last);
>                                 break;
>                         }
>         }
> @@ -682,7 +686,6 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
>         pl->prog = prog;
>         pl->link = link;
>         bpf_cgroup_storages_assign(pl->storage, storage);
> -       cgrp->bpf.flags[atype] = saved_flags;
>
>         if (type == BPF_LSM_CGROUP) {
>                 err = bpf_trampoline_link_cgroup_shim(new_prog, atype);
> @@ -796,7 +799,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
>         enum cgroup_bpf_attach_type atype;
>         struct bpf_prog *old_prog;
>         struct bpf_prog_list *pl;
> -       struct hlist_head *progs;
> +       struct hlist_head __rcu *progs;
>         bool found = false;
>
>         atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id);
> @@ -808,7 +811,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
>         if (link->link.prog->type != new_prog->type)
>                 return -EINVAL;
>
> -       hlist_for_each_entry(pl, progs, node) {
> +       hlist_for_each_entry_rcu(pl, progs, node) {
>                 if (pl->link == link) {
>                         found = true;
>                         break;
> @@ -847,7 +850,7 @@ static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
>         return ret;
>  }
>
> -static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
> +static struct bpf_prog_list *find_detach_entry(struct hlist_head __rcu *progs,
>                                                struct bpf_prog *prog,
>                                                struct bpf_cgroup_link *link,
>                                                bool allow_multi)
> @@ -862,7 +865,7 @@ static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
>                 /* to maintain backward compatibility NONE and OVERRIDE cgroups
>                  * allow detaching with invalid FD (prog==NULL) in legacy mode
>                  */
> -               return hlist_entry(progs->first, typeof(*pl), node);
> +               return hlist_entry(rcu_dereference_raw(progs)->first, typeof(*pl), node);
>         }
>
>         if (!prog && !link)
> @@ -872,7 +875,7 @@ static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
>                 return ERR_PTR(-EINVAL);
>
>         /* find the prog or link and detach it */
> -       hlist_for_each_entry(pl, progs, node) {
> +       hlist_for_each_entry_rcu(pl, progs, node) {
>                 if (pl->prog == prog && pl->link == link)
>                         return pl;
>         }
> @@ -894,9 +897,9 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
>                                   enum cgroup_bpf_attach_type atype)
>  {
>         struct cgroup_subsys_state *css;
> +       struct hlist_head __rcu *head;
>         struct bpf_prog_array *progs;
>         struct bpf_prog_list *pl;
> -       struct hlist_head *head;
>         struct cgroup *cg;
>         int pos;
>
> @@ -913,7 +916,7 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
>                                 continue;
>
>                         head = &cg->bpf.progs[atype];
> -                       hlist_for_each_entry(pl, head, node) {
> +                       hlist_for_each_entry_rcu(pl, head, node) {
>                                 if (!prog_list_prog(pl))
>                                         continue;
>                                 if (pl->prog == prog && pl->link == link)
> @@ -950,9 +953,9 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
>                                struct bpf_cgroup_link *link, enum bpf_attach_type type)
>  {
>         enum cgroup_bpf_attach_type atype;
> +       struct hlist_head __rcu *progs;
>         struct bpf_prog *old_prog;
>         struct bpf_prog_list *pl;
> -       struct hlist_head *progs;
>         u32 attach_btf_id = 0;
>         u32 flags;
>
> @@ -989,12 +992,12 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
>         }
>
>         /* now can actually delete it from this cgroup list */
> -       hlist_del(&pl->node);
> +       hlist_del_rcu(&pl->node);
>
>         kfree(pl);
>         if (hlist_empty(progs))
>                 /* last program was detached, reset flags to zero */
> -               cgrp->bpf.flags[atype] = 0;
> +               WRITE_ONCE(cgrp->bpf.flags[atype], 0);
>         if (old_prog) {
>                 if (type == BPF_LSM_CGROUP)
>                         bpf_trampoline_unlink_cgroup_shim(old_prog);
> @@ -1022,10 +1025,10 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>         __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
>         bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
>         __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
> +       struct bpf_prog_array *effective[MAX_CGROUP_BPF_ATTACH_TYPE];
>         enum bpf_attach_type type = attr->query.attach_type;
>         enum cgroup_bpf_attach_type from_atype, to_atype;
>         enum cgroup_bpf_attach_type atype;
> -       struct bpf_prog_array *effective;
>         int cnt, ret = 0, i;
>         int total_cnt = 0;
>         u32 flags;
> @@ -1046,14 +1049,15 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>                 if (from_atype < 0)
>                         return -EINVAL;
>                 to_atype = from_atype;
> -               flags = cgrp->bpf.flags[from_atype];
> +               flags = READ_ONCE(cgrp->bpf.flags[from_atype]);
>         }
>
>         for (atype = from_atype; atype <= to_atype; atype++) {
>                 if (effective_query) {
> -                       effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
> -                                                             lockdep_is_held(&cgroup_mutex));
> -                       total_cnt += bpf_prog_array_length(effective);
> +                       effective[atype] = rcu_dereference_protected(
> +                                               cgrp->bpf.effective[atype],
> +                                               lockdep_is_held(&cgroup_mutex));
> +                       total_cnt += bpf_prog_array_length(effective[atype]);
>                 } else {
>                         total_cnt += prog_list_length(&cgrp->bpf.progs[atype]);
>                 }
> @@ -1076,12 +1080,10 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>
>         for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
>                 if (effective_query) {
> -                       effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
> -                                                             lockdep_is_held(&cgroup_mutex));
> -                       cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
> -                       ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
> +                       cnt = min_t(int, bpf_prog_array_length(effective[atype]), total_cnt);
> +                       ret = bpf_prog_array_copy_to_user(effective[atype], prog_ids, cnt);
>                 } else {
> -                       struct hlist_head *progs;
> +                       struct hlist_head __rcu *progs;
>                         struct bpf_prog_list *pl;
>                         struct bpf_prog *prog;
>                         u32 id;
> @@ -1089,7 +1091,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>                         progs = &cgrp->bpf.progs[atype];
>                         cnt = min_t(int, prog_list_length(progs), total_cnt);
>                         i = 0;
> -                       hlist_for_each_entry(pl, progs, node) {
> +                       hlist_for_each_entry_rcu(pl, progs, node) {
>                                 prog = prog_list_prog(pl);
>                                 id = prog->aux->id;
>                                 if (copy_to_user(prog_ids + i, &id, sizeof(id)))
> @@ -1099,7 +1101,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>                         }
>
>                         if (prog_attach_flags) {
> -                               flags = cgrp->bpf.flags[atype];
> +                               flags = READ_ONCE(cgrp->bpf.flags[atype]);
>
>                                 for (i = 0; i < cnt; i++)
>                                         if (copy_to_user(prog_attach_flags + i,
> @@ -1112,6 +1114,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>                 prog_ids += cnt;
>                 total_cnt -= cnt;
>         }
> +       if (total_cnt != 0)
> +               return -EAGAIN; /* raced with the detach */
>         return ret;
>  }
>
> @@ -1120,9 +1124,9 @@ static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>  {
>         int ret;
>
> -       mutex_lock(&cgroup_mutex);
> +       rcu_read_lock();
>         ret = __cgroup_bpf_query(cgrp, attr, uattr);
> -       mutex_unlock(&cgroup_mutex);
> +       rcu_read_unlock();
>         return ret;
>  }
>
> --
> 2.40.1.521.gf1e218fcd8-goog
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2023-05-11 16:00 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-05-05 18:45 [PATCH bpf-next] RFC: bpf: query effective progs without cgroup_mutex Stanislav Fomichev
2023-05-05 18:47 ` Stanislav Fomichev
2023-05-05 21:35   ` Stanislav Fomichev
  -- strict thread matches above, loose matches on Subject: below --
2023-05-11 15:57 Stanislav Fomichev
2023-05-11 15:59 ` Stanislav Fomichev

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox