Re: [PATCH bpf v3 4/6] bpf: Optimize the free of inner map

BPF List
 help / color / mirror / Atom feed

From: Yonghong Song <yonghong.song@linux.dev>
To: Hou Tao <houtao@huaweicloud.com>, bpf@vger.kernel.org
Cc: Martin KaFai Lau <martin.lau@linux.dev>,
	Alexei Starovoitov <alexei.starovoitov@gmail.com>,
	Andrii Nakryiko <andrii@kernel.org>, Song Liu <song@kernel.org>,
	Hao Luo <haoluo@google.com>,
	Daniel Borkmann <daniel@iogearbox.net>,
	KP Singh <kpsingh@kernel.org>,
	Stanislav Fomichev <sdf@google.com>, Jiri Olsa <jolsa@kernel.org>,
	John Fastabend <john.fastabend@gmail.com>,
	houtao1@huawei.com
Subject: Re: [PATCH bpf v3 4/6] bpf: Optimize the free of inner map
Date: Sat, 25 Nov 2023 23:13:23 -0800	[thread overview]
Message-ID: <03909d48-646d-4d71-b7bd-0b7510b0bd4f@linux.dev> (raw)
In-Reply-To: <20231124113033.503338-5-houtao@huaweicloud.com>


On 11/24/23 6:30 AM, Hou Tao wrote:
> From: Hou Tao <houtao1@huawei.com>
>
> When removing the inner map from the outer map, the inner map will be
> freed after one RCU grace period and one RCU tasks trace grace
> period, so it is certain that the bpf program, which may access the
> inner map, has exited before the inner map is freed.
>
> However there is unnecessary to wait for any RCU grace period, one RCU
> grace period or one RCU tasks trace grace period if the outer map is
> only accessed by userspace, sleepable program or non-sleepable program.
> So recording the sleepable attributes of the owned bpf programs when
> adding the outer map into env->used_maps, copying the recorded
> attributes to inner map atomically when removing inner map from the
> outer map and using the recorded attributes in the inner map to decide
> which, and how many, RCU grace periods are needed when freeing the
> inner map.
>
> Signed-off-by: Hou Tao <houtao1@huawei.com>
> ---
>   include/linux/bpf.h     |  8 +++++++-
>   kernel/bpf/map_in_map.c | 19 ++++++++++++++-----
>   kernel/bpf/syscall.c    | 15 +++++++++++++--
>   kernel/bpf/verifier.c   |  4 ++++
>   4 files changed, 38 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 15a6bb951b70..c5b549f352d7 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -245,6 +245,11 @@ struct bpf_list_node_kern {
>   	void *owner;
>   } __attribute__((aligned(8)));
>   
> +enum {
> +	BPF_MAP_RCU_GP = BIT(0),
> +	BPF_MAP_RCU_TT_GP = BIT(1),
> +};
> +
>   struct bpf_map {
>   	/* The first two cachelines with read-mostly members of which some
>   	 * are also accessed in fast-path (e.g. ops, max_entries).
> @@ -296,7 +301,8 @@ struct bpf_map {
>   	} owner;
>   	bool bypass_spec_v1;
>   	bool frozen; /* write-once; write-protected by freeze_mutex */
> -	bool free_after_mult_rcu_gp;
> +	atomic_t used_in_rcu_gp;
> +	atomic_t free_by_rcu_gp;
>   	s64 __percpu *elem_count;
>   };
>   
> diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
> index cf3363065566..d044ee677107 100644
> --- a/kernel/bpf/map_in_map.c
> +++ b/kernel/bpf/map_in_map.c
> @@ -131,12 +131,21 @@ void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool deferred)
>   {
>   	struct bpf_map *inner_map = ptr;
>   
> -	/* The inner map may still be used by both non-sleepable and sleepable
> -	 * bpf program, so free it after one RCU grace period and one tasks
> -	 * trace RCU grace period.
> +	/* Defer the freeing of inner map according to the attribute of bpf
> +	 * program which owns the outer map, so unnecessary multiple RCU GP
> +	 * waitings can be avoided.
>   	 */
> -	if (deferred)
> -		WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true);
> +	if (deferred) {
> +		/* used_in_rcu_gp may be updated concurrently by new bpf
> +		 * program, so add smp_mb() to guarantee the order between
> +		 * used_in_rcu_gp and lookup/deletion operation of inner map.
> +		 * If a new bpf program finds the inner map before it is
> +		 * removed from outer map, reading used_in_rcu_gp below will
> +		 * return the newly-set bit set by the new bpf program.
> +		 */
> +		smp_mb();

smp_mb__before_atomic()?

> +		atomic_or(atomic_read(&map->used_in_rcu_gp), &inner_map->free_by_rcu_gp);
> +	}
>   	bpf_map_put(inner_map);
>   }
>   
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 88882cb58121..014a8cd55a41 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -734,7 +734,10 @@ static void bpf_map_free_rcu_gp(struct rcu_head *rcu)
>   
>   static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu)
>   {
> -	if (rcu_trace_implies_rcu_gp())
> +	struct bpf_map *map = container_of(rcu, struct bpf_map, rcu);
> +
> +	if (!(atomic_read(&map->free_by_rcu_gp) & BPF_MAP_RCU_GP) ||
> +	    rcu_trace_implies_rcu_gp())
>   		bpf_map_free_rcu_gp(rcu);
>   	else
>   		call_rcu(rcu, bpf_map_free_rcu_gp);
> @@ -746,11 +749,16 @@ static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu)
>   void bpf_map_put(struct bpf_map *map)
>   {
>   	if (atomic64_dec_and_test(&map->refcnt)) {
> +		int free_by_rcu_gp;
> +
>   		/* bpf_map_free_id() must be called first */
>   		bpf_map_free_id(map);
>   		btf_put(map->btf);
>   
> -		if (READ_ONCE(map->free_after_mult_rcu_gp))
> +		free_by_rcu_gp = atomic_read(&map->free_by_rcu_gp);
> +		if (free_by_rcu_gp == BPF_MAP_RCU_GP)
> +			call_rcu(&map->rcu, bpf_map_free_rcu_gp);
> +		else if (free_by_rcu_gp)
>   			call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp);
>   		else
>   			bpf_map_free_in_work(map);
> @@ -5343,6 +5351,9 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
>   		goto out_unlock;
>   	}
>   
> +	/* No need to update used_in_rcu_gp, because the bpf program doesn't
> +	 * access the map.
> +	 */
>   	memcpy(used_maps_new, used_maps_old,
>   	       sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
>   	used_maps_new[prog->aux->used_map_cnt] = map;
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 6da370a047fe..3b86c02077f1 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -18051,6 +18051,10 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
>   				return -E2BIG;
>   			}
>   
> +			atomic_or(env->prog->aux->sleepable ? BPF_MAP_RCU_TT_GP : BPF_MAP_RCU_GP,
> +				  &map->used_in_rcu_gp);
> +			/* Pairs with smp_mb() in bpf_map_fd_put_ptr() */
> +			smp_mb__before_atomic();

smp_mb__after_atomic()?

Just curious, are two smp_mb*() memory barriers in this patch truely necessary or just
want to be cautious?

>   			/* hold the map. If the program is rejected by verifier,
>   			 * the map will be released by release_maps() or it
>   			 * will be used by the valid program until it's unloaded

next prev parent reply	other threads:[~2023-11-26  7:13 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-11-24 11:30 [PATCH bpf v3 0/6] bpf: Fix the release of inner map Hou Tao
2023-11-24 11:30 ` [PATCH bpf v3 1/6] bpf: Check rcu_read_lock_trace_held() before calling bpf map helpers Hou Tao
2023-11-24 11:30 ` [PATCH bpf v3 2/6] bpf: Add map and need_defer parameters to .map_fd_put_ptr() Hou Tao
2023-11-24 11:30 ` [PATCH bpf v3 3/6] bpf: Defer the free of inner map when necessary Hou Tao
2023-11-24 11:30 ` [PATCH bpf v3 4/6] bpf: Optimize the free of inner map Hou Tao
2023-11-26  7:13   ` Yonghong Song [this message]
2023-11-27  1:24     ` Hou Tao
2023-11-27  1:49   ` Alexei Starovoitov
2023-11-27  2:47     ` Hou Tao
2023-11-27  3:07       ` Hou Tao
2023-11-27  3:20       ` Alexei Starovoitov
2023-11-27  3:54         ` Hou Tao
2023-11-24 11:30 ` [PATCH bpf v3 5/6] selftests/bpf: Add test cases for " Hou Tao
2023-11-24 11:30 ` [PATCH bpf v3 6/6] selftests/bpf: Test outer map update operations in syscall program Hou Tao

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=03909d48-646d-4d71-b7bd-0b7510b0bd4f@linux.dev \
    --to=yonghong.song@linux.dev \
    --cc=alexei.starovoitov@gmail.com \
    --cc=andrii@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=haoluo@google.com \
    --cc=houtao1@huawei.com \
    --cc=houtao@huaweicloud.com \
    --cc=john.fastabend@gmail.com \
    --cc=jolsa@kernel.org \
    --cc=kpsingh@kernel.org \
    --cc=martin.lau@linux.dev \
    --cc=sdf@google.com \
    --cc=song@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox