Re: [PATCH bpf-next v3] bpf: Do not allocate percpu memory at init stage

BPF List
 help / color / mirror / Atom feed

From: Yonghong Song <yonghong.song@linux.dev>
To: Hou Tao <houtao@huaweicloud.com>, bpf@vger.kernel.org
Cc: Alexei Starovoitov <ast@kernel.org>,
	Andrii Nakryiko <andrii@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	kernel-team@fb.com, Martin KaFai Lau <martin.lau@kernel.org>,
	"Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Subject: Re: [PATCH bpf-next v3] bpf: Do not allocate percpu memory at init stage
Date: Mon, 13 Nov 2023 22:23:32 -0500	[thread overview]
Message-ID: <05207c41-2e2f-49d6-a8bf-a2820701242f@linux.dev> (raw)
In-Reply-To: <50a70429-169f-0d44-86da-d5fe6a9d59e6@huaweicloud.com>


On 11/13/23 4:42 AM, Hou Tao wrote:
> Hi,
>
> On 11/11/2023 9:39 AM, Yonghong Song wrote:
>> Kirill Shutemov reported significant percpu memory consumption increase after
>> booting in 288-cpu VM ([1]) due to commit 41a5db8d8161 ("bpf: Add support for
>> non-fix-size percpu mem allocation"). The percpu memory consumption is
>> increased from 111MB to 969MB. The number is from /proc/meminfo.
>>
>> I tried to reproduce the issue with my local VM which at most supports upto
>> 255 cpus. With 252 cpus, without the above commit, the percpu memory
>> consumption immediately after boot is 57MB while with the above commit the
>> percpu memory consumption is 231MB.
>>
>> This is not good since so far percpu memory from bpf memory allocator is not
>> widely used yet. Let us change pre-allocation in init stage to on-demand
>> allocation when verifier detects there is a need of percpu memory for bpf
>> program. With this change, percpu memory consumption after boot can be reduced
>> signicantly.
>>
>>    [1] https://lore.kernel.org/lkml/20231109154934.4saimljtqx625l3v@box.shutemov.name/
>>
>> Fixes: 41a5db8d8161 ("bpf: Add support for non-fix-size percpu mem allocation")
>> Reported-and-tested-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>> ---
>>   include/linux/bpf.h   |  2 +-
>>   kernel/bpf/core.c     |  8 +++-----
>>   kernel/bpf/verifier.c | 20 ++++++++++++++++++--
>>   3 files changed, 22 insertions(+), 8 deletions(-)
>>
>> Changelog:
>>    v2 -> v3:
>>      - Use dedicated mutex lock (bpf_percpu_ma_lock)
>>    v1 -> v2:
>>      - Add proper Reported-and-tested-by tag.
>>      - Do a check of !bpf_global_percpu_ma_set before acquiring verifier_lock.
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 35bff17396c0..6762dac3ef76 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -56,7 +56,7 @@ extern struct idr btf_idr;
>>   extern spinlock_t btf_idr_lock;
>>   extern struct kobject *btf_kobj;
>>   extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
>> -extern bool bpf_global_ma_set, bpf_global_percpu_ma_set;
>> +extern bool bpf_global_ma_set;
>>   
>>   typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
>>   typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
>> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
>> index 08626b519ce2..cd3afe57ece3 100644
>> --- a/kernel/bpf/core.c
>> +++ b/kernel/bpf/core.c
>> @@ -64,8 +64,8 @@
>>   #define OFF	insn->off
>>   #define IMM	insn->imm
>>   
>> -struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
>> -bool bpf_global_ma_set, bpf_global_percpu_ma_set;
>> +struct bpf_mem_alloc bpf_global_ma;
>> +bool bpf_global_ma_set;
>>   
>>   /* No hurry in this branch
>>    *
>> @@ -2934,9 +2934,7 @@ static int __init bpf_global_ma_init(void)
>>   
>>   	ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
>>   	bpf_global_ma_set = !ret;
>> -	ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
>> -	bpf_global_percpu_ma_set = !ret;
>> -	return !bpf_global_ma_set || !bpf_global_percpu_ma_set;
>> +	return ret;
>>   }
>>   late_initcall(bpf_global_ma_init);
>>   #endif
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index a2267d5ed14e..6da370a047fe 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -26,6 +26,7 @@
>>   #include <linux/poison.h>
>>   #include <linux/module.h>
>>   #include <linux/cpumask.h>
>> +#include <linux/bpf_mem_alloc.h>
>>   #include <net/xdp.h>
>>   
>>   #include "disasm.h"
>> @@ -41,6 +42,9 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
>>   #undef BPF_LINK_TYPE
>>   };
>>   
>> +struct bpf_mem_alloc bpf_global_percpu_ma;
>> +static bool bpf_global_percpu_ma_set;
>> +
>>   /* bpf_check() is a static code analyzer that walks eBPF program
>>    * instruction by instruction and updates register/stack state.
>>    * All paths of conditional branches are analyzed until 'bpf_exit' insn.
>> @@ -336,6 +340,7 @@ struct bpf_kfunc_call_arg_meta {
>>   struct btf *btf_vmlinux;
>>   
>>   static DEFINE_MUTEX(bpf_verifier_lock);
>> +static DEFINE_MUTEX(bpf_percpu_ma_lock);
>>   
>>   static const struct bpf_line_info *
>>   find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
>> @@ -12091,8 +12096,19 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>>   				if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
>>   					return -ENOMEM;
>>   
>> -				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && !bpf_global_percpu_ma_set)
>> -					return -ENOMEM;
>> +				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
>> +					if (!bpf_global_percpu_ma_set) {
>> +						mutex_lock(&bpf_percpu_ma_lock);
>> +						if (!bpf_global_percpu_ma_set) {
>> +							err = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
>> +							if (!err)
>> +								bpf_global_percpu_ma_set = true;
>> +						}
> A dumb question here: do we need some memory barrier to guarantee the
> memory order between bpf_global_percpu_ma_set and bpf_global_percpu_ma ?

We should be fine. There is a control dependence on '!err' for
'bpf_global_percpu_ma_set = true'.

>> +						mutex_unlock(&bpf_percpu_ma_lock);
>> +						if (err)
>> +							return err;
>> +					}
>> +				}
>>   
>>   				if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
>>   					verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");

next prev parent reply	other threads:[~2023-11-14  3:23 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-11-11  1:39 [PATCH bpf-next v3] bpf: Do not allocate percpu memory at init stage Yonghong Song
2023-11-13 12:42 ` Hou Tao
2023-11-14  3:23   ` Yonghong Song [this message]
2023-11-14  4:03     ` Hou Tao
2023-11-14  4:12 ` Hou Tao
2023-11-15 16:00 ` patchwork-bot+netdevbpf

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=05207c41-2e2f-49d6-a8bf-a2820701242f@linux.dev \
    --to=yonghong.song@linux.dev \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=houtao@huaweicloud.com \
    --cc=kernel-team@fb.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=martin.lau@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox