From: Leon Hwang <leon.hwang@linux.dev>
To: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org, ast@kernel.org, daniel@iogearbox.net,
andrii@kernel.org, yonghong.song@linux.dev, song@kernel.org,
eddyz87@gmail.com, qmo@kernel.org, dxu@dxuuu.xyz,
kernel-patches-bot@fb.com
Subject: Re: [PATCH bpf-next 2/4] bpf, libbpf: Support global percpu data
Date: Fri, 7 Feb 2025 17:48:32 +0800 [thread overview]
Message-ID: <d4b3d0eb-3a04-44f6-b714-01f3f4396052@linux.dev> (raw)
In-Reply-To: <CAEf4Bza87kazAf2HfMZdqLCw4RfXRF+zMmfQs4cO_PYE2zKjOQ@mail.gmail.com>
On 6/2/25 08:09, Andrii Nakryiko wrote:
> On Mon, Jan 27, 2025 at 8:22 AM Leon Hwang <leon.hwang@linux.dev> wrote:
>>
>> This patch introduces support for global percpu data in libbpf. A new
>> section named ".percpu" is added, similar to the existing ".data" section.
>> Internal maps are created for ".percpu" sections, which are then
>> initialized and populated accordingly.
>>
>> The changes include:
>>
>> * Introduction of the ".percpu" section in libbpf.
>> * Creation of internal maps for percpu data.
>> * Initialization and population of these maps.
>>
>> This enhancement allows BPF programs to efficiently manage and access
>> percpu global data, improving performance for use cases that require
>> percpu buffer.
>>
>> Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
>> ---
>> tools/lib/bpf/libbpf.c | 172 ++++++++++++++++++++++++++++++++---------
>> 1 file changed, 135 insertions(+), 37 deletions(-)
>>
>> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
>> index 194809da51725..6da6004c5c84d 100644
>> --- a/tools/lib/bpf/libbpf.c
>> +++ b/tools/lib/bpf/libbpf.c
>> @@ -516,6 +516,7 @@ struct bpf_struct_ops {
>> };
>>
>> #define DATA_SEC ".data"
>> +#define PERCPU_DATA_SEC ".percpu"
>> #define BSS_SEC ".bss"
>> #define RODATA_SEC ".rodata"
>> #define KCONFIG_SEC ".kconfig"
>> @@ -530,6 +531,7 @@ enum libbpf_map_type {
>> LIBBPF_MAP_BSS,
>> LIBBPF_MAP_RODATA,
>> LIBBPF_MAP_KCONFIG,
>> + LIBBPF_MAP_PERCPU_DATA,
>
> nit: let's keep it shorter: LIBBPF_MAP_PERCPU
>
Ack.
>> };
>>
>> struct bpf_map_def {
>> @@ -562,6 +564,7 @@ struct bpf_map {
>> __u32 btf_value_type_id;
>> __u32 btf_vmlinux_value_type_id;
>> enum libbpf_map_type libbpf_type;
>> + void *data;
>> void *mmaped;
>> struct bpf_struct_ops *st_ops;
>> struct bpf_map *inner_map;
>> @@ -640,6 +643,7 @@ enum sec_type {
>> SEC_DATA,
>> SEC_RODATA,
>> SEC_ST_OPS,
>> + SEC_PERCPU_DATA,
>
> ditto, just SEC_PERCPU?
>
Ack.
>> };
>>
>> struct elf_sec_desc {
>> @@ -1923,13 +1927,24 @@ static bool map_is_mmapable(struct bpf_object *obj, struct bpf_map *map)
>> return false;
>> }
>>
>> +static void map_copy_data(struct bpf_map *map, const void *data)
>> +{
>> + bool is_percpu_data = map->libbpf_type == LIBBPF_MAP_PERCPU_DATA;
>> + size_t data_sz = map->def.value_size;
>> +
>> + if (data)
>> + memcpy(is_percpu_data ? map->data : map->mmaped, data, data_sz);
>> +}
>> +
>> static int
>> bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type,
>> const char *real_name, int sec_idx, void *data, size_t data_sz)
>> {
>> + bool is_percpu_data = type == LIBBPF_MAP_PERCPU_DATA;
>> struct bpf_map_def *def;
>> struct bpf_map *map;
>> size_t mmap_sz;
>> + size_t elem_sz;
>
> nit: just:
>
> size_t mmap_sz, elem_sz;
>
>> int err;
>>
>> map = bpf_object__add_map(obj);
>> @@ -1948,7 +1963,8 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type,
>> }
>>
>> def = &map->def;
>> - def->type = BPF_MAP_TYPE_ARRAY;
>> + def->type = is_percpu_data ? BPF_MAP_TYPE_PERCPU_ARRAY
>> + : BPF_MAP_TYPE_ARRAY;
>
> nit: single line
>
>> def->key_size = sizeof(int);
>> def->value_size = data_sz;
>> def->max_entries = 1;
>
> [...]
>
>> + if (map_is_mmapable(obj, map))
>> + def->map_flags |= BPF_F_MMAPABLE;
>> +
>> + mmap_sz = bpf_map_mmap_sz(map);
>> + map->mmaped = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE,
>> + MAP_SHARED | MAP_ANONYMOUS, -1, 0);
>> + if (map->mmaped == MAP_FAILED) {
>> + err = -errno;
>> + map->mmaped = NULL;
>> + pr_warn("map '%s': failed to alloc content buffer: %s\n",
>> + map->name, errstr(err));
>> + goto free_name;
>> + }
>> +
>> + map_copy_data(map, data);
>
> why not memcpy() here? you know it's not percpu map, so why obscuring
> that memcpy?
>
>
>> + }
>>
>> pr_debug("map %td is \"%s\"\n", map - obj->maps, map->name);
>> return 0;
>
> [...]
>
>> @@ -5125,23 +5180,54 @@ static int
>> bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map)
>> {
>> enum libbpf_map_type map_type = map->libbpf_type;
>> - int err, zero = 0;
>> + bool is_percpu_data = map_type == LIBBPF_MAP_PERCPU_DATA;
>> + int err = 0, zero = 0;
>> + void *data = NULL;
>> + int num_cpus, i;
>> + size_t data_sz;
>> + size_t elem_sz;
>> size_t mmap_sz;
>
> nit: keep those size_t variables grouped: `size_t mmap_sz, data_sz, elem_sz;`
>
Ack.
>>
>> + data_sz = map->def.value_size;
>> + if (is_percpu_data) {
>> + num_cpus = libbpf_num_possible_cpus();
>> + if (num_cpus < 0) {
>> + err = libbpf_err_errno(num_cpus);
>
> why? num_cpus *IS* error code if num_cpus < 0
>
Ack. Thanks for pointing this out.
>> + pr_warn("map '%s': failed to get num_cpus: %s\n",
>> + bpf_map__name(map), errstr(err));
>
> this is unlikely to happen, I'd drop pr_warn()
>
Ack.
>> + return err;
>> + }
>> +
>> + elem_sz = roundup(data_sz, 8);
>> + data_sz = elem_sz * num_cpus;
>> + data = malloc(data_sz);
>> + if (!data) {
>> + err = -ENOMEM;
>> + pr_warn("map '%s': failed to malloc memory: %s\n",
>> + bpf_map__name(map), errstr(err));
>
> -ENOMEM is rather self-descriptive (and generally not expected), so
> don't add pr_warn() for such cases
>
Ack.
>> + return err;
>> + }
>> +
>> + for (i = 0; i < num_cpus; i++)
>> + memcpy(data + i * elem_sz, map->data, elem_sz);
>> + } else {
>> + data = map->mmaped;
>> + }
>> +
>
> [...]
>
>> static void bpf_map__destroy(struct bpf_map *map);
>> @@ -8120,7 +8209,9 @@ static int bpf_object__sanitize_maps(struct bpf_object *obj)
>> struct bpf_map *m;
>>
>> bpf_object__for_each_map(m, obj) {
>> - if (!bpf_map__is_internal(m))
>> + if (!bpf_map__is_internal(m) ||
>> + /* percpu data map is internal and not-mmapable. */
>> + m->libbpf_type == LIBBPF_MAP_PERCPU_DATA)
>
> original logic would work anyways, no? let's not add unnecessary
> special casing here
>
The original logic works for it. I'll remove it.
>> continue;
>> if (!kernel_supports(obj, FEAT_ARRAY_MMAP))
>> m->def.map_flags &= ~BPF_F_MMAPABLE;
>> @@ -9041,6 +9132,8 @@ static void bpf_map__destroy(struct bpf_map *map)
>> if (map->mmaped && map->mmaped != map->obj->arena_data)
>> munmap(map->mmaped, bpf_map_mmap_sz(map));
>> map->mmaped = NULL;
>> + if (map->data)
>> + zfree(&map->data);
>>
>
> this whole map->mmaped and map->data duality and duplication seems not
> worth it, tbh. Maybe we should keep using map->mmaped (we probably
> could name it more generically at some point, but I don't want to
> start bike shedding now) even for malloc'ed memory? After all, we
> already have ARENA as another special case? WDYT, can your changes be
> implemented by reusing map->mmaped, taking into account a type of map?
>
It is better to reuse map->mmaped. I'll take a try.
> pw-bot: cr
>
>> if (map->st_ops) {
>> zfree(&map->st_ops->data);
>> @@ -10132,14 +10225,18 @@ int bpf_map__fd(const struct bpf_map *map)
>>
>> static bool map_uses_real_name(const struct bpf_map *map)
>> {
>> - /* Since libbpf started to support custom .data.* and .rodata.* maps,
>> - * their user-visible name differs from kernel-visible name. Users see
>> - * such map's corresponding ELF section name as a map name.
>> - * This check distinguishes .data/.rodata from .data.* and .rodata.*
>> - * maps to know which name has to be returned to the user.
>> + /* Since libbpf started to support custom .data.*, .percpu.* and
>> + * .rodata.* maps, their user-visible name differs from kernel-visible
>> + * name. Users see such map's corresponding ELF section name as a map
>> + * name. This check distinguishes .data/.percpu/.rodata from .data.*,
>> + * .percpu.* and .rodata.* maps to know which name has to be returned to
>> + * the user.
>> */
>> if (map->libbpf_type == LIBBPF_MAP_DATA && strcmp(map->real_name, DATA_SEC) != 0)
>> return true;
>> + if (map->libbpf_type == LIBBPF_MAP_PERCPU_DATA &&
>> + strcmp(map->real_name, PERCPU_DATA_SEC) != 0)
>> + return true;
>
> nit: shorten LIBBPF_MAP_PERCPU_DATA and keep single line, please
>
Ack.
>> if (map->libbpf_type == LIBBPF_MAP_RODATA && strcmp(map->real_name, RODATA_SEC) != 0)
>> return true;
>> return false;
>> @@ -10348,7 +10445,8 @@ int bpf_map__set_initial_value(struct bpf_map *map,
>> if (map->obj->loaded || map->reused)
>> return libbpf_err(-EBUSY);
>>
>> - if (!map->mmaped || map->libbpf_type == LIBBPF_MAP_KCONFIG)
>> + if ((!map->mmaped && !map->data) ||
>> + map->libbpf_type == LIBBPF_MAP_KCONFIG)
>> return libbpf_err(-EINVAL);
>>
>> if (map->def.type == BPF_MAP_TYPE_ARENA)
>> @@ -10358,7 +10456,7 @@ int bpf_map__set_initial_value(struct bpf_map *map,
>> if (size != actual_sz)
>> return libbpf_err(-EINVAL);
>>
>> - memcpy(map->mmaped, data, size);
>> + map_copy_data(map, data);
>> return 0;
>> }
>>
>> @@ -10370,7 +10468,7 @@ void *bpf_map__initial_value(const struct bpf_map *map, size_t *psize)
>> return map->st_ops->data;
>> }
>>
>> - if (!map->mmaped)
>> + if (!map->mmaped && !map->data)
>> return NULL;
>>
>> if (map->def.type == BPF_MAP_TYPE_ARENA)
>> @@ -10378,7 +10476,7 @@ void *bpf_map__initial_value(const struct bpf_map *map, size_t *psize)
>> else
>> *psize = map->def.value_size;
>>
>> - return map->mmaped;
>> + return map->libbpf_type == LIBBPF_MAP_PERCPU_DATA ? map->data : map->mmaped;
>
> Good chunk of changes like this wouldn't be necessary if we just reuse
> map->mmaped.
>
Yes. you're right. It's unnecessary to change it if reuse map->mmaped.
>
>> }
>>
>> bool bpf_map__is_internal(const struct bpf_map *map)
>> --
>> 2.47.1
>>
Thanks,
Leon
next prev parent reply other threads:[~2025-02-07 9:48 UTC|newest]
Thread overview: 20+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-01-27 16:21 [PATCH bpf-next 0/4] bpf: Introduce global percpu data Leon Hwang
2025-01-27 16:21 ` [PATCH bpf-next 1/4] " Leon Hwang
2025-02-06 0:09 ` Andrii Nakryiko
2025-02-07 9:42 ` Leon Hwang
2025-02-08 0:23 ` Alexei Starovoitov
2025-02-10 9:35 ` Leon Hwang
2025-01-27 16:21 ` [PATCH bpf-next 2/4] bpf, libbpf: Support " Leon Hwang
2025-02-06 0:09 ` Andrii Nakryiko
2025-02-07 9:48 ` Leon Hwang [this message]
2025-01-27 16:21 ` [PATCH bpf-next 3/4] bpf, bpftool: Generate skeleton for " Leon Hwang
2025-02-06 0:09 ` Andrii Nakryiko
2025-02-07 9:52 ` Leon Hwang
2025-01-27 16:21 ` [PATCH bpf-next 4/4] selftests/bpf: Add a case to test " Leon Hwang
2025-02-06 0:09 ` Andrii Nakryiko
2025-02-07 10:00 ` Leon Hwang
2025-02-07 19:48 ` Andrii Nakryiko
2025-02-10 9:52 ` Leon Hwang
2025-02-11 0:15 ` Andrii Nakryiko
2025-02-12 1:50 ` Leon Hwang
-- strict thread matches above, loose matches on Subject: below --
2025-02-13 16:06 [PATCH bpf-next 0/4] bpf: Introduce " Leon Hwang
2025-02-13 16:06 ` [PATCH bpf-next 2/4] bpf, libbpf: Support " Leon Hwang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=d4b3d0eb-3a04-44f6-b714-01f3f4396052@linux.dev \
--to=leon.hwang@linux.dev \
--cc=andrii.nakryiko@gmail.com \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=dxu@dxuuu.xyz \
--cc=eddyz87@gmail.com \
--cc=kernel-patches-bot@fb.com \
--cc=qmo@kernel.org \
--cc=song@kernel.org \
--cc=yonghong.song@linux.dev \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.