From: Mykyta Yatsenko <mykyta.yatsenko5@gmail.com>
To: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org, ast@kernel.org, andrii@kernel.org,
daniel@iogearbox.net, kafai@meta.com, kernel-team@meta.com,
Mykyta Yatsenko <yatsenko@meta.com>
Subject: Re: [PATCH bpf-next] bpftool: improve btf c dump sorting stability
Date: Thu, 5 Sep 2024 23:33:01 +0100 [thread overview]
Message-ID: <d70142da-3f08-4d5f-9556-bf442570942d@gmail.com> (raw)
In-Reply-To: <CAEf4BzYnrpdBcTFpA_MsM+6qyW3Cmcte9zhv2vrJsnYKQrFhAQ@mail.gmail.com>
On 05/09/2024 21:31, Andrii Nakryiko wrote:
> On Thu, Sep 5, 2024 at 1:12 PM Mykyta Yatsenko
> <mykyta.yatsenko5@gmail.com> wrote:
>> From: Mykyta Yatsenko <yatsenko@meta.com>
>>
>> Existing algorithm for BTF C dump sorting uses only types and names of
>> the structs and unions for ordering. As dump contains structs with the
>> same names but different contents, relative to each other ordering of
>> those structs will be accidental.
>> This patch addresses this problem by introducing a new sorting field
>> that contains hash of the struct/union field names and types to
>> disambiguate comparison of the non-unique named structs.
>>
> Did you check how stable generated vmlinux.h now is when just
> rebuilding kernel with no actual changes? Does it still have some
> variation?
It's stable with this change, no variation at all (I tested up to about 5
times in a row rebuilding kernel and generating vmlinux.h), though, I
would not
be surprised if some edge case for same-named structs is not covered.
It may get triggered in future by some change in kernel structures.
> LGTM, but let's keep the btf_type_sort_name() as is? See below.
>
>> Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
>> ---
>> tools/bpf/bpftool/btf.c | 93 ++++++++++++++++++++++++++++++++++-------
>> 1 file changed, 79 insertions(+), 14 deletions(-)
>>
>> diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
>> index 3b57ba095ab6..0e7151bfc3d5 100644
>> --- a/tools/bpf/bpftool/btf.c
>> +++ b/tools/bpf/bpftool/btf.c
>> @@ -50,6 +50,7 @@ struct sort_datum {
>> int type_rank;
>> const char *sort_name;
>> const char *own_name;
>> + __u64 disambig_hash;
>> };
>>
>> static const char *btf_int_enc_str(__u8 encoding)
>> @@ -557,17 +558,6 @@ static const char *btf_type_sort_name(const struct btf *btf, __u32 index, bool f
>> const struct btf_type *t = btf__type_by_id(btf, index);
>>
>> switch (btf_kind(t)) {
>> - case BTF_KIND_ENUM:
>> - case BTF_KIND_ENUM64: {
>> - int name_off = t->name_off;
>> -
>> - if (!from_ref && !name_off && btf_vlen(t))
>> - name_off = btf_kind(t) == BTF_KIND_ENUM64 ?
>> - btf_enum64(t)->name_off :
>> - btf_enum(t)->name_off;
>> -
>> - return btf__name_by_offset(btf, name_off);
>> - }
> Why remove this? anonymous enums will usually still have some
> meaningful prefix for each enumerator value, and so sorting based on
> those prefixes (effective) still seems useful for more logical
> ordering
Removed this just to simplify code a little bit. With the hash
function we use for disambiguation same-named enums, enums with
less number of elements would be pushed higher which is decent sorting
as well.
I see why we would prefer to cluster enums with similar prefixes
together. I'll
revert this removal in v2.
>
> pw-bot: cr
>
>> case BTF_KIND_ARRAY:
>> return btf_type_sort_name(btf, btf_array(t)->type, true);
>> case BTF_KIND_TYPE_TAG:
>> @@ -584,20 +574,94 @@ static const char *btf_type_sort_name(const struct btf *btf, __u32 index, bool f
>> return NULL;
>> }
>>
>> +static __u64 hasher(__u64 hash, __u64 val)
>> +{
>> + return hash * 31 + val;
>> +}
>> +
>> +static __u64 btf_name_hasher(__u64 hash, const struct btf *btf, __u32 name_off)
>> +{
>> + if (!name_off)
>> + return hash;
>> +
>> + return hasher(hash, str_hash(btf__name_by_offset(btf, name_off)));
>> +}
>> +
>> +static __u64 btf_type_disambig_hash(const struct btf *btf, __u32 index, bool include_members)
> nit: s/index/id/ ?
Ack
>
>> +{
>> + const struct btf_type *t = btf__type_by_id(btf, index);
>> + int i;
>> + size_t hash = 0;
>> +
>> + hash = btf_name_hasher(hash, btf, t->name_off);
>> +
>> + switch (btf_kind(t)) {
>> + case BTF_KIND_ENUM:
>> + case BTF_KIND_ENUM64:
>> + for (i = 0; i < btf_vlen(t); i++) {
>> + __u32 name_off = btf_is_enum(t) ?
>> + btf_enum(t)[i].name_off :
>> + btf_enum64(t)[i].name_off;
>> +
>> + hash = btf_name_hasher(hash, btf, name_off);
>> + }
>> + break;
>> + case BTF_KIND_STRUCT:
>> + case BTF_KIND_UNION:
>> + if (!include_members)
>> + break;
>> + for (i = 0; i < btf_vlen(t); i++) {
>> + const struct btf_member *m = btf_members(t) + i;
>> +
>> + hash = btf_name_hasher(hash, btf, m->name_off);
>> + /* resolve field type's name and hash it as well */
>> + hash = hasher(hash, btf_type_disambig_hash(btf, m->type, false));
>> + }
>> + break;
>> + case BTF_KIND_TYPE_TAG:
>> + case BTF_KIND_CONST:
>> + case BTF_KIND_PTR:
>> + case BTF_KIND_VOLATILE:
>> + case BTF_KIND_RESTRICT:
>> + case BTF_KIND_TYPEDEF:
>> + case BTF_KIND_DECL_TAG:
>> + hash = hasher(hash, btf_type_disambig_hash(btf, t->type, include_members));
>> + break;
>> + case BTF_KIND_ARRAY: {
>> + struct btf_array *arr = btf_array(t);
>> +
>> + hash = hasher(hash, arr->nelems);
>> + hash = hasher(hash, btf_type_disambig_hash(btf, arr->type, include_members));
>> + break;
>> + }
>> + default:
>> + break;
>> + }
>> + return hash;
>> +}
>> +
>> static int btf_type_compare(const void *left, const void *right)
>> {
>> const struct sort_datum *d1 = (const struct sort_datum *)left;
>> const struct sort_datum *d2 = (const struct sort_datum *)right;
>> int r;
>>
>> - if (d1->type_rank != d2->type_rank)
>> - return d1->type_rank < d2->type_rank ? -1 : 1;
>> + r = d1->type_rank - d2->type_rank;
>> + if (r)
>> + return r;
>>
>> r = strcmp(d1->sort_name, d2->sort_name);
>> if (r)
>> return r;
>>
>> - return strcmp(d1->own_name, d2->own_name);
>> + r = strcmp(d1->own_name, d2->own_name);
>> + if (r)
>> + return r;
> when I was playing with this code I had stong desire to do something
> like below to cut down on visual noise
>
> r = d1->type_rank - d2->type_rank;
> r = r ?: strcmp(d1->sort_name, d2->sort_name);
> r = r ?: strcmp(d1->own_name, d2->own_name);
> if (r)
> return r;
>
> WDYT?
Looks a little hard to parse for me (maybe a skill issue). Early exits
help to disregard
context when reading the code (not sure if this makes sense), so I prefer
them. Essentially this is the same thing as theexisting code, but more
compact,
makes sense, I'll include it in v2.
>> +
>> + if (d1->disambig_hash != d2->disambig_hash)
>> + return d1->disambig_hash < d2->disambig_hash ? -1 : 1;
>> +
>> + return d1->index - d2->index;
>> }
>>
>> static struct sort_datum *sort_btf_c(const struct btf *btf)
>> @@ -618,6 +682,7 @@ static struct sort_datum *sort_btf_c(const struct btf *btf)
>> d->type_rank = btf_type_rank(btf, i, false);
>> d->sort_name = btf_type_sort_name(btf, i, false);
>> d->own_name = btf__name_by_offset(btf, t->name_off);
>> + d->disambig_hash = btf_type_disambig_hash(btf, i, true);
>> }
>>
>> qsort(datums, n, sizeof(struct sort_datum), btf_type_compare);
>> --
>> 2.46.0
>>
prev parent reply other threads:[~2024-09-05 22:33 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-09-05 20:12 [PATCH bpf-next] bpftool: improve btf c dump sorting stability Mykyta Yatsenko
2024-09-05 20:31 ` Andrii Nakryiko
2024-09-05 22:33 ` Mykyta Yatsenko [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=d70142da-3f08-4d5f-9556-bf442570942d@gmail.com \
--to=mykyta.yatsenko5@gmail.com \
--cc=andrii.nakryiko@gmail.com \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=kafai@meta.com \
--cc=kernel-team@meta.com \
--cc=yatsenko@meta.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox