* Re: [PATCHv3 bpf-next 20/24] selftests/bpf: Add tracing multi cookies test
From: Jiri Olsa @ 2026-03-17 17:18 UTC (permalink / raw)
To: Leon Hwang
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, bpf,
linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <e01da3b0-ff72-403d-8f3f-4312d35e6ec7@linux.dev>
On Tue, Mar 17, 2026 at 11:06:25AM +0800, Leon Hwang wrote:
SNIP
> > diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_check.c b/tools/testing/selftests/bpf/progs/tracing_multi_check.c
> > index 0e3248312dd5..e6047d5a078a 100644
> > --- a/tools/testing/selftests/bpf/progs/tracing_multi_check.c
> > +++ b/tools/testing/selftests/bpf/progs/tracing_multi_check.c
> > @@ -7,6 +7,7 @@
> > char _license[] SEC("license") = "GPL";
> >
> > int pid = 0;
> > +bool test_cookies = false;
> >
> > extern const void bpf_fentry_test1 __ksym;
> > extern const void bpf_fentry_test2 __ksym;
> > @@ -28,7 +29,7 @@ extern const void bpf_testmod_fentry_test11 __ksym;
> > void tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return)
> > {
> > void *ip = (void *) bpf_get_func_ip(ctx);
> > - __u64 value = 0, ret = 0;
> > + __u64 value = 0, ret = 0, cookie = 0;
> > long err = 0;
> >
> > if (bpf_get_current_pid_tgid() >> 32 != pid)
> > @@ -36,6 +37,8 @@ void tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return)
> >
> > if (is_return)
> > err |= bpf_get_func_ret(ctx, &ret);
> > + if (test_cookies)
> > + cookie = test_cookies ? bpf_get_attach_cookie(ctx) : 0;
> ^ dup test_cookies check ? Can drop this one.
heh nice, yea, thanks
jirka
^ permalink raw reply
* Re: [PATCHv3 bpf-next 19/24] selftests/bpf: Add tracing multi intersect tests
From: Jiri Olsa @ 2026-03-17 17:18 UTC (permalink / raw)
To: Leon Hwang
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, bpf,
linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <63fa6255-8682-495d-abe1-d30d72a7ece9@linux.dev>
On Tue, Mar 17, 2026 at 11:05:29AM +0800, Leon Hwang wrote:
SNIP
> > diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
> > index e9042d8d4760..b7818f438d6e 100644
> > --- a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
> > +++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
> > @@ -6,6 +6,7 @@
> > #include "bpf/libbpf_internal.h"
> > #include "tracing_multi.skel.h"
> > #include "tracing_multi_module.skel.h"
> > +#include "tracing_multi_intersect.skel.h"
> > #include "trace_helpers.h"
> >
> > static const char * const bpf_fentry_test[] = {
> > @@ -31,6 +32,20 @@ static const char * const bpf_testmod_fentry_test[] = {
> >
> > #define FUNCS_CNT (ARRAY_SIZE(bpf_fentry_test))
> >
> > +static int get_random_funcs(const char **funcs)
> > +{
> > + int i, cnt = 0;
> > +
> > + for (i = 0; i < FUNCS_CNT; i++) {
> > + if (rand() % 2)
> ^ srand() is missing for rand() ?
it's in test_progs main
> > +static void test_intersect(void)
> > +{
> > + struct tracing_multi_intersect *skel;
> > + const struct bpf_program *progs[4];
> > + __u64 *test_results[4];
> > + __u32 i;
> > +
> > + skel = tracing_multi_intersect__open_and_load();
> > + if (!ASSERT_OK_PTR(skel, "tracing_multi_intersect__open_and_load"))
> > + return;
> > +
> > + skel->bss->pid = getpid();
> > +
> > + progs[0] = skel->progs.fentry_1;
> > + progs[1] = skel->progs.fexit_1;
> > + progs[2] = skel->progs.fentry_2;
> > + progs[3] = skel->progs.fexit_2;
> > +
> > + test_results[0] = &skel->bss->test_result_fentry_1;
> > + test_results[1] = &skel->bss->test_result_fexit_1;
> > + test_results[2] = &skel->bss->test_result_fentry_2;
> > + test_results[3] = &skel->bss->test_result_fexit_2;
> > +
> > + for (i = 1; i < 16; i++)
> > + __test_intersect(i, progs, test_results);
> > +
> > + tracing_multi_intersect__destroy(skel);
> > +}
> > +
> > void test_tracing_multi_test(void)
> > {
> > #ifndef __x86_64__
> > @@ -347,4 +444,6 @@ void test_tracing_multi_test(void)
> > test_module_link_api_pattern();
> > if (test__start_subtest("module_link_api_ids"))
> > test_module_link_api_ids();
> > + if (test__start_subtest("intersect"))
> > + test_intersect();
> > }
> > diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_intersect_attach.c b/tools/testing/selftests/bpf/progs/tracing_multi_intersect_attach.c
> > new file mode 100644
> > index 000000000000..b8aecbf44093
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/progs/tracing_multi_intersect_attach.c
> > @@ -0,0 +1,42 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +#include <stdbool.h>
> > +#include <linux/bpf.h>
> NIT: ^ vmlinux.h is better than stdbool.h + bpf.h.
ok, thanks
jirka
^ permalink raw reply
* Re: [PATCHv3 bpf-next 17/24] selftests/bpf: Add tracing multi skel/pattern/ids attach tests
From: Jiri Olsa @ 2026-03-17 17:18 UTC (permalink / raw)
To: Leon Hwang
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, bpf,
linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman, Song Liu,
Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <39113fe0-2e9f-4f11-afe9-efb676a48149@linux.dev>
On Tue, Mar 17, 2026 at 11:04:57AM +0800, Leon Hwang wrote:
SNIP
> > +static __u32 *get_ids(const char * const funcs[], int funcs_cnt, const char *mod)
> > +{
> > + struct btf *btf, *vmlinux_btf;
> > + __u32 nr, type_id, cnt = 0;
> > + void *root = NULL;
> > + __u32 *ids = NULL;
> > + int i, err = 0;
> > +
> > + btf = btf__load_vmlinux_btf();
> > + if (!ASSERT_OK_PTR(btf, "btf__load_vmlinux_btf"))
> > + return NULL;
> > +
> > + if (mod) {
> > + vmlinux_btf = btf;
> > + btf = btf__load_module_btf(mod, vmlinux_btf);
> > + if (!ASSERT_OK_PTR(btf, "btf__load_module_btf"))
> > + return NULL;
> ^ vmlinux_btf does not get released.
right, needs to be 'goto out'
>
> > + }
> > +
> > + ids = calloc(funcs_cnt, sizeof(ids[0]));
> > + if (!ids)
> > + goto out;
> > +
> > + /*
> > + * We sort function names by name and search them
> > + * below for each function.
> > + */
> > + for (i = 0; i < funcs_cnt; i++)
> > + tsearch(&funcs[i], &root, compare);
> ^ tdestroy() is missing to free tree nodes?
right, will add, thanks
jirka
^ permalink raw reply
* Re: [PATCH mm-unstable v15 05/13] mm/khugepaged: generalize collapse_huge_page for mTHP collapse
From: Randy Dunlap @ 2026-03-17 17:16 UTC (permalink / raw)
To: Lorenzo Stoakes (Oracle), Nico Pache
Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
lance.yang, Liam.Howlett, lorenzo.stoakes, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
raquini, richard.weiyang, rientjes, rostedt, rppt, ryan.roberts,
shivankg, sunnanyong, surenb, thomas.hellstrom, tiwai,
usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will, willy,
yang, ying.huang, ziy, zokeefe
In-Reply-To: <9f0b8790-eace-4caa-a0c0-45f66285887f@lucifer.local>
On 3/17/26 9:51 AM, Lorenzo Stoakes (Oracle) wrote:
> On Wed, Feb 25, 2026 at 08:24:27PM -0700, Nico Pache wrote:
>> Pass an order and offset to collapse_huge_page to support collapsing anon
>> memory to arbitrary orders within a PMD. order indicates what mTHP size we
>> are attempting to collapse to, and offset indicates were in the PMD to
>> start the collapse attempt.
>>
>> For non-PMD collapse we must leave the anon VMA write locked until after
>> we collapse the mTHP-- in the PMD case all the pages are isolated, but in
> The '--' seems weird here 🙂 maybe meant to be ' - '?
"--" is common typewriter(!) style for "dash".
Single "-" is a hyphen.
--
~Randy
^ permalink raw reply
* Re: [PATCH mm-unstable v15 09/13] mm/khugepaged: introduce collapse_allowable_orders helper function
From: Lorenzo Stoakes (Oracle) @ 2026-03-17 17:08 UTC (permalink / raw)
To: Nico Pache
Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
lance.yang, Liam.Howlett, lorenzo.stoakes, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260226032542.233891-1-npache@redhat.com>
On Wed, Feb 25, 2026 at 08:25:42PM -0700, Nico Pache wrote:
> Add collapse_allowable_orders() to generalize THP order eligibility. The
> function determines which THP orders are permitted based on collapse
> context (khugepaged vs madv_collapse).
>
> This consolidates collapse configuration logic and provides a clean
> interface for future mTHP collapse support where the orders may be
> different.
>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
> mm/khugepaged.c | 16 +++++++++++++---
> 1 file changed, 13 insertions(+), 3 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 2e66d660ee8e..2fdfb6d42cf9 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -486,12 +486,22 @@ static unsigned int collapse_max_ptes_none(unsigned int order)
> return -EINVAL;
> }
>
> +/* Check what orders are allowed based on the vma and collapse type */
> +static unsigned long collapse_allowable_orders(struct vm_area_struct *vma,
> + vm_flags_t vm_flags, bool is_khugepaged)
You're always passing vma->vm_flags, maybe just pass vma and you can grab
vma->vm_flags?
Really it would be better for it to be &vma->flags, but probably best to wait
for me to do a follow up VMA flags series for that.
> +{
> + enum tva_type tva_flags = is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
> + unsigned long orders = BIT(HPAGE_PMD_ORDER);
Const?
Also not sure if we decided BIT() was right here or not :P For me fine though.
> +
> + return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
> +}
> +
> void khugepaged_enter_vma(struct vm_area_struct *vma,
> vm_flags_t vm_flags)
> {
> if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
> hugepage_pmd_enabled()) {
> - if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
> + if (collapse_allowable_orders(vma, vm_flags, /*is_khugepaged=*/true))
I agree with David, let's pass through the enum value please :)
> __khugepaged_enter(vma->vm_mm);
> }
> }
> @@ -2637,7 +2647,7 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, enum scan_result *
> progress++;
> break;
> }
> - if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
> + if (!collapse_allowable_orders(vma, vma->vm_flags, /*is_khugepaged=*/true)) {
> progress++;
> continue;
> }
> @@ -2949,7 +2959,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> BUG_ON(vma->vm_start > start);
> BUG_ON(vma->vm_end < end);
>
> - if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
> + if (!collapse_allowable_orders(vma, vma->vm_flags, /*is_khugepaged=*/false))
> return -EINVAL;
>
> cc = kmalloc_obj(*cc);
> --
> 2.53.0
>
Cheers, Lorenzo
^ permalink raw reply
* Re: [PATCH mm-unstable v15 07/13] mm/khugepaged: add per-order mTHP collapse failure statistics
From: Lorenzo Stoakes (Oracle) @ 2026-03-17 17:05 UTC (permalink / raw)
To: Nico Pache
Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
lance.yang, Liam.Howlett, lorenzo.stoakes, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260226032504.233594-1-npache@redhat.com>
On Wed, Feb 25, 2026 at 08:25:04PM -0700, Nico Pache wrote:
> Add three new mTHP statistics to track collapse failures for different
> orders when encountering swap PTEs, excessive none PTEs, and shared PTEs:
>
> - collapse_exceed_swap_pte: Increment when mTHP collapse fails due to swap
> PTEs
>
> - collapse_exceed_none_pte: Counts when mTHP collapse fails due to
> exceeding the none PTE threshold for the given order
>
> - collapse_exceed_shared_pte: Counts when mTHP collapse fails due to shared
> PTEs
>
> These statistics complement the existing THP_SCAN_EXCEED_* events by
> providing per-order granularity for mTHP collapse attempts. The stats are
> exposed via sysfs under
> `/sys/kernel/mm/transparent_hugepage/hugepages-*/stats/` for each
> supported hugepage size.
>
> As we currently dont support collapsing mTHPs that contain a swap or
> shared entry, those statistics keep track of how often we are
> encountering failed mTHP collapses due to these restrictions.
>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
> Documentation/admin-guide/mm/transhuge.rst | 24 ++++++++++++++++++++++
> include/linux/huge_mm.h | 3 +++
> mm/huge_memory.c | 7 +++++++
> mm/khugepaged.c | 16 ++++++++++++---
> 4 files changed, 47 insertions(+), 3 deletions(-)
>
> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> index c51932e6275d..eebb1f6bbc6c 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -714,6 +714,30 @@ nr_anon_partially_mapped
> an anonymous THP as "partially mapped" and count it here, even though it
> is not actually partially mapped anymore.
>
> +collapse_exceed_none_pte
> + The number of collapse attempts that failed due to exceeding the
> + max_ptes_none threshold. For mTHP collapse, Currently only max_ptes_none
> + values of 0 and (HPAGE_PMD_NR - 1) are supported. Any other value will
> + emit a warning and no mTHP collapse will be attempted. khugepaged will
It's weird to document this here but not elsewhere in the document? I mean I
made this comment on the documentation patch also.
Not sure if I missed you adding it to another bit of the docs? :)
> + try to collapse to the largest enabled (m)THP size; if it fails, it will
> + try the next lower enabled mTHP size. This counter records the number of
> + times a collapse attempt was skipped for exceeding the max_ptes_none
> + threshold, and khugepaged will move on to the next available mTHP size.
> +
> +collapse_exceed_swap_pte
> + The number of anonymous mTHP PTE ranges which were unable to collapse due
> + to containing at least one swap PTE. Currently khugepaged does not
> + support collapsing mTHP regions that contain a swap PTE. This counter can
> + be used to monitor the number of khugepaged mTHP collapses that failed
> + due to the presence of a swap PTE.
> +
> +collapse_exceed_shared_pte
> + The number of anonymous mTHP PTE ranges which were unable to collapse due
> + to containing at least one shared PTE. Currently khugepaged does not
> + support collapsing mTHP PTE ranges that contain a shared PTE. This
> + counter can be used to monitor the number of khugepaged mTHP collapses
> + that failed due to the presence of a shared PTE.
All of these talk about 'ranges' that could be of any size. Are these useful
metrics? Counting a bunch of failures and not knowing if they are 256 KB
failures or 16 KB failures or whatever is maybe not so useful information?
Also, from the code, aren't you treating PMD events the same as mTHP ones from
the point of view of these counters? Maybe worth documenting that?
> +
> As the system ages, allocating huge pages may be expensive as the
> system uses memory compaction to copy data around memory to free a
> huge page for use. There are some counters in ``/proc/vmstat`` to help
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 9941fc6d7bd8..e8777bb2347d 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -144,6 +144,9 @@ enum mthp_stat_item {
> MTHP_STAT_SPLIT_DEFERRED,
> MTHP_STAT_NR_ANON,
> MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
> + MTHP_STAT_COLLAPSE_EXCEED_SWAP,
> + MTHP_STAT_COLLAPSE_EXCEED_NONE,
> + MTHP_STAT_COLLAPSE_EXCEED_SHARED,
> __MTHP_STAT_COUNT
> };
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 228f35e962b9..1049a207a257 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -642,6 +642,10 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
> DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
> DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
> DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
> +DEFINE_MTHP_STAT_ATTR(collapse_exceed_swap_pte, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
> +DEFINE_MTHP_STAT_ATTR(collapse_exceed_none_pte, MTHP_STAT_COLLAPSE_EXCEED_NONE);
> +DEFINE_MTHP_STAT_ATTR(collapse_exceed_shared_pte, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
Is there a reason there's such a difference between the names and the actual
enum names?
> +
>
> static struct attribute *anon_stats_attrs[] = {
> &anon_fault_alloc_attr.attr,
> @@ -658,6 +662,9 @@ static struct attribute *anon_stats_attrs[] = {
> &split_deferred_attr.attr,
> &nr_anon_attr.attr,
> &nr_anon_partially_mapped_attr.attr,
> + &collapse_exceed_swap_pte_attr.attr,
> + &collapse_exceed_none_pte_attr.attr,
> + &collapse_exceed_shared_pte_attr.attr,
> NULL,
> };
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index c739f26dd61e..a6cf90e09e4a 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -595,7 +595,9 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
> continue;
> } else {
> result = SCAN_EXCEED_NONE_PTE;
> - count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> + if (is_pmd_order(order))
> + count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE);
It's a bit gross to have separate stats for both thp and mthp but maybe
unavoidable from a legacy stand point.
Why are we dropping the _PTE suffix?
> goto out;
> }
> }
> @@ -631,10 +633,17 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
> * shared may cause a future higher order collapse on a
> * rescan of the same range.
> */
> - if (!is_pmd_order(order) || (cc->is_khugepaged &&
> - shared > khugepaged_max_ptes_shared)) {
OK losing track here :) as the series sadly doesn't currently apply so can't
browser file as is.
In the code I'm looking at, there's also a ++shared here that I guess another
patch removed?
Is this in the folio_maybe_mapped_shared() branch?
> + if (!is_pmd_order(order)) {
> + result = SCAN_EXCEED_SHARED_PTE;
> + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> + goto out;
> + }
> +
> + if (cc->is_khugepaged &&
> + shared > khugepaged_max_ptes_shared) {
> result = SCAN_EXCEED_SHARED_PTE;
> count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
> + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> goto out;
Anyway I'm a bit lost on this logic until a respin but this looks like a LOT of
code duplication. I see David alluded to a refactoring so maybe what he suggests
will help (not had a chance to check what it is specifically :P)
> }
> }
> @@ -1081,6 +1090,7 @@ static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
> * range.
> */
> if (!is_pmd_order(order)) {
> + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
Hmm I thought we were incrementing mthp stats for pmd sized also?
> pte_unmap(pte);
> mmap_read_unlock(mm);
> result = SCAN_EXCEED_SWAP_PTE;
> --
> 2.53.0
>
Cheers, Lorenzo
^ permalink raw reply
* Re: [PATCH mm-unstable v15 05/13] mm/khugepaged: generalize collapse_huge_page for mTHP collapse
From: Lorenzo Stoakes (Oracle) @ 2026-03-17 16:51 UTC (permalink / raw)
To: Nico Pache
Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
lance.yang, Liam.Howlett, lorenzo.stoakes, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260226032427.233282-1-npache@redhat.com>
On Wed, Feb 25, 2026 at 08:24:27PM -0700, Nico Pache wrote:
> Pass an order and offset to collapse_huge_page to support collapsing anon
> memory to arbitrary orders within a PMD. order indicates what mTHP size we
> are attempting to collapse to, and offset indicates were in the PMD to
> start the collapse attempt.
>
> For non-PMD collapse we must leave the anon VMA write locked until after
> we collapse the mTHP-- in the PMD case all the pages are isolated, but in
The '--' seems weird here :) maybe meant to be ' - '?
> the mTHP case this is not true, and we must keep the lock to prevent
> changes to the VMA from occurring.
You mean changes to the page tables right? rmap won't alter VMA parameters
without a VMA lock. Better to be specific.
>
> Also convert these BUG_ON's to WARN_ON_ONCE's as these conditions, while
> unexpected, should not bring down the system.
>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
> mm/khugepaged.c | 102 +++++++++++++++++++++++++++++-------------------
> 1 file changed, 62 insertions(+), 40 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 99f78f0e44c6..fb3ba8fe5a6c 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1150,44 +1150,53 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
> return SCAN_SUCCEED;
> }
>
> -static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address,
> - int referenced, int unmapped, struct collapse_control *cc)
> +static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long start_addr,
> + int referenced, int unmapped, struct collapse_control *cc,
> + bool *mmap_locked, unsigned int order)
This is getting horrible, could we maybe look at passing through a helper
struct or something?
> {
> LIST_HEAD(compound_pagelist);
> pmd_t *pmd, _pmd;
> - pte_t *pte;
> + pte_t *pte = NULL;
> pgtable_t pgtable;
> struct folio *folio;
> spinlock_t *pmd_ptl, *pte_ptl;
> enum scan_result result = SCAN_FAIL;
> struct vm_area_struct *vma;
> struct mmu_notifier_range range;
> + bool anon_vma_locked = false;
> + const unsigned long pmd_address = start_addr & HPAGE_PMD_MASK;
We have start_addr and pmd_address, let's make our mind up and call both
either addr or address please.
>
> - VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> + VM_WARN_ON_ONCE(pmd_address & ~HPAGE_PMD_MASK);
You just masked this with HPAGE_PMD_MASK then check & ~HPAGE_PMD_MASK? :)
Can we just drop it? :)
>
> /*
> * Before allocating the hugepage, release the mmap_lock read lock.
> * The allocation can take potentially a long time if it involves
> * sync compaction, and we do not need to hold the mmap_lock during
> * that. We will recheck the vma after taking it again in write mode.
> + * If collapsing mTHPs we may have already released the read_lock.
> */
> - mmap_read_unlock(mm);
> + if (*mmap_locked) {
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
> + }
If you use a helper struct you can write a function that'll do both of
these at once, E.g.:
static void scan_mmap_unlock(struct scan_state *scan)
{
if (!scan->mmap_locked)
return;
mmap_read_unlock(scan->mm);
scan->mmap_locked = false;
}
...
scan_mmap_unlock(scan_state);
>
> - result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
> + result = alloc_charge_folio(&folio, mm, cc, order);
> if (result != SCAN_SUCCEED)
> goto out_nolock;
>
> mmap_read_lock(mm);
> - result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> - HPAGE_PMD_ORDER);
> + *mmap_locked = true;
> + result = hugepage_vma_revalidate(mm, pmd_address, true, &vma, cc, order);
Be nice to add a /*expect_anon=*/true, here so we can read what parameter
that is at a glance.
> if (result != SCAN_SUCCEED) {
> mmap_read_unlock(mm);
> + *mmap_locked = false;
> goto out_nolock;
> }
>
> - result = find_pmd_or_thp_or_none(mm, address, &pmd);
> + result = find_pmd_or_thp_or_none(mm, pmd_address, &pmd);
> if (result != SCAN_SUCCEED) {
> mmap_read_unlock(mm);
> + *mmap_locked = false;
> goto out_nolock;
> }
>
> @@ -1197,13 +1206,16 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> * released when it fails. So we jump out_nolock directly in
> * that case. Continuing to collapse causes inconsistency.
> */
> - result = __collapse_huge_page_swapin(mm, vma, address, pmd,
> - referenced, HPAGE_PMD_ORDER);
> - if (result != SCAN_SUCCEED)
> + result = __collapse_huge_page_swapin(mm, vma, start_addr, pmd,
> + referenced, order);
> + if (result != SCAN_SUCCEED) {
> + *mmap_locked = false;
> goto out_nolock;
> + }
> }
>
> mmap_read_unlock(mm);
> + *mmap_locked = false;
> /*
> * Prevent all access to pagetables with the exception of
> * gup_fast later handled by the ptep_clear_flush and the VM
> @@ -1213,20 +1225,20 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> * mmap_lock.
> */
> mmap_write_lock(mm);
Hmm you take an mmap... write lock here then don/t set *mmap_locked =
true... It's inconsistent and bug prone.
I'm also seriously not a fan of switching between mmap read and write lock
here but keeping an *mmap_locked parameter here which is begging for a bug.
In general though, you seem to always make sure in the (fairly hideous
honestly) error goto labels to have the mmap lock dropped, so what is the
point in keeping the *mmap_locked parameter updated throughou this anyway?
Are we ever exiting with it set? If not why not drop the parameter/helper
struct field and just have the caller understand that it's dropped on exit
(and document that).
Since you're just dropping the lock on entry, why not have the caller do
that and document that you have to enter unlocked anyway?
> - result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> - HPAGE_PMD_ORDER);
> + result = hugepage_vma_revalidate(mm, pmd_address, true, &vma, cc, order);
> if (result != SCAN_SUCCEED)
> goto out_up_write;
> /* check if the pmd is still valid */
> vma_start_write(vma);
> - result = check_pmd_still_valid(mm, address, pmd);
> + result = check_pmd_still_valid(mm, pmd_address, pmd);
> if (result != SCAN_SUCCEED)
> goto out_up_write;
>
> anon_vma_lock_write(vma->anon_vma);
> + anon_vma_locked = true;
Again with a helper struct you can abstract this and avoid more noise.
E.g. scan_anon_vma_lock_write(scan);
>
> - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
> - address + HPAGE_PMD_SIZE);
> + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start_addr,
> + start_addr + (PAGE_SIZE << order));
I hate this open-coded 'start_addr + (PAGE_SIZE << order)' construct.
If you use a helper struct (theme here :) you could have a macro that
generates it set an end param to this.
> mmu_notifier_invalidate_range_start(&range);
>
> pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
> @@ -1238,24 +1250,21 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> * Parallel GUP-fast is fine since GUP-fast will back off when
> * it detects PMD is changed.
> */
> - _pmd = pmdp_collapse_flush(vma, address, pmd);
> + _pmd = pmdp_collapse_flush(vma, pmd_address, pmd);
> spin_unlock(pmd_ptl);
> mmu_notifier_invalidate_range_end(&range);
> tlb_remove_table_sync_one();
>
> - pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> + pte = pte_offset_map_lock(mm, &_pmd, start_addr, &pte_ptl);
> if (pte) {
> - result = __collapse_huge_page_isolate(vma, address, pte, cc,
> - HPAGE_PMD_ORDER,
> - &compound_pagelist);
> + result = __collapse_huge_page_isolate(vma, start_addr, pte, cc,
> + order, &compound_pagelist);
Will this work correctly with the non-PMD aligned start_addr?
> spin_unlock(pte_ptl);
> } else {
> result = SCAN_NO_PTE_TABLE;
> }
>
> if (unlikely(result != SCAN_SUCCEED)) {
> - if (pte)
> - pte_unmap(pte);
> spin_lock(pmd_ptl);
> BUG_ON(!pmd_none(*pmd));
Can we downgrade to WARN_ON_ONCE() as we pass by any BUG_ON()'s please?
Since we're churning here anyway it's worth doing :)
> /*
> @@ -1265,21 +1274,21 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> */
> pmd_populate(mm, pmd, pmd_pgtable(_pmd));
> spin_unlock(pmd_ptl);
> - anon_vma_unlock_write(vma->anon_vma);
> goto out_up_write;
> }
>
> /*
> - * All pages are isolated and locked so anon_vma rmap
> - * can't run anymore.
> + * For PMD collapse all pages are isolated and locked so anon_vma
> + * rmap can't run anymore. For mTHP collapse we must hold the lock
This is really unclear. What does 'can't run anymore' mean? Why must we
hold the lock for mTHP?
I realise the previous comment was equally as unclear but let's make this
make sense please :)
> */
> - anon_vma_unlock_write(vma->anon_vma);
> + if (is_pmd_order(order)) {
> + anon_vma_unlock_write(vma->anon_vma);
> + anon_vma_locked = false;
> + }
>
> result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
> - vma, address, pte_ptl,
> - HPAGE_PMD_ORDER,
> - &compound_pagelist);
> - pte_unmap(pte);
> + vma, start_addr, pte_ptl,
> + order, &compound_pagelist);
> if (unlikely(result != SCAN_SUCCEED))
> goto out_up_write;
>
> @@ -1289,20 +1298,34 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> * write.
> */
> __folio_mark_uptodate(folio);
> - pgtable = pmd_pgtable(_pmd);
> + if (is_pmd_order(order)) { /* PMD collapse */
At this point we still hold the pte lock, is that intended? Are we sure
there won't be any issues leaving it held during the operations that now
happen before you release it?
> + pgtable = pmd_pgtable(_pmd);
>
> - spin_lock(pmd_ptl);
> - BUG_ON(!pmd_none(*pmd));
> - pgtable_trans_huge_deposit(mm, pmd, pgtable);
> - map_anon_folio_pmd_nopf(folio, pmd, vma, address);
> + spin_lock(pmd_ptl);
> + WARN_ON_ONCE(!pmd_none(*pmd));
> + pgtable_trans_huge_deposit(mm, pmd, pgtable);
> + map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_address);
If we're PMD order start_addr == pmd_address right?
> + } else { /* mTHP collapse */
> + spin_lock(pmd_ptl);
> + WARN_ON_ONCE(!pmd_none(*pmd));
You duplicate both of these lines in both branches, pull them out?
> + map_anon_folio_pte_nopf(folio, pte, vma, start_addr, /*uffd_wp=*/ false);
> + smp_wmb(); /* make PTEs visible before PMD. See pmd_install() */
It'd be much nicer to call pmd_install() :)
Or maybe even to separate out the unlocked bit from pmd_install(), put that
in e.g. __pmd_install(), then use that after lock acquired?
> + pmd_populate(mm, pmd, pmd_pgtable(_pmd));
> + }
> spin_unlock(pmd_ptl);
>
> folio = NULL;
Not your code but... why? I guess to avoid the folio_put() below but
gross. Anyway this function needs refactoring, can be a follow up.
>
> result = SCAN_SUCCEED;
> out_up_write:
> + if (anon_vma_locked)
> + anon_vma_unlock_write(vma->anon_vma);
> + if (pte)
> + pte_unmap(pte);
Again can be helped with helper struct :)
> mmap_write_unlock(mm);
> + *mmap_locked = false;
And this... I also hate the break from if (*mmap_locked) ... etc.
> out_nolock:
> + WARN_ON_ONCE(*mmap_locked);
Should be a VM_WARN_ON_ONCE() if we keep it.
> if (folio)
> folio_put(folio);
> trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
> @@ -1483,9 +1506,8 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
> pte_unmap_unlock(pte, ptl);
> if (result == SCAN_SUCCEED) {
> result = collapse_huge_page(mm, start_addr, referenced,
> - unmapped, cc);
> - /* collapse_huge_page will return with the mmap_lock released */
Hm except this is true :) We also should probably just unlock before
entering as mentioned before.
> - *mmap_locked = false;
> + unmapped, cc, mmap_locked,
> + HPAGE_PMD_ORDER);
> }
> out:
> trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
> --
> 2.53.0
>
Cheers, Lorenzo
^ permalink raw reply
* Re: [RFC v5 6/7] ext4: fast commit: add lock_updates tracepoint
From: Steven Rostedt @ 2026-03-17 16:21 UTC (permalink / raw)
To: Li Chen
Cc: Zhang Yi, Theodore Ts'o, Andreas Dilger, Masami Hiramatsu,
Mathieu Desnoyers, linux-ext4, linux-kernel, linux-trace-kernel,
Vineeth Remanan Pillai
In-Reply-To: <20260317084624.457185-7-me@linux.beauty>
On Tue, 17 Mar 2026 16:46:21 +0800
Li Chen <me@linux.beauty> wrote:
> Commit-time fast commit snapshots run under jbd2_journal_lock_updates(),
> so it is useful to quantify the time spent with updates locked and to
> understand why snapshotting can fail.
>
> Add a new tracepoint, ext4_fc_lock_updates, reporting the time spent in
> the updates-locked window along with the number of snapshotted inodes
> and ranges. Record the first snapshot failure reason in a stable snap_err
> field for tooling.
>
> Signed-off-by: Li Chen <me@linux.beauty>
> ---
> fs/ext4/ext4.h | 15 ++++++++
> fs/ext4/fast_commit.c | 71 +++++++++++++++++++++++++++++--------
> include/trace/events/ext4.h | 61 +++++++++++++++++++++++++++++++
> 3 files changed, 132 insertions(+), 15 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 68a64fa0be926..b9e146f3dd9e4 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1037,6 +1037,21 @@ enum {
>
> struct ext4_fc_inode_snap;
>
> +/*
> + * Snapshot failure reasons for ext4_fc_lock_updates tracepoint.
> + * Keep these stable for tooling.
> + */
> +enum ext4_fc_snap_err {
> + EXT4_FC_SNAP_ERR_NONE = 0,
> + EXT4_FC_SNAP_ERR_ES_MISS = 1,
> + EXT4_FC_SNAP_ERR_ES_DELAYED = 2,
> + EXT4_FC_SNAP_ERR_ES_OTHER = 3,
> + EXT4_FC_SNAP_ERR_INODES_CAP = 4,
> + EXT4_FC_SNAP_ERR_RANGES_CAP = 5,
> + EXT4_FC_SNAP_ERR_NOMEM = 6,
> + EXT4_FC_SNAP_ERR_INODE_LOC = 7,
You don't need to explicitly state the assignments, the enum will increment
them without them.
> +};
> +
> /*
> * fourth extended file system inode data in memory
> */
> diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
> index d1eefee609120..4929e2990b292 100644
> --- a/fs/ext4/fast_commit.c
> +++ b/fs/ext4/fast_commit.c
> @@ -193,6 +193,12 @@ static struct kmem_cache *ext4_fc_range_cachep;
> #define EXT4_FC_SNAPSHOT_MAX_INODES 1024
> #define EXT4_FC_SNAPSHOT_MAX_RANGES 2048
>
> +static inline void ext4_fc_set_snap_err(int *snap_err, int err)
> +{
> + if (snap_err && *snap_err == EXT4_FC_SNAP_ERR_NONE)
> + *snap_err = err;
> +}
> +
> static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
> {
> BUFFER_TRACE(bh, "");
> @@ -983,11 +989,12 @@ static void ext4_fc_free_inode_snap(struct inode *inode)
> static int ext4_fc_snapshot_inode_data(struct inode *inode,
> struct list_head *ranges,
> unsigned int nr_ranges_total,
> - unsigned int *nr_rangesp)
> + unsigned int *nr_rangesp,
> + int *snap_err)
> {
> struct ext4_inode_info *ei = EXT4_I(inode);
> - unsigned int nr_ranges = 0;
> ext4_lblk_t start_lblk, end_lblk, cur_lblk;
> + unsigned int nr_ranges = 0;
>
> spin_lock(&ei->i_fc_lock);
> if (ei->i_fc_lblk_len == 0) {
> @@ -1010,11 +1017,16 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
> struct ext4_fc_range *range;
> ext4_lblk_t len;
>
> - if (!ext4_es_lookup_extent(inode, cur_lblk, NULL, &es, NULL))
> + if (!ext4_es_lookup_extent(inode, cur_lblk, NULL, &es, NULL)) {
> + ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_MISS);
> return -EAGAIN;
> + }
>
> - if (ext4_es_is_delayed(&es))
> + if (ext4_es_is_delayed(&es)) {
> + ext4_fc_set_snap_err(snap_err,
> + EXT4_FC_SNAP_ERR_ES_DELAYED);
> return -EAGAIN;
> + }
>
> len = es.es_len - (cur_lblk - es.es_lblk);
> if (len > end_lblk - cur_lblk + 1)
> @@ -1024,12 +1036,17 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
> continue;
> }
>
> - if (nr_ranges_total + nr_ranges >= EXT4_FC_SNAPSHOT_MAX_RANGES)
> + if (nr_ranges_total + nr_ranges >= EXT4_FC_SNAPSHOT_MAX_RANGES) {
> + ext4_fc_set_snap_err(snap_err,
> + EXT4_FC_SNAP_ERR_RANGES_CAP);
> return -E2BIG;
> + }
>
> range = kmem_cache_alloc(ext4_fc_range_cachep, GFP_NOFS);
> - if (!range)
> + if (!range) {
> + ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM);
> return -ENOMEM;
> + }
> nr_ranges++;
>
> range->lblk = cur_lblk;
> @@ -1054,6 +1071,7 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
> range->len = max;
> } else {
> kmem_cache_free(ext4_fc_range_cachep, range);
> + ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_OTHER);
> return -EAGAIN;
> }
>
> @@ -1070,7 +1088,7 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
>
> static int ext4_fc_snapshot_inode(struct inode *inode,
> unsigned int nr_ranges_total,
> - unsigned int *nr_rangesp)
> + unsigned int *nr_rangesp, int *snap_err)
> {
> struct ext4_inode_info *ei = EXT4_I(inode);
> struct ext4_fc_inode_snap *snap;
> @@ -1082,8 +1100,10 @@ static int ext4_fc_snapshot_inode(struct inode *inode,
> int alloc_ctx;
>
> ret = ext4_get_inode_loc_noio(inode, &iloc);
> - if (ret)
> + if (ret) {
> + ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_INODE_LOC);
> return ret;
> + }
>
> if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
> inode_len = EXT4_INODE_SIZE(inode->i_sb);
> @@ -1092,6 +1112,7 @@ static int ext4_fc_snapshot_inode(struct inode *inode,
>
> snap = kmalloc(struct_size(snap, inode_buf, inode_len), GFP_NOFS);
> if (!snap) {
> + ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM);
> brelse(iloc.bh);
> return -ENOMEM;
> }
> @@ -1102,7 +1123,7 @@ static int ext4_fc_snapshot_inode(struct inode *inode,
> brelse(iloc.bh);
>
> ret = ext4_fc_snapshot_inode_data(inode, &ranges, nr_ranges_total,
> - &nr_ranges);
> + &nr_ranges, snap_err);
> if (ret) {
> kfree(snap);
> ext4_fc_free_ranges(&ranges);
> @@ -1203,7 +1224,10 @@ static int ext4_fc_alloc_snapshot_inodes(struct super_block *sb,
> unsigned int *nr_inodesp);
>
> static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
> - unsigned int inodes_size)
> + unsigned int inodes_size,
> + unsigned int *nr_inodesp,
> + unsigned int *nr_rangesp,
> + int *snap_err)
> {
> struct super_block *sb = journal->j_private;
> struct ext4_sb_info *sbi = EXT4_SB(sb);
> @@ -1221,6 +1245,8 @@ static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
> alloc_ctx = ext4_fc_lock(sb);
> list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
> if (i >= inodes_size) {
> + ext4_fc_set_snap_err(snap_err,
> + EXT4_FC_SNAP_ERR_INODES_CAP);
> ret = -E2BIG;
> goto unlock;
> }
> @@ -1244,6 +1270,8 @@ static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
> continue;
>
> if (i >= inodes_size) {
> + ext4_fc_set_snap_err(snap_err,
> + EXT4_FC_SNAP_ERR_INODES_CAP);
> ret = -E2BIG;
> goto unlock;
> }
> @@ -1268,16 +1296,20 @@ static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
> unsigned int inode_ranges = 0;
>
> ret = ext4_fc_snapshot_inode(inodes[idx], nr_ranges,
> - &inode_ranges);
> + &inode_ranges, snap_err);
> if (ret)
> break;
> nr_ranges += inode_ranges;
> }
>
> + if (nr_inodesp)
> + *nr_inodesp = i;
> + if (nr_rangesp)
> + *nr_rangesp = nr_ranges;
> return ret;
> }
>
> -static int ext4_fc_perform_commit(journal_t *journal)
> +static int ext4_fc_perform_commit(journal_t *journal, tid_t commit_tid)
> {
> struct super_block *sb = journal->j_private;
> struct ext4_sb_info *sbi = EXT4_SB(sb);
> @@ -1286,10 +1318,15 @@ static int ext4_fc_perform_commit(journal_t *journal)
> struct inode *inode;
> struct inode **inodes;
> unsigned int inodes_size;
> + unsigned int snap_inodes = 0;
> + unsigned int snap_ranges = 0;
> + int snap_err = EXT4_FC_SNAP_ERR_NONE;
> struct blk_plug plug;
> int ret = 0;
> u32 crc = 0;
> int alloc_ctx;
> + ktime_t lock_start;
> + u64 locked_ns;
>
> /*
> * Step 1: Mark all inodes on s_fc_q[MAIN] with
> @@ -1337,13 +1374,13 @@ static int ext4_fc_perform_commit(journal_t *journal)
> if (ret)
> return ret;
>
> -
> ret = ext4_fc_alloc_snapshot_inodes(sb, &inodes, &inodes_size);
> if (ret)
> return ret;
>
> /* Step 4: Mark all inodes as being committed. */
> jbd2_journal_lock_updates(journal);
> + lock_start = ktime_get();
> /*
> * The journal is now locked. No more handles can start and all the
> * previous handles are now drained. Snapshotting happens in this
> @@ -1357,8 +1394,12 @@ static int ext4_fc_perform_commit(journal_t *journal)
> }
> ext4_fc_unlock(sb, alloc_ctx);
>
> - ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size);
> + ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size,
> + &snap_inodes, &snap_ranges, &snap_err);
> jbd2_journal_unlock_updates(journal);
> + locked_ns = ktime_to_ns(ktime_sub(ktime_get(), lock_start));
If locked_ns is only used for the tracepoint, it should either be
calculated in the tracepoint, or add:
if (trace_ext4_fc_lock_updates_enabled()) {
locked_ns = ktime_to_ns(ktime_sub(ktime_get(), lock_start));
> + trace_ext4_fc_lock_updates(sb, commit_tid, locked_ns, snap_inodes,
> + snap_ranges, ret, snap_err);
}
Note, we are going to also add a code to call the tracepoint directly, to
remove the double static_branch.
https://lore.kernel.org/all/20260312150523.2054552-1-vineeth@bitbyteword.org/
But that code is still being worked on so you don't need to worry about it
at the moment.
-- Steve
> kvfree(inodes);
> if (ret)
> return ret;
> @@ -1563,7 +1604,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
> journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
> set_task_ioprio(current, journal_ioprio);
> fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
> - ret = ext4_fc_perform_commit(journal);
> + ret = ext4_fc_perform_commit(journal, commit_tid);
> if (ret < 0) {
> if (ret == -EAGAIN || ret == -E2BIG || ret == -ECANCELED)
> status = EXT4_FC_STATUS_INELIGIBLE;
> diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
> index fd76d14c2776e..dc084f39b74ad 100644
> --- a/include/trace/events/ext4.h
> +++ b/include/trace/events/ext4.h
> @@ -104,6 +104,26 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_INODE_JOURNAL_DATA);
> TRACE_DEFINE_ENUM(EXT4_FC_REASON_ENCRYPTED_FILENAME);
> TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX);
>
> +#undef EM
> +#undef EMe
> +#define EM(a) TRACE_DEFINE_ENUM(EXT4_FC_SNAP_ERR_##a);
> +#define EMe(a) TRACE_DEFINE_ENUM(EXT4_FC_SNAP_ERR_##a);
> +
> +#define TRACE_SNAP_ERR \
> + EM(NONE) \
> + EM(ES_MISS) \
> + EM(ES_DELAYED) \
> + EM(ES_OTHER) \
> + EM(INODES_CAP) \
> + EM(RANGES_CAP) \
> + EM(NOMEM) \
> + EMe(INODE_LOC)
> +
> +TRACE_SNAP_ERR
> +
> +#undef EM
> +#undef EMe
> +
> #define show_fc_reason(reason) \
> __print_symbolic(reason, \
> { EXT4_FC_REASON_XATTR, "XATTR"}, \
> @@ -2812,6 +2832,47 @@ TRACE_EVENT(ext4_fc_commit_stop,
> __entry->num_fc_ineligible, __entry->nblks_agg, __entry->tid)
> );
>
> +#define EM(a) { EXT4_FC_SNAP_ERR_##a, #a },
> +#define EMe(a) { EXT4_FC_SNAP_ERR_##a, #a }
> +
> +TRACE_EVENT(ext4_fc_lock_updates,
> + TP_PROTO(struct super_block *sb, tid_t commit_tid, u64 locked_ns,
> + unsigned int nr_inodes, unsigned int nr_ranges, int err,
> + int snap_err),
> +
> + TP_ARGS(sb, commit_tid, locked_ns, nr_inodes, nr_ranges, err, snap_err),
> +
> + TP_STRUCT__entry(/* entry */
> + __field(dev_t, dev)
> + __field(tid_t, tid)
> + __field(u64, locked_ns)
> + __field(unsigned int, nr_inodes)
> + __field(unsigned int, nr_ranges)
> + __field(int, err)
> + __field(int, snap_err)
> + ),
> +
> + TP_fast_assign(/* assign */
> + __entry->dev = sb->s_dev;
> + __entry->tid = commit_tid;
> + __entry->locked_ns = locked_ns;
> + __entry->nr_inodes = nr_inodes;
> + __entry->nr_ranges = nr_ranges;
> + __entry->err = err;
> + __entry->snap_err = snap_err;
> + ),
> +
> + TP_printk("dev %d,%d tid %u locked_ns %llu nr_inodes %u nr_ranges %u err %d snap_err %s",
> + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
> + __entry->locked_ns, __entry->nr_inodes, __entry->nr_ranges,
> + __entry->err, __print_symbolic(__entry->snap_err,
> + TRACE_SNAP_ERR))
> +);
> +
> +#undef EM
> +#undef EMe
> +#undef TRACE_SNAP_ERR
> +
> #define FC_REASON_NAME_STAT(reason) \
> show_fc_reason(reason), \
> __entry->fc_ineligible_rc[reason]
^ permalink raw reply
* Re: [PATCH v6 16/17] lib/bootconfig: fix sign-compare in xbc_node_compose_key_after()
From: Josh Law @ 2026-03-17 16:15 UTC (permalink / raw)
To: Steven Rostedt, Masami Hiramatsu (Google)
Cc: Andrew Morton, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317121507.30735331@gandalf.local.home>
On 17 March 2026 16:15:07 GMT, Steven Rostedt <rostedt@goodmis.org> wrote:
>On Tue, 17 Mar 2026 16:55:49 +0900
>Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
>
>> > --- a/lib/bootconfig.c
>> > +++ b/lib/bootconfig.c
>> > @@ -319,10 +319,10 @@ int __init xbc_node_compose_key_after(struct xbc_node *root,
>> > depth ? "." : "");
>> > if (ret < 0)
>> > return ret;
>> > - if (ret >= size) {
>> > + if (ret >= (int)size) {
>>
>> nit:
>>
>> if ((size_t)ret >= size) {
>>
>> because sizeof(size_t) > sizeof(int).
>
>I don't think we need to worry about this. But this does bring up an issue.
>ret comes from:
>
> ret = snprintf(buf, size, "%s%s", xbc_node_get_data(node),
> depth ? "." : "");
>
>Where size is of type size_t
>
>snprintf() takes size_t but returns int.
>
>snprintf() calls vsnprintf() which has:
>
> size_t len, pos;
>
>Where pos is incremented based on fmt, and vsnprintf() returns:
>
> return pos;
>
>Which can overflow.
>
>Now, honestly, we should never have a 2Gig string as that would likely
>cause other horrible things. Does size really need to be size_t?
>
>Perhaps we should have:
>
> if (WARN_ON_ONCE(size > MAX_INT))
> return -EINVAL;
>
>?
>
>-- Steve
Hello Steven! I made a V7 dropping that since masami nitted it anyway, and I was too busy to fix it.
V/R
If you would like to review V7, go right ahead!
^ permalink raw reply
* Re: [PATCH v6 16/17] lib/bootconfig: fix sign-compare in xbc_node_compose_key_after()
From: Steven Rostedt @ 2026-03-17 16:15 UTC (permalink / raw)
To: Masami Hiramatsu (Google)
Cc: Josh Law, Andrew Morton, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317165549.99ea4171d7672f83ec3b6fc4@kernel.org>
On Tue, 17 Mar 2026 16:55:49 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> > --- a/lib/bootconfig.c
> > +++ b/lib/bootconfig.c
> > @@ -319,10 +319,10 @@ int __init xbc_node_compose_key_after(struct xbc_node *root,
> > depth ? "." : "");
> > if (ret < 0)
> > return ret;
> > - if (ret >= size) {
> > + if (ret >= (int)size) {
>
> nit:
>
> if ((size_t)ret >= size) {
>
> because sizeof(size_t) > sizeof(int).
I don't think we need to worry about this. But this does bring up an issue.
ret comes from:
ret = snprintf(buf, size, "%s%s", xbc_node_get_data(node),
depth ? "." : "");
Where size is of type size_t
snprintf() takes size_t but returns int.
snprintf() calls vsnprintf() which has:
size_t len, pos;
Where pos is incremented based on fmt, and vsnprintf() returns:
return pos;
Which can overflow.
Now, honestly, we should never have a 2Gig string as that would likely
cause other horrible things. Does size really need to be size_t?
Perhaps we should have:
if (WARN_ON_ONCE(size > MAX_INT))
return -EINVAL;
?
-- Steve
^ permalink raw reply
* [PATCH v7 06/15] bootconfig: constify xbc_calc_checksum() data parameter
From: Josh Law @ 2026-03-17 16:09 UTC (permalink / raw)
To: Masami Hiramatsu, Andrew Morton
Cc: Steven Rostedt, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317160916.33576-1-objecting@objecting.org>
xbc_calc_checksum() only reads the data buffer, so mark the parameter
as const void * and the internal pointer as const unsigned char *.
Signed-off-by: Josh Law <objecting@objecting.org>
---
include/linux/bootconfig.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h
index 25df9260d206..23a96c5edcf3 100644
--- a/include/linux/bootconfig.h
+++ b/include/linux/bootconfig.h
@@ -36,9 +36,9 @@ bool __init cmdline_has_extra_options(void);
* The checksum will be used with the BOOTCONFIG_MAGIC and the size for
* embedding the bootconfig in the initrd image.
*/
-static inline __init uint32_t xbc_calc_checksum(void *data, uint32_t size)
+static inline __init uint32_t xbc_calc_checksum(const void *data, uint32_t size)
{
- unsigned char *p = data;
+ const unsigned char *p = data;
uint32_t ret = 0;
while (size--)
--
2.34.1
^ permalink raw reply related
* [PATCH v7 05/15] lib/bootconfig: drop redundant memset of xbc_nodes
From: Josh Law @ 2026-03-17 16:09 UTC (permalink / raw)
To: Masami Hiramatsu, Andrew Morton
Cc: Steven Rostedt, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317160916.33576-1-objecting@objecting.org>
memblock_alloc() already returns zeroed memory, so the explicit memset
in xbc_init() is redundant. Switch the userspace xbc_alloc_mem() from
malloc() to calloc() so both paths return zeroed memory, and remove
the separate memset call.
Signed-off-by: Josh Law <objecting@objecting.org>
---
lib/bootconfig.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 6b899e24189c..69486bd56fea 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -71,7 +71,7 @@ static inline void __init xbc_free_mem(void *addr, size_t size, bool early)
static inline void *xbc_alloc_mem(size_t size)
{
- return malloc(size);
+ return calloc(1, size);
}
static inline void xbc_free_mem(void *addr, size_t size, bool early)
@@ -982,7 +982,6 @@ int __init xbc_init(const char *data, size_t size, const char **emsg, int *epos)
_xbc_exit(true);
return -ENOMEM;
}
- memset(xbc_nodes, 0, sizeof(struct xbc_node) * XBC_NODE_MAX);
ret = xbc_parse_tree();
if (!ret)
--
2.34.1
^ permalink raw reply related
* [PATCH v7 04/15] lib/bootconfig: increment xbc_node_num after node init succeeds
From: Josh Law @ 2026-03-17 16:09 UTC (permalink / raw)
To: Masami Hiramatsu, Andrew Morton
Cc: Steven Rostedt, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317160916.33576-1-objecting@objecting.org>
Move the xbc_node_num increment to after xbc_init_node() so a failed
init does not leave a partially initialized node counted in the array.
If xbc_init_node() fails on a data offset at the boundary of a
maximum-size bootconfig, the pre-incremented count causes subsequent
tree verification and traversal to consider the uninitialized node as
valid, potentially leading to an out-of-bounds read or unpredictable
boot behavior.
Signed-off-by: Josh Law <objecting@objecting.org>
---
lib/bootconfig.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index ca668ead1db6..6b899e24189c 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -429,9 +429,10 @@ static struct xbc_node * __init xbc_add_node(char *data, uint16_t flag)
if (xbc_node_num == XBC_NODE_MAX)
return NULL;
- node = &xbc_nodes[xbc_node_num++];
+ node = &xbc_nodes[xbc_node_num];
if (xbc_init_node(node, data, flag) < 0)
return NULL;
+ xbc_node_num++;
return node;
}
--
2.34.1
^ permalink raw reply related
* [PATCH v7 09/15] lib/bootconfig: check xbc_init_node() return in override path
From: Josh Law @ 2026-03-17 16:09 UTC (permalink / raw)
To: Masami Hiramatsu, Andrew Morton
Cc: Steven Rostedt, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317160916.33576-1-objecting@objecting.org>
The ':=' override path in xbc_parse_kv() calls xbc_init_node() to
re-initialize an existing value node but does not check the return
value. If xbc_init_node() fails (data offset out of range), parsing
silently continues with stale node data.
Add the missing error check to match the xbc_add_node() call path
which already checks for failure.
In practice, a bootconfig using ':=' to override a value near the
32KB data limit could silently retain the old value, meaning a
security-relevant boot parameter override (e.g., a trace filter or
debug setting) would not take effect as intended.
Fixes: e5efaeb8a8f5 ("bootconfig: Support mixing a value and subkeys under a key")
Signed-off-by: Josh Law <objecting@objecting.org>
---
lib/bootconfig.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 1adf592cc038..ecc4e8d93547 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -728,7 +728,8 @@ static int __init xbc_parse_kv(char **k, char *v, int op)
if (op == ':') {
unsigned short nidx = child->next;
- xbc_init_node(child, v, XBC_VALUE);
+ if (xbc_init_node(child, v, XBC_VALUE) < 0)
+ return xbc_parse_error("Failed to override value", v);
child->next = nidx; /* keep subkeys */
goto array;
}
--
2.34.1
^ permalink raw reply related
* [PATCH v7 08/15] lib/bootconfig: validate child node index in xbc_verify_tree()
From: Josh Law @ 2026-03-17 16:09 UTC (permalink / raw)
To: Masami Hiramatsu, Andrew Morton
Cc: Steven Rostedt, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317160916.33576-1-objecting@objecting.org>
xbc_verify_tree() validates that each node's next index is within
bounds, but does not check the child index. Add the same bounds
check for the child field.
Without this check, a corrupt bootconfig that passes next-index
validation could still trigger an out-of-bounds memory access via an
invalid child index when xbc_node_get_child() is called during tree
traversal at boot time.
Signed-off-by: Josh Law <objecting@objecting.org>
---
lib/bootconfig.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index d7e2395b7e83..1adf592cc038 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -823,6 +823,10 @@ static int __init xbc_verify_tree(void)
return xbc_parse_error("No closing brace",
xbc_node_get_data(xbc_nodes + i));
}
+ if (xbc_nodes[i].child >= xbc_node_num) {
+ return xbc_parse_error("Broken child node",
+ xbc_node_get_data(xbc_nodes + i));
+ }
}
/* Key tree limitation check */
--
2.34.1
^ permalink raw reply related
* [PATCH v7 14/15] lib/bootconfig: use size_t for key length tracking in xbc_verify_tree()
From: Josh Law @ 2026-03-17 16:09 UTC (permalink / raw)
To: Masami Hiramatsu, Andrew Morton
Cc: Steven Rostedt, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317160916.33576-1-objecting@objecting.org>
lib/bootconfig.c:839:24: warning: conversion from 'size_t' to 'int'
may change value [-Wconversion]
lib/bootconfig.c:860:32: warning: conversion from 'size_t' to 'int'
may change value [-Wconversion]
lib/bootconfig.c:860:29: warning: conversion to 'size_t' from 'int'
may change the sign of the result [-Wsign-conversion]
The key length variables len and wlen accumulate strlen() results but
were declared as int, causing truncation and sign-conversion warnings.
Change both to size_t to match the strlen() return type and avoid
mixed-sign arithmetic.
Signed-off-by: Josh Law <objecting@objecting.org>
---
lib/bootconfig.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 119c3d429c1f..272d9427e879 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -803,7 +803,8 @@ static int __init xbc_close_brace(char **k, char *n)
static int __init xbc_verify_tree(void)
{
- int i, depth, len, wlen;
+ int i, depth;
+ size_t len, wlen;
struct xbc_node *n, *m;
/* Brace closing */
--
2.34.1
^ permalink raw reply related
* [PATCH v7 03/15] lib/bootconfig: fix off-by-one in xbc_verify_tree() next node check
From: Josh Law @ 2026-03-17 16:09 UTC (permalink / raw)
To: Masami Hiramatsu, Andrew Morton
Cc: Steven Rostedt, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317160916.33576-1-objecting@objecting.org>
Valid node indices are 0 to xbc_node_num-1, so a next value equal to
xbc_node_num is out of bounds. Use >= instead of > to catch this.
A malformed or corrupt bootconfig could pass tree verification with
an out-of-bounds next index. On subsequent tree traversal at boot
time, xbc_node_get_next() would return a pointer past the allocated
xbc_nodes array, causing an out-of-bounds read of kernel memory.
Signed-off-by: Josh Law <objecting@objecting.org>
---
lib/bootconfig.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index d69ec95d6062..ca668ead1db6 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -816,7 +816,7 @@ static int __init xbc_verify_tree(void)
}
for (i = 0; i < xbc_node_num; i++) {
- if (xbc_nodes[i].next > xbc_node_num) {
+ if (xbc_nodes[i].next >= xbc_node_num) {
return xbc_parse_error("No closing brace",
xbc_node_get_data(xbc_nodes + i));
}
--
2.34.1
^ permalink raw reply related
* [PATCH v7 07/15] lib/bootconfig: replace linux/kernel.h with specific includes
From: Josh Law @ 2026-03-17 16:09 UTC (permalink / raw)
To: Masami Hiramatsu, Andrew Morton
Cc: Steven Rostedt, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317160916.33576-1-objecting@objecting.org>
linux/kernel.h is a legacy catch-all header. Replace it with the
specific headers actually needed: linux/cache.h for SMP_CACHE_BYTES,
linux/compiler.h for unlikely(), and linux/sprintf.h for snprintf().
Signed-off-by: Josh Law <objecting@objecting.org>
---
lib/bootconfig.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 69486bd56fea..d7e2395b7e83 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -17,7 +17,9 @@
#include <linux/bug.h>
#include <linux/ctype.h>
#include <linux/errno.h>
-#include <linux/kernel.h>
+#include <linux/cache.h>
+#include <linux/compiler.h>
+#include <linux/sprintf.h>
#include <linux/memblock.h>
#include <linux/string.h>
--
2.34.1
^ permalink raw reply related
* [PATCH v7 02/15] lib/bootconfig: narrow flag parameter type from uint32_t to uint16_t
From: Josh Law @ 2026-03-17 16:09 UTC (permalink / raw)
To: Masami Hiramatsu, Andrew Morton
Cc: Steven Rostedt, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317160916.33576-1-objecting@objecting.org>
The flag parameter in the node creation helpers only ever carries
XBC_KEY (0) or XBC_VALUE (0x8000), both of which fit in uint16_t.
Using uint16_t matches the width of xbc_node.data where the flag is
ultimately stored.
Signed-off-by: Josh Law <objecting@objecting.org>
---
lib/bootconfig.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index d77b6533ac00..d69ec95d6062 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -408,7 +408,7 @@ const char * __init xbc_node_find_next_key_value(struct xbc_node *root,
/* XBC parse and tree build */
-static int __init xbc_init_node(struct xbc_node *node, char *data, uint32_t flag)
+static int __init xbc_init_node(struct xbc_node *node, char *data, uint16_t flag)
{
unsigned long offset = data - xbc_data;
@@ -422,7 +422,7 @@ static int __init xbc_init_node(struct xbc_node *node, char *data, uint32_t flag
return 0;
}
-static struct xbc_node * __init xbc_add_node(char *data, uint32_t flag)
+static struct xbc_node * __init xbc_add_node(char *data, uint16_t flag)
{
struct xbc_node *node;
@@ -452,7 +452,7 @@ static inline __init struct xbc_node *xbc_last_child(struct xbc_node *node)
return node;
}
-static struct xbc_node * __init __xbc_add_sibling(char *data, uint32_t flag, bool head)
+static struct xbc_node * __init __xbc_add_sibling(char *data, uint16_t flag, bool head)
{
struct xbc_node *sib, *node = xbc_add_node(data, flag);
@@ -480,17 +480,17 @@ static struct xbc_node * __init __xbc_add_sibling(char *data, uint32_t flag, boo
return node;
}
-static inline struct xbc_node * __init xbc_add_sibling(char *data, uint32_t flag)
+static inline struct xbc_node * __init xbc_add_sibling(char *data, uint16_t flag)
{
return __xbc_add_sibling(data, flag, false);
}
-static inline struct xbc_node * __init xbc_add_head_sibling(char *data, uint32_t flag)
+static inline struct xbc_node * __init xbc_add_head_sibling(char *data, uint16_t flag)
{
return __xbc_add_sibling(data, flag, true);
}
-static inline __init struct xbc_node *xbc_add_child(char *data, uint32_t flag)
+static inline __init struct xbc_node *xbc_add_child(char *data, uint16_t flag)
{
struct xbc_node *node = xbc_add_sibling(data, flag);
--
2.34.1
^ permalink raw reply related
* [PATCH v7 10/15] tools/bootconfig: fix fd leak in load_xbc_file() on fstat failure
From: Josh Law @ 2026-03-17 16:09 UTC (permalink / raw)
To: Masami Hiramatsu, Andrew Morton
Cc: Steven Rostedt, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317160916.33576-1-objecting@objecting.org>
If fstat() fails after open() succeeds, the function returns without
closing the file descriptor. Also preserve errno across close(), since
close() may overwrite it before the error is returned.
Fixes: 950313ebf79c ("tools: bootconfig: Add bootconfig command")
Signed-off-by: Josh Law <objecting@objecting.org>
---
tools/bootconfig/main.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c
index 55d59ed507d5..643f707b8f1d 100644
--- a/tools/bootconfig/main.c
+++ b/tools/bootconfig/main.c
@@ -162,8 +162,11 @@ static int load_xbc_file(const char *path, char **buf)
if (fd < 0)
return -errno;
ret = fstat(fd, &stat);
- if (ret < 0)
- return -errno;
+ if (ret < 0) {
+ ret = -errno;
+ close(fd);
+ return ret;
+ }
ret = load_xbc_fd(fd, buf, stat.st_size);
--
2.34.1
^ permalink raw reply related
* [PATCH v7 00/15] bootconfig: fixes, cleanups, and modernization
From: Josh Law @ 2026-03-17 16:09 UTC (permalink / raw)
To: Masami Hiramatsu, Andrew Morton
Cc: Steven Rostedt, linux-kernel, linux-trace-kernel
This series addresses a collection of issues found during a review of
lib/bootconfig.c, include/linux/bootconfig.h, and tools/bootconfig,
ranging from off-by-one errors and unchecked return values to coding
style, signedness/type cleanup, and API modernization.
Changes since v6:
- Dropped "add missing __init annotations to static helpers"
(v6 patch 1).
- Dropped "fix sign-compare in xbc_node_compose_key_after()"
(v6 patch 16).
- Updated "fix fd leak in load_xbc_file() on fstat failure" to save
errno before close(), since close() may overwrite it before the
error is returned (patch 10).
- Updated "fix signed comparison in xbc_node_get_data()" to use
size_t for the local offset variable, matching the warning and
xbc_data_size (patch 11).
- Updated "use signed type for offset in xbc_init_node()" to use a
signed long and explicitly check offset < 0 in WARN_ON(), making
the pre-base pointer case explicit instead of relying on unsigned
wraparound (patch 13).
Changes since v5:
- Folded typo fixes, kerneldoc blank line, and inconsistent bracing
patches (v5 02-05, 07) into a single patch (patch 1).
- Dropped "use __packed macro for struct xbc_node" (v5 11) and
"add __packed definition to tools/bootconfig shim header" (v5 14)
per review feedback.
- Added Fixes: tag to "check xbc_init_node() return in override
path" (patch 9).
- Added Fixes: tag to "fix fd leak in load_xbc_file() on fstat
failure" (patch 10).
Changes since v4:
- Added six follow-up patches found via static analysis with strict
GCC warnings (patches 11-15, plus the now-dropped v6 patch 16).
- Added "fix signed comparison in xbc_node_get_data()" to match the
local offset type to xbc_data_size and eliminate the sign-compare
warning (patch 11).
- Added "use size_t for strlen result in xbc_node_match_prefix()"
and "use size_t for key length tracking in xbc_verify_tree()" to
match strlen() return types (patches 12, 14).
- Added "use signed type for offset in xbc_init_node()" to make the
offset bounds check explicit and avoid sign-conversion warnings
from pointer subtraction (patch 13).
- Added "change xbc_node_index() return type to uint16_t" to match
the 16-bit storage fields and XBC_NODE_MAX bounds (patch 15).
Changes since v3:
- Added commit descriptions to all patches that were missing them.
- Added real-world impact statements to all bug-fix patches.
Changes since v2:
- Added "validate child node index in xbc_verify_tree()" (patch 8).
- Added "check xbc_init_node() return in override path" (patch 9).
- Added "fix fd leak in load_xbc_file() on fstat failure" (patch 10).
Changes since v1:
- Dropped "return empty string instead of NULL from
xbc_node_get_data()" -- returning "" causes false matches in
xbc_node_match_prefix() because strncmp(..., "", 0) always
returns 0.
Bug fixes:
- Fix off-by-one in xbc_verify_tree() where a next-node index equal
to xbc_node_num passes the bounds check despite being out of range;
a malformed bootconfig could cause an out-of-bounds read of kernel
memory during tree traversal at boot time (patch 3).
- Move xbc_node_num increment to after xbc_init_node() validation so
a failed init does not leave a partially initialized node counted
in the array; on a maximum-size bootconfig, the uninitialized node
could be traversed leading to unpredictable boot behavior (patch 4).
- Validate child node indices in xbc_verify_tree() alongside the
existing next-node check; without this, a corrupt bootconfig could
trigger an out-of-bounds memory access via an invalid child index
during tree traversal (patch 8).
- Check xbc_init_node() return value in the ':=' override path; a
bootconfig using ':=' near the 32KB data limit could silently
retain the old value, meaning a security-relevant boot parameter
override would not take effect (patch 9).
- Fix file descriptor leak in tools/bootconfig load_xbc_file() when
fstat() fails, and preserve errno across close() on that error path
(patch 10).
Correctness:
- Narrow the flag parameter in node creation helpers from uint32_t to
uint16_t to match the xbc_node.data field width (patch 2).
- Constify the xbc_calc_checksum() data parameter since it only reads
the buffer (patch 6).
- Fix strict-GCC signedness and narrowing warnings by aligning local
types with strlen() APIs and the node index/data storage in
xbc_node_get_data(), xbc_node_match_prefix(), xbc_init_node(),
xbc_verify_tree(), and xbc_node_index() (patches 11-15).
Cleanups:
- Fix comment typos, missing blank line before kerneldoc, and
inconsistent if/else bracing (patch 1).
- Drop redundant memset after memblock_alloc which already returns
zeroed memory; switch the userspace path from malloc to calloc to
match (patch 5).
Modernization:
- Replace the catch-all linux/kernel.h include with the specific
headers needed: linux/cache.h, linux/compiler.h, and
linux/sprintf.h (patch 7).
Build-tested with both the in-kernel build (lib/bootconfig.o,
init/main.o) and the userspace tools/bootconfig build. All 70
tools/bootconfig test cases pass.
Josh Law (15):
lib/bootconfig: clean up comment typos and bracing
lib/bootconfig: narrow flag parameter type from uint32_t to uint16_t
lib/bootconfig: fix off-by-one in xbc_verify_tree() next node check
lib/bootconfig: increment xbc_node_num after node init succeeds
lib/bootconfig: drop redundant memset of xbc_nodes
bootconfig: constify xbc_calc_checksum() data parameter
lib/bootconfig: replace linux/kernel.h with specific includes
lib/bootconfig: validate child node index in xbc_verify_tree()
lib/bootconfig: check xbc_init_node() return in override path
tools/bootconfig: fix fd leak in load_xbc_file() on fstat failure
lib/bootconfig: fix signed comparison in xbc_node_get_data()
lib/bootconfig: use size_t for strlen result in
xbc_node_match_prefix()
lib/bootconfig: use signed type for offset in xbc_init_node()
lib/bootconfig: use size_t for key length tracking in
xbc_verify_tree()
lib/bootconfig: change xbc_node_index() return type to uint16_t
include/linux/bootconfig.h | 6 ++--
lib/bootconfig.c | 65 ++++++++++++++++++++++----------------
tools/bootconfig/main.c | 7 ++--
3 files changed, 46 insertions(+), 32 deletions(-)
--
2.34.1
^ permalink raw reply
* [PATCH v7 15/15] lib/bootconfig: change xbc_node_index() return type to uint16_t
From: Josh Law @ 2026-03-17 16:09 UTC (permalink / raw)
To: Masami Hiramatsu, Andrew Morton
Cc: Steven Rostedt, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317160916.33576-1-objecting@objecting.org>
lib/bootconfig.c:136:21: warning: conversion from 'long int' to
'int' may change value [-Wconversion]
lib/bootconfig.c:308:33: warning: conversion from 'int' to 'uint16_t'
may change value [-Wconversion]
lib/bootconfig.c:467:37: warning: conversion from 'int' to 'uint16_t'
may change value [-Wconversion]
lib/bootconfig.c:469:40: warning: conversion from 'int' to 'uint16_t'
may change value [-Wconversion]
lib/bootconfig.c:472:54: warning: conversion from 'int' to 'uint16_t'
may change value [-Wconversion]
lib/bootconfig.c:476:45: warning: conversion from 'int' to 'uint16_t'
may change value [-Wconversion]
xbc_node_index() returns the position of a node in the xbc_nodes array,
which has at most XBC_NODE_MAX (8192) entries, well within uint16_t
range. Every caller stores the result in a uint16_t field (node->parent,
node->child, node->next, or the keys[] array in compose_key_after), so
the int return type causes narrowing warnings at all six call sites.
Change the return type to uint16_t and add an explicit cast on the
pointer subtraction to match the storage width and eliminate the
warnings.
Signed-off-by: Josh Law <objecting@objecting.org>
---
include/linux/bootconfig.h | 2 +-
lib/bootconfig.c | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h
index 23a96c5edcf3..692a5acc2ffc 100644
--- a/include/linux/bootconfig.h
+++ b/include/linux/bootconfig.h
@@ -66,7 +66,7 @@ struct xbc_node {
/* Node tree access raw APIs */
struct xbc_node * __init xbc_root_node(void);
-int __init xbc_node_index(struct xbc_node *node);
+uint16_t __init xbc_node_index(struct xbc_node *node);
struct xbc_node * __init xbc_node_get_parent(struct xbc_node *node);
struct xbc_node * __init xbc_node_get_child(struct xbc_node *node);
struct xbc_node * __init xbc_node_get_next(struct xbc_node *node);
diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 272d9427e879..57f07e33868e 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -131,9 +131,9 @@ struct xbc_node * __init xbc_root_node(void)
*
* Return the index number of @node in XBC node list.
*/
-int __init xbc_node_index(struct xbc_node *node)
+uint16_t __init xbc_node_index(struct xbc_node *node)
{
- return node - &xbc_nodes[0];
+ return (uint16_t)(node - &xbc_nodes[0]);
}
/**
--
2.34.1
^ permalink raw reply related
* [PATCH v7 01/15] lib/bootconfig: clean up comment typos and bracing
From: Josh Law @ 2026-03-17 16:09 UTC (permalink / raw)
To: Masami Hiramatsu, Andrew Morton
Cc: Steven Rostedt, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317160916.33576-1-objecting@objecting.org>
Fixes kerneldoc typos ("initiized" and "uder") and adds a missing blank line.
Also fixes inconsistent if/else bracing in __xbc_add_key() and elsewhere.
Signed-off-by: Josh Law <objecting@objecting.org>
---
lib/bootconfig.c | 19 +++++++++++--------
1 file changed, 11 insertions(+), 8 deletions(-)
diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index b0ef1e74e98a..d77b6533ac00 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -79,6 +79,7 @@ static inline void xbc_free_mem(void *addr, size_t size, bool early)
free(addr);
}
#endif
+
/**
* xbc_get_info() - Get the information of loaded boot config
* @node_size: A pointer to store the number of nodes.
@@ -112,7 +113,7 @@ static int __init xbc_parse_error(const char *msg, const char *p)
* xbc_root_node() - Get the root node of extended boot config
*
* Return the address of root node of extended boot config. If the
- * extended boot config is not initiized, return NULL.
+ * extended boot config is not initialized, return NULL.
*/
struct xbc_node * __init xbc_root_node(void)
{
@@ -364,7 +365,7 @@ struct xbc_node * __init xbc_node_find_next_leaf(struct xbc_node *root,
node = xbc_node_get_parent(node);
if (node == root)
return NULL;
- /* User passed a node which is not uder parent */
+ /* User passed a node which is not under parent */
if (WARN_ON(!node))
return NULL;
}
@@ -472,8 +473,9 @@ static struct xbc_node * __init __xbc_add_sibling(char *data, uint32_t flag, boo
sib->next = xbc_node_index(node);
}
}
- } else
+ } else {
xbc_parse_error("Too many nodes", data);
+ }
return node;
}
@@ -655,9 +657,9 @@ static int __init __xbc_add_key(char *k)
if (unlikely(xbc_node_num == 0))
goto add_node;
- if (!last_parent) /* the first level */
+ if (!last_parent) { /* the first level */
node = find_match_node(xbc_nodes, k);
- else {
+ } else {
child = xbc_node_get_child(last_parent);
/* Since the value node is the first child, skip it. */
if (child && xbc_node_is_value(child))
@@ -665,9 +667,9 @@ static int __init __xbc_add_key(char *k)
node = find_match_node(child, k);
}
- if (node)
+ if (node) {
last_parent = node;
- else {
+ } else {
add_node:
node = xbc_add_child(k, XBC_KEY);
if (!node)
@@ -991,8 +993,9 @@ int __init xbc_init(const char *data, size_t size, const char **emsg, int *epos)
if (emsg)
*emsg = xbc_err_msg;
_xbc_exit(true);
- } else
+ } else {
ret = xbc_node_num;
+ }
return ret;
}
--
2.34.1
^ permalink raw reply related
* [PATCH v7 11/15] lib/bootconfig: fix signed comparison in xbc_node_get_data()
From: Josh Law @ 2026-03-17 16:09 UTC (permalink / raw)
To: Masami Hiramatsu, Andrew Morton
Cc: Steven Rostedt, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317160916.33576-1-objecting@objecting.org>
lib/bootconfig.c:188:28: warning: comparison of integer expressions
of different signedness: 'int' and 'size_t' [-Wsign-compare]
The local variable 'offset' is declared as int, but xbc_data_size is
size_t. Change the type to size_t to match and eliminate the warning.
Signed-off-by: Josh Law <objecting@objecting.org>
---
lib/bootconfig.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index ecc4e8d93547..dd639046912c 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -183,7 +183,7 @@ struct xbc_node * __init xbc_node_get_next(struct xbc_node *node)
*/
const char * __init xbc_node_get_data(struct xbc_node *node)
{
- int offset = node->data & ~XBC_VALUE;
+ size_t offset = node->data & ~XBC_VALUE;
if (WARN_ON(offset >= xbc_data_size))
return NULL;
--
2.34.1
^ permalink raw reply related
* [PATCH v7 13/15] lib/bootconfig: use signed type for offset in xbc_init_node()
From: Josh Law @ 2026-03-17 16:09 UTC (permalink / raw)
To: Masami Hiramatsu, Andrew Morton
Cc: Steven Rostedt, linux-kernel, linux-trace-kernel
In-Reply-To: <20260317160916.33576-1-objecting@objecting.org>
lib/bootconfig.c:415:32: warning: conversion to 'long unsigned int'
from 'long int' may change the sign of the result [-Wsign-conversion]
Pointer subtraction yields ptrdiff_t (signed), which was stored in
unsigned long. The original unsigned type implicitly caught a negative
offset (data < xbc_data) because the wrapped value would exceed
XBC_DATA_MAX. Make this intent explicit by using a signed long and
adding an offset < 0 check to the WARN_ON condition.
Signed-off-by: Josh Law <objecting@objecting.org>
---
lib/bootconfig.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 3c5b6ad32a54..119c3d429c1f 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -412,9 +412,9 @@ const char * __init xbc_node_find_next_key_value(struct xbc_node *root,
static int __init xbc_init_node(struct xbc_node *node, char *data, uint16_t flag)
{
- unsigned long offset = data - xbc_data;
+ long offset = data - xbc_data;
- if (WARN_ON(offset >= XBC_DATA_MAX))
+ if (WARN_ON(offset < 0 || offset >= XBC_DATA_MAX))
return -EINVAL;
node->data = (uint16_t)offset | flag;
--
2.34.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox