* [PATCH mm-unstable v19 06/14] mm/khugepaged: generalize collapse_huge_page for mTHP collapse
From: Nico Pache @ 2026-06-05 16:14 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260605161422.213817-1-npache@redhat.com>
Pass an order to collapse_huge_page to support collapsing anon memory to
arbitrary orders within a PMD. order indicates what mTHP size we are
attempting to collapse to.
For non-PMD collapse we must leave the anon VMA write locked until after
we collapse the mTHP-- in the PMD case all the pages are isolated, but in
the mTHP case this is not true, and we must keep the lock to prevent
access/changes to the page tables. This can happen if the rmap walkers hit
a pmd_none while the PMD entry is currently unavailable due to being
temporarily removed during the collapse phase.
To properly establish the page table hierarchy without violating any
expectations from certain architectures (e.g. MIPS), we must make sure to
have the PMD reinstalled before the PTEs, and hold both PTE/PMD locks
before calling update_mmu_cache_range() (if they are distinct locks).
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 105 ++++++++++++++++++++++++++++++------------------
1 file changed, 67 insertions(+), 38 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index e4b2ca77ecf6..c2769d82a719 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1228,34 +1228,36 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
* while allocating a THP, as that could trigger direct reclaim/compaction.
* Note that the VMA must be rechecked after grabbing the mmap_lock again.
*/
-static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address,
- int referenced, int unmapped, struct collapse_control *cc)
+static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long start_addr,
+ int referenced, int unmapped, struct collapse_control *cc,
+ unsigned int order)
{
+ const unsigned long pmd_addr = start_addr & HPAGE_PMD_MASK;
+ const unsigned long end_addr = start_addr + (PAGE_SIZE << order);
LIST_HEAD(compound_pagelist);
pmd_t *pmd, _pmd;
- pte_t *pte;
+ pte_t *pte = NULL;
pgtable_t pgtable;
struct folio *folio;
spinlock_t *pmd_ptl, *pte_ptl;
enum scan_result result = SCAN_FAIL;
struct vm_area_struct *vma;
struct mmu_notifier_range range;
+ bool anon_vma_locked = false;
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-
- result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
+ result = alloc_charge_folio(&folio, mm, cc, order);
if (result != SCAN_SUCCEED)
goto out_nolock;
mmap_read_lock(mm);
- result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
- HPAGE_PMD_ORDER);
+ result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
+ &vma, cc, order);
if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm);
goto out_nolock;
}
- result = find_pmd_or_thp_or_none(mm, address, &pmd);
+ result = find_pmd_or_thp_or_none(mm, pmd_addr, &pmd);
if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm);
goto out_nolock;
@@ -1267,8 +1269,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
* released when it fails. So we jump out_nolock directly in
* that case. Continuing to collapse causes inconsistency.
*/
- result = __collapse_huge_page_swapin(mm, vma, address, pmd,
- referenced, HPAGE_PMD_ORDER);
+ result = __collapse_huge_page_swapin(mm, vma, start_addr, pmd,
+ referenced, order);
if (result != SCAN_SUCCEED)
goto out_nolock;
}
@@ -1283,20 +1285,28 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
* mmap_lock.
*/
mmap_write_lock(mm);
- result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
- HPAGE_PMD_ORDER);
+ result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
+ &vma, cc, order);
if (result != SCAN_SUCCEED)
goto out_up_write;
/* check if the pmd is still valid */
vma_start_write(vma);
- result = check_pmd_still_valid(mm, address, pmd);
+ result = check_pmd_still_valid(mm, pmd_addr, pmd);
if (result != SCAN_SUCCEED)
goto out_up_write;
anon_vma_lock_write(vma->anon_vma);
+ anon_vma_locked = true;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
- address + HPAGE_PMD_SIZE);
+ /*
+ * Only notify about the PTE range we will actually modify. While we
+ * temporary unmap the whole PTE table for mTHP collapse, we'll remap
+ * it later, leaving other PTEs effectively unmodified. The locks we
+ * hold prevent anybody from stumbling over such temporarily unmapped
+ * PTE tables.
+ */
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start_addr,
+ end_addr);
mmu_notifier_invalidate_range_start(&range);
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
@@ -1308,26 +1318,23 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
* Parallel GUP-fast is fine since GUP-fast will back off when
* it detects PMD is changed.
*/
- _pmd = pmdp_collapse_flush(vma, address, pmd);
+ _pmd = pmdp_collapse_flush(vma, pmd_addr, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(&range);
tlb_remove_table_sync_one();
- pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
+ pte = pte_offset_map_lock(mm, &_pmd, start_addr, &pte_ptl);
if (pte) {
- result = __collapse_huge_page_isolate(vma, address, pte, cc,
- HPAGE_PMD_ORDER,
- &compound_pagelist);
+ result = __collapse_huge_page_isolate(vma, start_addr, pte, cc,
+ order, &compound_pagelist);
spin_unlock(pte_ptl);
} else {
result = SCAN_NO_PTE_TABLE;
}
if (unlikely(result != SCAN_SUCCEED)) {
- if (pte)
- pte_unmap(pte);
spin_lock(pmd_ptl);
- BUG_ON(!pmd_none(*pmd));
+ VM_WARN_ON_ONCE(!pmd_none(*pmd));
/*
* We can only use set_pmd_at when establishing
* hugepmds and never for establishing regular pmds that
@@ -1335,21 +1342,24 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
*/
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(pmd_ptl);
- anon_vma_unlock_write(vma->anon_vma);
goto out_up_write;
}
/*
- * All pages are isolated and locked so anon_vma rmap
- * can't run anymore.
+ * For PMD collapse all pages are isolated and locked so anon_vma
+ * rmap can't run anymore. For mTHP collapse the PMD entry has been
+ * removed and not all pages are isolated and locked, so we must hold
+ * the lock to prevent neighboring folios from attempting to access
+ * this PMD until its reinstalled.
*/
- anon_vma_unlock_write(vma->anon_vma);
+ if (is_pmd_order(order)) {
+ anon_vma_unlock_write(vma->anon_vma);
+ anon_vma_locked = false;
+ }
result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
- vma, address, pte_ptl,
- HPAGE_PMD_ORDER,
- &compound_pagelist);
- pte_unmap(pte);
+ vma, start_addr, pte_ptl,
+ order, &compound_pagelist);
if (unlikely(result != SCAN_SUCCEED))
goto out_up_write;
@@ -1359,18 +1369,37 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
* write.
*/
__folio_mark_uptodate(folio);
- pgtable = pmd_pgtable(_pmd);
-
spin_lock(pmd_ptl);
- BUG_ON(!pmd_none(*pmd));
- pgtable_trans_huge_deposit(mm, pmd, pgtable);
- map_anon_folio_pmd_nopf(folio, pmd, vma, address);
+ VM_WARN_ON_ONCE(!pmd_none(*pmd));
+ if (is_pmd_order(order)) {
+ pgtable = pmd_pgtable(_pmd);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_addr);
+ } else {
+ /*
+ * Some architectures (e.g. MIPS) walk the live page table in
+ * their implementation. update_mmu_cache_range() must be called
+ * with a valid page table hierarchy and the PTE lock held.
+ * Acquire it nested inside pmd_ptl when they are distinct locks.
+ */
+ if (pte_ptl != pmd_ptl)
+ spin_lock_nested(pte_ptl, SINGLE_DEPTH_NESTING);
+ pmd_populate(mm, pmd, pmd_pgtable(_pmd));
+ map_anon_folio_pte_nopf(folio, pte, vma, start_addr,
+ /*uffd_wp=*/ false);
+ if (pte_ptl != pmd_ptl)
+ spin_unlock(pte_ptl);
+ }
spin_unlock(pmd_ptl);
folio = NULL;
result = SCAN_SUCCEED;
out_up_write:
+ if (anon_vma_locked)
+ anon_vma_unlock_write(vma->anon_vma);
+ if (pte)
+ pte_unmap(pte);
mmap_write_unlock(mm);
out_nolock:
if (folio)
@@ -1550,7 +1579,7 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
/* collapse_huge_page expects the lock to be dropped before calling */
mmap_read_unlock(mm);
result = collapse_huge_page(mm, start_addr, referenced,
- unmapped, cc);
+ unmapped, cc, HPAGE_PMD_ORDER);
/* collapse_huge_page will return with the mmap_lock released */
*lock_dropped = true;
}
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v19 07/14] mm/khugepaged: skip collapsing mTHP to smaller orders
From: Nico Pache @ 2026-06-05 16:14 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe,
Usama Arif
In-Reply-To: <20260605161422.213817-1-npache@redhat.com>
khugepaged may try to collapse a mTHP to a folio of equal or smaller size,
possibly resulting in a partially mapped source folio, which is undesired.
Skip these cases until we have a way to check if its ok to collapse to a
smaller mTHP size (like in the case of a partially mapped folio). This
check is not done during the scan phase as the current collapse order is
unknown at that time.
This patch is inspired by Dev Jain's work on khugepaged mTHP support [1].
[1] https://lore.kernel.org/lkml/20241216165105.56185-11-dev.jain@arm.com/
Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: David Hildenbrand (arm) <david@kernel.org>
Acked-by: Usama Arif <usama.arif@linux.dev>
Co-developed-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index c2769d82a719..191e529c185c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -697,6 +697,14 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
goto out;
}
}
+ /*
+ * TODO: In some cases of partially-mapped folios, we'd actually
+ * want to collapse.
+ */
+ if (!is_pmd_order(order) && folio_order(folio) >= order) {
+ result = SCAN_PTE_MAPPED_HUGEPAGE;
+ goto out;
+ }
if (folio_test_large(folio)) {
struct folio *f;
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v19 08/14] mm/khugepaged: add per-order mTHP collapse failure statistics
From: Nico Pache @ 2026-06-05 16:14 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260605161422.213817-1-npache@redhat.com>
Add three new mTHP statistics to track collapse failures for different
orders when encountering swap PTEs, excessive none PTEs, and shared PTEs:
- collapse_exceed_swap_pte: Increment when mTHP collapse fails due to
encountering a swap PTE.
- collapse_exceed_none_pte: Counts when mTHP collapse fails due to
exceeding the none PTE threshold for the given order
- collapse_exceed_shared_pte: Counts when mTHP collapse fails due to
encountering a shared PTE.
These statistics complement the existing THP_SCAN_EXCEED_* events by
providing per-order granularity for mTHP collapse attempts. The stats are
exposed via sysfs under
`/sys/kernel/mm/transparent_hugepage/hugepages-*/stats/` for each
supported hugepage size.
As we currently do not support collapsing mTHPs that contain a swap or
shared entry, those statistics keep track of how often we are
encountering failed mTHP collapses due to these restrictions.
We will add support for mTHP collapse for anonymous pages next; lets also
track when this happens at the PMD level within the per-mTHP stats.
Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Signed-off-by: Nico Pache <npache@redhat.com>
---
Documentation/admin-guide/mm/transhuge.rst | 14 ++++++++++++++
include/linux/huge_mm.h | 3 +++
mm/huge_memory.c | 7 +++++++
mm/khugepaged.c | 15 +++++++++++++--
4 files changed, 37 insertions(+), 2 deletions(-)
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index a74844e01f1e..b98e18c80185 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -714,6 +714,20 @@ nr_anon_partially_mapped
an anonymous THP as "partially mapped" and count it here, even though it
is not actually partially mapped anymore.
+collapse_exceed_none_pte
+ The number of collapse attempts that failed due to exceeding the
+ max_ptes_none threshold.
+
+collapse_exceed_swap_pte
+ The number of collapse attempts that failed due to exceeding the
+ max_ptes_swap threshold. For non-PMD orders this occurs if a mTHP range
+ contains at least one swap PTE.
+
+collapse_exceed_shared_pte
+ The number of collapse attempts that failed due to exceeding the
+ max_ptes_shared threshold. For non-PMD orders this occurs if a mTHP range
+ contains at least one shared PTE.
+
As the system ages, allocating huge pages may be expensive as the
system uses memory compaction to copy data around memory to free a
huge page for use. There are some counters in ``/proc/vmstat`` to help
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 443852423790..148109ebd08a 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -144,6 +144,9 @@ enum mthp_stat_item {
MTHP_STAT_SPLIT_DEFERRED,
MTHP_STAT_NR_ANON,
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
+ MTHP_STAT_COLLAPSE_EXCEED_SWAP,
+ MTHP_STAT_COLLAPSE_EXCEED_NONE,
+ MTHP_STAT_COLLAPSE_EXCEED_SHARED,
__MTHP_STAT_COUNT
};
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index eea83da9114a..222e421d9e8e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -717,6 +717,10 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_swap_pte, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_none_pte, MTHP_STAT_COLLAPSE_EXCEED_NONE);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_shared_pte, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
+
static struct attribute *anon_stats_attrs[] = {
&anon_fault_alloc_attr.attr,
@@ -733,6 +737,9 @@ static struct attribute *anon_stats_attrs[] = {
&split_deferred_attr.attr,
&nr_anon_attr.attr,
&nr_anon_partially_mapped_attr.attr,
+ &collapse_exceed_swap_pte_attr.attr,
+ &collapse_exceed_none_pte_attr.attr,
+ &collapse_exceed_shared_pte_attr.attr,
NULL,
};
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 191e529c185c..ac4731addafa 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -651,7 +651,9 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
if (pte_none_or_zero(pteval)) {
if (++none_or_zero > max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
- count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+ if (is_pmd_order(order))
+ count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE);
goto out;
}
continue;
@@ -693,7 +695,9 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
*/
if (++shared > max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
- count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+ if (is_pmd_order(order))
+ count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
goto out;
}
}
@@ -1152,6 +1156,7 @@ static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
* range.
*/
if (!is_pmd_order(order)) {
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
pte_unmap(pte);
mmap_read_unlock(mm);
result = SCAN_EXCEED_SWAP_PTE;
@@ -1459,6 +1464,8 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
if (++none_or_zero > max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+ count_mthp_stat(HPAGE_PMD_ORDER,
+ MTHP_STAT_COLLAPSE_EXCEED_NONE);
goto out_unmap;
}
continue;
@@ -1467,6 +1474,8 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
if (++unmapped > max_ptes_swap) {
result = SCAN_EXCEED_SWAP_PTE;
count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
+ count_mthp_stat(HPAGE_PMD_ORDER,
+ MTHP_STAT_COLLAPSE_EXCEED_SWAP);
goto out_unmap;
}
/*
@@ -1524,6 +1533,8 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
if (++shared > max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+ count_mthp_stat(HPAGE_PMD_ORDER,
+ MTHP_STAT_COLLAPSE_EXCEED_SHARED);
goto out_unmap;
}
}
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v19 09/14] mm/khugepaged: improve tracepoints for mTHP orders
From: Nico Pache @ 2026-06-05 16:14 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260605161422.213817-1-npache@redhat.com>
Add the order to the mm_collapse_huge_page<_swapin,_isolate> tracepoints to
give better insight into what order is being operated at for.
Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Signed-off-by: Nico Pache <npache@redhat.com>
---
include/trace/events/huge_memory.h | 34 +++++++++++++++++++-----------
mm/khugepaged.c | 9 ++++----
2 files changed, 27 insertions(+), 16 deletions(-)
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index bcdc57eea270..291fae364c62 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -89,40 +89,44 @@ TRACE_EVENT(mm_khugepaged_scan_pmd,
TRACE_EVENT(mm_collapse_huge_page,
- TP_PROTO(struct mm_struct *mm, int isolated, int status),
+ TP_PROTO(struct mm_struct *mm, int isolated, int status, unsigned int order),
- TP_ARGS(mm, isolated, status),
+ TP_ARGS(mm, isolated, status, order),
TP_STRUCT__entry(
__field(struct mm_struct *, mm)
__field(int, isolated)
__field(int, status)
+ __field(unsigned int, order)
),
TP_fast_assign(
__entry->mm = mm;
__entry->isolated = isolated;
__entry->status = status;
+ __entry->order = order;
),
- TP_printk("mm=%p, isolated=%d, status=%s",
+ TP_printk("mm=%p, isolated=%d, status=%s, order=%u",
__entry->mm,
__entry->isolated,
- __print_symbolic(__entry->status, SCAN_STATUS))
+ __print_symbolic(__entry->status, SCAN_STATUS),
+ __entry->order)
);
TRACE_EVENT(mm_collapse_huge_page_isolate,
TP_PROTO(struct folio *folio, int none_or_zero,
- int referenced, int status),
+ int referenced, int status, unsigned int order),
- TP_ARGS(folio, none_or_zero, referenced, status),
+ TP_ARGS(folio, none_or_zero, referenced, status, order),
TP_STRUCT__entry(
__field(unsigned long, pfn)
__field(int, none_or_zero)
__field(int, referenced)
__field(int, status)
+ __field(unsigned int, order)
),
TP_fast_assign(
@@ -130,26 +134,30 @@ TRACE_EVENT(mm_collapse_huge_page_isolate,
__entry->none_or_zero = none_or_zero;
__entry->referenced = referenced;
__entry->status = status;
+ __entry->order = order;
),
- TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, status=%s",
+ TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, status=%s, order=%u",
__entry->pfn,
__entry->none_or_zero,
__entry->referenced,
- __print_symbolic(__entry->status, SCAN_STATUS))
+ __print_symbolic(__entry->status, SCAN_STATUS),
+ __entry->order)
);
TRACE_EVENT(mm_collapse_huge_page_swapin,
- TP_PROTO(struct mm_struct *mm, int swapped_in, int referenced, int ret),
+ TP_PROTO(struct mm_struct *mm, int swapped_in, int referenced, int ret,
+ unsigned int order),
- TP_ARGS(mm, swapped_in, referenced, ret),
+ TP_ARGS(mm, swapped_in, referenced, ret, order),
TP_STRUCT__entry(
__field(struct mm_struct *, mm)
__field(int, swapped_in)
__field(int, referenced)
__field(int, ret)
+ __field(unsigned int, order)
),
TP_fast_assign(
@@ -157,13 +165,15 @@ TRACE_EVENT(mm_collapse_huge_page_swapin,
__entry->swapped_in = swapped_in;
__entry->referenced = referenced;
__entry->ret = ret;
+ __entry->order = order;
),
- TP_printk("mm=%p, swapped_in=%d, referenced=%d, ret=%d",
+ TP_printk("mm=%p, swapped_in=%d, referenced=%d, ret=%d, order=%u",
__entry->mm,
__entry->swapped_in,
__entry->referenced,
- __entry->ret)
+ __entry->ret,
+ __entry->order)
);
TRACE_EVENT(mm_khugepaged_scan_file,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ac4731addafa..26c343a6fa3d 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -785,13 +785,13 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
} else {
result = SCAN_SUCCEED;
trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
- referenced, result);
+ referenced, result, order);
return result;
}
out:
release_pte_pages(pte, _pte, compound_pagelist);
trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
- referenced, result);
+ referenced, result, order);
return result;
}
@@ -1197,7 +1197,8 @@ static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
result = SCAN_SUCCEED;
out:
- trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result);
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result,
+ order);
return result;
}
@@ -1417,7 +1418,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long s
out_nolock:
if (folio)
folio_put(folio);
- trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
+ trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result, order);
return result;
}
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v19 10/14] mm/khugepaged: introduce collapse_possible_orders helper functions
From: Nico Pache @ 2026-06-05 16:14 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260605161422.213817-1-npache@redhat.com>
Add collapse_possible_orders() to generalize THP order eligibility. The
function determines which THP orders are permitted based on collapse
context (khugepaged vs madv_collapse). We also add collapse_possible()
as a thin wrapper around collapse_possible_orders() that returns a bool
rather than the whole bitmap.
This consolidates collapse configuration logic and provides a clean
interface for future mTHP collapse support where the orders may be
different.
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 24 +++++++++++++++++++++---
1 file changed, 21 insertions(+), 3 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 26c343a6fa3d..ec886a031952 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -554,12 +554,30 @@ void __khugepaged_enter(struct mm_struct *mm)
wake_up_interruptible(&khugepaged_wait);
}
+/*
+ * Check what orders are possible based on the vma and collapse type.
+ * This is used to determine if mTHP collapse is a viable option.
+ */
+static unsigned long collapse_possible_orders(struct vm_area_struct *vma,
+ vm_flags_t vm_flags, enum tva_type tva_flags)
+{
+ const unsigned long orders = BIT(HPAGE_PMD_ORDER);
+
+ return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
+}
+
+static bool collapse_possible(struct vm_area_struct *vma,
+ vm_flags_t vm_flags, enum tva_type tva_flags)
+{
+ return collapse_possible_orders(vma, vm_flags, tva_flags);
+}
+
void khugepaged_enter_vma(struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
hugepage_pmd_enabled()) {
- if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
+ if (collapse_possible(vma, vm_flags, TVA_KHUGEPAGED))
__khugepaged_enter(vma->vm_mm);
}
}
@@ -2700,7 +2718,7 @@ static void collapse_scan_mm_slot(unsigned int progress_max,
cc->progress++;
break;
}
- if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
+ if (!collapse_possible(vma, vma->vm_flags, TVA_KHUGEPAGED)) {
cc->progress++;
continue;
}
@@ -3010,7 +3028,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
BUG_ON(vma->vm_start > start);
BUG_ON(vma->vm_end < end);
- if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
+ if (!collapse_possible(vma, vma->vm_flags, TVA_FORCED_COLLAPSE))
return -EINVAL;
cc = kmalloc_obj(*cc);
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v19 11/14] mm/khugepaged: Introduce mTHP collapse support
From: Nico Pache @ 2026-06-05 16:14 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260605161422.213817-1-npache@redhat.com>
Enable khugepaged to collapse to mTHP orders. This patch implements the
main scanning logic using a bitmap to track occupied pages and the
algorithm to find optimal collapse sizes.
Previous to this patch, PMD collapse had 3 main phases, a light weight
scanning phase (mmap_read_lock) that determines a potential PMD
collapse, an alloc phase (mmap unlocked), then finally heavier collapse
phase (mmap_write_lock).
To enabled mTHP collapse we make the following changes:
During PMD scan phase, track occupied pages in a bitmap. When mTHP
orders are enabled, we remove the restriction of max_ptes_none during the
scan phase to avoid missing potential mTHP collapse candidates. Once we
have scanned the full PMD range and updated the bitmap to track occupied
pages, we use the bitmap to find the optimal mTHP size.
Implement mthp_collapse() to walk forward through the bitmap and
determine the best eligible order for each naturally-aligned region. The
algorithm starts at the beginning of the PMD range and, for each offset,
tries the highest order that fits the alignment. If the number of
occupied PTEs in that region satisfies the max_ptes_none threshold for
that order, a collapse is attempted. On failure, the order is
decremented and the same offset is retried at the next smaller size. Once
the smallest enabled order is exhausted (or a collapse succeeds), the
offset advances past the region just processed, and the next attempt
starts at the highest order permitted by the new offset's natural
alignment.
The algorithm works as follows:
1) set offset=0 and order=HPAGE_PMD_ORDER
2) if the order is not enabled, go to step (5)
3) count occupied PTEs in the (offset, order) range using
bitmap_weight_from()
4) if the count satisfies the max_ptes_none threshold, attempt
collapse; on success, advance to step (6)
5) if a smaller enabled order exists, decrement order and retry
from step (2) at the same offset
6) advance offset past the current region and compute the next
order from the new offset's natural alignment via __ffs(offset),
capped at HPAGE_PMD_ORDER
7) repeat from step (2) until the full PMD range is covered
mTHP collapses reject regions containing swapped out or shared pages.
This is because adding new entries can lead to new none pages, and these
may lead to constant promotion into a higher order mTHP. A similar
issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse
introducing at least 2x the number of pages, and on a future scan will
satisfy the promotion condition once again. This issue is prevented via
the collapse_max_ptes_none() function which imposes the max_ptes_none
restrictions above.
We currently only support mTHP collapse for max_ptes_none values of 0
and HPAGE_PMD_NR - 1. resulting in the following behavior:
- max_ptes_none=0: Never introduce new empty pages during collapse
- max_ptes_none=HPAGE_PMD_NR-1: Always try collapse to the highest
available mTHP order
Any other max_ptes_none value will emit a warning and default mTHP
collapse to max_ptes_none=0. There should be no behavior change for PMD
collapse.
Once we determine what mTHP sizes fits best in that PMD range a collapse
is attempted. A minimum collapse order of 2 is used as this is the lowest
order supported by anon memory as defined by THP_ORDERS_ALL_ANON.
Currently madv_collapse is not supported and will only attempt PMD
collapse.
We can also remove the check for is_khugepaged inside the PMD scan as
the collapse_max_ptes_none() function handles this logic now.
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 146 +++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 138 insertions(+), 8 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ec886a031952..430047316f43 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -99,6 +99,8 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
static struct kmem_cache *mm_slot_cache __ro_after_init;
+#define KHUGEPAGED_MIN_MTHP_ORDER 2
+
struct collapse_control {
bool is_khugepaged;
@@ -110,6 +112,9 @@ struct collapse_control {
/* nodemask for allocation fallback */
nodemask_t alloc_nmask;
+
+ /* Each bit represents a single occupied (!none/zero) page. */
+ DECLARE_BITMAP(mthp_present_ptes, MAX_PTRS_PER_PTE);
};
/**
@@ -1440,20 +1445,130 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long s
return result;
}
+/* Return the highest naturally aligned order that fits at @offset within a PMD. */
+static unsigned int max_order_from_offset(unsigned int offset)
+{
+ if (offset == 0)
+ return HPAGE_PMD_ORDER;
+
+ return min_t(unsigned int, __ffs(offset), HPAGE_PMD_ORDER);
+}
+
+/*
+ * mthp_collapse() consumes the bitmap that is generated during
+ * collapse_scan_pmd() to determine what regions and mTHP orders fit best.
+ *
+ * Each bit in cc->mthp_present_ptes represents a single occupied (!none/zero)
+ * page. We start at the PMD order and check if it is eligible for collapse;
+ * if not, we check the left and right halves of the PTE page table we are
+ * examining at a lower order.
+ *
+ * For each of these, we determine how many PTE entries are occupied in the
+ * range of PTE entries we propose to collapse, then we compare this to a
+ * threshold number of PTE entries which would need to be occupied for a
+ * collapse to be permitted at that order (accounting for max_ptes_none).
+ *
+ * If a collapse is permitted, we attempt to collapse the PTE range into a
+ * mTHP.
+ */
+static enum scan_result mthp_collapse(struct mm_struct *mm,
+ unsigned long address, int referenced, int unmapped,
+ struct collapse_control *cc, unsigned long enabled_orders)
+{
+ unsigned int nr_occupied_ptes, nr_ptes, max_ptes_none;
+ enum scan_result last_result = SCAN_FAIL;
+ int collapsed = 0;
+ bool alloc_failed = false;
+ unsigned long collapse_address;
+ unsigned int offset = 0;
+ unsigned int order = HPAGE_PMD_ORDER;
+
+ while (offset < HPAGE_PMD_NR) {
+ nr_ptes = 1UL << order;
+
+ if (!test_bit(order, &enabled_orders))
+ goto next_order;
+
+ max_ptes_none = collapse_max_ptes_none(cc, NULL, order);
+ nr_occupied_ptes = bitmap_weight_from(cc->mthp_present_ptes, offset,
+ offset + nr_ptes);
+
+ if (nr_occupied_ptes >= nr_ptes - max_ptes_none) {
+ enum scan_result ret;
+
+ collapse_address = address + offset * PAGE_SIZE;
+ ret = collapse_huge_page(mm, collapse_address, referenced,
+ unmapped, cc, order);
+ switch (ret) {
+ /* Cases where we continue to next collapse candidate */
+ case SCAN_SUCCEED:
+ collapsed += nr_ptes;
+ fallthrough;
+ case SCAN_PTE_MAPPED_HUGEPAGE:
+ goto next_offset;
+ /* Cases where lower orders might still succeed */
+ case SCAN_ALLOC_HUGE_PAGE_FAIL:
+ alloc_failed = true;
+ last_result = ret;
+ goto next_order;
+ /* Cases where no further collapse is possible */
+ case SCAN_PMD_MAPPED:
+ fallthrough;
+ default:
+ last_result = ret;
+ goto done;
+ }
+ }
+
+next_order:
+ /*
+ * Continue with the next smaller order if there is still
+ * any smaller order enabled. When at the smallest order
+ * we must always move to the next offset.
+ */
+ if (order > KHUGEPAGED_MIN_MTHP_ORDER &&
+ (enabled_orders & GENMASK(order - 1, 0))) {
+ order--;
+ continue;
+ }
+next_offset:
+ /*
+ * Advance past the region we just processed and determine the
+ * highest order we can attempt next. Since huge pages must be
+ * naturally aligned, the max order we can attempt next is
+ * limited by the alignment of the new offset.
+ * E.g. if we collapsed a order-2 mTHP at offset 0, offset
+ * becomes 4 and __ffs(4) == 2, so the next attempt starts at
+ * order 2.
+ */
+ offset += nr_ptes;
+ order = max_order_from_offset(offset);
+ }
+done:
+ if (collapsed)
+ return SCAN_SUCCEED;
+ if (alloc_failed)
+ return SCAN_ALLOC_HUGE_PAGE_FAIL;
+ return last_result;
+}
+
static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long start_addr,
bool *lock_dropped, struct collapse_control *cc)
{
- const unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, HPAGE_PMD_ORDER);
const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
+ unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
+ enum tva_type tva_flags = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
pmd_t *pmd;
- pte_t *pte, *_pte;
+ pte_t *pte, *_pte, pteval;
+ int i;
int none_or_zero = 0, shared = 0, referenced = 0;
enum scan_result result = SCAN_FAIL;
struct page *page = NULL;
struct folio *folio = NULL;
unsigned long addr;
+ unsigned long enabled_orders;
spinlock_t *ptl;
int node = NUMA_NO_NODE, unmapped = 0;
@@ -1465,8 +1580,19 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
goto out;
}
+ bitmap_zero(cc->mthp_present_ptes, MAX_PTRS_PER_PTE);
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
+
+ enabled_orders = collapse_possible_orders(vma, vma->vm_flags, tva_flags);
+
+ /*
+ * If PMD is the only enabled order, enforce max_ptes_none, otherwise
+ * scan all pages to populate the bitmap for mTHP collapse.
+ */
+ if (enabled_orders != BIT(HPAGE_PMD_ORDER))
+ max_ptes_none = KHUGEPAGED_MAX_PTES_LIMIT;
+
pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
if (!pte) {
cc->progress++;
@@ -1474,11 +1600,13 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
goto out;
}
- for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
- _pte++, addr += PAGE_SIZE) {
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ _pte = pte + i;
+ addr = start_addr + i * PAGE_SIZE;
+ pteval = ptep_get(_pte);
+
cc->progress++;
- pte_t pteval = ptep_get(_pte);
if (pte_none_or_zero(pteval)) {
if (++none_or_zero > max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
@@ -1558,6 +1686,8 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
}
}
+ /* Set bit for occupied pages */
+ __set_bit(i, cc->mthp_present_ptes);
/*
* Record which node the original page is from and save this
* information to cc->node_load[].
@@ -1616,9 +1746,9 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
if (result == SCAN_SUCCEED) {
/* collapse_huge_page expects the lock to be dropped before calling */
mmap_read_unlock(mm);
- result = collapse_huge_page(mm, start_addr, referenced,
- unmapped, cc, HPAGE_PMD_ORDER);
- /* collapse_huge_page will return with the mmap_lock released */
+ result = mthp_collapse(mm, start_addr, referenced,
+ unmapped, cc, enabled_orders);
+ /* mmap_lock was released above, set lock_dropped */
*lock_dropped = true;
}
out:
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v19 12/14] mm/khugepaged: avoid unnecessary mTHP collapse attempts
From: Nico Pache @ 2026-06-05 16:14 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe,
Usama Arif
In-Reply-To: <20260605161422.213817-1-npache@redhat.com>
There are cases where, if an attempted collapse fails, all subsequent
orders are guaranteed to also fail. Avoid these collapse attempts by
bailing out early.
Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
Acked-by: Usama Arif <usama.arif@linux.dev>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 430047316f43..7de92b28dd30 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1499,6 +1499,7 @@ static enum scan_result mthp_collapse(struct mm_struct *mm,
collapse_address = address + offset * PAGE_SIZE;
ret = collapse_huge_page(mm, collapse_address, referenced,
unmapped, cc, order);
+
switch (ret) {
/* Cases where we continue to next collapse candidate */
case SCAN_SUCCEED:
@@ -1509,6 +1510,18 @@ static enum scan_result mthp_collapse(struct mm_struct *mm,
/* Cases where lower orders might still succeed */
case SCAN_ALLOC_HUGE_PAGE_FAIL:
alloc_failed = true;
+ fallthrough;
+ case SCAN_LACK_REFERENCED_PAGE:
+ case SCAN_EXCEED_NONE_PTE:
+ case SCAN_EXCEED_SWAP_PTE:
+ case SCAN_EXCEED_SHARED_PTE:
+ case SCAN_PAGE_LOCK:
+ case SCAN_PAGE_COUNT:
+ case SCAN_PAGE_NULL:
+ case SCAN_DEL_PAGE_LRU:
+ case SCAN_PTE_NON_PRESENT:
+ case SCAN_PTE_UFFD_WP:
+ case SCAN_PAGE_LAZYFREE:
last_result = ret;
goto next_order;
/* Cases where no further collapse is possible */
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v19 13/14] mm/khugepaged: run khugepaged for all orders
From: Nico Pache @ 2026-06-05 16:14 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe,
Usama Arif
In-Reply-To: <20260605161422.213817-1-npache@redhat.com>
From: Baolin Wang <baolin.wang@linux.alibaba.com>
If any order (m)THP is enabled we should allow running khugepaged to
attempt scanning and collapsing mTHPs. In order for khugepaged to operate
when only mTHP sizes are specified in sysfs, we must modify the predicate
function that determines whether it ought to run to do so.
This function is currently called hugepage_pmd_enabled(), this patch
renames it to hugepage_enabled() and updates the logic to check to
determine whether any valid orders may exist which would justify
khugepaged running.
We must also update collapse_possible_orders() to check all orders if
the vma is anonymous and the collapse is khugepaged.
After this patch khugepaged mTHP collapse is fully enabled.
Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Acked-by: Usama Arif <usama.arif@linux.dev>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 36 ++++++++++++++++++++----------------
1 file changed, 20 insertions(+), 16 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 7de92b28dd30..996e014a03d3 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -503,23 +503,23 @@ static inline int collapse_test_exit_or_disable(struct mm_struct *mm)
mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
}
-static bool hugepage_pmd_enabled(void)
+static bool hugepage_enabled(void)
{
/*
* We cover the anon, shmem and the file-backed case here; file-backed
* hugepages, when configured in, are determined by the global control.
- * Anon pmd-sized hugepages are determined by the pmd-size control.
+ * Anon hugepages are determined by its per-size mTHP control.
* Shmem pmd-sized hugepages are also determined by its pmd-size control,
* except when the global shmem_huge is set to SHMEM_HUGE_DENY.
*/
if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
hugepage_global_enabled())
return true;
- if (test_bit(PMD_ORDER, &huge_anon_orders_always))
+ if (READ_ONCE(huge_anon_orders_always))
return true;
- if (test_bit(PMD_ORDER, &huge_anon_orders_madvise))
+ if (READ_ONCE(huge_anon_orders_madvise))
return true;
- if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) &&
+ if (READ_ONCE(huge_anon_orders_inherit) &&
hugepage_global_enabled())
return true;
if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled())
@@ -566,7 +566,13 @@ void __khugepaged_enter(struct mm_struct *mm)
static unsigned long collapse_possible_orders(struct vm_area_struct *vma,
vm_flags_t vm_flags, enum tva_type tva_flags)
{
- const unsigned long orders = BIT(HPAGE_PMD_ORDER);
+ unsigned long orders;
+
+ /* If khugepaged is scanning an anonymous vma, allow mTHP collapse */
+ if ((tva_flags == TVA_KHUGEPAGED) && vma_is_anonymous(vma))
+ orders = THP_ORDERS_ALL_ANON;
+ else
+ orders = BIT(HPAGE_PMD_ORDER);
return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
}
@@ -580,11 +586,9 @@ static bool collapse_possible(struct vm_area_struct *vma,
void khugepaged_enter_vma(struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
- hugepage_pmd_enabled()) {
- if (collapse_possible(vma, vm_flags, TVA_KHUGEPAGED))
- __khugepaged_enter(vma->vm_mm);
- }
+ if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) && hugepage_enabled()
+ && collapse_possible(vma, vm_flags, TVA_KHUGEPAGED))
+ __khugepaged_enter(vma->vm_mm);
}
void __khugepaged_exit(struct mm_struct *mm)
@@ -2936,7 +2940,7 @@ static void collapse_scan_mm_slot(unsigned int progress_max,
static int khugepaged_has_work(void)
{
- return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled();
+ return !list_empty(&khugepaged_scan.mm_head) && hugepage_enabled();
}
static int khugepaged_wait_event(void)
@@ -3009,7 +3013,7 @@ static void khugepaged_wait_work(void)
return;
}
- if (hugepage_pmd_enabled())
+ if (hugepage_enabled())
wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
}
@@ -3040,7 +3044,7 @@ void set_recommended_min_free_kbytes(void)
int nr_zones = 0;
unsigned long recommended_min;
- if (!hugepage_pmd_enabled()) {
+ if (!hugepage_enabled()) {
calculate_min_free_kbytes();
goto update_wmarks;
}
@@ -3090,7 +3094,7 @@ int start_stop_khugepaged(void)
int err = 0;
mutex_lock(&khugepaged_mutex);
- if (hugepage_pmd_enabled()) {
+ if (hugepage_enabled()) {
if (!khugepaged_thread)
khugepaged_thread = kthread_run(khugepaged, NULL,
"khugepaged");
@@ -3116,7 +3120,7 @@ int start_stop_khugepaged(void)
void khugepaged_min_free_kbytes_update(void)
{
mutex_lock(&khugepaged_mutex);
- if (hugepage_pmd_enabled() && khugepaged_thread)
+ if (hugepage_enabled() && khugepaged_thread)
set_recommended_min_free_kbytes();
mutex_unlock(&khugepaged_mutex);
}
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v19 14/14] Documentation: mm: update the admin guide for mTHP collapse
From: Nico Pache @ 2026-06-05 16:14 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe,
Bagas Sanjaya
In-Reply-To: <20260605161422.213817-1-npache@redhat.com>
Now that we can collapse to mTHPs lets update the admin guide to
reflect these changes and provide proper guidance on how to utilize it.
Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
Documentation/admin-guide/mm/transhuge.rst | 49 ++++++++++++++--------
1 file changed, 32 insertions(+), 17 deletions(-)
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index b98e18c80185..23f8d13c2629 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -63,7 +63,8 @@ often.
THP can be enabled system wide or restricted to certain tasks or even
memory ranges inside task's address space. Unless THP is completely
disabled, there is ``khugepaged`` daemon that scans memory and
-collapses sequences of basic pages into PMD-sized huge pages.
+collapses sequences of basic pages into huge pages of either PMD size
+or mTHP sizes, if the system is configured to do so.
The THP behaviour is controlled via :ref:`sysfs <thp_sysfs>`
interface and using madvise(2) and prctl(2) system calls.
@@ -219,10 +220,10 @@ this behaviour by writing 0 to shrink_underused, and enable it by writing
echo 0 > /sys/kernel/mm/transparent_hugepage/shrink_underused
echo 1 > /sys/kernel/mm/transparent_hugepage/shrink_underused
-khugepaged will be automatically started when PMD-sized THP is enabled
+khugepaged will be automatically started when any THP size is enabled
(either of the per-size anon control or the top-level control are set
to "always" or "madvise"), and it'll be automatically shutdown when
-PMD-sized THP is disabled (when both the per-size anon control and the
+all THP sizes are disabled (when both the per-size anon control and the
top-level control are "never")
process THP controls
@@ -265,8 +266,8 @@ Khugepaged controls
-------------------
.. note::
- khugepaged currently only searches for opportunities to collapse to
- PMD-sized THP and no attempt is made to collapse to other THP
+ khugepaged currently only searches for opportunities to collapse file/shmem
+ to PMD-sized THP. Only anonymous memory will attempt to collapse to other THP
sizes.
khugepaged runs usually at low frequency so while one may not want to
@@ -296,11 +297,11 @@ allocation failure to throttle the next allocation attempt::
The khugepaged progress can be seen in the number of pages collapsed (note
that this counter may not be an exact count of the number of pages
collapsed, since "collapsed" could mean multiple things: (1) A PTE mapping
-being replaced by a PMD mapping, or (2) All 4K physical pages replaced by
-one 2M hugepage. Each may happen independently, or together, depending on
-the type of memory and the failures that occur. As such, this value should
-be interpreted roughly as a sign of progress, and counters in /proc/vmstat
-consulted for more accurate accounting)::
+being replaced by a PMD mapping, or (2) physical pages replaced by one
+hugepage of various sizes (PMD-sized or mTHP). Each may happen independently,
+or together, depending on the type of memory and the failures that occur.
+As such, this value should be interpreted roughly as a sign of progress,
+and counters in /proc/vmstat consulted for more accurate accounting)::
/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
@@ -308,16 +309,21 @@ for each pass::
/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans
-``max_ptes_none`` specifies how many extra small pages (that are
-not already mapped) can be allocated when collapsing a group
-of small pages into one large page::
+``max_ptes_none`` specifies how many empty (none/zero) pages are allowed
+when collapsing a group of small pages into one large page::
/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none
-A higher value leads to use additional memory for programs.
-A lower value leads to gain less thp performance. Value of
-max_ptes_none can waste cpu time very little, you can
-ignore it.
+For PMD-sized THP collapse, this directly limits the number of empty pages
+allowed in the 2MB region.
+
+For mTHP collapse, only 0 or (HPAGE_PMD_NR - 1) are supported. At
+HPAGE_PMD_NR - 1, we collapse to the highest possible order. Any intermediate
+value will emit a warning and mTHP collapse will default to max_ptes_none=0.
+
+A higher value allows more empty pages, potentially leading to more memory
+usage but better THP performance. A lower value is more conservative and
+may result in fewer THP collapses.
``max_ptes_swap`` specifies how many pages can be brought in from
swap when collapsing a group of pages into a transparent huge page::
@@ -337,6 +343,15 @@ that THP is shared. Exceeding the number would block the collapse::
A higher value may increase memory footprint for some workloads.
+.. note::
+ For mTHP collapse, khugepaged does not support collapsing regions that
+ contain shared or swapped out pages, as this could lead to continuous
+ promotion to higher orders. The collapse will fail if any shared or
+ swapped PTEs are encountered during the scan.
+
+ Currently, madvise_collapse only supports collapsing to PMD-sized THPs
+ and does not attempt mTHP collapses.
+
Boot parameters
===============
--
2.54.0
^ permalink raw reply related
* [PATCH v2 00/18] tracing/remotes: Add printk, dump_on_panic and boot parameters
From: Vincent Donnefort @ 2026-06-05 16:38 UTC (permalink / raw)
To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
Cc: kernel-team, linux-kernel, Vincent Donnefort
This series extends the recently introduced trace remotes
infrastructure, bringing useful features for developers:
* dump_on_panic: Dump the trace remote buffer on system panic.
* dmesg: Redirect remote events to dmesg.
* trace_remote=: Configure a trace_remote from the commandline.
* poll_ms: Modify the polling period.
It also brings a couple of optimisations:
* In-header compressed length support for small events.
* Single work thread for remote polling.
And some misc improvements:
* Use kstrtobool where possible
* Free resources on remote registration failure
v2:
Sashiko had a few spot-on comments on the v1 which forced me to heavily
re-arrange the series, so I thought it'd be alright to send a v2
already including those changes.
* dump_on_oops -> dump_on_panic
* printk -> dmesg
* Allow to configure poll_ms
* Make the ring_buffer_iter functions panic-friendly
* Gate tracefs .open on remote registration (Sashiko)
* Use irqsave for reader lock (Sashiko)
* Fix iter locking for TRI_DMESG (Sashiko)
* Yield in the dmesg work (Sashiko)
v1 (https://lore.kernel.org/all/20260602171146.2238998-1-vdonnefort@google.com/)
Vincent Donnefort (18):
tracing/remotes: Gate tracefs files opening on trace remote
registration
tracing/remotes: Release tracefs,eventfs on registration failure
tracing/remotes: Use kstrtobool for boolean tracefs files
tracing/remotes: Use a single per-remote polling work
tracing/simple_ring_buffer: Add support for compressed length
tracing/remotes: Add dmesg tracefs file
tracing/remotes: selftests: Add a test for the dmesg tracefs file
tracing/remotes: selftests: Prefix hypervisor folder
ring-buffer: Use irqsave for the reader lock in
ring_buffer_poll_remote
ring-buffer: Use panic-friendly locking in ring_buffer_iter interface
ring-buffer: Add ring_buffer_read_remote_meta_page()
ring-buffer: Add kerneldoc for ring_buffer_poll_remote
tracing/remotes: Add dump_on_panic tracefs file
tracing/remotes: selftests: Add a test for the dump_on_panic tracefs
file
tracing/remotes: Add poll_ms tracefs file
tracing/remotes: Add trace_remote cmdline options
Documentation: tracing/remotes: Add detailed tracefs layout
Documentation/kernel-parameters: Add trace_remote
.../admin-guide/kernel-parameters.txt | 19 +
Documentation/trace/remotes.rst | 66 +-
include/linux/ring_buffer.h | 1 +
kernel/trace/ring_buffer.c | 86 +-
kernel/trace/simple_ring_buffer.c | 25 +-
kernel/trace/trace_remote.c | 823 +++++++++++++++---
.../buffer_size.tc | 0
.../test.d/remotes/00hypervisor/dmesg.tc | 11 +
.../remotes/00hypervisor/dump_on_panic.tc | 11 +
.../{hypervisor => 00hypervisor}/hotplug.tc | 0
.../{hypervisor => 00hypervisor}/reset.tc | 0
.../{hypervisor => 00hypervisor}/trace.tc | 0
.../trace_pipe.tc | 0
.../{hypervisor => 00hypervisor}/unloading.tc | 0
.../selftests/ftrace/test.d/remotes/dmesg.tc | 72 ++
.../ftrace/test.d/remotes/dump_on_panic.tc | 51 ++
.../selftests/ftrace/test.d/remotes/functions | 2 +
17 files changed, 1028 insertions(+), 139 deletions(-)
rename tools/testing/selftests/ftrace/test.d/remotes/{hypervisor => 00hypervisor}/buffer_size.tc (100%)
create mode 100644 tools/testing/selftests/ftrace/test.d/remotes/00hypervisor/dmesg.tc
create mode 100644 tools/testing/selftests/ftrace/test.d/remotes/00hypervisor/dump_on_panic.tc
rename tools/testing/selftests/ftrace/test.d/remotes/{hypervisor => 00hypervisor}/hotplug.tc (100%)
rename tools/testing/selftests/ftrace/test.d/remotes/{hypervisor => 00hypervisor}/reset.tc (100%)
rename tools/testing/selftests/ftrace/test.d/remotes/{hypervisor => 00hypervisor}/trace.tc (100%)
rename tools/testing/selftests/ftrace/test.d/remotes/{hypervisor => 00hypervisor}/trace_pipe.tc (100%)
rename tools/testing/selftests/ftrace/test.d/remotes/{hypervisor => 00hypervisor}/unloading.tc (100%)
create mode 100644 tools/testing/selftests/ftrace/test.d/remotes/dmesg.tc
create mode 100644 tools/testing/selftests/ftrace/test.d/remotes/dump_on_panic.tc
base-commit: e43ffb69e0438cddd72aaa30898b4dc446f664f8
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply
* [PATCH v2 01/18] tracing/remotes: Gate tracefs files opening on trace remote registration
From: Vincent Donnefort @ 2026-06-05 16:38 UTC (permalink / raw)
To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
Cc: kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260605163825.1762953-1-vdonnefort@google.com>
Currently, if a remote fails to register, its resources will leak and
will not be properly tear-downed.
To prevent a user accessing a remote tracefs that is about to be
destroyed, keep track of the registered remotes in a global list,
similarly to trace instances. Gate the tracefs to open function based on
the presence of the remote in that list.
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
index d6c3f94d67cd..9f1669d433d5 100644
--- a/kernel/trace/trace_remote.c
+++ b/kernel/trace/trace_remote.c
@@ -39,6 +39,7 @@ struct trace_remote_iterator {
};
struct trace_remote {
+ struct list_head node;
struct trace_remote_callbacks *cbs;
void *priv;
struct trace_buffer *trace_buffer;
@@ -57,6 +58,9 @@ struct trace_remote {
bool tracing_on;
};
+static DEFINE_MUTEX(trace_remotes_lock);
+static LIST_HEAD(trace_remotes);
+
static bool trace_remote_loaded(struct trace_remote *remote)
{
return !!remote->trace_buffer;
@@ -170,6 +174,60 @@ static void trace_remote_reset(struct trace_remote *remote, int cpu)
trace_remote_try_unload(remote);
}
+static int trace_remote_tracefs_open(struct inode *inode, struct file *filp)
+{
+ void *i_private = inode->i_private;
+ struct trace_remote *r;
+
+ if (!i_private)
+ return -ENODEV;
+
+ guard(mutex)(&trace_remotes_lock);
+
+ /* i_private is either a struct trace_remote or a struct remote_event */
+ list_for_each_entry(r, &trace_remotes, node) {
+ if (r == i_private)
+ return 0;
+ if (!r->events)
+ continue;
+ if (i_private >= (void *)r->events &&
+ i_private < (void *)(r->events + r->nr_events))
+ return 0;
+ }
+
+ return -ENODEV;
+}
+
+#define DEFINE_TRACE_REMOTE_ATTRIBUTE_FUNCS(__name) \
+static int __name ## _open(struct inode *inode, struct file *file) \
+{ \
+ int ret = trace_remote_tracefs_open(inode, file); \
+ \
+ if (ret) \
+ return ret; \
+ \
+ return single_open(file, __name ## _show, inode->i_private); \
+}
+
+#define DEFINE_TRACE_REMOTE_ATTRIBUTE(__name) \
+DEFINE_TRACE_REMOTE_ATTRIBUTE_FUNCS(__name) \
+static const struct file_operations __name ## _fops = { \
+ .open = __name ## _open, \
+ .read = seq_read, \
+ .write = __name ## _write, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
+}
+
+#define DEFINE_TRACE_REMOTE_SHOW_ATTRIBUTE(__name) \
+DEFINE_TRACE_REMOTE_ATTRIBUTE_FUNCS(__name) \
+static const struct file_operations __name ## _fops = { \
+ .open = __name ## _open, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
+}
+
static ssize_t
tracing_on_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos)
{
@@ -198,7 +256,7 @@ static int tracing_on_show(struct seq_file *s, void *unused)
return 0;
}
-DEFINE_SHOW_STORE_ATTRIBUTE(tracing_on);
+DEFINE_TRACE_REMOTE_ATTRIBUTE(tracing_on);
static ssize_t buffer_size_kb_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
@@ -235,7 +293,7 @@ static int buffer_size_kb_show(struct seq_file *s, void *unused)
return 0;
}
-DEFINE_SHOW_STORE_ATTRIBUTE(buffer_size_kb);
+DEFINE_TRACE_REMOTE_ATTRIBUTE(buffer_size_kb);
static int trace_remote_get(struct trace_remote *remote, int cpu)
{
@@ -593,6 +651,11 @@ static int trace_pipe_open(struct inode *inode, struct file *filp)
struct trace_remote *remote = inode->i_private;
struct trace_remote_iterator *iter;
int cpu = tracing_get_cpu(inode);
+ int ret;
+
+ ret = trace_remote_tracefs_open(inode, filp);
+ if (ret)
+ return ret;
guard(mutex)(&remote->lock);
@@ -602,7 +665,7 @@ static int trace_pipe_open(struct inode *inode, struct file *filp)
filp->private_data = iter;
- return IS_ERR(iter) ? PTR_ERR(iter) : 0;
+ return 0;
}
static int trace_pipe_release(struct inode *inode, struct file *filp)
@@ -734,22 +797,26 @@ static int trace_open(struct inode *inode, struct file *filp)
int cpu = tracing_get_cpu(inode);
int ret;
+ ret = trace_remote_tracefs_open(inode, filp);
+ if (ret)
+ return ret;
+
if (!(filp->f_mode & FMODE_READ))
return 0;
+ ret = seq_open(filp, &trace_sops);
+ if (ret)
+ return ret;
+
guard(mutex)(&remote->lock);
iter = trace_remote_iter(remote, cpu, TRI_NONCONSUMING);
- if (IS_ERR(iter))
+ if (IS_ERR(iter)) {
+ seq_release(inode, filp);
return PTR_ERR(iter);
-
- ret = seq_open(filp, &trace_sops);
- if (ret) {
- trace_remote_iter_free(iter);
- return ret;
}
- ((struct seq_file *)filp->private_data)->private = (void *)iter;
+ ((struct seq_file *)filp->private_data)->private = iter;
return 0;
}
@@ -932,8 +999,12 @@ int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs,
}
ret = cbs->init ? cbs->init(remote->dentry, priv) : 0;
- if (ret)
+ if (ret) {
pr_err("Init failed for trace remote '%s' (%d)\n", name, ret);
+ } else {
+ guard(mutex)(&trace_remotes_lock);
+ list_add(&remote->node, &trace_remotes);
+ }
return ret;
}
@@ -1076,7 +1147,7 @@ static ssize_t remote_event_enable_write(struct file *filp, const char __user *u
return count;
}
-DEFINE_SHOW_STORE_ATTRIBUTE(remote_event_enable);
+DEFINE_TRACE_REMOTE_ATTRIBUTE(remote_event_enable);
static int remote_event_id_show(struct seq_file *s, void *unused)
{
@@ -1086,7 +1157,7 @@ static int remote_event_id_show(struct seq_file *s, void *unused)
return 0;
}
-DEFINE_SHOW_ATTRIBUTE(remote_event_id);
+DEFINE_TRACE_REMOTE_SHOW_ATTRIBUTE(remote_event_id);
static int remote_event_format_show(struct seq_file *s, void *unused)
{
@@ -1115,7 +1186,7 @@ static int remote_event_format_show(struct seq_file *s, void *unused)
return 0;
}
-DEFINE_SHOW_ATTRIBUTE(remote_event_format);
+DEFINE_TRACE_REMOTE_SHOW_ATTRIBUTE(remote_event_format);
static int remote_event_callback(const char *name, umode_t *mode, void **data,
const struct file_operations **fops)
@@ -1190,6 +1261,7 @@ static ssize_t remote_events_dir_enable_read(struct file *filp, char __user *ubu
}
static const struct file_operations remote_events_dir_enable_fops = {
+ .open = trace_remote_tracefs_open,
.write = remote_events_dir_enable_write,
.read = remote_events_dir_enable_read,
};
@@ -1214,6 +1286,7 @@ remote_events_dir_header_page_read(struct file *filp, char __user *ubuf, size_t
}
static const struct file_operations remote_events_dir_header_page_fops = {
+ .open = trace_remote_tracefs_open,
.read = remote_events_dir_header_page_read,
};
@@ -1237,6 +1310,7 @@ remote_events_dir_header_event_read(struct file *filp, char __user *ubuf, size_t
}
static const struct file_operations remote_events_dir_header_event_fops = {
+ .open = trace_remote_tracefs_open,
.read = remote_events_dir_header_event_read,
};
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related
* [PATCH v2 02/18] tracing/remotes: Release tracefs,eventfs on registration failure
From: Vincent Donnefort @ 2026-06-05 16:38 UTC (permalink / raw)
To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
Cc: kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260605163825.1762953-1-vdonnefort@google.com>
In trace_remote_register(), if registration of events or the init
callback fails, the created tracefs and eventfs directories are leaked.
Release the entire eventfs and tracefs hierarchy on trace_remote
registration failure.
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
index 9f1669d433d5..9b27c7bd6040 100644
--- a/kernel/trace/trace_remote.c
+++ b/kernel/trace/trace_remote.c
@@ -45,7 +45,8 @@ struct trace_remote {
struct trace_buffer *trace_buffer;
struct trace_buffer_desc *trace_buffer_desc;
struct dentry *dentry;
- struct eventfs_inode *eventfs;
+ struct eventfs_inode *eventfs_root;
+ struct eventfs_inode *eventfs_subdir;
struct remote_event *events;
unsigned long nr_events;
unsigned long trace_buffer_size;
@@ -60,6 +61,7 @@ struct trace_remote {
static DEFINE_MUTEX(trace_remotes_lock);
static LIST_HEAD(trace_remotes);
+static struct dentry *trace_remotes_root;
static bool trace_remote_loaded(struct trace_remote *remote)
{
@@ -865,23 +867,21 @@ static const struct file_operations trace_fops = {
static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote)
{
struct dentry *remote_d, *percpu_d, *d;
- static struct dentry *root;
- static DEFINE_MUTEX(lock);
bool root_inited = false;
int cpu;
- guard(mutex)(&lock);
+ lockdep_assert_held(&trace_remotes_lock);
- if (!root) {
- root = tracefs_create_dir(TRACEFS_DIR, NULL);
- if (!root) {
+ if (!trace_remotes_root) {
+ trace_remotes_root = tracefs_create_dir(TRACEFS_DIR, NULL);
+ if (!trace_remotes_root) {
pr_err("Failed to create tracefs dir "TRACEFS_DIR"\n");
return -ENOMEM;
}
root_inited = true;
}
- remote_d = tracefs_create_dir(name, root);
+ remote_d = tracefs_create_dir(name, trace_remotes_root);
if (!remote_d) {
pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/\n", name);
goto err;
@@ -939,8 +939,8 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo
err:
if (root_inited) {
- tracefs_remove(root);
- root = NULL;
+ tracefs_remove(trace_remotes_root);
+ trace_remotes_root = NULL;
} else {
tracefs_remove(remote_d);
}
@@ -948,8 +948,26 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo
return -ENOMEM;
}
+static void trace_remote_remove_tracefs(struct trace_remote *remote)
+{
+ lockdep_assert_held(&trace_remotes_lock);
+
+ if (!remote->dentry)
+ return;
+
+ tracefs_remove(remote->dentry);
+ remote->dentry = NULL;
+
+ if (!list_empty(&trace_remotes))
+ return;
+
+ tracefs_remove(trace_remotes_root);
+ trace_remotes_root = NULL;
+}
+
static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote,
struct remote_event *events, size_t nr_events);
+static void trace_remote_unregister_events(struct trace_remote *remote);
/**
* trace_remote_register() - Register a Tracefs remote
@@ -972,10 +990,9 @@ static int trace_remote_register_events(const char *remote_name, struct trace_re
int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv,
struct remote_event *events, size_t nr_events)
{
- struct trace_remote *remote;
+ struct trace_remote *remote __free(kfree) = kzalloc_obj(*remote);
int ret;
- remote = kzalloc_obj(*remote);
if (!remote)
return -ENOMEM;
@@ -986,13 +1003,15 @@ int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs,
mutex_init(&remote->lock);
init_rwsem(&remote->reader_lock);
- if (trace_remote_init_tracefs(name, remote)) {
- kfree(remote);
- return -ENOMEM;
- }
+ guard(mutex)(&trace_remotes_lock);
+
+ ret = trace_remote_init_tracefs(name, remote);
+ if (ret)
+ return ret;
ret = trace_remote_register_events(name, remote, events, nr_events);
if (ret) {
+ trace_remote_remove_tracefs(remote);
pr_err("Failed to register events for trace remote '%s' (%d)\n",
name, ret);
return ret;
@@ -1000,13 +1019,16 @@ int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs,
ret = cbs->init ? cbs->init(remote->dentry, priv) : 0;
if (ret) {
+ trace_remote_unregister_events(remote);
+ trace_remote_remove_tracefs(remote);
pr_err("Init failed for trace remote '%s' (%d)\n", name, ret);
- } else {
- guard(mutex)(&trace_remotes_lock);
- list_add(&remote->node, &trace_remotes);
+ return ret;
}
- return ret;
+ list_add(&remote->node, &trace_remotes);
+ retain_and_null_ptr(remote);
+
+ return 0;
}
EXPORT_SYMBOL_GPL(trace_remote_register);
@@ -1341,7 +1363,6 @@ static int remote_events_dir_callback(const char *name, umode_t *mode, void **da
static int trace_remote_init_eventfs(const char *remote_name, struct trace_remote *remote,
struct remote_event *evt)
{
- struct eventfs_inode *eventfs = remote->eventfs;
static struct eventfs_entry dir_entries[] = {
{
.name = "enable",
@@ -1366,35 +1387,37 @@ static int trace_remote_init_eventfs(const char *remote_name, struct trace_remot
.callback = remote_event_callback,
}
};
- bool eventfs_create = false;
+ struct eventfs_inode *eventfs_root, *eventfs_subdir, *e;
- if (!eventfs) {
- eventfs = eventfs_create_events_dir("events", remote->dentry, dir_entries,
- ARRAY_SIZE(dir_entries), remote);
- if (IS_ERR(eventfs))
- return PTR_ERR(eventfs);
+ eventfs_root = remote->eventfs_root;
+ eventfs_subdir = remote->eventfs_subdir;
+ if (!eventfs_root) {
+ eventfs_root = eventfs_create_events_dir("events", remote->dentry, dir_entries,
+ ARRAY_SIZE(dir_entries), remote);
+ if (IS_ERR(eventfs_root))
+ return PTR_ERR(eventfs_root);
/*
* Create similar hierarchy as local events even if a single system is supported at
* the moment
*/
- eventfs = eventfs_create_dir(remote_name, eventfs, NULL, 0, NULL);
- if (IS_ERR(eventfs))
- return PTR_ERR(eventfs);
-
- remote->eventfs = eventfs;
- eventfs_create = true;
+ eventfs_subdir = eventfs_create_dir(remote_name, eventfs_root, NULL, 0, NULL);
+ if (IS_ERR(eventfs_subdir)) {
+ eventfs_remove_events_dir(eventfs_root);
+ return PTR_ERR(eventfs_subdir);
+ }
}
- eventfs = eventfs_create_dir(evt->name, eventfs, entries, ARRAY_SIZE(entries), evt);
- if (IS_ERR(eventfs)) {
- if (eventfs_create) {
- eventfs_remove_events_dir(remote->eventfs);
- remote->eventfs = NULL;
- }
- return PTR_ERR(eventfs);
+ e = eventfs_create_dir(evt->name, eventfs_subdir, entries, ARRAY_SIZE(entries), evt);
+ if (IS_ERR(e)) {
+ if (!remote->eventfs_root)
+ eventfs_remove_events_dir(eventfs_root);
+ return PTR_ERR(e);
}
+ remote->eventfs_root = eventfs_root;
+ remote->eventfs_subdir = eventfs_subdir;
+
return 0;
}
@@ -1409,11 +1432,11 @@ static int trace_remote_attach_events(struct trace_remote *remote, struct remote
if (evt->remote)
return -EEXIST;
- evt->remote = remote;
-
/* We need events to be sorted for efficient lookup */
if (i && evt->id <= events[i - 1].id)
return -EINVAL;
+
+ evt->remote = remote;
}
remote->events = events;
@@ -1422,14 +1445,33 @@ static int trace_remote_attach_events(struct trace_remote *remote, struct remote
return 0;
}
+static void trace_remote_detach_events(struct trace_remote *remote, struct remote_event *events,
+ size_t nr_events)
+{
+ int i;
+
+ for (i = 0; i < nr_events; i++) {
+ struct remote_event *evt = &events[i];
+
+ if (evt->remote == remote)
+ evt->remote = NULL;
+ }
+
+ remote->events = NULL;
+ remote->nr_events = 0;
+}
+
static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote,
struct remote_event *events, size_t nr_events)
{
int i, ret;
ret = trace_remote_attach_events(remote, events, nr_events);
- if (ret)
+ if (ret) {
+ /* It is safe to call detach on a half-registered array */
+ trace_remote_detach_events(remote, events, nr_events);
return ret;
+ }
for (i = 0; i < nr_events; i++) {
struct remote_event *evt = &events[i];
@@ -1443,6 +1485,13 @@ static int trace_remote_register_events(const char *remote_name, struct trace_re
return 0;
}
+static void trace_remote_unregister_events(struct trace_remote *remote)
+{
+ trace_remote_detach_events(remote, remote->events, remote->nr_events);
+ if (remote->eventfs_root)
+ eventfs_remove_events_dir(remote->eventfs_root);
+}
+
static int __cmp_events(const void *key, const void *data)
{
const struct remote_event *evt = data;
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related
* [PATCH v2 03/18] tracing/remotes: Use kstrtobool for boolean tracefs files
From: Vincent Donnefort @ 2026-06-05 16:38 UTC (permalink / raw)
To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
Cc: kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260605163825.1762953-1-vdonnefort@google.com>
Use kstrtobool in trace_remote.c where possible. This is more user-friendly
as it allows a better variety of input strings.
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
index 9b27c7bd6040..71f6cda0fbd4 100644
--- a/kernel/trace/trace_remote.c
+++ b/kernel/trace/trace_remote.c
@@ -235,10 +235,10 @@ tracing_on_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t
{
struct seq_file *seq = filp->private_data;
struct trace_remote *remote = seq->private;
- unsigned long val;
+ bool val;
int ret;
- ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ ret = kstrtobool_from_user(ubuf, cnt, &val);
if (ret)
return ret;
@@ -1154,10 +1154,10 @@ static ssize_t remote_event_enable_write(struct file *filp, const char __user *u
struct seq_file *seq = filp->private_data;
struct remote_event *evt = seq->private;
struct trace_remote *remote = evt->remote;
- u8 enable;
+ bool enable;
int ret;
- ret = kstrtou8_from_user(ubuf, count, 10, &enable);
+ ret = kstrtobool_from_user(ubuf, count, &enable);
if (ret)
return ret;
@@ -1238,10 +1238,10 @@ static ssize_t remote_events_dir_enable_write(struct file *filp, const char __us
size_t count, loff_t *ppos)
{
struct trace_remote *remote = file_inode(filp)->i_private;
+ bool enable;
int i, ret;
- u8 enable;
- ret = kstrtou8_from_user(ubuf, count, 10, &enable);
+ ret = kstrtobool_from_user(ubuf, count, &enable);
if (ret)
return ret;
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related
* [PATCH v2 04/18] tracing/remotes: Use a single per-remote polling work
From: Vincent Donnefort @ 2026-06-05 16:38 UTC (permalink / raw)
To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
Cc: kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260605163825.1762953-1-vdonnefort@google.com>
Having a per-iterator polling work is wasteful when logging several
trace_remote per_cpu/trace_pipe files in parallel. This result in one
work running per-CPU, where only one would suffice.
Transition to a single per-remote polling work, scheduled on the first
consumer creation and stopped when the last consuming iterator is freed.
This blanket polls all CPUs, regardless of which ones are actually being
read. This is acceptable because the poll consists of reading the
meta-page, which is a fast operation. Also, it is more common to log all
CPUs in the system than only one, so this use-case should be favoured.
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
index 71f6cda0fbd4..2271d54eb3dd 100644
--- a/kernel/trace/trace_remote.c
+++ b/kernel/trace/trace_remote.c
@@ -26,7 +26,6 @@ enum tri_type {
struct trace_remote_iterator {
struct trace_remote *remote;
struct trace_seq seq;
- struct delayed_work poll_work;
unsigned long lost_events;
u64 ts;
struct ring_buffer_iter *rb_iter;
@@ -56,6 +55,8 @@ struct trace_remote {
struct rw_semaphore *pcpu_reader_locks;
unsigned int nr_readers;
unsigned int poll_ms;
+ struct delayed_work poll_work;
+ unsigned int poll_cnt;
bool tracing_on;
};
@@ -350,17 +351,6 @@ static bool trace_remote_has_cpu(struct trace_remote *remote, int cpu)
return ring_buffer_poll_remote(remote->trace_buffer, cpu) == 0;
}
-static void __poll_remote(struct work_struct *work)
-{
- struct delayed_work *dwork = to_delayed_work(work);
- struct trace_remote_iterator *iter;
-
- iter = container_of(dwork, struct trace_remote_iterator, poll_work);
- ring_buffer_poll_remote(iter->remote->trace_buffer, iter->cpu);
- schedule_delayed_work((struct delayed_work *)work,
- msecs_to_jiffies(iter->remote->poll_ms));
-}
-
static void __free_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu)
{
if (cpu != RING_BUFFER_ALL_CPUS) {
@@ -404,6 +394,36 @@ static int __alloc_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu)
return 0;
}
+static void trace_remote_do_poll(struct trace_remote *remote)
+{
+ ring_buffer_poll_remote(remote->trace_buffer, RING_BUFFER_ALL_CPUS);
+ schedule_delayed_work(&remote->poll_work, msecs_to_jiffies(remote->poll_ms));
+}
+
+static void __poll_remote(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+
+ trace_remote_do_poll(container_of(dwork, struct trace_remote, poll_work));
+}
+
+static void trace_remote_inc_poll(struct trace_remote *remote)
+{
+ /* poll_cnt <= nr_readers, inherits its overflow protection */
+ if (!remote->poll_cnt++)
+ trace_remote_do_poll(remote);
+}
+
+static void trace_remote_dec_poll(struct trace_remote *remote)
+{
+ if (WARN_ON_ONCE(!remote->poll_cnt))
+ return;
+
+ remote->poll_cnt--;
+ if (!remote->poll_cnt)
+ cancel_delayed_work_sync(&remote->poll_work);
+}
+
static struct trace_remote_iterator
*trace_remote_iter(struct trace_remote *remote, int cpu, enum tri_type type)
{
@@ -433,9 +453,7 @@ static struct trace_remote_iterator
switch (type) {
case TRI_CONSUMING:
- ring_buffer_poll_remote(remote->trace_buffer, cpu);
- INIT_DELAYED_WORK(&iter->poll_work, __poll_remote);
- schedule_delayed_work(&iter->poll_work, msecs_to_jiffies(remote->poll_ms));
+ trace_remote_inc_poll(remote);
break;
case TRI_NONCONSUMING:
ret = __alloc_ring_buffer_iter(iter, cpu);
@@ -469,7 +487,7 @@ static void trace_remote_iter_free(struct trace_remote_iterator *iter)
switch (iter->type) {
case TRI_CONSUMING:
- cancel_delayed_work_sync(&iter->poll_work);
+ trace_remote_dec_poll(remote);
break;
case TRI_NONCONSUMING:
__free_ring_buffer_iter(iter, iter->cpu);
@@ -1002,6 +1020,7 @@ int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs,
remote->poll_ms = 100;
mutex_init(&remote->lock);
init_rwsem(&remote->reader_lock);
+ INIT_DELAYED_WORK(&remote->poll_work, __poll_remote);
guard(mutex)(&trace_remotes_lock);
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related
* [PATCH v2 05/18] tracing/simple_ring_buffer: Add support for compressed length
From: Vincent Donnefort @ 2026-06-05 16:38 UTC (permalink / raw)
To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
Cc: kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260605163825.1762953-1-vdonnefort@google.com>
The array length is the total size in bytes of the data for the current
event. It is possible to compress this value into the event header type,
which has 28 unused types, saving 32 bits for sufficiently small events.
The compressed length is expressed as a multiple of the ring-buffer
alignment, 4-bytes by default. Enforces this alignment.
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
diff --git a/kernel/trace/simple_ring_buffer.c b/kernel/trace/simple_ring_buffer.c
index f4642f5adda3..1a97d17cca24 100644
--- a/kernel/trace/simple_ring_buffer.c
+++ b/kernel/trace/simple_ring_buffer.c
@@ -207,7 +207,15 @@ static unsigned long rb_event_size(unsigned long length)
{
struct ring_buffer_event *event;
- return length + RB_EVNT_HDR_SIZE + sizeof(event->array[0]);
+ if (!length)
+ length++;
+
+ length = ALIGN(length, RB_ALIGNMENT);
+
+ if (length > RB_MAX_SMALL_DATA)
+ length += sizeof(event->array[0]);
+
+ return length + RB_EVNT_HDR_SIZE;
}
static struct ring_buffer_event *
@@ -223,12 +231,15 @@ rb_event_add_ts_extend(struct ring_buffer_event *event, u64 delta)
static struct ring_buffer_event *
simple_rb_reserve_next(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, u64 timestamp)
{
- unsigned long ts_ext_size = 0, event_size = rb_event_size(length);
struct simple_buffer_page *tail = cpu_buffer->tail_page;
+ unsigned long event_size, array_size, ts_ext_size = 0;
struct ring_buffer_event *event;
u32 write, prev_write;
u64 time_delta;
+ event_size = rb_event_size(length);
+ array_size = event_size - RB_EVNT_HDR_SIZE;
+
time_delta = timestamp - cpu_buffer->write_stamp;
if (test_time_stamp(time_delta))
@@ -259,9 +270,13 @@ simple_rb_reserve_next(struct simple_rb_per_cpu *cpu_buffer, unsigned long lengt
time_delta = 0;
}
- event->type_len = 0;
+ if (length > RB_MAX_SMALL_DATA) {
+ event->type_len = 0;
+ event->array[0] = array_size;
+ } else {
+ event->type_len = DIV_ROUND_UP(array_size, RB_ALIGNMENT);
+ }
event->time_delta = time_delta;
- event->array[0] = event_size - RB_EVNT_HDR_SIZE;
return event;
}
@@ -284,7 +299,7 @@ void *simple_ring_buffer_reserve(struct simple_rb_per_cpu *cpu_buffer, unsigned
rb_event = simple_rb_reserve_next(cpu_buffer, length, timestamp);
- return &rb_event->array[1];
+ return rb_event->type_len ? &rb_event->array[0] : &rb_event->array[1];
}
EXPORT_SYMBOL_GPL(simple_ring_buffer_reserve);
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related
* [PATCH v2 06/18] tracing/remotes: Add dmesg tracefs file
From: Vincent Donnefort @ 2026-06-05 16:38 UTC (permalink / raw)
To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
Cc: kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260605163825.1762953-1-vdonnefort@google.com>
When enabled, the dmesg tracefs file enables the redirection of all
events to dmesg. This is similar to tp_printk.
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
index 2271d54eb3dd..19dfa355b7f3 100644
--- a/kernel/trace/trace_remote.c
+++ b/kernel/trace/trace_remote.c
@@ -21,6 +21,7 @@
enum tri_type {
TRI_CONSUMING,
TRI_NONCONSUMING,
+ TRI_DMESG,
};
struct trace_remote_iterator {
@@ -43,6 +44,7 @@ struct trace_remote {
void *priv;
struct trace_buffer *trace_buffer;
struct trace_buffer_desc *trace_buffer_desc;
+ struct trace_remote_iterator *dmesg;
struct dentry *dentry;
struct eventfs_inode *eventfs_root;
struct eventfs_inode *eventfs_subdir;
@@ -394,10 +396,15 @@ static int __alloc_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu)
return 0;
}
+static bool trace_remote_do_dmesg(struct trace_remote *remote);
+
static void trace_remote_do_poll(struct trace_remote *remote)
{
+ bool yield;
+
ring_buffer_poll_remote(remote->trace_buffer, RING_BUFFER_ALL_CPUS);
- schedule_delayed_work(&remote->poll_work, msecs_to_jiffies(remote->poll_ms));
+ yield = trace_remote_do_dmesg(remote);
+ schedule_delayed_work(&remote->poll_work, yield ? 0 : msecs_to_jiffies(remote->poll_ms));
}
static void __poll_remote(struct work_struct *work)
@@ -452,6 +459,14 @@ static struct trace_remote_iterator
trace_seq_init(&iter->seq);
switch (type) {
+ case TRI_DMESG:
+ /* only one printk iter allowed */
+ if (WARN_ON_ONCE(remote->dmesg)) {
+ ret = -EBUSY;
+ break;
+ }
+ smp_store_release(&remote->dmesg, iter);
+ fallthrough;
case TRI_CONSUMING:
trace_remote_inc_poll(remote);
break;
@@ -486,6 +501,11 @@ static void trace_remote_iter_free(struct trace_remote_iterator *iter)
lockdep_assert_held(&remote->lock);
switch (iter->type) {
+ case TRI_DMESG:
+ WARN_ON_ONCE(remote->dmesg != iter);
+ smp_store_release(&remote->dmesg, NULL);
+ flush_delayed_work(&remote->poll_work);
+ fallthrough;
case TRI_CONSUMING:
trace_remote_dec_poll(remote);
break;
@@ -498,13 +518,24 @@ static void trace_remote_iter_free(struct trace_remote_iterator *iter)
trace_remote_put(remote);
}
+static bool trace_remote_iter_is_consuming(struct trace_remote_iterator *iter)
+{
+ switch (iter->type) {
+ case TRI_CONSUMING:
+ case TRI_DMESG:
+ return true;
+ default:
+ return false;
+ }
+}
+
static void trace_remote_iter_read_start(struct trace_remote_iterator *iter)
{
struct trace_remote *remote = iter->remote;
int cpu = iter->cpu;
/* Acquire global reader lock */
- if (cpu == RING_BUFFER_ALL_CPUS && iter->type == TRI_CONSUMING)
+ if (cpu == RING_BUFFER_ALL_CPUS && trace_remote_iter_is_consuming(iter))
down_write(&remote->reader_lock);
else
down_read(&remote->reader_lock);
@@ -521,7 +552,7 @@ static void trace_remote_iter_read_start(struct trace_remote_iterator *iter)
if (WARN_ON_ONCE(!remote->pcpu_reader_locks))
return;
- if (iter->type == TRI_CONSUMING)
+ if (trace_remote_iter_is_consuming(iter))
down_write(&remote->pcpu_reader_locks[cpu]);
else
down_read(&remote->pcpu_reader_locks[cpu]);
@@ -538,14 +569,14 @@ static void trace_remote_iter_read_finished(struct trace_remote_iterator *iter)
* No need for the remote lock here, iter holds a reference on
* remote->nr_readers
*/
- if (iter->type == TRI_CONSUMING)
+ if (trace_remote_iter_is_consuming(iter))
up_write(&remote->pcpu_reader_locks[cpu]);
else
up_read(&remote->pcpu_reader_locks[cpu]);
}
/* Release global reader lock */
- if (cpu == RING_BUFFER_ALL_CPUS && iter->type == TRI_CONSUMING)
+ if (cpu == RING_BUFFER_ALL_CPUS && trace_remote_iter_is_consuming(iter))
up_write(&remote->reader_lock);
else
up_read(&remote->reader_lock);
@@ -562,10 +593,9 @@ __peek_event(struct trace_remote_iterator *iter, int cpu, u64 *ts, unsigned long
struct ring_buffer_event *rb_evt;
struct ring_buffer_iter *rb_iter;
- switch (iter->type) {
- case TRI_CONSUMING:
+ if (trace_remote_iter_is_consuming(iter)) {
return ring_buffer_peek(iter->remote->trace_buffer, cpu, ts, lost_events);
- case TRI_NONCONSUMING:
+ } else {
rb_iter = __get_rb_iter(iter, cpu);
if (!rb_iter)
return NULL;
@@ -629,14 +659,10 @@ static void trace_remote_iter_move(struct trace_remote_iterator *iter)
{
struct trace_buffer *trace_buffer = iter->remote->trace_buffer;
- switch (iter->type) {
- case TRI_CONSUMING:
+ if (trace_remote_iter_is_consuming(iter))
ring_buffer_consume(trace_buffer, iter->evt_cpu, NULL, NULL);
- break;
- case TRI_NONCONSUMING:
+ else
ring_buffer_iter_advance(__get_rb_iter(iter, iter->evt_cpu));
- break;
- }
}
static struct remote_event *trace_remote_find_event(struct trace_remote *remote, unsigned short id);
@@ -882,6 +908,86 @@ static const struct file_operations trace_fops = {
.release = trace_release,
};
+static bool trace_remote_do_dmesg(struct trace_remote *remote)
+{
+ struct trace_remote_iterator *iter = smp_load_acquire(&remote->dmesg);
+ unsigned int max_events = 1000;
+
+ if (!iter)
+ return false;
+
+ trace_remote_iter_read_start(iter);
+
+ while (trace_remote_iter_read_event(iter)) {
+ trace_seq_init(&iter->seq);
+
+ trace_remote_iter_print_event(iter);
+ if (!pr_emerg("%s", iter->seq.buffer))
+ break;
+
+ trace_remote_iter_move(iter);
+
+ if (!(--max_events))
+ break;
+ }
+
+ trace_remote_iter_read_finished(iter);
+
+ return !max_events;
+}
+
+static int trace_remote_enable_dmesg(struct trace_remote *remote, bool enable)
+{
+ struct trace_remote_iterator *iter = remote->dmesg;
+
+ lockdep_assert_held(&remote->lock);
+
+ if (enable == !!iter)
+ return 0;
+
+ if (enable) {
+ iter = trace_remote_iter(remote, RING_BUFFER_ALL_CPUS, TRI_DMESG);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+ } else {
+ trace_remote_iter_free(remote->dmesg);
+ /* trace_remote_iter_free has reset remote->dmesg */
+ }
+
+ return 0;
+}
+
+static ssize_t
+dmesg_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct seq_file *seq = filp->private_data;
+ struct trace_remote *remote = seq->private;
+ bool val;
+ int ret;
+
+ ret = kstrtobool_from_user(ubuf, cnt, &val);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&remote->lock);
+
+ ret = trace_remote_enable_dmesg(remote, val);
+ if (ret)
+ return ret;
+
+ return cnt;
+}
+
+static int dmesg_show(struct seq_file *s, void *unused)
+{
+ struct trace_remote *remote = s->private;
+
+ seq_printf(s, "%d\n", !!remote->dmesg);
+
+ return 0;
+}
+DEFINE_TRACE_REMOTE_ATTRIBUTE(dmesg);
+
static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote)
{
struct dentry *remote_d, *percpu_d, *d;
@@ -922,6 +1028,10 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo
if (!d)
goto err;
+ d = trace_create_file("dmesg", TRACEFS_MODE_WRITE, remote_d, remote, &dmesg_fops);
+ if (!d)
+ goto err;
+
percpu_d = tracefs_create_dir("per_cpu", remote_d);
if (!percpu_d) {
pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/per_cpu/\n", name);
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related
* [PATCH v2 07/18] tracing/remotes: selftests: Add a test for the dmesg tracefs file
From: Vincent Donnefort @ 2026-06-05 16:38 UTC (permalink / raw)
To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
Cc: kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260605163825.1762953-1-vdonnefort@google.com>
Exercise the newly introduced dmesg tracefs file that turns on and off
the dmesg redirection.
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/dmesg.tc b/tools/testing/selftests/ftrace/test.d/remotes/dmesg.tc
new file mode 100644
index 000000000000..aebeea9dbab6
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/remotes/dmesg.tc
@@ -0,0 +1,72 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test trace remote dmesg redirection
+# requires: remotes/test
+
+. $TEST_DIR/remotes/functions
+
+test_dmesg()
+{
+ echo 0 > tracing_on
+ assert_unloaded
+
+ #
+ # Test dmesg on/off when tracing is disabled
+ #
+ echo 1 > dmesg
+ test $(cat dmesg) -eq 1
+ assert_loaded
+
+ echo 0 > dmesg
+ test $(cat dmesg) -eq 0
+ assert_unloaded
+
+ #
+ # Test events are logged to dmesg
+ #
+ dmesg -c > /dev/null
+
+ echo 1 > tracing_on
+ assert_loaded
+ echo 1 > dmesg
+ test $(cat dmesg) -eq 1
+
+ nr_events=128
+ for i in $(seq 1 $nr_events); do
+ echo $i > write_event
+ done
+
+ sleep 1
+ output=$(mktemp $TMPDIR/remote_test.XXXXXX)
+ dmesg | grep "selftest id=" | sed 's/^[^]]*] //'> $output
+
+ check_trace 1 $nr_events $output
+
+ rm $output
+
+ #
+ # Disable dmesg and Test events were not consumed by dmesg
+ #
+ echo 0 > dmesg
+ test $(cat dmesg) -eq 0
+
+ start_id=$(($nr_events + 1))
+ end_id=$(($start_id + $nr_events))
+
+ for i in $(seq $start_id $end_id); do
+ echo $i > write_event
+ done
+
+ sleep 1
+
+ output=$(dump_trace_pipe)
+ check_trace $start_id $end_id $output
+ rm $output
+}
+
+if [ -z "$SOURCE_REMOTE_TEST" ]; then
+ set -e
+
+ setup_remote_test
+ test_dmesg
+fi
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/functions b/tools/testing/selftests/ftrace/test.d/remotes/functions
index 05224fac3653..4a14aa72fdf0 100644
--- a/tools/testing/selftests/ftrace/test.d/remotes/functions
+++ b/tools/testing/selftests/ftrace/test.d/remotes/functions
@@ -8,6 +8,7 @@ setup_remote()
cd remotes/$name/
echo 0 > tracing_on
+ echo 0 > dmesg
clear_trace
echo 7 > buffer_size_kb
echo 0 > events/enable
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/dmesg.tc b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/dmesg.tc
new file mode 100644
index 000000000000..bf4a3c145e7a
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/dmesg.tc
@@ -0,0 +1,11 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test the hypervisor trace dmesg redirection
+# requires: remotes/hypervisor/write_event
+
+SOURCE_REMOTE_TEST=1
+. $TEST_DIR/remotes/dmesg.tc
+
+set -e
+setup_remote "hypervisor"
+test_dmesg
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related
* [PATCH v2 08/18] tracing/remotes: selftests: Prefix hypervisor folder
From: Vincent Donnefort @ 2026-06-05 16:38 UTC (permalink / raw)
To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
Cc: kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260605163825.1762953-1-vdonnefort@google.com>
Avoid interleaving run tests by prefixing the hypervisor folder.
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/buffer_size.tc b/tools/testing/selftests/ftrace/test.d/remotes/00hypervisor/buffer_size.tc
similarity index 100%
rename from tools/testing/selftests/ftrace/test.d/remotes/hypervisor/buffer_size.tc
rename to tools/testing/selftests/ftrace/test.d/remotes/00hypervisor/buffer_size.tc
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/dmesg.tc b/tools/testing/selftests/ftrace/test.d/remotes/00hypervisor/dmesg.tc
similarity index 100%
rename from tools/testing/selftests/ftrace/test.d/remotes/hypervisor/dmesg.tc
rename to tools/testing/selftests/ftrace/test.d/remotes/00hypervisor/dmesg.tc
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/hotplug.tc b/tools/testing/selftests/ftrace/test.d/remotes/00hypervisor/hotplug.tc
similarity index 100%
rename from tools/testing/selftests/ftrace/test.d/remotes/hypervisor/hotplug.tc
rename to tools/testing/selftests/ftrace/test.d/remotes/00hypervisor/hotplug.tc
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/reset.tc b/tools/testing/selftests/ftrace/test.d/remotes/00hypervisor/reset.tc
similarity index 100%
rename from tools/testing/selftests/ftrace/test.d/remotes/hypervisor/reset.tc
rename to tools/testing/selftests/ftrace/test.d/remotes/00hypervisor/reset.tc
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace.tc b/tools/testing/selftests/ftrace/test.d/remotes/00hypervisor/trace.tc
similarity index 100%
rename from tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace.tc
rename to tools/testing/selftests/ftrace/test.d/remotes/00hypervisor/trace.tc
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace_pipe.tc b/tools/testing/selftests/ftrace/test.d/remotes/00hypervisor/trace_pipe.tc
similarity index 100%
rename from tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace_pipe.tc
rename to tools/testing/selftests/ftrace/test.d/remotes/00hypervisor/trace_pipe.tc
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/unloading.tc b/tools/testing/selftests/ftrace/test.d/remotes/00hypervisor/unloading.tc
similarity index 100%
rename from tools/testing/selftests/ftrace/test.d/remotes/hypervisor/unloading.tc
rename to tools/testing/selftests/ftrace/test.d/remotes/00hypervisor/unloading.tc
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply
* [PATCH v2 09/18] ring-buffer: Use irqsave for the reader lock in ring_buffer_poll_remote
From: Vincent Donnefort @ 2026-06-05 16:38 UTC (permalink / raw)
To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
Cc: kernel-team, linux-kernel, Vincent Donnefort, Sashiko
In-Reply-To: <20260605163825.1762953-1-vdonnefort@google.com>
Calling rb_wakeups with the reader lock but interrupts enabled can lead
to a deadlock: the irq_work might run on the same CPU, but will
block when acquiring that very same reader spinlock.
First, rb_wakeups doesn't even need to be called under the reader lock.
Move the function outside of the reader lock scope.
Second, the reader lock must be called with IRQs disabled anyway. Use
the irqsave variant of the spinlock.
Fixes: 2e67fabd8b77 ("ring-buffer: Introduce ring-buffer remotes")
Reported-by: Sashiko <sashiko-bot@kernel.org>
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7b07d2004cc6..183326633037 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -6628,13 +6628,17 @@ int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu)
struct ring_buffer_per_cpu *cpu_buffer;
if (cpu != RING_BUFFER_ALL_CPUS) {
+ bool wakeup;
+
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return -EINVAL;
cpu_buffer = buffer->buffers[cpu];
- guard(raw_spinlock)(&cpu_buffer->reader_lock);
- if (rb_read_remote_meta_page(cpu_buffer))
+ scoped_guard(raw_spinlock_irqsave, &cpu_buffer->reader_lock)
+ wakeup = rb_read_remote_meta_page(cpu_buffer);
+
+ if (wakeup)
rb_wakeups(buffer, cpu_buffer);
return 0;
@@ -6649,7 +6653,7 @@ int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu)
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
- guard(raw_spinlock)(&cpu_buffer->reader_lock);
+ guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
rb_read_remote_meta_page(cpu_buffer);
}
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related
* [PATCH v2 10/18] ring-buffer: Use panic-friendly locking in ring_buffer_iter interface
From: Vincent Donnefort @ 2026-06-05 16:38 UTC (permalink / raw)
To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
Cc: kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260605163825.1762953-1-vdonnefort@google.com>
In preparation for allowing trace_remote to dump the buffer on panic,
make the non-consuming iterator functions panic-friendly.
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 183326633037..88ef44e2da53 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5444,6 +5444,9 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
}
}
+static inline bool rb_reader_lock(struct ring_buffer_per_cpu *cpu_buffer);
+static inline void rb_reader_unlock(struct ring_buffer_per_cpu *cpu_buffer, bool locked);
+
/**
* ring_buffer_iter_reset - reset an iterator
* @iter: The iterator to reset
@@ -5455,15 +5458,18 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
+ bool dolock;
if (!iter)
return;
cpu_buffer = iter->cpu_buffer;
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ local_irq_save(flags);
+ dolock = rb_reader_lock(cpu_buffer);
rb_iter_reset(iter);
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ rb_reader_unlock(cpu_buffer, dolock);
+ local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
@@ -6127,11 +6133,14 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
struct ring_buffer_event *event;
unsigned long flags;
+ bool dolock;
again:
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ local_irq_save(flags);
+ dolock = rb_reader_lock(cpu_buffer);
event = rb_iter_peek(iter, ts);
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ rb_reader_unlock(cpu_buffer, dolock);
+ local_irq_restore(flags);
if (event && event->type_len == RINGBUF_TYPE_PADDING)
goto again;
@@ -6269,12 +6278,15 @@ void ring_buffer_iter_advance(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
unsigned long flags;
+ bool dolock;
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ local_irq_save(flags);
+ dolock = rb_reader_lock(cpu_buffer);
iter->missed_events = 0;
rb_advance_iter(iter);
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ rb_reader_unlock(cpu_buffer, dolock);
+ local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(ring_buffer_iter_advance);
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related
* [PATCH v2 11/18] ring-buffer: Add ring_buffer_read_remote_meta_page()
From: Vincent Donnefort @ 2026-06-05 16:38 UTC (permalink / raw)
To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
Cc: kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260605163825.1762953-1-vdonnefort@google.com>
In preparation for the introduction of a panic handler for trace
remotes, add a ring_buffer_read_remote_meta_page(). This is basically
similar to ring_buffer_poll_remote, but it doesn't try to wake-up
readers and, in the !RING_BUFFER_ALL_CPUS case, uses panic-friendly
locks.
While at it, update trace_remote_has_cpu() to use this new function
instead of ring_buffer_poll_remote(), avoiding unnecessary wakeups when
verifying if a CPU buffer is active.
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 994f52b34344..6e008a548063 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -298,6 +298,7 @@ struct ring_buffer_remote {
void *priv;
};
+int ring_buffer_read_remote_meta_page(struct trace_buffer *buffer, int cpu);
int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu);
struct trace_buffer *
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 88ef44e2da53..efae14e3ba80 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -6635,6 +6635,47 @@ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu)
}
EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
+/**
+ * ring_buffer_read_remote_meta_page - read the meta page of a remote ring buffer
+ * @buffer: The ring buffer
+ * @cpu: The CPU buffer to read (or RING_BUFFER_ALL_CPUS)
+ *
+ * Returns:
+ * 0 on success, or -EINVAL if the CPU is not in the buffer's cpumask.
+ */
+int ring_buffer_read_remote_meta_page(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (cpu != RING_BUFFER_ALL_CPUS) {
+ unsigned long flags;
+ bool dolock;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -EINVAL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ local_irq_save(flags);
+ dolock = rb_reader_lock(cpu_buffer);
+ rb_read_remote_meta_page(cpu_buffer);
+ rb_reader_unlock(cpu_buffer, dolock);
+ local_irq_restore(flags);
+ return 0;
+ }
+
+ guard(cpus_read_lock)();
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
+ rb_read_remote_meta_page(cpu_buffer);
+ }
+
+ return 0;
+}
+
int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
index 19dfa355b7f3..b0404ad8981d 100644
--- a/kernel/trace/trace_remote.c
+++ b/kernel/trace/trace_remote.c
@@ -350,7 +350,7 @@ static bool trace_remote_has_cpu(struct trace_remote *remote, int cpu)
if (cpu == RING_BUFFER_ALL_CPUS)
return true;
- return ring_buffer_poll_remote(remote->trace_buffer, cpu) == 0;
+ return ring_buffer_read_remote_meta_page(remote->trace_buffer, cpu) == 0;
}
static void __free_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu)
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related
* [PATCH v2 12/18] ring-buffer: Add kerneldoc for ring_buffer_poll_remote
From: Vincent Donnefort @ 2026-06-05 16:38 UTC (permalink / raw)
To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
Cc: kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260605163825.1762953-1-vdonnefort@google.com>
Document ring_buffer_poll_remote().
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index efae14e3ba80..2f99f77a039b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -6676,6 +6676,17 @@ int ring_buffer_read_remote_meta_page(struct trace_buffer *buffer, int cpu)
return 0;
}
+/**
+ * ring_buffer_poll_remote - poll a remote ring buffer for new data
+ * @buffer: The ring buffer
+ * @cpu: The CPU buffer to poll (or RING_BUFFER_ALL_CPUS)
+ *
+ * This function polls the specified remote CPU buffer (or all of them)
+ * by reading its meta page to update the local reader's view. If new
+ * entries are detected, it triggers wakeups for any waiting readers.
+ * Returns:
+ * 0 on success, or -EINVAL if the CPU is not in the buffer's cpumask.
+ */
int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related
* [PATCH v2 13/18] tracing/remotes: Add dump_on_panic tracefs file
From: Vincent Donnefort @ 2026-06-05 16:38 UTC (permalink / raw)
To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
Cc: kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260605163825.1762953-1-vdonnefort@google.com>
When enabled, dump_on_panic will dump the content of the trace remote
buffer if the system panics.
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
index b0404ad8981d..cf99752e1cd5 100644
--- a/kernel/trace/trace_remote.c
+++ b/kernel/trace/trace_remote.c
@@ -7,6 +7,7 @@
#include <linux/kstrtox.h>
#include <linux/lockdep.h>
#include <linux/mutex.h>
+#include <linux/panic_notifier.h>
#include <linux/tracefs.h>
#include <linux/trace_remote.h>
#include <linux/trace_seq.h>
@@ -22,6 +23,7 @@ enum tri_type {
TRI_CONSUMING,
TRI_NONCONSUMING,
TRI_DMESG,
+ TRI_PANIC,
};
struct trace_remote_iterator {
@@ -60,6 +62,9 @@ struct trace_remote {
struct delayed_work poll_work;
unsigned int poll_cnt;
bool tracing_on;
+ bool panic_on;
+ struct notifier_block panic_notifier;
+ struct trace_remote_iterator *panic_iter;
};
static DEFINE_MUTEX(trace_remotes_lock);
@@ -71,10 +76,15 @@ static bool trace_remote_loaded(struct trace_remote *remote)
return !!remote->trace_buffer;
}
+static void trace_remote_unload(struct trace_remote *remote);
+static int trace_remote_panic_load(struct trace_remote *remote);
+static void trace_remote_panic_unload(struct trace_remote *remote);
+
static int trace_remote_load(struct trace_remote *remote)
{
struct ring_buffer_remote *rb_remote = &remote->rb_remote;
struct trace_buffer_desc *desc;
+ int ret = 0;
lockdep_assert_held(&remote->lock);
@@ -89,15 +99,28 @@ static int trace_remote_load(struct trace_remote *remote)
rb_remote->swap_reader_page = remote->cbs->swap_reader_page;
rb_remote->priv = remote->priv;
rb_remote->reset = remote->cbs->reset;
+ remote->trace_buffer_desc = desc;
remote->trace_buffer = ring_buffer_alloc_remote(rb_remote);
if (!remote->trace_buffer) {
remote->cbs->unload_trace_buffer(desc, remote->priv);
return -ENOMEM;
}
- remote->trace_buffer_desc = desc;
+ if (remote->panic_on) {
+ ret = trace_remote_panic_load(remote);
+ if (ret)
+ trace_remote_unload(remote);
+ }
- return 0;
+ return ret;
+}
+
+static void trace_remote_unload(struct trace_remote *remote)
+{
+ trace_remote_panic_unload(remote);
+ ring_buffer_free(remote->trace_buffer);
+ remote->trace_buffer = NULL;
+ remote->cbs->unload_trace_buffer(remote->trace_buffer_desc, remote->priv);
}
static void trace_remote_try_unload(struct trace_remote *remote)
@@ -115,9 +138,7 @@ static void trace_remote_try_unload(struct trace_remote *remote)
if (!ring_buffer_empty(remote->trace_buffer))
return;
- ring_buffer_free(remote->trace_buffer);
- remote->trace_buffer = NULL;
- remote->cbs->unload_trace_buffer(remote->trace_buffer_desc, remote->priv);
+ trace_remote_unload(remote);
}
static int trace_remote_enable_tracing(struct trace_remote *remote)
@@ -434,58 +455,68 @@ static void trace_remote_dec_poll(struct trace_remote *remote)
static struct trace_remote_iterator
*trace_remote_iter(struct trace_remote *remote, int cpu, enum tri_type type)
{
- struct trace_remote_iterator *iter = NULL;
+ struct trace_remote_iterator *iter __free(kfree) = kzalloc_obj(*iter);
int ret;
lockdep_assert_held(&remote->lock);
- if (type == TRI_NONCONSUMING && !trace_remote_loaded(remote))
- return NULL;
+ if (!iter)
+ return ERR_PTR(-ENOMEM);
- ret = trace_remote_get(remote, cpu);
- if (ret)
- return ERR_PTR(ret);
+ switch (type) {
+ case TRI_NONCONSUMING:
+ if (!trace_remote_loaded(remote))
+ return NULL;
+ fallthrough;
+ case TRI_CONSUMING:
+ case TRI_DMESG:
+ ret = trace_remote_get(remote, cpu);
+ if (ret)
+ return ERR_PTR(ret);
+ break;
+ case TRI_PANIC:
+ break;
+ }
if (!trace_remote_has_cpu(remote, cpu)) {
ret = -ENODEV;
goto err;
}
- iter = kzalloc_obj(*iter);
- if (iter) {
- iter->remote = remote;
- iter->cpu = cpu;
- iter->type = type;
- trace_seq_init(&iter->seq);
+ iter->remote = remote;
+ iter->cpu = cpu;
+ iter->type = type;
+ trace_seq_init(&iter->seq);
- switch (type) {
- case TRI_DMESG:
- /* only one printk iter allowed */
- if (WARN_ON_ONCE(remote->dmesg)) {
- ret = -EBUSY;
- break;
- }
- smp_store_release(&remote->dmesg, iter);
- fallthrough;
- case TRI_CONSUMING:
- trace_remote_inc_poll(remote);
- break;
- case TRI_NONCONSUMING:
- ret = __alloc_ring_buffer_iter(iter, cpu);
- break;
+ switch (type) {
+ case TRI_DMESG:
+ /* only one dmesg iter allowed */
+ if (WARN_ON_ONCE(remote->dmesg)) {
+ ret = -EBUSY;
+ goto err;
}
-
+ smp_store_release(&remote->dmesg, iter);
+ fallthrough;
+ case TRI_CONSUMING:
+ trace_remote_inc_poll(remote);
+ break;
+ case TRI_PANIC:
+ case TRI_NONCONSUMING:
+ ret = __alloc_ring_buffer_iter(iter, cpu);
if (ret)
goto err;
-
- return iter;
+ break;
}
- ret = -ENOMEM;
-err:
- kfree(iter);
- trace_remote_put(remote);
+ return no_free_ptr(iter);
+err:
+ switch (type) {
+ case TRI_PANIC:
+ break;
+ default:
+ trace_remote_put(remote);
+ }
return ERR_PTR(ret);
}
@@ -508,14 +539,18 @@ static void trace_remote_iter_free(struct trace_remote_iterator *iter)
fallthrough;
case TRI_CONSUMING:
trace_remote_dec_poll(remote);
+ trace_remote_put(remote);
break;
case TRI_NONCONSUMING:
+ trace_remote_put(remote);
+ __free_ring_buffer_iter(iter, iter->cpu);
+ break;
+ case TRI_PANIC:
__free_ring_buffer_iter(iter, iter->cpu);
break;
}
kfree(iter);
- trace_remote_put(remote);
}
static bool trace_remote_iter_is_consuming(struct trace_remote_iterator *iter)
@@ -988,6 +1023,116 @@ static int dmesg_show(struct seq_file *s, void *unused)
}
DEFINE_TRACE_REMOTE_ATTRIBUTE(dmesg);
+static int trace_remote_panic_handler(struct notifier_block *self, unsigned long ev, void *v)
+{
+ struct trace_remote *remote = container_of(self, struct trace_remote, panic_notifier);
+ struct trace_remote_iterator *iter = smp_load_acquire(&remote->panic_iter);
+ int cpu;
+
+ if (!iter) {
+ pr_warn("Unexpected error: no panic iterator for the trace remote\n");
+ return NOTIFY_DONE;
+ }
+
+ for_each_possible_cpu(cpu) {
+ if (iter->rb_iters[cpu]) {
+ /* No RING_BUFFER_ALL_CPUS to avoid taking cpu_read_lock() */
+ ring_buffer_read_remote_meta_page(remote->trace_buffer, cpu);
+ ring_buffer_iter_reset(iter->rb_iters[cpu]);
+ }
+ }
+
+ while (trace_remote_iter_read_event(iter)) {
+ trace_seq_init(&iter->seq);
+
+ trace_remote_iter_print_event(iter);
+ pr_emerg("%s", iter->seq.buffer);
+
+ trace_remote_iter_move(iter);
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int trace_remote_panic_load(struct trace_remote *remote)
+{
+ struct notifier_block *notifier = &remote->panic_notifier;
+ struct trace_remote_iterator *iter;
+
+ lockdep_assert_held(&remote->lock);
+
+ if (remote->panic_iter)
+ return 0;
+
+ iter = trace_remote_iter(remote, RING_BUFFER_ALL_CPUS, TRI_PANIC);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+
+ smp_store_release(&remote->panic_iter, iter);
+
+ notifier->notifier_call = trace_remote_panic_handler;
+ notifier->priority = INT_MAX - 1;
+ atomic_notifier_chain_register(&panic_notifier_list, notifier);
+
+ return 0;
+}
+
+static void trace_remote_panic_unload(struct trace_remote *remote)
+{
+ struct trace_remote_iterator *iter = remote->panic_iter;
+
+ lockdep_assert_held(&remote->lock);
+
+ if (!iter)
+ return;
+
+ atomic_notifier_chain_unregister(&panic_notifier_list, &remote->panic_notifier);
+ smp_store_release(&remote->panic_iter, NULL);
+ trace_remote_iter_free(iter);
+}
+
+static ssize_t dump_on_panic_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct seq_file *seq = filp->private_data;
+ struct trace_remote *remote = seq->private;
+ bool enable;
+ int ret;
+
+ ret = kstrtobool_from_user(ubuf, cnt, &enable);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&remote->lock);
+
+ if (enable == remote->panic_on)
+ return cnt;
+
+ if (trace_remote_loaded(remote)) {
+ if (enable) {
+ ret = trace_remote_panic_load(remote);
+ if (ret)
+ return ret;
+ } else {
+ trace_remote_panic_unload(remote);
+ }
+ }
+
+ remote->panic_on = enable;
+
+ return cnt;
+}
+
+static int dump_on_panic_show(struct seq_file *s, void *unused)
+{
+ struct trace_remote *remote = s->private;
+
+ seq_printf(s, "%d\n", remote->panic_on);
+
+ return 0;
+}
+DEFINE_TRACE_REMOTE_ATTRIBUTE(dump_on_panic);
+
static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote)
{
struct dentry *remote_d, *percpu_d, *d;
@@ -1015,6 +1160,11 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo
if (!d)
goto err;
+ d = trace_create_file("dump_on_panic", TRACEFS_MODE_WRITE, remote_d, remote,
+ &dump_on_panic_fops);
+ if (!d)
+ goto err;
+
d = trace_create_file("buffer_size_kb", TRACEFS_MODE_WRITE, remote_d, remote,
&buffer_size_kb_fops);
if (!d)
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related
* [PATCH v2 14/18] tracing/remotes: selftests: Add a test for the dump_on_panic tracefs file
From: Vincent Donnefort @ 2026-06-05 16:38 UTC (permalink / raw)
To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
Cc: kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260605163825.1762953-1-vdonnefort@google.com>
Exercise the newly introduced dump_on_panic tracefs file that turns on
or off the trace remote buffer dump on system panic.
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/00hypervisor/dump_on_panic.tc b/tools/testing/selftests/ftrace/test.d/remotes/00hypervisor/dump_on_panic.tc
new file mode 100644
index 000000000000..5e3d3c412ecd
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/remotes/00hypervisor/dump_on_panic.tc
@@ -0,0 +1,11 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test hypervisor trace dump_on_panic
+# requires: remotes/hypervisor/write_event
+
+SOURCE_REMOTE_TEST=1
+. $TEST_DIR/remotes/dump_on_panic.tc
+
+set -e
+setup_remote "hypervisor"
+test_dump_on_panic
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/dump_on_panic.tc b/tools/testing/selftests/ftrace/test.d/remotes/dump_on_panic.tc
new file mode 100644
index 000000000000..defc6f3a07ca
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/remotes/dump_on_panic.tc
@@ -0,0 +1,51 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test trace remote dump_on_panic
+# requires: remotes/test
+
+. $TEST_DIR/remotes/functions
+
+test_dump_on_panic()
+{
+ #
+ # Toggle when the buffer is unloaded
+ #
+ echo 1 > dump_on_panic
+ echo 0 > dump_on_panic
+
+ #
+ # Toggle when the buffer is loaded
+ #
+ echo 1 > tracing_on
+ assert_loaded
+
+ echo 1 > dump_on_panic
+ echo 0 > dump_on_panic
+
+ #
+ # Load and unload buffer while dump_on_panic is enabled
+ #
+ echo 0 > tracing_on
+ assert_unloaded
+
+ echo 1 > dump_on_panic
+ echo 1 > tracing_on
+ echo 0 > tracing_on
+
+ # REMOVE ME FOR A PROPER OOPS TEST
+ return
+
+ echo 1 > tracing_on
+
+ for i in $(seq 1 32); do
+ echo $i > write_event
+ done
+
+ echo c > /proc/sysrq-trigger
+}
+
+if [ -z "$SOURCE_REMOTE_TEST" ]; then
+ set -e
+ setup_remote_test
+ test_dump_on_panic
+fi
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/functions b/tools/testing/selftests/ftrace/test.d/remotes/functions
index 4a14aa72fdf0..bdd28b5b8596 100644
--- a/tools/testing/selftests/ftrace/test.d/remotes/functions
+++ b/tools/testing/selftests/ftrace/test.d/remotes/functions
@@ -9,6 +9,7 @@ setup_remote()
cd remotes/$name/
echo 0 > tracing_on
echo 0 > dmesg
+ echo 0 > dump_on_panic
clear_trace
echo 7 > buffer_size_kb
echo 0 > events/enable
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related
* [PATCH v2 15/18] tracing/remotes: Add poll_ms tracefs file
From: Vincent Donnefort @ 2026-06-05 16:38 UTC (permalink / raw)
To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel
Cc: kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260605163825.1762953-1-vdonnefort@google.com>
Add a tracefs file to configure the trace remote polling period. Keep
the default value to 100ms.
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
index cf99752e1cd5..f72fc862ae7f 100644
--- a/kernel/trace/trace_remote.c
+++ b/kernel/trace/trace_remote.c
@@ -1133,6 +1133,40 @@ static int dump_on_panic_show(struct seq_file *s, void *unused)
}
DEFINE_TRACE_REMOTE_ATTRIBUTE(dump_on_panic);
+static ssize_t poll_ms_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct seq_file *seq = filp->private_data;
+ struct trace_remote *remote = seq->private;
+ unsigned int val;
+ int ret;
+
+ ret = kstrtouint_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ if (!val)
+ return -EINVAL;
+
+ guard(mutex)(&remote->lock);
+
+ if (val < remote->poll_ms && remote->poll_cnt)
+ mod_delayed_work(system_percpu_wq, &remote->poll_work, msecs_to_jiffies(val));
+
+ remote->poll_ms = val;
+
+ return cnt;
+}
+
+static int poll_ms_show(struct seq_file *s, void *unused)
+{
+ struct trace_remote *remote = s->private;
+
+ seq_printf(s, "%u\n", remote->poll_ms);
+
+ return 0;
+}
+DEFINE_TRACE_REMOTE_ATTRIBUTE(poll_ms);
+
static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote)
{
struct dentry *remote_d, *percpu_d, *d;
@@ -1165,6 +1199,10 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo
if (!d)
goto err;
+ d = trace_create_file("poll_ms", TRACEFS_MODE_WRITE, remote_d, remote, &poll_ms_fops);
+ if (!d)
+ goto err;
+
d = trace_create_file("buffer_size_kb", TRACEFS_MODE_WRITE, remote_d, remote,
&buffer_size_kb_fops);
if (!d)
--
2.54.0.1032.g2f8565e1d1-goog
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox