* [PATCH mm-unstable v18 06/14] mm/khugepaged: generalize collapse_huge_page for mTHP collapse
From: Nico Pache @ 2026-05-22 15:00 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe,
Usama Arif
In-Reply-To: <20260522150009.121603-1-npache@redhat.com>
Pass an order and offset to collapse_huge_page to support collapsing anon
memory to arbitrary orders within a PMD. order indicates what mTHP size we
are attempting to collapse to, and offset indicates were in the PMD to
start the collapse attempt.
For non-PMD collapse we must leave the anon VMA write locked until after
we collapse the mTHP-- in the PMD case all the pages are isolated, but in
the mTHP case this is not true, and we must keep the lock to prevent
access/changes to the page tables. This can happen if the rmap walkers hit
a pmd_none while the PMD entry is currently unavailable due to being
temporarily removed during the collapse phase.
Acked-by: Usama Arif <usama.arif@linux.dev>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 93 +++++++++++++++++++++++++++++--------------------
1 file changed, 55 insertions(+), 38 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index fab35d318641..d64f42f66236 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1214,34 +1214,36 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
* while allocating a THP, as that could trigger direct reclaim/compaction.
* Note that the VMA must be rechecked after grabbing the mmap_lock again.
*/
-static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address,
- int referenced, int unmapped, struct collapse_control *cc)
+static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long start_addr,
+ int referenced, int unmapped, struct collapse_control *cc,
+ unsigned int order)
{
+ const unsigned long pmd_addr = start_addr & HPAGE_PMD_MASK;
+ const unsigned long end_addr = start_addr + (PAGE_SIZE << order);
LIST_HEAD(compound_pagelist);
pmd_t *pmd, _pmd;
- pte_t *pte;
+ pte_t *pte = NULL;
pgtable_t pgtable;
struct folio *folio;
spinlock_t *pmd_ptl, *pte_ptl;
enum scan_result result = SCAN_FAIL;
struct vm_area_struct *vma;
struct mmu_notifier_range range;
+ bool anon_vma_locked = false;
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-
- result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
+ result = alloc_charge_folio(&folio, mm, cc, order);
if (result != SCAN_SUCCEED)
goto out_nolock;
mmap_read_lock(mm);
- result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
- HPAGE_PMD_ORDER);
+ result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
+ &vma, cc, order);
if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm);
goto out_nolock;
}
- result = find_pmd_or_thp_or_none(mm, address, &pmd);
+ result = find_pmd_or_thp_or_none(mm, pmd_addr, &pmd);
if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm);
goto out_nolock;
@@ -1253,8 +1255,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
* released when it fails. So we jump out_nolock directly in
* that case. Continuing to collapse causes inconsistency.
*/
- result = __collapse_huge_page_swapin(mm, vma, address, pmd,
- referenced, HPAGE_PMD_ORDER);
+ result = __collapse_huge_page_swapin(mm, vma, start_addr, pmd,
+ referenced, order);
if (result != SCAN_SUCCEED)
goto out_nolock;
}
@@ -1269,20 +1271,21 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
* mmap_lock.
*/
mmap_write_lock(mm);
- result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
- HPAGE_PMD_ORDER);
+ result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
+ &vma, cc, order);
if (result != SCAN_SUCCEED)
goto out_up_write;
/* check if the pmd is still valid */
vma_start_write(vma);
- result = check_pmd_still_valid(mm, address, pmd);
+ result = check_pmd_still_valid(mm, pmd_addr, pmd);
if (result != SCAN_SUCCEED)
goto out_up_write;
anon_vma_lock_write(vma->anon_vma);
+ anon_vma_locked = true;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
- address + HPAGE_PMD_SIZE);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start_addr,
+ end_addr);
mmu_notifier_invalidate_range_start(&range);
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
@@ -1294,26 +1297,23 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
* Parallel GUP-fast is fine since GUP-fast will back off when
* it detects PMD is changed.
*/
- _pmd = pmdp_collapse_flush(vma, address, pmd);
+ _pmd = pmdp_collapse_flush(vma, pmd_addr, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(&range);
tlb_remove_table_sync_one();
- pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
+ pte = pte_offset_map_lock(mm, &_pmd, start_addr, &pte_ptl);
if (pte) {
- result = __collapse_huge_page_isolate(vma, address, pte, cc,
- HPAGE_PMD_ORDER,
- &compound_pagelist);
+ result = __collapse_huge_page_isolate(vma, start_addr, pte, cc,
+ order, &compound_pagelist);
spin_unlock(pte_ptl);
} else {
result = SCAN_NO_PTE_TABLE;
}
if (unlikely(result != SCAN_SUCCEED)) {
- if (pte)
- pte_unmap(pte);
spin_lock(pmd_ptl);
- BUG_ON(!pmd_none(*pmd));
+ WARN_ON_ONCE(!pmd_none(*pmd));
/*
* We can only use set_pmd_at when establishing
* hugepmds and never for establishing regular pmds that
@@ -1321,21 +1321,24 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
*/
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(pmd_ptl);
- anon_vma_unlock_write(vma->anon_vma);
goto out_up_write;
}
/*
- * All pages are isolated and locked so anon_vma rmap
- * can't run anymore.
+ * For PMD collapse all pages are isolated and locked so anon_vma
+ * rmap can't run anymore. For mTHP collapse the PMD entry has been
+ * removed and not all pages are isolated and locked, so we must hold
+ * the lock to prevent neighboring folios from attempting to access
+ * this PMD until its reinstalled.
*/
- anon_vma_unlock_write(vma->anon_vma);
+ if (is_pmd_order(order)) {
+ anon_vma_unlock_write(vma->anon_vma);
+ anon_vma_locked = false;
+ }
result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
- vma, address, pte_ptl,
- HPAGE_PMD_ORDER,
- &compound_pagelist);
- pte_unmap(pte);
+ vma, start_addr, pte_ptl,
+ order, &compound_pagelist);
if (unlikely(result != SCAN_SUCCEED))
goto out_up_write;
@@ -1345,18 +1348,32 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
* write.
*/
__folio_mark_uptodate(folio);
- pgtable = pmd_pgtable(_pmd);
-
spin_lock(pmd_ptl);
- BUG_ON(!pmd_none(*pmd));
- pgtable_trans_huge_deposit(mm, pmd, pgtable);
- map_anon_folio_pmd_nopf(folio, pmd, vma, address);
+ WARN_ON_ONCE(!pmd_none(*pmd));
+ if (is_pmd_order(order)) {
+ pgtable = pmd_pgtable(_pmd);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_addr);
+ } else {
+ /*
+ * set_ptes is called in map_anon_folio_pte_nopf with the
+ * pmd_ptl lock still held; this is safe as the PMD is expected
+ * to be none. The pmd entry is then repopulated below.
+ */
+ map_anon_folio_pte_nopf(folio, pte, vma, start_addr, /*uffd_wp=*/ false);
+ smp_wmb(); /* make PTEs visible before PMD. See pmd_install() */
+ pmd_populate(mm, pmd, pmd_pgtable(_pmd));
+ }
spin_unlock(pmd_ptl);
folio = NULL;
result = SCAN_SUCCEED;
out_up_write:
+ if (anon_vma_locked)
+ anon_vma_unlock_write(vma->anon_vma);
+ if (pte)
+ pte_unmap(pte);
mmap_write_unlock(mm);
out_nolock:
if (folio)
@@ -1536,7 +1553,7 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
/* collapse_huge_page expects the lock to be dropped before calling */
mmap_read_unlock(mm);
result = collapse_huge_page(mm, start_addr, referenced,
- unmapped, cc);
+ unmapped, cc, HPAGE_PMD_ORDER);
/* collapse_huge_page will return with the mmap_lock released */
*lock_dropped = true;
}
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v18 05/14] mm/khugepaged: require collapse_huge_page to enter/exit with the lock dropped
From: Nico Pache @ 2026-05-22 15:00 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260522150009.121603-1-npache@redhat.com>
Currently the collapse_huge_page function requires the mmap_read_lock to
enter with it held, and exit with it dropped. This function moves the
unlock into its parent caller, and changes this semantic to requiring it
to enter/exit with it always unlocked.
In future patches, we need this expectation, as for in mTHP collapse, we
may have already have dropped the lock, and do not want to conditionally
check for this by passing through the lock_dropped variable.
No functional change is expected as one of the first things the
collapse_huge_page function does is drop this lock before allocating the
hugepage.
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index e98ba5b15163..fab35d318641 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1208,6 +1208,12 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
return SCAN_SUCCEED;
}
+/*
+ * collapse_huge_page expects the mmap_lock to be unlocked before entering and
+ * will always return with the lock unlocked, to avoid holding the mmap_lock
+ * while allocating a THP, as that could trigger direct reclaim/compaction.
+ * Note that the VMA must be rechecked after grabbing the mmap_lock again.
+ */
static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address,
int referenced, int unmapped, struct collapse_control *cc)
{
@@ -1223,14 +1229,6 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- /*
- * Before allocating the hugepage, release the mmap_lock read lock.
- * The allocation can take potentially a long time if it involves
- * sync compaction, and we do not need to hold the mmap_lock during
- * that. We will recheck the vma after taking it again in write mode.
- */
- mmap_read_unlock(mm);
-
result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED)
goto out_nolock;
@@ -1535,6 +1533,8 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
out_unmap:
pte_unmap_unlock(pte, ptl);
if (result == SCAN_SUCCEED) {
+ /* collapse_huge_page expects the lock to be dropped before calling */
+ mmap_read_unlock(mm);
result = collapse_huge_page(mm, start_addr, referenced,
unmapped, cc);
/* collapse_huge_page will return with the mmap_lock released */
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v18 04/14] mm/khugepaged: generalize __collapse_huge_page_* for mTHP support
From: Nico Pache @ 2026-05-22 14:59 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260522150009.121603-1-npache@redhat.com>
generalize the order of the __collapse_huge_page_* and collapse_max_*
functions to support future mTHP collapse.
The current mechanism for determining collapse with the
khugepaged_max_ptes_none value is not designed with mTHP in mind. This
raises a key design issue: if we support user defined max_pte_none values
(even those scaled by order), a collapse of a lower order can introduces
an feedback loop, or "creep", when max_ptes_none is set to a value greater
than HPAGE_PMD_NR / 2. [1]
With this configuration, a successful collapse to order N will populate
enough pages to satisfy the collapse condition on order N+1 on the next
scan. This leads to unnecessary work and memory churn.
To fix this issue introduce a helper function that will limit mTHP
collapse support to two max_ptes_none values, 0 and HPAGE_PMD_NR - 1.
This effectively supports two modes: [2]
- max_ptes_none=0: never collapses if it encounters an empty PTE or a PTE
that maps the shared zeropage. Consequently, no memory bloat.
- max_ptes_none=511 (on 4k pagesz): Always collapse to the highest
available mTHP order.
This removes the possibility of "creep", and a warning will be emitted if
any non-supported max_ptes_none value is configured with mTHP enabled.
Any intermediate value will default mTHP collapse to max_ptes_none=0.
mTHP collapse will not honor the khugepaged_max_ptes_shared or
khugepaged_max_ptes_swap parameters, and will fail if it encounters a
shared or swapped entry.
No functional changes in this patch; however it defines future behavior
for mTHP collapse.
[1] - https://lore.kernel.org/all/e46ab3ab-a3d7-4fb7-9970-d0704bd5d05a@arm.com
[2] - https://lore.kernel.org/all/37375ace-5601-4d6c-9dac-d1c8268698e9@redhat.com
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Co-developed-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 121 +++++++++++++++++++++++++++++++++++-------------
1 file changed, 88 insertions(+), 33 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 116f39518948..e98ba5b15163 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -353,30 +353,52 @@ static bool pte_none_or_zero(pte_t pte)
* the shared zeropage for the given collapse operation.
* @cc: The collapse control struct
* @vma: The vma to check for userfaultfd
+ * @order: The folio order being collapsed to
*
* Return: Maximum number of empty/shared zeropage PTEs for the collapse operation
*/
static unsigned int collapse_max_ptes_none(struct collapse_control *cc,
- struct vm_area_struct *vma)
+ struct vm_area_struct *vma, unsigned int order)
{
+ unsigned int max_ptes_none = khugepaged_max_ptes_none;
+
if (vma && userfaultfd_armed(vma))
return 0;
/* for MADV_COLLAPSE, allow any empty/shared zeropage PTEs */
if (!cc->is_khugepaged)
return HPAGE_PMD_NR;
- /* For all other cases respect the user defined maximum */
- return khugepaged_max_ptes_none;
+ /* for PMD collapse, respect the user defined maximum */
+ if (is_pmd_order(order))
+ return max_ptes_none;
+ /*
+ * for mTHP collapse with the sysctl value set to KHUGEPAGED_MAX_PTES_LIMIT,
+ * scale the maximum number of PTEs to the order of the collapse.
+ */
+ if (max_ptes_none == KHUGEPAGED_MAX_PTES_LIMIT)
+ return (1 << order) - 1;
+ if (!max_ptes_none)
+ return 0;
+ /*
+ * For mTHP collapse of values other than 0 or KHUGEPAGED_MAX_PTES_LIMIT,
+ * emit a warning and return 0.
+ */
+ pr_warn_once("mTHP collapse does not support max_ptes_none values"
+ " other than 0 or %u, defaulting to 0.\n",
+ KHUGEPAGED_MAX_PTES_LIMIT);
+ return 0;
}
/**
* collapse_max_ptes_shared - Calculate maximum allowed PTEs that map shared
* anonymous pages for the given collapse operation.
* @cc: The collapse control struct
+ * @order: The folio order being collapsed to
*
* Return: Maximum number of PTEs that map shared anonymous pages for the
* collapse operation
*/
-static unsigned int collapse_max_ptes_shared(struct collapse_control *cc)
+static unsigned int collapse_max_ptes_shared(struct collapse_control *cc,
+ unsigned int order)
{
/*
* For MADV_COLLAPSE, do not restrict the number of PTEs that map shared
@@ -384,6 +406,13 @@ static unsigned int collapse_max_ptes_shared(struct collapse_control *cc)
*/
if (!cc->is_khugepaged)
return HPAGE_PMD_NR;
+ /*
+ * for mTHP collapse do not allow collapsing anonymous memory pages that
+ * are shared between processes.
+ */
+ if (!is_pmd_order(order))
+ return 0;
+ /* for PMD collapse, respect the user defined maximum */
return khugepaged_max_ptes_shared;
}
@@ -391,11 +420,13 @@ static unsigned int collapse_max_ptes_shared(struct collapse_control *cc)
* collapse_max_ptes_swap - Calculate the maximum allowed non-present PTEs or the
* maximum allowed non-present pagecache entries for the given collapse operation.
* @cc: The collapse control struct
+ * @order: The folio order being collapsed to
*
* Return: Maximum number of non-present PTEs or the maximum allowed non-present
* pagecache entries for the collapse operation.
*/
-static unsigned int collapse_max_ptes_swap(struct collapse_control *cc)
+static unsigned int collapse_max_ptes_swap(struct collapse_control *cc,
+ unsigned int order)
{
/*
* For MADV_COLLAPSE, do not restrict the number PTEs entries or
@@ -403,6 +434,10 @@ static unsigned int collapse_max_ptes_swap(struct collapse_control *cc)
*/
if (!cc->is_khugepaged)
return HPAGE_PMD_NR;
+ /* for mTHP collapse do not allow any non-present PTEs or pagecache entries */
+ if (!is_pmd_order(order))
+ return 0;
+ /* for PMD collapse, respect the user defined maximum */
return khugepaged_max_ptes_swap;
}
@@ -596,10 +631,11 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long start_addr, pte_t *pte, struct collapse_control *cc,
- struct list_head *compound_pagelist)
+ unsigned int order, struct list_head *compound_pagelist)
{
- const unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma);
- const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc);
+ const unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma, order);
+ const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, order);
+ const unsigned long nr_pages = 1UL << order;
struct page *page = NULL;
struct folio *folio = NULL;
unsigned long addr = start_addr;
@@ -607,7 +643,7 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
int none_or_zero = 0, shared = 0, referenced = 0;
enum scan_result result = SCAN_FAIL;
- for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
+ for (_pte = pte; _pte < pte + nr_pages;
_pte++, addr += PAGE_SIZE) {
pte_t pteval = ptep_get(_pte);
if (pte_none_or_zero(pteval)) {
@@ -740,18 +776,18 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
}
static void __collapse_huge_page_copy_succeeded(pte_t *pte,
- struct vm_area_struct *vma,
- unsigned long address,
- spinlock_t *ptl,
- struct list_head *compound_pagelist)
+ struct vm_area_struct *vma, unsigned long address,
+ spinlock_t *ptl, unsigned int order,
+ struct list_head *compound_pagelist)
{
- unsigned long end = address + HPAGE_PMD_SIZE;
+ const unsigned long nr_pages = 1UL << order;
+ unsigned long end = address + (PAGE_SIZE << order);
struct folio *src, *tmp;
pte_t pteval;
pte_t *_pte;
unsigned int nr_ptes;
- for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
+ for (_pte = pte; _pte < pte + nr_pages; _pte += nr_ptes,
address += nr_ptes * PAGE_SIZE) {
nr_ptes = 1;
pteval = ptep_get(_pte);
@@ -804,11 +840,10 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
}
static void __collapse_huge_page_copy_failed(pte_t *pte,
- pmd_t *pmd,
- pmd_t orig_pmd,
- struct vm_area_struct *vma,
- struct list_head *compound_pagelist)
+ pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
+ unsigned int order, struct list_head *compound_pagelist)
{
+ const unsigned long nr_pages = 1UL << order;
spinlock_t *pmd_ptl;
/*
@@ -824,7 +859,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
* Release both raw and compound pages isolated
* in __collapse_huge_page_isolate.
*/
- release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
+ release_pte_pages(pte, pte + nr_pages, compound_pagelist);
}
/*
@@ -844,16 +879,17 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
*/
static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
- unsigned long address, spinlock_t *ptl,
+ unsigned long address, spinlock_t *ptl, unsigned int order,
struct list_head *compound_pagelist)
{
+ const unsigned long nr_pages = 1UL << order;
unsigned int i;
enum scan_result result = SCAN_SUCCEED;
/*
* Copying pages' contents is subject to memory poison at any iteration.
*/
- for (i = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0; i < nr_pages; i++) {
pte_t pteval = ptep_get(pte + i);
struct page *page = folio_page(folio, i);
unsigned long src_addr = address + i * PAGE_SIZE;
@@ -872,10 +908,10 @@ static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio *foli
if (likely(result == SCAN_SUCCEED))
__collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
- compound_pagelist);
+ order, compound_pagelist);
else
__collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
- compound_pagelist);
+ order, compound_pagelist);
return result;
}
@@ -1042,16 +1078,20 @@ static enum scan_result check_pmd_still_valid(struct mm_struct *mm,
* Bring missing pages in from swap, to complete THP collapse.
* Only done if khugepaged_scan_pmd believes it is worthwhile.
*
+ * For mTHP orders the function bails on the first swap entry, because
+ * faulting pages back in during collapse could re-populate PTEs that
+ * push a later scan over the threshold for a higher-order collapse.
+ *
* Called and returns without pte mapped or spinlocks held.
* Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
*/
static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long start_addr, pmd_t *pmd,
- int referenced)
+ struct vm_area_struct *vma, unsigned long start_addr,
+ pmd_t *pmd, int referenced, unsigned int order)
{
int swapped_in = 0;
vm_fault_t ret = 0;
- unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
+ unsigned long addr, end = start_addr + (PAGE_SIZE << order);
enum scan_result result;
pte_t *pte = NULL;
spinlock_t *ptl;
@@ -1083,6 +1123,19 @@ static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
pte_present(vmf.orig_pte))
continue;
+ /*
+ * TODO: Support swapin without leading to further mTHP
+ * collapses. Currently bringing in new pages via swapin may
+ * cause a future higher order collapse on a rescan of the same
+ * range.
+ */
+ if (!is_pmd_order(order)) {
+ pte_unmap(pte);
+ mmap_read_unlock(mm);
+ result = SCAN_EXCEED_SWAP_PTE;
+ goto out;
+ }
+
vmf.pte = pte;
vmf.ptl = ptl;
ret = do_swap_page(&vmf);
@@ -1203,7 +1256,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
* that case. Continuing to collapse causes inconsistency.
*/
result = __collapse_huge_page_swapin(mm, vma, address, pmd,
- referenced);
+ referenced, HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED)
goto out_nolock;
}
@@ -1251,6 +1304,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
if (pte) {
result = __collapse_huge_page_isolate(vma, address, pte, cc,
+ HPAGE_PMD_ORDER,
&compound_pagelist);
spin_unlock(pte_ptl);
} else {
@@ -1281,6 +1335,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
vma, address, pte_ptl,
+ HPAGE_PMD_ORDER,
&compound_pagelist);
pte_unmap(pte);
if (unlikely(result != SCAN_SUCCEED))
@@ -1316,9 +1371,9 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long start_addr,
bool *lock_dropped, struct collapse_control *cc)
{
- const unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma);
- const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc);
- const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc);
+ const unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
+ const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, HPAGE_PMD_ORDER);
+ const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
pmd_t *pmd;
pte_t *pte, *_pte;
int none_or_zero = 0, shared = 0, referenced = 0;
@@ -2372,8 +2427,8 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
unsigned long addr, struct file *file, pgoff_t start,
struct collapse_control *cc)
{
- const unsigned int max_ptes_none = collapse_max_ptes_none(cc, NULL);
- const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc);
+ const unsigned int max_ptes_none = collapse_max_ptes_none(cc, NULL, HPAGE_PMD_ORDER);
+ const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
struct folio *folio = NULL;
struct address_space *mapping = file->f_mapping;
XA_STATE(xas, &mapping->i_pages, start);
--
2.54.0
^ permalink raw reply related
* Re: [PATCH] unwind: Add sframe_(un)register() system calls
From: Steven Rostedt @ 2026-05-22 15:01 UTC (permalink / raw)
To: Thomas Weißschuh
Cc: LKML, Linux Trace Kernel, bpf, Masami Hiramatsu,
Mathieu Desnoyers, Jens Remus, Josh Poimboeuf, Peter Zijlstra,
Ingo Molnar, Jiri Olsa, Arnaldo Carvalho de Melo, Namhyung Kim,
Thomas Gleixner, Andrii Nakryiko, Indu Bhagat, Jose E. Marchesi,
Beau Belgrave, Linus Torvalds, Andrew Morton, Florian Weimer,
Kees Cook, Carlos O'Donell, Sam James, Dylan Hatch,
Borislav Petkov, Dave Hansen, David Hildenbrand, H. Peter Anvin,
Liam R. Howlett, Lorenzo Stoakes, Michal Hocko, Mike Rapoport,
Suren Baghdasaryan, Vlastimil Babka, Heiko Carstens,
Vasily Gorbik
In-Reply-To: <7923f815-a8ce-4b64-8cbf-2b90e57cbd24@t-8ch.de>
On Fri, 22 May 2026 16:36:56 +0200
Thomas Weißschuh <thomas@t-8ch.de> wrote:
> On 2026-05-21 18:35:32-0400, Steven Rostedt wrote:
> > From: Steven Rostedt <rostedt@goodmis.org>
> >
> > Add system calls to register and unregister sframes that can be used by
> > dynamic linkers to tell the kernel where the sframe section is in memory
> > for libraries it loads.
>
> How is this system call related to the prctl() with the same
> functionality from Jens' series? I guess it will replace it,
> but some explanation would be nice.
I thought the patch with the prctl() stated it was for debug purposes only.
From the change log:
[
This adds an interface for prctl() for testing loading of sframes for
libraries. But this interface should really be a system call. This patch
is for testing purposes only and should not be applied to mainline.
]
Hence I didn't think there needs to be any explanation. The prctl() patch
should never be applied upstream.
>
> (...)
>
> > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> > index f5639d5ac331..992ccc401c5e 100644
> > --- a/include/linux/syscalls.h
> > +++ b/include/linux/syscalls.h
> > @@ -999,6 +999,8 @@ asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx __user *
> > asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx __user *ctx,
> > u32 size, u32 flags);
> > asmlinkage long sys_lsm_list_modules(u64 __user *ids, u32 __user *size, u32 flags);
> > +asmlinkage long sys_sframe_register(void *data, unsigned int size);
> > +asmlinkage long sys_sframe_unregister(void *data, unsigned int size);
>
> Why not use the actual structure here?
Yeah, I was somewhat lazy here to make sure that this was the direction we
want to go. I just need to add a structure pointer reference at the top of
that file.
Will update in v2.
>
> > /*
> > * Architecture-specific system calls
>
> (...)
>
> > diff --git a/include/uapi/linux/sframe.h b/include/uapi/linux/sframe.h
> > new file mode 100644
> > index 000000000000..137a2ebf91f4
> > --- /dev/null
> > +++ b/include/uapi/linux/sframe.h
> > @@ -0,0 +1,12 @@
> > +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
> > +#ifndef _UAPI_LINUX_SFRAME_H
> > +#define _UAPI_LINUX_SFRAME_H
> > +
> > +struct sframe_setup {
> > + unsigned long sframe_start;
> > + unsigned long sframe_size;
> > + unsigned long text_start;
> > + unsigned long text_size;
> > +};
>
> This will break for compat processes, as they use a different 'unsigned
> long' than the host kernel. Maybe just use __u64.
I'll update it. I was thinking we wouldn't support compat, but in case we
decide we should forcing the size is better than being architecture
specific.
>
> > +
> > +#endif /* _UAPI_LINUX_SFRAME_H */
>
> (...)
>
> > +/**
> > + * sys_sframe_register - register an address for user space stacktrace walking.
> > + * @data: Structure of sframe data used to register the sframe section
> > + * @size: The size of the given structure.
> > + *
> > + * This system call is used by dynamic library utilities to inform the kernel
> > + * of meta data that it loaded that can be used by the kernel to know how
> > + * to stack walk the given text locations.
> > + *
> > + * Return: 0 if successful, otherwise a negative error.
> > + */
> > +SYSCALL_DEFINE2(sframe_register, __user struct sframe_setup *, data, unsigned int, size)
>
> AFAIK the normal place for the '__user' is right before '*':
>
> struct sframe_setup __user *, data,
Will update.
>
> Use __kernel_size_t for 'size'?
Looking at the history of the accept() system call that started with int
and then wanted size_t, then changed to socklen_t, I guess there's
precedence to use __kernel_size_t.
Will update.
Thanks!
-- Steve
^ permalink raw reply
* [PATCH mm-unstable v18 03/14] mm/khugepaged: rework max_ptes_* handling with helper functions
From: Nico Pache @ 2026-05-22 14:59 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe,
Usama Arif
In-Reply-To: <20260522150009.121603-1-npache@redhat.com>
The following cleanup reworks all the max_ptes_* handling into helper
functions. This increases the code readability and will later be used to
implement the mTHP handling of these variables.
With these changes we abstract all the madvise_collapse() special casing
(do not respect the sysctls) away from the functions that utilize them.
And will be used later in this series to cleanly restrict the mTHP
collapse behavior.
No functional change is intended; however, we are now only reading the
sysfs variables once per scan, whereas before these variables were being
read on each loop iteration.
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Suggested-by: David Hildenbrand <david@kernel.org>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Acked-by: Usama Arif <usama.arif@linux.dev>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 120 +++++++++++++++++++++++++++++++++---------------
1 file changed, 84 insertions(+), 36 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 13d82993755f..116f39518948 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -348,6 +348,64 @@ static bool pte_none_or_zero(pte_t pte)
return pte_present(pte) && is_zero_pfn(pte_pfn(pte));
}
+/**
+ * collapse_max_ptes_none - Calculate maximum allowed empty PTEs or PTEs mapping
+ * the shared zeropage for the given collapse operation.
+ * @cc: The collapse control struct
+ * @vma: The vma to check for userfaultfd
+ *
+ * Return: Maximum number of empty/shared zeropage PTEs for the collapse operation
+ */
+static unsigned int collapse_max_ptes_none(struct collapse_control *cc,
+ struct vm_area_struct *vma)
+{
+ if (vma && userfaultfd_armed(vma))
+ return 0;
+ /* for MADV_COLLAPSE, allow any empty/shared zeropage PTEs */
+ if (!cc->is_khugepaged)
+ return HPAGE_PMD_NR;
+ /* For all other cases respect the user defined maximum */
+ return khugepaged_max_ptes_none;
+}
+
+/**
+ * collapse_max_ptes_shared - Calculate maximum allowed PTEs that map shared
+ * anonymous pages for the given collapse operation.
+ * @cc: The collapse control struct
+ *
+ * Return: Maximum number of PTEs that map shared anonymous pages for the
+ * collapse operation
+ */
+static unsigned int collapse_max_ptes_shared(struct collapse_control *cc)
+{
+ /*
+ * For MADV_COLLAPSE, do not restrict the number of PTEs that map shared
+ * anonymous pages.
+ */
+ if (!cc->is_khugepaged)
+ return HPAGE_PMD_NR;
+ return khugepaged_max_ptes_shared;
+}
+
+/**
+ * collapse_max_ptes_swap - Calculate the maximum allowed non-present PTEs or the
+ * maximum allowed non-present pagecache entries for the given collapse operation.
+ * @cc: The collapse control struct
+ *
+ * Return: Maximum number of non-present PTEs or the maximum allowed non-present
+ * pagecache entries for the collapse operation.
+ */
+static unsigned int collapse_max_ptes_swap(struct collapse_control *cc)
+{
+ /*
+ * For MADV_COLLAPSE, do not restrict the number PTEs entries or
+ * pagecache entries that are non-present.
+ */
+ if (!cc->is_khugepaged)
+ return HPAGE_PMD_NR;
+ return khugepaged_max_ptes_swap;
+}
+
int hugepage_madvise(struct vm_area_struct *vma,
vm_flags_t *vm_flags, int advice)
{
@@ -540,6 +598,8 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long start_addr, pte_t *pte, struct collapse_control *cc,
struct list_head *compound_pagelist)
{
+ const unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma);
+ const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc);
struct page *page = NULL;
struct folio *folio = NULL;
unsigned long addr = start_addr;
@@ -551,16 +611,12 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
_pte++, addr += PAGE_SIZE) {
pte_t pteval = ptep_get(_pte);
if (pte_none_or_zero(pteval)) {
- ++none_or_zero;
- if (!userfaultfd_armed(vma) &&
- (!cc->is_khugepaged ||
- none_or_zero <= khugepaged_max_ptes_none)) {
- continue;
- } else {
+ if (++none_or_zero > max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
goto out;
}
+ continue;
}
if (!pte_present(pteval)) {
result = SCAN_PTE_NON_PRESENT;
@@ -591,9 +647,7 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
/* See collapse_scan_pmd(). */
if (folio_maybe_mapped_shared(folio)) {
- ++shared;
- if (cc->is_khugepaged &&
- shared > khugepaged_max_ptes_shared) {
+ if (++shared > max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
goto out;
@@ -1262,6 +1316,9 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long start_addr,
bool *lock_dropped, struct collapse_control *cc)
{
+ const unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma);
+ const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc);
+ const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc);
pmd_t *pmd;
pte_t *pte, *_pte;
int none_or_zero = 0, shared = 0, referenced = 0;
@@ -1295,36 +1352,29 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
pte_t pteval = ptep_get(_pte);
if (pte_none_or_zero(pteval)) {
- ++none_or_zero;
- if (!userfaultfd_armed(vma) &&
- (!cc->is_khugepaged ||
- none_or_zero <= khugepaged_max_ptes_none)) {
- continue;
- } else {
+ if (++none_or_zero > max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
goto out_unmap;
}
+ continue;
}
if (!pte_present(pteval)) {
- ++unmapped;
- if (!cc->is_khugepaged ||
- unmapped <= khugepaged_max_ptes_swap) {
- /*
- * Always be strict with uffd-wp
- * enabled swap entries. Please see
- * comment below for pte_uffd_wp().
- */
- if (pte_swp_uffd_wp_any(pteval)) {
- result = SCAN_PTE_UFFD_WP;
- goto out_unmap;
- }
- continue;
- } else {
+ if (++unmapped > max_ptes_swap) {
result = SCAN_EXCEED_SWAP_PTE;
count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
goto out_unmap;
}
+ /*
+ * Always be strict with uffd-wp
+ * enabled swap entries. Please see
+ * comment below for pte_uffd_wp().
+ */
+ if (pte_swp_uffd_wp_any(pteval)) {
+ result = SCAN_PTE_UFFD_WP;
+ goto out_unmap;
+ }
+ continue;
}
if (pte_uffd_wp(pteval)) {
/*
@@ -1367,9 +1417,7 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
* is shared.
*/
if (folio_maybe_mapped_shared(folio)) {
- ++shared;
- if (cc->is_khugepaged &&
- shared > khugepaged_max_ptes_shared) {
+ if (++shared > max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
goto out_unmap;
@@ -2324,6 +2372,8 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
unsigned long addr, struct file *file, pgoff_t start,
struct collapse_control *cc)
{
+ const unsigned int max_ptes_none = collapse_max_ptes_none(cc, NULL);
+ const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc);
struct folio *folio = NULL;
struct address_space *mapping = file->f_mapping;
XA_STATE(xas, &mapping->i_pages, start);
@@ -2342,8 +2392,7 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
if (xa_is_value(folio)) {
swap += 1 << xas_get_order(&xas);
- if (cc->is_khugepaged &&
- swap > khugepaged_max_ptes_swap) {
+ if (swap > max_ptes_swap) {
result = SCAN_EXCEED_SWAP_PTE;
count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
break;
@@ -2414,8 +2463,7 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
cc->progress += HPAGE_PMD_NR;
if (result == SCAN_SUCCEED) {
- if (cc->is_khugepaged &&
- present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+ if (present < HPAGE_PMD_NR - max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
} else {
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v18 02/14] mm/khugepaged: generalize alloc_charge_folio()
From: Nico Pache @ 2026-05-22 14:59 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe,
Usama Arif
In-Reply-To: <20260522150009.121603-1-npache@redhat.com>
From: Dev Jain <dev.jain@arm.com>
Pass order to alloc_charge_folio() and update mTHP statistics.
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Acked-by: Usama Arif <usama.arif@linux.dev>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Signed-off-by: Dev Jain <dev.jain@arm.com>
Co-developed-by: Nico Pache <npache@redhat.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
Documentation/admin-guide/mm/transhuge.rst | 8 ++++++++
include/linux/huge_mm.h | 2 ++
mm/huge_memory.c | 4 ++++
mm/khugepaged.c | 20 +++++++++++++-------
4 files changed, 27 insertions(+), 7 deletions(-)
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 5fbc3d89bb07..c51932e6275d 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -639,6 +639,14 @@ anon_fault_fallback_charge
instead falls back to using huge pages with lower orders or
small pages even though the allocation was successful.
+collapse_alloc
+ is incremented every time a huge page is successfully allocated for a
+ khugepaged collapse.
+
+collapse_alloc_failed
+ is incremented every time a huge page allocation fails during a
+ khugepaged collapse.
+
zswpout
is incremented every time a huge page is swapped out to zswap in one
piece without splitting.
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2949e5acff35..ba7ae6808544 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -128,6 +128,8 @@ enum mthp_stat_item {
MTHP_STAT_ANON_FAULT_ALLOC,
MTHP_STAT_ANON_FAULT_FALLBACK,
MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
+ MTHP_STAT_COLLAPSE_ALLOC,
+ MTHP_STAT_COLLAPSE_ALLOC_FAILED,
MTHP_STAT_ZSWPOUT,
MTHP_STAT_SWPIN,
MTHP_STAT_SWPIN_FALLBACK,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 970e077019b7..345c54133c83 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -685,6 +685,8 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+DEFINE_MTHP_STAT_ATTR(collapse_alloc, MTHP_STAT_COLLAPSE_ALLOC);
+DEFINE_MTHP_STAT_ATTR(collapse_alloc_failed, MTHP_STAT_COLLAPSE_ALLOC_FAILED);
DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK);
@@ -750,6 +752,8 @@ static struct attribute *any_stats_attrs[] = {
#endif
&split_attr.attr,
&split_failed_attr.attr,
+ &collapse_alloc_attr.attr,
+ &collapse_alloc_failed_attr.attr,
NULL,
};
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 53e7e4be172d..13d82993755f 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1068,28 +1068,34 @@ static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
}
static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
- struct collapse_control *cc)
+ struct collapse_control *cc, unsigned int order)
{
gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
GFP_TRANSHUGE);
int node = collapse_find_target_node(cc);
struct folio *folio;
- folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
+ folio = __folio_alloc(gfp, order, node, &cc->alloc_nmask);
if (!folio) {
*foliop = NULL;
- count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ if (is_pmd_order(order))
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_ALLOC_FAILED);
return SCAN_ALLOC_HUGE_PAGE_FAIL;
}
- count_vm_event(THP_COLLAPSE_ALLOC);
+ if (is_pmd_order(order))
+ count_vm_event(THP_COLLAPSE_ALLOC);
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_ALLOC);
+
if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
folio_put(folio);
*foliop = NULL;
return SCAN_CGROUP_CHARGE_FAIL;
}
- count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);
+ if (is_pmd_order(order))
+ count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);
*foliop = folio;
return SCAN_SUCCEED;
@@ -1118,7 +1124,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
*/
mmap_read_unlock(mm);
- result = alloc_charge_folio(&folio, mm, cc);
+ result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED)
goto out_nolock;
@@ -1899,7 +1905,7 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
- result = alloc_charge_folio(&new_folio, mm, cc);
+ result = alloc_charge_folio(&new_folio, mm, cc, HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED)
goto out;
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v18 01/14] mm/khugepaged: generalize hugepage_vma_revalidate for mTHP support
From: Nico Pache @ 2026-05-22 14:59 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe,
Usama Arif
In-Reply-To: <20260522150009.121603-1-npache@redhat.com>
For khugepaged to support different mTHP orders, we must generalize this
to check if the PMD is not shared by another VMA and that the order is
enabled.
No functional change in this patch. Also correct a comment about the
functionality of the revalidation and fix a double space issues.
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Acked-by: Usama Arif <usama.arif@linux.dev>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Co-developed-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 20 ++++++++++++--------
1 file changed, 12 insertions(+), 8 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b8452dbdb043..53e7e4be172d 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -902,12 +902,13 @@ static int collapse_find_target_node(struct collapse_control *cc)
/*
* If mmap_lock temporarily dropped, revalidate vma
- * before taking mmap_lock.
+ * after taking the mmap_lock again.
* Returns enum scan_result value.
*/
static enum scan_result hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
- bool expect_anon, struct vm_area_struct **vmap, struct collapse_control *cc)
+ bool expect_anon, struct vm_area_struct **vmap,
+ struct collapse_control *cc, unsigned int order)
{
struct vm_area_struct *vma;
enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED :
@@ -920,15 +921,16 @@ static enum scan_result hugepage_vma_revalidate(struct mm_struct *mm, unsigned l
if (!vma)
return SCAN_VMA_NULL;
+ /* Always check the PMD order to ensure its not shared by another VMA */
if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
return SCAN_ADDRESS_RANGE;
- if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER))
+ if (!thp_vma_allowable_orders(vma, vma->vm_flags, type, BIT(order)))
return SCAN_VMA_CHECK;
/*
* Anon VMA expected, the address may be unmapped then
* remapped to file after khugepaged reaquired the mmap_lock.
*
- * thp_vma_allowable_order may return true for qualified file
+ * thp_vma_allowable_orders may return true for qualified file
* vmas.
*/
if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
@@ -1121,7 +1123,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
goto out_nolock;
mmap_read_lock(mm);
- result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
+ result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
+ HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm);
goto out_nolock;
@@ -1155,7 +1158,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
* mmap_lock.
*/
mmap_write_lock(mm);
- result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
+ result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
+ HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED)
goto out_up_write;
/* check if the pmd is still valid */
@@ -2857,8 +2861,8 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
mmap_unlocked = false;
*lock_dropped = true;
result = hugepage_vma_revalidate(mm, addr, false, &vma,
- cc);
- if (result != SCAN_SUCCEED) {
+ cc, HPAGE_PMD_ORDER);
+ if (result != SCAN_SUCCEED) {
last_fail = result;
goto out_nolock;
}
--
2.54.0
^ permalink raw reply related
* [PATCH mm-hotfixes-unstable v18 00/14] khugepaged: add mTHP collapse support
From: Nico Pache @ 2026-05-22 14:59 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
The following series provides khugepaged with the capability to collapse
anonymous memory regions to mTHPs.
To achieve this we generalize the khugepaged functions to no longer depend
on PMD_ORDER. Then during the PMD scan, we use a bitmap to track individual
pages that are occupied (!none/zero). After the PMD scan is done, we use
the bitmap to find the optimal mTHP sizes for the PMD range. The
restriction on max_ptes_none is removed during the scan, to make sure we
account for the whole PMD range in the bitmap. When no mTHP size is
enabled, the legacy behavior of khugepaged is maintained.
We currently only support max_ptes_none values of 0 or HPAGE_PMD_NR - 1
(ie 511). If any other value is specified, the kernel will emit a warning
and mTHP collapse will default to max_ptes_none=0. If a mTHP collapse is
attempted, but contains swapped out, or shared pages, we don't perform
the collapse.
It is now also possible to collapse to mTHPs without requiring the PMD THP
size to be enabled. These limitations are to prevent collapse "creep"
behavior. This prevents constantly promoting mTHPs to the next available
size, which would occur because a collapse introduces more non-zero pages
that would satisfy the promotion condition on subsequent scans.
Patch 1-2: Generalize hugepage_vma_revalidate and alloc_charge_folio
for arbitrary orders.
Patch 3: Rework max_ptes_* handling into helper functions
Patch 4: Generalize __collapse_huge_page_* for mTHP support
Patch 5: Require collapse_huge_page to enter/exit with the lock dropped
Patch 6: Generalize collapse_huge_page for mTHP collapse
Patch 7: Skip collapsing mTHP to smaller orders
Patch 8-9: Add per-order mTHP statistics and tracepoints
Patch 10: Introduce collapse_allowable_orders helper function
Patch 11-13: Introduce bitmap and mTHP collapse support, fully enabled
Patch 14: Documentation
Testing:
- Built for x86_64, aarch64, ppc64le, and s390x
- ran all arches on test suites provided by the kernel-tests project
- internal testing suites: functional testing and performance testing
- selftests mm
- I created a test script that I used to push khugepaged to its limits
while monitoring a number of stats and tracepoints. The code is
available here[1] (Run in legacy mode for these changes and set mthp
sizes to inherit)
The summary from my testings was that there was no significant
regression noticed through this test. In some cases my changes had
better collapse latencies, and was able to scan more pages in the same
amount of time/work, but for the most part the results were consistent.
- redis testing. I did some testing with these changes along with my defer
changes (see followup [2] post for more details). We've decided to get
the mTHP changes merged first before attempting the defer series.
- some basic testing on 64k page size.
- lots of general use.
[1] - https://gitlab.com/npache/khugepaged_mthp_test
[2] - https://lore.kernel.org/lkml/20250515033857.132535-1-npache@redhat.com/
V18 Changes:
- Added RBs/Acks
- [patch 02] Guard count_memcg_folio_events with is_pmd_order() to keep
THP_COLLAPSE_ALLOC PMD-only (Usama, Lance)
- [patch 03] Convert C++ comments to C-style; fix "none-page" terminology
to "empty PTEs or PTEs mapping the shared zeropage"; drop unnecessary
userfaultfd comment; add const to local max_ptes_* variables; fix
"repect" typo (Lance, David)
- [patch 04] collapse_max_ptes_none() now returns 0 instead of -EINVAL for
unsupported values; remove SCAN_INVALID_PTES_NONE; change return type
from int to unsigned int and propagate to all callers; add comment above
__collapse_huge_page_swapin explaining mTHP swap bail-out (David,
Lorenzo, Lance, Wei Yang, Usama)
- [patch 05] Rewrite collapse_huge_page lock comment to David's suggested
wording (David)
- [patch 11] Propagate unsigned int return type for max_ptes_none; remove
the now-unnecessary negative return check (consequence of patch 04);
Add optimization to the next_order goto that will prevent unnecessary
iterations if there are no lower orders enabled (Vernon); update locking
comment; pass VMA to mthp_collapse to improve uffd-armed detection, and
prevent unnecessary work. (Wei)
- [patch 14] Update documentation to reflect fallback-to-0 behavior
V17: https://lore.kernel.org/all/20260511185817.686831-1-npache@redhat.com
V16: https://lore.kernel.org/all/20260419185750.260784-1-npache@redhat.com
V15: https://lore.kernel.org/all/20260226031741.230674-1-npache@redhat.com
V14: https://lore.kernel.org/all/20260122192841.128719-1-npache@redhat.com
V13: https://lore.kernel.org/all/20251201174627.23295-1-npache@redhat.com
V12: https://lore.kernel.org/all/20251022183717.70829-1-npache@redhat.com
V11: https://lore.kernel.org/all/20250912032810.197475-1-npache@redhat.com
V10: https://lore.kernel.org/all/20250819134205.622806-1-npache@redhat.com
V9 : https://lore.kernel.org/all/20250714003207.113275-1-npache@redhat.com
V8 : https://lore.kernel.org/all/20250702055742.102808-1-npache@redhat.com
V7 : https://lore.kernel.org/all/20250515032226.128900-1-npache@redhat.com
V6 : https://lore.kernel.org/all/20250515030312.125567-1-npache@redhat.com
V5 : https://lore.kernel.org/all/20250428181218.85925-1-npache@redhat.com
V4 : https://lore.kernel.org/all/20250417000238.74567-1-npache@redhat.com
V3 : https://lore.kernel.org/all/20250414220557.35388-1-npache@redhat.com
V2 : https://lore.kernel.org/all/20250211003028.213461-1-npache@redhat.com
V1 : https://lore.kernel.org/all/20250108233128.14484-1-npache@redhat.com
Baolin Wang (1):
mm/khugepaged: run khugepaged for all orders
Dev Jain (1):
mm/khugepaged: generalize alloc_charge_folio()
Nico Pache (12):
mm/khugepaged: generalize hugepage_vma_revalidate for mTHP support
mm/khugepaged: rework max_ptes_* handling with helper functions
mm/khugepaged: generalize __collapse_huge_page_* for mTHP support
mm/khugepaged: require collapse_huge_page to enter/exit with the lock
dropped
mm/khugepaged: generalize collapse_huge_page for mTHP collapse
mm/khugepaged: skip collapsing mTHP to smaller orders
mm/khugepaged: add per-order mTHP collapse failure statistics
mm/khugepaged: improve tracepoints for mTHP orders
mm/khugepaged: introduce collapse_allowable_orders helper function
mm/khugepaged: Introduce mTHP collapse support
mm/khugepaged: avoid unnecessary mTHP collapse attempts
Documentation: mm: update the admin guide for mTHP collapse
Documentation/admin-guide/mm/transhuge.rst | 72 ++-
include/linux/huge_mm.h | 5 +
include/trace/events/huge_memory.h | 34 +-
mm/huge_memory.c | 11 +
mm/khugepaged.c | 634 ++++++++++++++++-----
5 files changed, 584 insertions(+), 172 deletions(-)
base-commit: 6c8cb505a5634594b3ea159fd1c71bce2acf3346
--
2.54.0
^ permalink raw reply
* Re: [PATCH v6] tracing/eprobes: Allow use of BTF names to dereference pointers
From: Steven Rostedt @ 2026-05-22 14:45 UTC (permalink / raw)
To: Masami Hiramatsu
Cc: LKML, Linux trace kernel, Mathieu Desnoyers, Mark Rutland,
Peter Zijlstra, Namhyung Kim, Takaya Saeki, Douglas Raillard,
Tom Zanussi, Andrew Morton, Thomas Gleixner, Ian Rogers,
Jiri Olsa, sashiko-bot@kernel.org,
sashiko-reviews@lists.linux.dev
In-Reply-To: <20260522072322.18aa72dd@gandalf.local.home>
On Fri, 22 May 2026 07:23:22 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:
> > > @@ -653,6 +686,20 @@ static int parse_btf_arg(char *varname,
> > > return -EOPNOTSUPP;
> > > }
> > >
> > > + if (ctx->flags & TPARG_FL_TEVENT) {
> > > + int ret;
> > > +
> > > + ret = parse_trace_event(varname, code, ctx);
> > > + if (ret < 0)
> > > + return ret;
>
> > When parse_trace_event() returns a negative error code (such as -EINVAL or
> > -ENOENT) because a field name is invalid, the error is propagated back up
> > the stack. Does this path miss calling trace_probe_log_err()?
> > If so, users might receive a generic failure without context or a caret
> > pointing to the specific syntax error.
>
> Hmm, there's a comment in the parse_trace_event() that sets ctx->offset for
> backward compatibility. I'll investigate to see if we can fix that now.
Masami,
I looked at the code for parse_trace_event() that has:
/* backward compatibility */
ctx->offset = 0;
return -EINVAL;
And it was originally introduced by commit 1b8b0cd754cd ("tracing/probes:
Move event parameter fetching code to common parser"), with:
+ ret = parse_trace_event_arg(arg, code, ctx);
+ if (!ret)
+ return 0;
+ if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) {
+ code->op = FETCH_OP_COMM;
+ return 0;
+ }
+ /* backward compatibility */
+ ctx->offset = 0;
+ goto inval;
+ }
+
What was the reason for the "backward compatibility"? Can we make it a real
error now?
-- Steve
^ permalink raw reply
* Re: [PATCH] unwind: Add sframe_(un)register() system calls
From: Thomas Weißschuh @ 2026-05-22 14:36 UTC (permalink / raw)
To: Steven Rostedt
Cc: LKML, Linux Trace Kernel, bpf, Masami Hiramatsu,
Mathieu Desnoyers, Jens Remus, Josh Poimboeuf, Peter Zijlstra,
Ingo Molnar, Jiri Olsa, Arnaldo Carvalho de Melo, Namhyung Kim,
Thomas Gleixner, Andrii Nakryiko, Indu Bhagat, Jose E. Marchesi,
Beau Belgrave, Linus Torvalds, Andrew Morton, Florian Weimer,
Kees Cook, Carlos O'Donell, Sam James, Dylan Hatch,
Borislav Petkov, Dave Hansen, David Hildenbrand, H. Peter Anvin,
Liam R. Howlett, Lorenzo Stoakes, Michal Hocko, Mike Rapoport,
Suren Baghdasaryan, Vlastimil Babka, Heiko Carstens,
Vasily Gorbik
In-Reply-To: <20260521183532.7a145c8a@gandalf.local.home>
On 2026-05-21 18:35:32-0400, Steven Rostedt wrote:
> From: Steven Rostedt <rostedt@goodmis.org>
>
> Add system calls to register and unregister sframes that can be used by
> dynamic linkers to tell the kernel where the sframe section is in memory
> for libraries it loads.
How is this system call related to the prctl() with the same
functionality from Jens' series? I guess it will replace it,
but some explanation would be nice.
(...)
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index f5639d5ac331..992ccc401c5e 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -999,6 +999,8 @@ asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx __user *
> asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx __user *ctx,
> u32 size, u32 flags);
> asmlinkage long sys_lsm_list_modules(u64 __user *ids, u32 __user *size, u32 flags);
> +asmlinkage long sys_sframe_register(void *data, unsigned int size);
> +asmlinkage long sys_sframe_unregister(void *data, unsigned int size);
Why not use the actual structure here?
> /*
> * Architecture-specific system calls
(...)
> diff --git a/include/uapi/linux/sframe.h b/include/uapi/linux/sframe.h
> new file mode 100644
> index 000000000000..137a2ebf91f4
> --- /dev/null
> +++ b/include/uapi/linux/sframe.h
> @@ -0,0 +1,12 @@
> +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
> +#ifndef _UAPI_LINUX_SFRAME_H
> +#define _UAPI_LINUX_SFRAME_H
> +
> +struct sframe_setup {
> + unsigned long sframe_start;
> + unsigned long sframe_size;
> + unsigned long text_start;
> + unsigned long text_size;
> +};
This will break for compat processes, as they use a different 'unsigned
long' than the host kernel. Maybe just use __u64.
> +
> +#endif /* _UAPI_LINUX_SFRAME_H */
(...)
> +/**
> + * sys_sframe_register - register an address for user space stacktrace walking.
> + * @data: Structure of sframe data used to register the sframe section
> + * @size: The size of the given structure.
> + *
> + * This system call is used by dynamic library utilities to inform the kernel
> + * of meta data that it loaded that can be used by the kernel to know how
> + * to stack walk the given text locations.
> + *
> + * Return: 0 if successful, otherwise a negative error.
> + */
> +SYSCALL_DEFINE2(sframe_register, __user struct sframe_setup *, data, unsigned int, size)
AFAIK the normal place for the '__user' is right before '*':
struct sframe_setup __user *, data,
Use __kernel_size_t for 'size'?
> +{
> + struct sframe_setup sframe;
(...)
^ permalink raw reply
* Re: [PATCH v6 21/43] KVM: SEV: Make 'uaddr' parameter optional for KVM_SEV_SNP_LAUNCH_UPDATE
From: Sean Christopherson @ 2026-05-22 13:08 UTC (permalink / raw)
To: Ackerley Tng
Cc: Fuad Tabba, aik, andrew.jones, binbin.wu, brauner, chao.p.peng,
david, ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
Kemeng Shi, Nhat Pham, Baoquan He, Barry Song, Axel Rasmussen,
Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka, kvm,
linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco
In-Reply-To: <CAEvNRgFB8ydih9JTmsH06H32j38tH-iViZqN_eZ_gQAmXpw+Dw@mail.gmail.com>
On Thu, May 21, 2026, Ackerley Tng wrote:
> Sean Christopherson <seanjc@google.com> writes:
>
> > On Thu, May 21, 2026, Fuad Tabba wrote:
> >> Hi,
> >>
> >> On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
> > diff --git include/linux/kvm_host.h include/linux/kvm_host.h
> > index 61a3430957f2..b83cda2870ba 100644
> > --- include/linux/kvm_host.h
> > +++ include/linux/kvm_host.h
> > @@ -2596,7 +2596,8 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord
> > typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
> > struct page *page, void *opaque);
> >
> > -long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
> > +long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src,
> > + long npages, bool writable,
>
> What do you think of need_writable_src instead of just writable for the
> variable name?
How about "may_write_src" or "may_writeback_src"?
> > kvm_gmem_populate_cb post_populate, void *opaque);
> > #endif
> >
> > diff --git virt/kvm/guest_memfd.c virt/kvm/guest_memfd.c
> > index a35a55571a2d..6553d4e032ce 100644
> > --- virt/kvm/guest_memfd.c
> > +++ virt/kvm/guest_memfd.c
> > @@ -858,7 +858,8 @@ static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot,
> > return ret;
> > }
> >
> > -long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
> > +long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src,
> > + long npages, bool writable,
> > kvm_gmem_populate_cb post_populate, void *opaque)
> > {
> > struct kvm_memory_slot *slot;
> > @@ -892,8 +893,9 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> >
> > if (src) {
> > unsigned long uaddr = (unsigned long)src + i * PAGE_SIZE;
> > + unsigned int flags = writable ? FOLL_WRITE : 0;
>
> How about using FOLL_WRITE | FOLL_NOFAULT so if it weren't writable to
> start with, don't CoW, just error out?
Eh, I don't see any value in value in erroring out if userspace is doing something
unusual. If breaking CoW was actually problematic somehow, then sure. But AFAICT
it's overall harmless.
> Like you said above the CPUID page provided as src_page would have been
> written to before, so it should have been mapped as writable.
^ permalink raw reply
* Re: [PATCH mm-unstable v17 11/14] mm/khugepaged: Introduce mTHP collapse support
From: Nico Pache @ 2026-05-22 12:39 UTC (permalink / raw)
To: Vernon Yang
Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
lance.yang, liam, ljs, mathieu.desnoyers, matthew.brost, mhiramat,
mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
zokeefe
In-Reply-To: <8f9834db-8981-4eb1-ae46-94908943da3d@gmail.com>
On Wed, May 20, 2026 at 8:36 PM Vernon Yang <vernon2gm@gmail.com> wrote:
>
> On Mon, May 11, 2026 at 12:58:11PM -0600, Nico Pache wrote:
> > Enable khugepaged to collapse to mTHP orders. This patch implements the
> > main scanning logic using a bitmap to track occupied pages and a stack
> > structure that allows us to find optimal collapse sizes.
> >
> > Previous to this patch, PMD collapse had 3 main phases, a light weight
> > scanning phase (mmap_read_lock) that determines a potential PMD
> > collapse, an alloc phase (mmap unlocked), then finally heavier collapse
> > phase (mmap_write_lock).
> >
> > To enabled mTHP collapse we make the following changes:
> >
> > During PMD scan phase, track occupied pages in a bitmap. When mTHP
> > orders are enabled, we remove the restriction of max_ptes_none during the
> > scan phase to avoid missing potential mTHP collapse candidates. Once we
> > have scanned the full PMD range and updated the bitmap to track occupied
> > pages, we use the bitmap to find the optimal mTHP size.
> >
> > Implement collapse_scan_bitmap() to perform binary recursion on the bitmap
> > and determine the best eligible order for the collapse. A stack structure
> > is used instead of traditional recursion to manage the search. This also
> > prevents a traditional recursive approach when the kernel stack struct is
> > limited. The algorithm recursively splits the bitmap into smaller chunks to
> > find the highest order mTHPs that satisfy the collapse criteria. We start
> > by attempting the PMD order, then moved on the consecutively lower orders
> > (mTHP collapse). The stack maintains a pair of variables (offset, order),
> > indicating the number of PTEs from the start of the PMD, and the order of
> > the potential collapse candidate.
> >
> > The algorithm for consuming the bitmap works as such:
> > 1) push (0, HPAGE_PMD_ORDER) onto the stack
> > 2) pop the stack
> > 3) check if the number of set bits in that (offset,order) pair
> > statisfy the max_ptes_none threshold for that order
> > 4) if yes, attempt collapse
> > 5) if no (or collapse fails), push two new stack items representing
> > the left and right halves of the current bitmap range, at the
> > next lower order
> > 6) repeat at step (2) until stack is empty.
> >
> > Below is a diagram representing the algorithm and stack items:
> >
> > offset mid_offset
> > | |
> > | |
> > v v
> > ____________________________________
> > | PTE Page Table |
> > --------------------------------------
> > <-------><------->
> > order-1 order-1
> >
> > mTHP collapses reject regions containing swapped out or shared pages.
> > This is because adding new entries can lead to new none pages, and these
> > may lead to constant promotion into a higher order mTHP. A similar
> > issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse
> > introducing at least 2x the number of pages, and on a future scan will
> > satisfy the promotion condition once again. This issue is prevented via
> > the collapse_max_ptes_none() function which imposes the max_ptes_none
> > restrictions above.
> >
> > We currently only support mTHP collapse for max_ptes_none values of 0
> > and HPAGE_PMD_NR - 1. resulting in the following behavior:
> >
> > - max_ptes_none=0: Never introduce new empty pages during collapse
> > - max_ptes_none=HPAGE_PMD_NR-1: Always try collapse to the highest
> > available mTHP order
> >
> > Any other max_ptes_none value will emit a warning and skip mTHP collapse
> > attempts. There should be no behavior change for PMD collapse.
> >
> > Once we determine what mTHP sizes fits best in that PMD range a collapse
> > is attempted. A minimum collapse order of 2 is used as this is the lowest
> > order supported by anon memory as defined by THP_ORDERS_ALL_ANON.
> >
> > Currently madv_collapse is not supported and will only attempt PMD
> > collapse.
> >
> > We can also remove the check for is_khugepaged inside the PMD scan as
> > the collapse_max_ptes_none() function handles this logic now.
> >
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> > mm/khugepaged.c | 182 +++++++++++++++++++++++++++++++++++++++++++++---
> > 1 file changed, 174 insertions(+), 8 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 3492b135d667..39bf7ea8a6e8 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -100,6 +100,30 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
> >
> > static struct kmem_cache *mm_slot_cache __ro_after_init;
> >
> > +#define KHUGEPAGED_MIN_MTHP_ORDER 2
> > +/*
> > + * mthp_collapse() does an iterative DFS over a binary tree, from
> > + * HPAGE_PMD_ORDER down to KHUGEPAGED_MIN_MTHP_ORDER. The max stack
> > + * size needed for a DFS on a binary tree is height + 1, where
> > + * height = HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER.
> > + *
> > + * ilog2 is used in place of HPAGE_PMD_ORDER because some architectures
> > + * (e.g. ppc64le) do not define HPAGE_PMD_ORDER until after build time.
> > + */
> > +#define MTHP_STACK_SIZE (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER + 1)
> > +
> > +/*
> > + * Defines a range of PTE entries in a PTE page table which are being
> > + * considered for mTHP collapse.
> > + *
> > + * @offset: the offset of the first PTE entry in a PMD range.
> > + * @order: the order of the PTE entries being considered for collapse.
> > + */
> > +struct mthp_range {
> > + u16 offset;
> > + u8 order;
> > +};
> > +
> > struct collapse_control {
> > bool is_khugepaged;
> >
> > @@ -111,6 +135,12 @@ struct collapse_control {
> >
> > /* nodemask for allocation fallback */
> > nodemask_t alloc_nmask;
> > +
> > + /* Each bit represents a single occupied (!none/zero) page. */
> > + DECLARE_BITMAP(mthp_bitmap, MAX_PTRS_PER_PTE);
> > + /* A mask of the current range being considered for mTHP collapse. */
> > + DECLARE_BITMAP(mthp_bitmap_mask, MAX_PTRS_PER_PTE);
> > + struct mthp_range mthp_bitmap_stack[MTHP_STACK_SIZE];
> > };
> >
> > /**
> > @@ -1404,20 +1434,140 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long s
> > return result;
> > }
> >
> > +static void collapse_mthp_stack_push(struct collapse_control *cc, int *stack_size,
> > + u16 offset, u8 order)
> > +{
> > + const int size = *stack_size;
> > + struct mthp_range *stack = &cc->mthp_bitmap_stack[size];
> > +
> > + VM_WARN_ON_ONCE(size >= MTHP_STACK_SIZE);
> > + stack->order = order;
> > + stack->offset = offset;
> > + (*stack_size)++;
> > +}
> > +
> > +static struct mthp_range collapse_mthp_stack_pop(struct collapse_control *cc,
> > + int *stack_size)
> > +{
> > + const int size = *stack_size;
> > +
> > + VM_WARN_ON_ONCE(size <= 0);
> > + (*stack_size)--;
> > + return cc->mthp_bitmap_stack[size - 1];
> > +}
> > +
> > +static unsigned int collapse_mthp_count_present(struct collapse_control *cc,
> > + u16 offset, unsigned int nr_ptes)
> > +{
> > + bitmap_zero(cc->mthp_bitmap_mask, MAX_PTRS_PER_PTE);
> > + bitmap_set(cc->mthp_bitmap_mask, offset, nr_ptes);
> > + return bitmap_weight_and(cc->mthp_bitmap, cc->mthp_bitmap_mask, MAX_PTRS_PER_PTE);
> > +}
> > +
> > +/*
> > + * mthp_collapse() consumes the bitmap that is generated during
> > + * collapse_scan_pmd() to determine what regions and mTHP orders fit best.
> > + *
> > + * Each bit in cc->mthp_bitmap represents a single occupied (!none/zero) page.
> > + * A stack structure cc->mthp_bitmap_stack is used to check different regions
> > + * of the bitmap for collapse eligibility. The stack maintains a pair of
> > + * variables (offset, order), indicating the number of PTEs from the start of
> > + * the PMD, and the order of the potential collapse candidate respectively. We
> > + * start at the PMD order and check if it is eligible for collapse; if not, we
> > + * add two entries to the stack at a lower order to represent the left and right
> > + * halves of the PTE page table we are examining.
> > + *
> > + * offset mid_offset
> > + * | |
> > + * | |
> > + * v v
> > + * --------------------------------------
> > + * | cc->mthp_bitmap |
> > + * --------------------------------------
> > + * <-------><------->
> > + * order-1 order-1
> > + *
> > + * For each of these, we determine how many PTE entries are occupied in the
> > + * range of PTE entries we propose to collapse, then we compare this to a
> > + * threshold number of PTE entries which would need to be occupied for a
> > + * collapse to be permitted at that order (accounting for max_ptes_none).
> > + *
> > + * If a collapse is permitted, we attempt to collapse the PTE range into a
> > + * mTHP.
> > + */
> > +static int mthp_collapse(struct mm_struct *mm, unsigned long address,
> > + int referenced, int unmapped, struct collapse_control *cc,
> > + unsigned long enabled_orders)
> > +{
> > + unsigned int nr_occupied_ptes, nr_ptes;
> > + int max_ptes_none, collapsed = 0, stack_size = 0;
> > + unsigned long collapse_address;
> > + struct mthp_range range;
> > + u16 offset;
> > + u8 order;
> > +
> > + collapse_mthp_stack_push(cc, &stack_size, 0, HPAGE_PMD_ORDER);
> > +
> > + while (stack_size) {
> > + range = collapse_mthp_stack_pop(cc, &stack_size);
> > + order = range.order;
> > + offset = range.offset;
> > + nr_ptes = 1UL << order;
> > +
> > + if (!test_bit(order, &enabled_orders))
> > + goto next_order;
> > +
> > + max_ptes_none = collapse_max_ptes_none(cc, NULL, order);
> > +
> > + if (max_ptes_none < 0)
> > + return collapsed;
> > +
> > + nr_occupied_ptes = collapse_mthp_count_present(cc, offset,
> > + nr_ptes);
> > +
> > + if (nr_occupied_ptes >= nr_ptes - max_ptes_none) {
> > + int ret;
> > +
> > + collapse_address = address + offset * PAGE_SIZE;
> > + ret = collapse_huge_page(mm, collapse_address, referenced,
> > + unmapped, cc, order);
> > + if (ret == SCAN_SUCCEED) {
> > + collapsed += nr_ptes;
> > + continue;
> > + }
> > + }
> > +
> > +next_order:
> > + if (order > KHUGEPAGED_MIN_MTHP_ORDER) {
>
> Hi Nico, thank you very much for your contributions to this series.
>
> I found a minor issue, for MADV_COLLAPSE, if collapse_huge_page() fails
> for some reason (e.g. allocate folio), it goes to next_order and
> continues splitting to the next small order. However, enabled_orders
> only supports HPAGE_PMD_ORDER, so it keeps runing the split operations
> without any effective work until KHUGEPAGED_MIN_MTHP_ORDER is reached
> before exiting. For khugepaged, e.g. setting only 2MB to always, also
> same phenomenon.
>
> This does not affect the overall functionality of mthp collapse, just
> redundant.
>
> The redundant operations can be easily skipped with the following
> modification. If I miss some thing, please let me know. Thanks!
Hi Vernon!
Thank you for the report and very clean solution :) I will implement
your optimization into this commit.
Cheers,
-- Nico
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 1a25af3d6d0f..fa407cce525c 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1574,7 +1574,7 @@ static int mthp_collapse(struct mm_struct *mm, unsigned long address,
> }
>
> next_order:
> - if (order > KHUGEPAGED_MIN_MTHP_ORDER) {
> + if ((BIT(order) - 1) & enabled_orders) {
> const u8 next_order = order - 1;
> const u16 mid_offset = offset + (nr_ptes / 2);
>
> --
> Cheers,
> Vernon
>
> > + const u8 next_order = order - 1;
> > + const u16 mid_offset = offset + (nr_ptes / 2);
> > +
> > + collapse_mthp_stack_push(cc, &stack_size, mid_offset,
> > + next_order);
> > + collapse_mthp_stack_push(cc, &stack_size, offset,
> > + next_order);
> > + }
> > + }
> > + return collapsed;
> > +}
> > +
> > static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
> > struct vm_area_struct *vma, unsigned long start_addr,
> > bool *lock_dropped, struct collapse_control *cc)
> > {
> > - const int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
> > + int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
> > const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, HPAGE_PMD_ORDER);
> > const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
> > + enum tva_type tva_flags = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
> > pmd_t *pmd;
> > - pte_t *pte, *_pte;
> > - int none_or_zero = 0, shared = 0, referenced = 0;
> > + pte_t *pte, *_pte, pteval;
> > + int i;
> > + int none_or_zero = 0, shared = 0, nr_collapsed = 0, referenced = 0;
> > enum scan_result result = SCAN_FAIL;
> > struct page *page = NULL;
> > struct folio *folio = NULL;
> > unsigned long addr;
> > + unsigned long enabled_orders;
> > spinlock_t *ptl;
> > int node = NUMA_NO_NODE, unmapped = 0;
> >
> > @@ -1429,8 +1579,19 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
> > goto out;
> > }
> >
> > + bitmap_zero(cc->mthp_bitmap, MAX_PTRS_PER_PTE);
> > memset(cc->node_load, 0, sizeof(cc->node_load));
> > nodes_clear(cc->alloc_nmask);
> > +
> > + enabled_orders = collapse_allowable_orders(vma, vma->vm_flags, tva_flags);
> > +
> > + /*
> > + * If PMD is the only enabled order, enforce max_ptes_none, otherwise
> > + * scan all pages to populate the bitmap for mTHP collapse.
> > + */
> > + if (enabled_orders != BIT(HPAGE_PMD_ORDER))
> > + max_ptes_none = KHUGEPAGED_MAX_PTES_LIMIT;
> > +
> > pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
> > if (!pte) {
> > cc->progress++;
> > @@ -1438,11 +1599,13 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
> > goto out;
> > }
> >
> > - for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> > - _pte++, addr += PAGE_SIZE) {
> > + for (i = 0; i < HPAGE_PMD_NR; i++) {
> > + _pte = pte + i;
> > + addr = start_addr + i * PAGE_SIZE;
> > + pteval = ptep_get(_pte);
> > +
> > cc->progress++;
> >
> > - pte_t pteval = ptep_get(_pte);
> > if (pte_none_or_zero(pteval)) {
> > if (++none_or_zero > max_ptes_none) {
> > result = SCAN_EXCEED_NONE_PTE;
> > @@ -1522,6 +1685,8 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
> > }
> > }
> >
> > + /* Set bit for occupied pages */
> > + __set_bit(i, cc->mthp_bitmap);
> > /*
> > * Record which node the original page is from and save this
> > * information to cc->node_load[].
> > @@ -1580,10 +1745,11 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
> > if (result == SCAN_SUCCEED) {
> > /* collapse_huge_page expects the lock to be dropped before calling */
> > mmap_read_unlock(mm);
> > - result = collapse_huge_page(mm, start_addr, referenced,
> > - unmapped, cc, HPAGE_PMD_ORDER);
> > + nr_collapsed = mthp_collapse(mm, start_addr, referenced, unmapped,
> > + cc, enabled_orders);
> > /* collapse_huge_page will return with the mmap_lock released */
> > *lock_dropped = true;
> > + result = nr_collapsed ? SCAN_SUCCEED : SCAN_FAIL;
> > }
> > out:
> > trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
> > --
> > 2.54.0
> >
> >
>
^ permalink raw reply
* Re: [LSF/MM/BPF TOPIC][RFC PATCH v4 00/27] Private Memory Nodes (w/ Compressed RAM)
From: Arun George/Arun George @ 2026-05-22 8:40 UTC (permalink / raw)
To: Gregory Price
Cc: lsf-pc, linux-kernel, linux-cxl, cgroups, linux-mm,
linux-trace-kernel, damon, kernel-team, gregkh, rafael, dakr,
dave, dave.jiang, alison.schofield, vishal.l.verma, ira.weiny,
longman, akpm, david, lorenzo.stoakes, Liam.Howlett, vbabka, rppt,
surenb, mhocko, osalvador, ziy, matthew.brost, joshua.hahnjy,
rakie.kim, byungchul, ying.huang, apopple, axelrasmussen, yuanchu,
weixugc, yury.norov, linux, mhiramat, mathieu.desnoyers, tj,
hannes, mkoutny, jackmanb, sj, baolin.wang, npache, ryan.roberts,
dev.jain, baohua, lance.yang, muchun.song, xu.xin16,
chengming.zhou, jannh, linmiaohe, nao.horiguchi, pfalcato,
rientjes, shakeel.butt, riel, harry.yoo, cl, roman.gushchin,
chrisl, kasong, shikemeng, nphamcs, bhe, zhengqi.arch,
terry.bowman, gost.dev, arungeorge05, cpgs
In-Reply-To: <afmgJcFUjQLYxkb5@gourry-fedora-PF4VCD3F>
Thanks.
On 05-05-2026 01:15 pm, Gregory Price wrote:
> In the scenario i'm talking about, a "write budget" is defined as a
> number of pages that are allows to be mapped writable in the page
> tables at any given time.
> Agree. I was also in the same context.
I am trying to bring the device perspective here, and would like to
discuss a few corner cases and possible solutions.
As I see, solving the compressed memory problem statement has these
aspects mainly:
1) Allocation control: private/managed memory concept.
2) Write control: write-protected PTEs, write-controlled use cases like
ZSWAP
3) Proactive reclaims: optional methods to ease back-pressure using
memory shrinkers, ballooning, kswapd, promotion etc. These methods will
be triggered based on notifications/interrupts from the device.
May be they are not enough to cover some corner cases for cram!
I believe that this thin-provisioned memory infra is susceptible to
'writes-above-media-capacity corner cases' (because of not handling
device back-pressure notifications in time) whichever methods we use in
the kernel. Even if we use write-controlled methods like ZSWAP and
pro-active reclaims, there could be corner cases where the communication
with the device could be broken and the write path is not aware of it
immediately. Note that OCP spec [1] says the device should mark the
memory location as 'poisoned' in 'over-capacity' writes.
So I have the following proposals / options for this scenario.
Option 1: Poisoned data management - This is about accepting that
poisoning of memory locations can happen in much more regular frequency
here than regular memories and we need to figure out potential recovery
mechanisms in host (not recovery of data; but recovery from the poison
situation). But I guess folks will not be okay with it in general, and I
am not aware of any workloads where data poisoning is tolerated (may be
caching workloads?).
Option 2 (preferred): Device assisted write budgeting - This is
about a device aware / assisted mechanism for the write-controlled
use-cases (Ex: ZSWAP) to know the 'safe number of writes' that can be
performed to the device (Or allows to be mapped writable in the page
tables). This could be like a 'token bucket' algorithm, where the device
provides a 'budget / set of tokens' to the host. And it need to be
replenished periodically in the device communication code path; and if
the host does not find the token, writes cannot go ahead.
In short, the communication with the device has to be maintained to make
pages mapped writable. For MVP, this could be a simple constraint of
checking actual device capacity periodically to replenish write-budget
for CRAM. For other users of private nodes (GPU memory?), this
constraint may not be needed at all.
We are planning to send an RFC code which will fit into your CRAM infra
to discuss this poison management approach further.
[1]:
https://www.opencompute.org/documents/hyperscale-tiered-memory-expander-specification-for-compute-express-link-cxl-1-pdf
~Arun George
^ permalink raw reply
* Re: [PATCH v6] tracing/eprobes: Allow use of BTF names to dereference pointers
From: Steven Rostedt @ 2026-05-22 11:23 UTC (permalink / raw)
To: LKML, Linux trace kernel
Cc: Masami Hiramatsu, Mathieu Desnoyers, Mark Rutland, Peter Zijlstra,
Namhyung Kim, Takaya Saeki, Douglas Raillard, Tom Zanussi,
Andrew Morton, Thomas Gleixner, Ian Rogers, Jiri Olsa,
sashiko-bot@kernel.org, sashiko-reviews@lists.linux.dev
In-Reply-To: <20260521225033.56458336@fedora>
[ Replying to Sashiko: https://sashiko.dev/?list=org.kernel.vger.linux-trace-kernel#/patchset/20260521225033.56458336%40fedora ]
> > @@ -653,6 +686,20 @@ static int parse_btf_arg(char *varname,
> > return -EOPNOTSUPP;
> > }
> >
> > + if (ctx->flags & TPARG_FL_TEVENT) {
> > + int ret;
> > +
> > + ret = parse_trace_event(varname, code, ctx);
> > + if (ret < 0)
> > + return ret;
> When parse_trace_event() returns a negative error code (such as -EINVAL or
> -ENOENT) because a field name is invalid, the error is propagated back up
> the stack. Does this path miss calling trace_probe_log_err()?
> If so, users might receive a generic failure without context or a caret
> pointing to the specific syntax error.
Hmm, there's a comment in the parse_trace_event() that sets ctx->offset for
backward compatibility. I'll investigate to see if we can fix that now.
> > +
> > + if (ctx->flags & TPARG_FL_TYPECAST) {
> > + type = ctx->last_struct;
> > + goto found_type;
> > + }
> > + return 0;
> > + }
> ...
> > @@ -758,7 +806,70 @@ static int parse_btf_bitfield(struct fetch_insn **pcode,
> > return 0;
> > }
> >
> > +static int handle_typecast(char *arg, struct fetch_insn **pcode,
> > + struct fetch_insn *end,
> > + struct traceprobe_parse_context *ctx)
> > +{
> ...
> > + ctx->flags |= TPARG_FL_TYPECAST;
> > + tmp++;
> > +
> > + ctx->offset += tmp - arg;
> > + ret = parse_btf_arg(tmp, pcode, end, ctx);
> > + ctx->flags &= ~TPARG_FL_TYPECAST;
> Since TPARG_FL_TYPECAST is cleared here, will it break automatic type
> inference later?
> The traceprobe_parse_context is used for subsequent steps like
> find_fetch_type_from_btf_type() and check_prepare_btf_string_fetch(),
> which execute in traceprobe_parse_probe_arg() after this function returns.
> These functions rely on ctx_btf(), which needs TPARG_FL_TYPECAST to be
> set to return ctx->struct_btf.
> > + ctx->last_struct = NULL;
> > +out_put:
> > + btf_put(ctx->struct_btf);
> Because ctx->struct_btf is not set to NULL after calling btf_put(),
> can this cause a refcount underflow and use-after-free?
> If a user creates an eprobe with multiple typecasted arguments, the shared
> traceprobe_parse_context means the second argument will enter
> query_btf_struct() and see ctx->struct_btf is not NULL. It will skip
> acquiring a new reference but still use the pointer. At the end of parsing
> the second argument, btf_put(ctx->struct_btf) will be called again
> unconditionally.
Oops, I forgot to do:
ctx->struct_buf = NULL;
here.
Will fix.
Thanks,
-- Steve
> > + return ret;
> > +}
^ permalink raw reply
* Re: [PATCH] unwind: Add sframe_(un)register() system calls
From: Steven Rostedt @ 2026-05-22 11:18 UTC (permalink / raw)
To: Jens Remus
Cc: LKML, Linux Trace Kernel, bpf, Masami Hiramatsu,
Mathieu Desnoyers, Josh Poimboeuf, Peter Zijlstra, Ingo Molnar,
Jiri Olsa, Arnaldo Carvalho de Melo, Namhyung Kim,
Thomas Gleixner, Andrii Nakryiko, Indu Bhagat, Jose E. Marchesi,
Beau Belgrave, Linus Torvalds, Andrew Morton, Florian Weimer,
Kees Cook, Carlos O'Donell, Sam James, Dylan Hatch,
Borislav Petkov, Dave Hansen, David Hildenbrand, H. Peter Anvin,
Liam R. Howlett, Lorenzo Stoakes, Michal Hocko, Mike Rapoport,
Suren Baghdasaryan, Vlastimil Babka, Heiko Carstens,
Vasily Gorbik
In-Reply-To: <d82ea328-758e-4e41-a58e-46f3137feca7@linux.ibm.com>
On Fri, 22 May 2026 11:43:06 +0200
Jens Remus <jremus@linux.ibm.com> wrote:
> On 5/22/2026 12:35 AM, Steven Rostedt wrote:
> > From: Steven Rostedt <rostedt@goodmis.org>
> >
> > Add system calls to register and unregister sframes that can be used by
> > dynamic linkers to tell the kernel where the sframe section is in memory
> > for libraries it loads.
>
> Why two separate system calls? Can't that be one single stacktracectl?
> Could they at least be non-sframe specific, e.g. stracktrace_register
> and stracktrace_unregister, so that if one would implement e.g. unwind
> user dwarf/eh_frame in the future one could pass ehframe_start and
> ehframe_end in addition to sframe_start and sframe_end?
Talking with everyone at LSF/MM/BPF the consensus was to avoid an ioctl
like system call. Everyone hates them. They told me that a system call
should do one thing. They wanted a separate system call to register and to
unregister.
Note this also helps to see what the user is doing via monitoring via
ftrace, strace, and security wise via LSMs and seccomp.
>
> >
> > Both system calls take a pointer to a new structure:
> >
> > struct sframe_setup {
> > unsigned long sframe_start;
> > unsigned long sframe_size;
> > unsigned long text_start;
> > unsigned long text_size;
> > };
> >
> > and a size of the passed in structure. If the system call needs to be
> > extended, then the structure could be changed and the size of that
> > structure will tell the kernel that it is the new version. If the kernel
> > does not recognize the structure size, it will return -EINVAL.
> >
> > sframe_start - The virtual address of the sframe section
> > sframe_size - The length of the sframe section
> > text_start - the text section the sframe represents
> > test_size - the length of the section
> >
> > If other stack tracing functionality is added, it will require a new
> > system call.
> >
> > The unregister only needs the sframe_start and requires all the rest of
> > the fields to be 0. In the future, if more can be done, then user space
> > can update the other values and check the return code to see if the kernel
> > supports it.
> >
> > Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
> > ---
> >
> > Based on top of Jens patches here:
> >
> > https://lore.kernel.org/linux-trace-kernel/20260520154004.3845823-1-jremus@linux.ibm.com/
> >
> > [ Note, I tested this with the same program from the RFC patch ]
> >
> > Changes from RFC: https://patch.msgid.link/20260429114355.6c712e6a@gandalf.local.home
> >
> > - Remove the ioctl() like system call for a unique system call for each
> > functionality. Right now there's two functionalities:
> > 1. register sframe section
> > 2. unregister sframe sections
> >
> > - Added taking a lock around the mtree logic in __sframe_remove_section()
> > as Sashiko mentioned that there could be races from user space
> > registering and unregistering sframe sections at the same time.
>
> Doesn't sframe_add_section() then also need likewise?
Ah, I saw the lock grabbed on the vma lookup. It should also be done for the
mtree_insert_range(). Thanks, will fix.
>
> >
> > - Removed [RFC] from subject as I believe this is more likely the way
> > this system call will be done.
>
> > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>
> > @@ -999,6 +999,8 @@ asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx __user *
> > asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx __user *ctx,
> > u32 size, u32 flags);
> > asmlinkage long sys_lsm_list_modules(u64 __user *ids, u32 __user *size, u32 flags);
> > +asmlinkage long sys_sframe_register(void *data, unsigned int size);
> > +asmlinkage long sys_sframe_unregister(void *data, unsigned int size);
> >
> > /*
> > * Architecture-specific system calls
>
>
> > diff --git a/include/uapi/linux/sframe.h b/include/uapi/linux/sframe.h
>
> > @@ -0,0 +1,12 @@
> > +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
> > +#ifndef _UAPI_LINUX_SFRAME_H
> > +#define _UAPI_LINUX_SFRAME_H
> > +
> > +struct sframe_setup {
> > + unsigned long sframe_start;
> > + unsigned long sframe_size;
> > + unsigned long text_start;
> > + unsigned long text_size;
> > +};
> > +
> > +#endif /* _UAPI_LINUX_SFRAME_H */
>
> > diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
>
> > @@ -842,9 +844,11 @@ static void sframe_free_srcu(struct rcu_head *rcu)
> > static int __sframe_remove_section(struct mm_struct *mm,
> > struct sframe_section *sec)
> > {
> > - if (!mtree_erase(&mm->sframe_mt, sec->text_start)) {
> > - dbg_sec("mtree_erase failed: text=%lx\n", sec->text_start);
> > - return -EINVAL;
> > + scoped_guard(mmap_read_lock, mm) {
>
> Why is a read lock sufficient? Doesn't that allow multiple readers?
> How does that prevent a concurrent modification of the mm->sframe_mt?
That was a cut and paste error. I meant to change it to a write lock, but
got distracted :-p Thanks, will fix.
>
> > + if (!mtree_erase(&mm->sframe_mt, sec->text_start)) {
> > + dbg_sec("mtree_erase failed: text=%lx\n", sec->text_start);
> > + return -EINVAL;
> > + }
>
> Is (or why not) likewise required in sframe_add_section() for the
> mtree_insert_range()?
>
> Wasn't the reported issue that while mt_for_each() in
> sframe_remove_section() there could be concurrent mtree_erase() in
> __sframe_remove_section() followed by mtree_insert_range() in
> sframe_add_section(), so that the mt_for_each() could get confused?
I'll take a closer look. But let me fix the obvious bugs first.
-- Steve
>
> > }
> >
> > call_srcu(&sframe_srcu, &sec->rcu, sframe_free_srcu);
> > @@ -936,3 +940,56 @@ void sframe_free_mm(struct mm_struct *mm)
> >
> > mtree_destroy(&mm->sframe_mt);
> > }
^ permalink raw reply
* [PATCH v11 2/6] x86/asm: Avoid emitting DWARF CFI for non-VDSO
From: Jens Remus @ 2026-05-22 11:04 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James,
Andy Lutomirski
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich, Steven Rostedt (Google)
In-Reply-To: <20260522110427.2816637-1-jremus@linux.ibm.com>
From: Josh Poimboeuf <jpoimboe@kernel.org>
It was decided years ago that .cfi_* annotations aren't maintainable in
the kernel. They were replaced by objtool unwind hints. For the kernel
proper, ensure the CFI_* macros don't do anything.
On the other hand the VDSO library *does* use them, so user space can
unwind through it.
Make sure these macros only work for VDSO. They aren't actually being
used outside of VDSO anyway, so there's no functional change.
[ Jens Remus: Define CFI_SIGNAL_FRAME for !BUILD_VDSO. ]
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Acked-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
arch/x86/include/asm/dwarf2.h | 52 ++++++++++++++++++++++++-----------
1 file changed, 36 insertions(+), 16 deletions(-)
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index 09c9684d3ad6..13e2e64ef265 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -6,6 +6,15 @@
#warning "asm/dwarf2.h should be only included in pure assembly files"
#endif
+#ifdef BUILD_VDSO
+
+ /*
+ * For the vDSO, emit both runtime unwind information and debug
+ * symbols for the .dbg file.
+ */
+
+ .cfi_sections .eh_frame, .debug_frame
+
#define CFI_STARTPROC .cfi_startproc
#define CFI_ENDPROC .cfi_endproc
#define CFI_DEF_CFA .cfi_def_cfa
@@ -22,21 +31,32 @@
#define CFI_ESCAPE .cfi_escape
#define CFI_SIGNAL_FRAME .cfi_signal_frame
-#ifndef BUILD_VDSO
- /*
- * Emit CFI data in .debug_frame sections, not .eh_frame sections.
- * The latter we currently just discard since we don't do DWARF
- * unwinding at runtime. So only the offline DWARF information is
- * useful to anyone. Note we should not use this directive if we
- * ever decide to enable DWARF unwinding at runtime.
- */
- .cfi_sections .debug_frame
-#else
- /*
- * For the vDSO, emit both runtime unwind information and debug
- * symbols for the .dbg file.
- */
- .cfi_sections .eh_frame, .debug_frame
-#endif
+#else /* !BUILD_VDSO */
+
+/*
+ * On x86, these macros aren't used outside VDSO. As well they shouldn't be:
+ * they're fragile and very difficult to maintain.
+ */
+
+.macro nocfi args:vararg
+.endm
+
+#define CFI_STARTPROC nocfi
+#define CFI_ENDPROC nocfi
+#define CFI_DEF_CFA nocfi
+#define CFI_DEF_CFA_REGISTER nocfi
+#define CFI_DEF_CFA_OFFSET nocfi
+#define CFI_ADJUST_CFA_OFFSET nocfi
+#define CFI_OFFSET nocfi
+#define CFI_REL_OFFSET nocfi
+#define CFI_REGISTER nocfi
+#define CFI_RESTORE nocfi
+#define CFI_REMEMBER_STATE nocfi
+#define CFI_RESTORE_STATE nocfi
+#define CFI_UNDEFINED nocfi
+#define CFI_ESCAPE nocfi
+#define CFI_SIGNAL_FRAME nocfi
+
+#endif /* !BUILD_VDSO */
#endif /* _ASM_X86_DWARF2_H */
--
2.51.0
^ permalink raw reply related
* [PATCH v11 6/6] x86/vdso: Enable sframe generation in VDSO
From: Jens Remus @ 2026-05-22 11:04 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James,
Andy Lutomirski
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich, Steven Rostedt (Google)
In-Reply-To: <20260522110427.2816637-1-jremus@linux.ibm.com>
From: Josh Poimboeuf <jpoimboe@kernel.org>
Enable .sframe generation in the VDSO library so kernel and user space
can unwind through it.
Starting with binutils 2.46 both GNU assembler and GNU linker
exclusively support generating and merging .sframe in SFrame V3 format.
For x86 SFrame is only supported for x86-64. Not for x86-32 nor x32.
Test whether the assembler supports option '--gsframe-3' to explicitly
select SFrame V3 format. Note that testing using Kconfig macro
'as-option' is not sufficient, as GNU assembler will accept the option
for any target, regardless of whether it is actually capable to generate
.sframe for it, as long the input does not trigger the generation.
Therefore it is necessary to use Kconfig macro 'as-instr' to provide
minimal CFI directives that trigger generation of .sframe.
For x86-64 VDSO, only if supported by the assembler, generate .sframe,
collect it, mark it as KEEP, and generate a GNU_SFRAME program table
entry.
For x86-32 and x32 VDSOs, given SFrame is not supported, do not generate
any .sframe nor GNU_SFRAME program table entry. Instead explicitly
discard any .sframe. The latter is required for x32 VDSO, as it is
built from x86-64 VDSO objects (potentially with .sframe) converted to
x32. In this regard discarding .sframe also prevents potential
issues with linkers, such as GNU linker prior to binutils 2.46 commit
7487c98ff07a ("x32: Allow R_X86_64_PC64 for SFrame V3"), that do not
support R_X86_64_PC64 relocations in x32, like those found in .sframe
in SFrame V3 format.
[ Jens Remus: Add support for SFrame V3. Prevent GNU_SFRAME program
table entry to empty .sframe section. Reword commit message. ]
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
Notes (jremus):
Changes in v9:
- Always define KEEP_SFRAME to either true/false in specific VDSO linker
scripts and use #if instead of #ifdef in common one. (Peter)
- Reword commit message to provide more details.
arch/Kconfig | 7 +++++++
arch/x86/entry/vdso/common/vdso-layout.lds.S | 15 +++++++++++++++
arch/x86/entry/vdso/vdso32/vdso32.lds.S | 3 +++
arch/x86/entry/vdso/vdso64/Makefile | 1 +
arch/x86/entry/vdso/vdso64/vdso64.lds.S | 2 ++
arch/x86/entry/vdso/vdso64/vdsox32.lds.S | 6 ++++++
6 files changed, 34 insertions(+)
diff --git a/arch/Kconfig b/arch/Kconfig
index e86880045158..79aef9b67645 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -479,6 +479,13 @@ config HAVE_HARDLOCKUP_DETECTOR_ARCH
It uses the same command line parameters, and sysctl interface,
as the generic hardlockup detectors.
+config AS_SFRAME
+ bool
+
+config AS_SFRAME3
+ def_bool $(as-instr,.cfi_startproc\n.cfi_endproc,-Wa$(comma)--gsframe-3)
+ select AS_SFRAME
+
config UNWIND_USER
bool
diff --git a/arch/x86/entry/vdso/common/vdso-layout.lds.S b/arch/x86/entry/vdso/common/vdso-layout.lds.S
index 856b8b9d278c..c486b07b195a 100644
--- a/arch/x86/entry/vdso/common/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/common/vdso-layout.lds.S
@@ -60,6 +60,13 @@ SECTIONS
*(.eh_frame.*)
} :text
+#if KEEP_SFRAME
+ .sframe : {
+ KEEP (*(.sframe))
+ *(.sframe.*)
+ } :text :sframe
+#endif
+
/*
* Text is well-separated from actual data: there's plenty of
* stuff that isn't used at runtime in between.
@@ -80,6 +87,10 @@ SECTIONS
*(.discard)
*(.discard.*)
*(__bug_table)
+#if !KEEP_SFRAME
+ *(.sframe)
+ *(.sframe.*)
+#endif
}
}
@@ -89,6 +100,7 @@ SECTIONS
#define PT_GNU_EH_FRAME 0x6474e550
#define PT_GNU_STACK 0x6474e551
#define PT_GNU_PROPERTY 0x6474e553
+#define PT_GNU_SFRAME 0x6474e554
/*
* We must supply the ELF program headers explicitly to get just one
@@ -104,6 +116,9 @@ PHDRS
dynamic PT_DYNAMIC PF_R;
note PT_NOTE PF_R;
eh_frame_hdr PT_GNU_EH_FRAME PF_R;
+#if KEEP_SFRAME
+ sframe PT_GNU_SFRAME PF_R;
+#endif
gnu_stack PT_GNU_STACK PF_RW;
gnu_property PT_GNU_PROPERTY PF_R;
}
diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
index 55554f80d930..a18b65749ce3 100644
--- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
@@ -11,6 +11,9 @@
#define BUILD_VDSO32
+/* Discard .sframe if any. SFrame does not support x86-32. */
+#define KEEP_SFRAME 0
+
#include "common/vdso-layout.lds.S"
/* The ELF entry point can be used to set the AT_SYSINFO value. */
diff --git a/arch/x86/entry/vdso/vdso64/Makefile b/arch/x86/entry/vdso/vdso64/Makefile
index bfffaf1aeecc..459f8026531e 100644
--- a/arch/x86/entry/vdso/vdso64/Makefile
+++ b/arch/x86/entry/vdso/vdso64/Makefile
@@ -14,6 +14,7 @@ vobjs-$(CONFIG_X86_SGX) += vsgx.o
# Compilation flags
flags-y := -DBUILD_VDSO64 -m64 -mcmodel=small
+flags-$(CONFIG_AS_SFRAME3) += -Wa,--gsframe-3
# The location of this include matters!
include $(src)/../common/Makefile.include
diff --git a/arch/x86/entry/vdso/vdso64/vdso64.lds.S b/arch/x86/entry/vdso/vdso64/vdso64.lds.S
index 5ce3f2b6373a..6685cf385fc1 100644
--- a/arch/x86/entry/vdso/vdso64/vdso64.lds.S
+++ b/arch/x86/entry/vdso/vdso64/vdso64.lds.S
@@ -9,6 +9,8 @@
#define BUILD_VDSO64
+#define KEEP_SFRAME IS_ENABLED(CONFIG_AS_SFRAME)
+
#include "common/vdso-layout.lds.S"
/*
diff --git a/arch/x86/entry/vdso/vdso64/vdsox32.lds.S b/arch/x86/entry/vdso/vdso64/vdsox32.lds.S
index 3dbd20c8dacc..5270fd0bdd0f 100644
--- a/arch/x86/entry/vdso/vdso64/vdsox32.lds.S
+++ b/arch/x86/entry/vdso/vdso64/vdsox32.lds.S
@@ -9,6 +9,12 @@
#define BUILD_VDSOX32
+/*
+ * Discard .sframe from x86-64 compiles. SFrame does not support x32 and
+ * it contains R_X86_64_PC64 relocations, which linkers may not expect.
+ */
+#define KEEP_SFRAME 0
+
#include "common/vdso-layout.lds.S"
/*
--
2.51.0
^ permalink raw reply related
* [PATCH v11 4/6] x86/vdso: Use SYM_FUNC_{START,END} in __kernel_vsyscall()
From: Jens Remus @ 2026-05-22 11:04 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James,
Andy Lutomirski
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich, Steven Rostedt (Google)
In-Reply-To: <20260522110427.2816637-1-jremus@linux.ibm.com>
From: Josh Poimboeuf <jpoimboe@kernel.org>
Use SYM_FUNC_{START,END} instead of all the boilerplate. No functional
change.
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Acked-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
arch/x86/entry/vdso/vdso32/system_call.S | 10 ++--------
1 file changed, 2 insertions(+), 8 deletions(-)
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S
index 9157cf9c5749..a90f4f7de396 100644
--- a/arch/x86/entry/vdso/vdso32/system_call.S
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -9,11 +9,7 @@
#include <asm/alternative.h>
.text
- .globl __kernel_vsyscall
- .type __kernel_vsyscall,@function
- ALIGN
-__kernel_vsyscall:
- CFI_STARTPROC
+SYM_FUNC_START(__kernel_vsyscall)
/*
* If using int $0x80, there is no reason to muck about with the
@@ -85,7 +81,5 @@ SYM_INNER_LABEL(int80_landing_pad, SYM_L_GLOBAL)
CFI_RESTORE ecx
CFI_ADJUST_CFA_OFFSET -4
RET
- CFI_ENDPROC
-
- .size __kernel_vsyscall,.-__kernel_vsyscall
+SYM_FUNC_END(__kernel_vsyscall)
.previous
--
2.51.0
^ permalink raw reply related
* [PATCH v11 3/6] x86/asm: Use CFI_* macros in SYM_FUNC_* macros so they can be added to VDSO
From: Jens Remus @ 2026-05-22 11:04 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James,
Andy Lutomirski
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich, Steven Rostedt (Google)
In-Reply-To: <20260522110427.2816637-1-jremus@linux.ibm.com>
From: Josh Poimboeuf <jpoimboe@kernel.org>
Add CFI_STARTPROC and CFI_ENDPROC annotations to the SYM_FUNC_* macros
so the VDSO asm functions don't need to add them manually. Note this
only affects VDSO, the CFI_* macros are empty for the kernel proper.
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Acked-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
arch/x86/entry/vdso/common/vdso-layout.lds.S | 2 +-
.../x86/entry/vdso/vdso64/vgetrandom-chacha.S | 2 --
arch/x86/entry/vdso/vdso64/vsgx.S | 4 ---
arch/x86/include/asm/linkage.h | 33 +++++++++++++++----
arch/x86/include/asm/vdso.h | 1 -
5 files changed, 28 insertions(+), 14 deletions(-)
diff --git a/arch/x86/entry/vdso/common/vdso-layout.lds.S b/arch/x86/entry/vdso/common/vdso-layout.lds.S
index a1e30be3e83d..856b8b9d278c 100644
--- a/arch/x86/entry/vdso/common/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/common/vdso-layout.lds.S
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#include <asm/vdso.h>
+#include <asm/page_types.h>
#include <asm/vdso/vsyscall.h>
#include <vdso/datapage.h>
diff --git a/arch/x86/entry/vdso/vdso64/vgetrandom-chacha.S b/arch/x86/entry/vdso/vdso64/vgetrandom-chacha.S
index cc82da9216fb..a33212594731 100644
--- a/arch/x86/entry/vdso/vdso64/vgetrandom-chacha.S
+++ b/arch/x86/entry/vdso/vdso64/vgetrandom-chacha.S
@@ -22,7 +22,6 @@ CONSTANTS: .octa 0x6b20657479622d323320646e61707865
* rcx: number of 64-byte blocks to write to output
*/
SYM_FUNC_START(__arch_chacha20_blocks_nostack)
- CFI_STARTPROC
.set output, %rdi
.set key, %rsi
.set counter, %rdx
@@ -175,5 +174,4 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
pxor temp,temp
ret
- CFI_ENDPROC
SYM_FUNC_END(__arch_chacha20_blocks_nostack)
diff --git a/arch/x86/entry/vdso/vdso64/vsgx.S b/arch/x86/entry/vdso/vdso64/vsgx.S
index 37a3d4c02366..c0342238c976 100644
--- a/arch/x86/entry/vdso/vdso64/vsgx.S
+++ b/arch/x86/entry/vdso/vdso64/vsgx.S
@@ -24,8 +24,6 @@
.section .text, "ax"
SYM_FUNC_START(__vdso_sgx_enter_enclave)
- /* Prolog */
- .cfi_startproc
push %rbp
.cfi_adjust_cfa_offset 8
.cfi_rel_offset %rbp, 0
@@ -143,8 +141,6 @@ SYM_FUNC_START(__vdso_sgx_enter_enclave)
jle .Lout
jmp .Lenter_enclave
- .cfi_endproc
-
_ASM_VDSO_EXTABLE_HANDLE(.Lenclu_eenter_eresume, .Lhandle_exception)
SYM_FUNC_END(__vdso_sgx_enter_enclave)
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index a7294656ad90..c2ca8117376f 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -40,6 +40,10 @@
#ifdef __ASSEMBLER__
+#ifndef LINKER_SCRIPT
+#include <asm/dwarf2.h>
+#endif
+
#if defined(CONFIG_MITIGATION_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO)
#define RET jmp __x86_return_thunk
#else /* CONFIG_MITIGATION_RETPOLINE */
@@ -112,34 +116,51 @@
# define SYM_FUNC_ALIAS_MEMFUNC SYM_FUNC_ALIAS
#endif
+#define __SYM_FUNC_START \
+ CFI_STARTPROC ASM_NL
+
+#define __SYM_FUNC_END \
+ CFI_ENDPROC ASM_NL
+
/* SYM_TYPED_FUNC_START -- use for indirectly called globals, w/ CFI type */
#define SYM_TYPED_FUNC_START(name) \
SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_F_ALIGN) \
+ __SYM_FUNC_START \
ENDBR
/* SYM_FUNC_START -- use for global functions */
#define SYM_FUNC_START(name) \
- SYM_START(name, SYM_L_GLOBAL, SYM_F_ALIGN)
+ SYM_START(name, SYM_L_GLOBAL, SYM_F_ALIGN) \
+ __SYM_FUNC_START
/* SYM_FUNC_START_NOALIGN -- use for global functions, w/o alignment */
#define SYM_FUNC_START_NOALIGN(name) \
- SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE)
+ SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE) \
+ __SYM_FUNC_START
/* SYM_FUNC_START_LOCAL -- use for local functions */
#define SYM_FUNC_START_LOCAL(name) \
- SYM_START(name, SYM_L_LOCAL, SYM_F_ALIGN)
+ SYM_START(name, SYM_L_LOCAL, SYM_F_ALIGN) \
+ __SYM_FUNC_START
/* SYM_FUNC_START_LOCAL_NOALIGN -- use for local functions, w/o alignment */
#define SYM_FUNC_START_LOCAL_NOALIGN(name) \
- SYM_START(name, SYM_L_LOCAL, SYM_A_NONE)
+ SYM_START(name, SYM_L_LOCAL, SYM_A_NONE) \
+ __SYM_FUNC_START
/* SYM_FUNC_START_WEAK -- use for weak functions */
#define SYM_FUNC_START_WEAK(name) \
- SYM_START(name, SYM_L_WEAK, SYM_F_ALIGN)
+ SYM_START(name, SYM_L_WEAK, SYM_F_ALIGN) \
+ __SYM_FUNC_START
/* SYM_FUNC_START_WEAK_NOALIGN -- use for weak functions, w/o alignment */
#define SYM_FUNC_START_WEAK_NOALIGN(name) \
- SYM_START(name, SYM_L_WEAK, SYM_A_NONE)
+ SYM_START(name, SYM_L_WEAK, SYM_A_NONE) \
+ __SYM_FUNC_START
+
+#define SYM_FUNC_END(name) \
+ __SYM_FUNC_END \
+ SYM_END(name, SYM_T_FUNC)
/*
* Expose 'sym' to the startup code in arch/x86/boot/startup/, by emitting an
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index f2d49212ae90..bbe270483e3e 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -2,7 +2,6 @@
#ifndef _ASM_X86_VDSO_H
#define _ASM_X86_VDSO_H
-#include <asm/page_types.h>
#include <linux/linkage.h>
#include <linux/init.h>
--
2.51.0
^ permalink raw reply related
* [PATCH v11 5/6] x86/vdso: Use CFI macros in __vdso_sgx_enter_enclave()
From: Jens Remus @ 2026-05-22 11:04 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James,
Andy Lutomirski
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich, Steven Rostedt (Google)
In-Reply-To: <20260522110427.2816637-1-jremus@linux.ibm.com>
From: Josh Poimboeuf <jpoimboe@kernel.org>
Use the CFI macros instead of the raw .cfi_* directives to be consistent
with the rest of the VDSO asm. It's also easier on the eyes.
No functional changes.
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Acked-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
arch/x86/entry/vdso/vdso64/vsgx.S | 14 ++++++--------
1 file changed, 6 insertions(+), 8 deletions(-)
diff --git a/arch/x86/entry/vdso/vdso64/vsgx.S b/arch/x86/entry/vdso/vdso64/vsgx.S
index c0342238c976..76efbeb1e287 100644
--- a/arch/x86/entry/vdso/vdso64/vsgx.S
+++ b/arch/x86/entry/vdso/vdso64/vsgx.S
@@ -25,12 +25,12 @@
SYM_FUNC_START(__vdso_sgx_enter_enclave)
push %rbp
- .cfi_adjust_cfa_offset 8
- .cfi_rel_offset %rbp, 0
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET %rbp, 0
mov %rsp, %rbp
- .cfi_def_cfa_register %rbp
+ CFI_DEF_CFA_REGISTER %rbp
push %rbx
- .cfi_rel_offset %rbx, -8
+ CFI_REL_OFFSET %rbx, -8
mov %ecx, %eax
.Lenter_enclave:
@@ -77,13 +77,11 @@ SYM_FUNC_START(__vdso_sgx_enter_enclave)
.Lout:
pop %rbx
leave
- .cfi_def_cfa %rsp, 8
+ CFI_DEF_CFA %rsp, 8
RET
- /* The out-of-line code runs with the pre-leave stack frame. */
- .cfi_def_cfa %rbp, 16
-
.Linvalid_input:
+ CFI_DEF_CFA %rbp, 16
mov $(-EINVAL), %eax
jmp .Lout
--
2.51.0
^ permalink raw reply related
* [PATCH v11 1/6] x86/vdso: Fix DWARF generation for getrandom()
From: Jens Remus @ 2026-05-22 11:04 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James,
Andy Lutomirski
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich, Steven Rostedt (Google)
In-Reply-To: <20260522110427.2816637-1-jremus@linux.ibm.com>
From: Josh Poimboeuf <jpoimboe@kernel.org>
Add CFI annotations to the VDSO implementation of getrandom() so it will
have valid DWARF unwinding metadata.
Fixes: 33385150ac45 ("x86: vdso: Wire up getrandom() vDSO implementation")
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Acked-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Jens Remus <jremus@linux.ibm.com>
---
arch/x86/entry/vdso/vdso64/vgetrandom-chacha.S | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/arch/x86/entry/vdso/vdso64/vgetrandom-chacha.S b/arch/x86/entry/vdso/vdso64/vgetrandom-chacha.S
index bcba5639b8ee..cc82da9216fb 100644
--- a/arch/x86/entry/vdso/vdso64/vgetrandom-chacha.S
+++ b/arch/x86/entry/vdso/vdso64/vgetrandom-chacha.S
@@ -4,7 +4,7 @@
*/
#include <linux/linkage.h>
-#include <asm/frame.h>
+#include <asm/dwarf2.h>
.section .rodata, "a"
.align 16
@@ -22,7 +22,7 @@ CONSTANTS: .octa 0x6b20657479622d323320646e61707865
* rcx: number of 64-byte blocks to write to output
*/
SYM_FUNC_START(__arch_chacha20_blocks_nostack)
-
+ CFI_STARTPROC
.set output, %rdi
.set key, %rsi
.set counter, %rdx
@@ -175,4 +175,5 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
pxor temp,temp
ret
+ CFI_ENDPROC
SYM_FUNC_END(__arch_chacha20_blocks_nostack)
--
2.51.0
^ permalink raw reply related
* [PATCH v11 0/6] x86/vdso: VDSO updates and fixes for sframes
From: Jens Remus @ 2026-05-22 11:04 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, x86, Steven Rostedt,
Josh Poimboeuf, Indu Bhagat, Peter Zijlstra, Dylan Hatch,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Mathieu Desnoyers, Kees Cook, Sam James,
Andy Lutomirski
Cc: Jens Remus, bpf, linux-mm, Namhyung Kim, Andrii Nakryiko,
Jose E. Marchesi, Beau Belgrave, Florian Weimer,
Carlos O'Donell, Masami Hiramatsu, Jiri Olsa,
Arnaldo Carvalho de Melo, Andrew Morton, David Hildenbrand,
Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Heiko Carstens, Vasily Gorbik,
Ilya Leoshkevich
This enables generation of SFrame V3 stack trace information for VDSO on
x86-64. It's a continuation of Josh's and Steve's work:
https://lore.kernel.org/all/cover.1737511963.git.jpoimboe@kernel.org/
https://lore.kernel.org/all/20250425023750.669174660@goodmis.org/
This series focuses only on the VDSO code. They are helpful fixes
and updates that doesn't rely on sframes (although the last patch
is sframe related).
This series applies on top of tip:master (b07a332d9cbb):
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git master
Like the unwind user sframe series [1] it depends on the binutils 2.46
release to be used to build the VDSO with SFrame V3 stack trace
information (using the assembler option --gsframe-3).
[1]: [PATCH v14 00/19] unwind_deferred: Implement sframe handling,
https://lore.kernel.org/all/20260505121718.3572346-1-jremus@linux.ibm.com/
Changes in v11:
- Rebased on tip:master (b07a332d9cbb).
- This time with correct version in cover letter.
Changes in v10:
- Rebased on tip:master (a375022d6383).
Changes in v9 (see indivicual patch notes):
- Always define KEEP_SFRAME to either true/false in specific VDSO linker
scripts and use #if instead of #ifdef in common one. (Peter)
- Reword patch 6 commit message to provide more details.
- Note: Binutils 2.46 with SFrame V3 support has been released.
Changes in v8:
- Discard .sframe for x32 and x86-32 VDSOs. (Josh/Indu)
- Define CFI_SIGNAL_FRAME for !BUILD_VDSO.
- Drop .cfi_sections .sframe in dwarf2.h in favor of the explicitly
specified more specific assembler option --gsframe-3.
- Incorporate missing changes and review feedback from Steven's v6
(I erroneously based my v6 on Steven's v5):
- Reword patch 3 commit subject to Steven's v6 one.
- Remove SYM_F_ALIGN in __vdso_sgx_enter_enclave(). (Josh)
Changes in v7:
- Rebase on H. Peter Anvin's vDSO changes on tip:x86/entry. (Peter)
- Simplify adding assembler option -Wa,--gsframe-3. Add for vdso64
only.
- Align to .eh_frame and mark .sframe as KEEP in vDSO linker script.
Note that GNU linker 2.46 will mark .sframe as KEEP in its default
linker script as well.
Changes in v6:
- SFrame V3 support (SFrame V2 is not supported).
- Prevent GNU_SFRAME program table entry to empty .sframe section.
- Integrate v5 review feedback. (Josh)
Regards,
Jens
Josh Poimboeuf (6):
x86/vdso: Fix DWARF generation for getrandom()
x86/asm: Avoid emitting DWARF CFI for non-VDSO
x86/asm: Use CFI_* macros in SYM_FUNC_* macros so they can be added to
VDSO
x86/vdso: Use SYM_FUNC_{START,END} in __kernel_vsyscall()
x86/vdso: Use CFI macros in __vdso_sgx_enter_enclave()
x86/vdso: Enable sframe generation in VDSO
arch/Kconfig | 7 +++
arch/x86/entry/vdso/common/vdso-layout.lds.S | 17 +++++-
arch/x86/entry/vdso/vdso32/system_call.S | 10 +---
arch/x86/entry/vdso/vdso32/vdso32.lds.S | 3 ++
arch/x86/entry/vdso/vdso64/Makefile | 1 +
arch/x86/entry/vdso/vdso64/vdso64.lds.S | 2 +
arch/x86/entry/vdso/vdso64/vdsox32.lds.S | 6 +++
.../x86/entry/vdso/vdso64/vgetrandom-chacha.S | 3 +-
arch/x86/entry/vdso/vdso64/vsgx.S | 18 +++----
arch/x86/include/asm/dwarf2.h | 52 +++++++++++++------
arch/x86/include/asm/linkage.h | 33 +++++++++---
arch/x86/include/asm/vdso.h | 1 -
12 files changed, 107 insertions(+), 46 deletions(-)
--
2.51.0
^ permalink raw reply
* [PATCH v2 3/3] trace: add documentation, selftest and tooling for stackmap
From: Li Pengfei @ 2026-05-22 10:40 UTC (permalink / raw)
To: linux-trace-kernel
Cc: rostedt, mhiramat, linux-kernel, cmllamas, zhangbo56, lipengfei28,
lkp
In-Reply-To: <20260522104017.1668638-1-lipengfei28@xiaomi.com>
From: Pengfei Li <lipengfei28@xiaomi.com>
Add supporting files for the ftrace stackmap feature:
Documentation/trace/ftrace-stackmap.rst:
Documentation covering design, usage, tracefs interface, binary
format, and performance characteristics. Added to the 'Core Tracing
Frameworks' toctree in Documentation/trace/index.rst. Documents:
- Reset requires tracing to be stopped first
- Boot-time activation via trace_options=stackmap
- bits parameter range [10, 18] and worst-case memory usage
- tracefs file modes (0640 / 0440)
- Best-effort snapshot semantics for stack_map_bin
- Counter naming: successes (events served), drops, success_rate
tools/testing/selftests/ftrace/test.d/ftrace/stackmap-basic.tc:
Functional selftest verifying:
- stackmap tracefs nodes exist
- enabling stackmap + stacktrace produces stack_id events
- stack_map_stat shows non-zero successes and zero drops
- reset clears entries when tracing is stopped
- reset is rejected (-EBUSY) while tracing is active
Uses an EXIT trap to restore options/stackmap and options/stacktrace
on any exit path.
tools/tracing/stackmap_dump.py:
Python script to parse the binary stack_map_bin export.
Features:
- Automatic endianness detection via magic number
- Batched addr2line via stdin (avoids ARG_MAX with large stacks)
- JSON output mode
- Top-N filtering by ref_count
Binary format: all fields are native-endian. The parser detects
byte order by reading the magic value (0x464D5342 = 'FSMB').
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202605160010.fakzGVVq-lkp@intel.com/
Signed-off-by: Pengfei Li <lipengfei28@xiaomi.com>
---
Documentation/trace/ftrace-stackmap.rst | 145 +++++++++++++++++
Documentation/trace/index.rst | 1 +
.../ftrace/test.d/ftrace/stackmap-basic.tc | 100 ++++++++++++
tools/tracing/stackmap_dump.py | 150 ++++++++++++++++++
4 files changed, 396 insertions(+)
create mode 100644 Documentation/trace/ftrace-stackmap.rst
create mode 100755 tools/testing/selftests/ftrace/test.d/ftrace/stackmap-basic.tc
create mode 100755 tools/tracing/stackmap_dump.py
diff --git a/Documentation/trace/ftrace-stackmap.rst b/Documentation/trace/ftrace-stackmap.rst
new file mode 100644
index 000000000000..1230d44d1d23
--- /dev/null
+++ b/Documentation/trace/ftrace-stackmap.rst
@@ -0,0 +1,145 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======================
+Ftrace Stack Map
+======================
+
+:Author: Pengfei Li <lipengfei28@xiaomi.com>
+
+Overview
+========
+
+The ftrace stack map provides stack trace deduplication for the ftrace
+ring buffer. When enabled, instead of storing full kernel stack traces
+(typically 80-160 bytes each) in the ring buffer for every event, ftrace
+stores only a 4-byte ``stack_id``. The full stacks are maintained in a
+separate hash table and exported via tracefs for userspace to resolve.
+
+This is inspired by eBPF's ``BPF_MAP_TYPE_STACK_TRACE`` but integrated
+into ftrace's infrastructure, requiring no userspace daemon.
+
+Configuration
+=============
+
+Enable ``CONFIG_FTRACE_STACKMAP=y`` in the kernel config.
+
+Kernel command line parameters:
+
+- ``ftrace_stackmap.bits=N`` - Set map capacity to 2^N unique stacks
+ (default: 14 → 16384 stacks; valid range: 10-18).
+
+ At ``bits=18`` the kernel reserves roughly 130 MB of vmalloc memory
+ for the element pool. Each ``open()`` of ``stack_map_bin`` may
+ briefly allocate a similar amount for a snapshot. The cap is set
+ intentionally to bound memory usage.
+
+Usage
+=====
+
+Enable stack deduplication::
+
+ echo 1 > /sys/kernel/debug/tracing/options/stackmap
+ echo 1 > /sys/kernel/debug/tracing/options/stacktrace
+ echo function > /sys/kernel/debug/tracing/current_tracer
+
+The trace output will show ``<stack_id N>`` instead of full stack traces::
+
+ sh-1234 [006] d.h.. 123.456789: <stack_id 42>
+
+To view the actual stacks::
+
+ cat /sys/kernel/debug/tracing/stack_map
+
+Output format::
+
+ stack_id 42 [ref 1337, depth 8]
+ [0] schedule+0x48/0xc0
+ [1] schedule_timeout+0x1c/0x30
+ ...
+
+To view statistics::
+
+ cat /sys/kernel/debug/tracing/stack_map_stat
+
+Output::
+
+ entries: 2500 / 16384
+ table_size: 32768
+ successes: 148923
+ drops: 0
+ success_rate: 100%
+
+To reset the stack map (tracing must be stopped first)::
+
+ echo 0 > /sys/kernel/debug/tracing/tracing_on
+ echo 0 > /sys/kernel/debug/tracing/stack_map
+
+Reset returns ``-EBUSY`` if tracing is currently active, or if another
+reset is already in progress.
+
+Boot-time activation
+====================
+
+The stackmap option can be enabled from the kernel command line::
+
+ trace_options=stackmap,stacktrace
+
+Trace events that fire before the tracefs filesystem is initialized
+(``fs_initcall`` time) fall back to recording full stack traces; once
+``ftrace_stackmap_create()`` runs, subsequent events are deduplicated.
+The crossover is automatic and lossless — no events are dropped, but
+early-boot stacks recorded before the crossover are not deduplicated.
+
+Tracefs Nodes
+=============
+
+The stack_map files are owned by root and not world-readable
+(``stack_map``: 0640; ``stack_map_stat`` and ``stack_map_bin``: 0440).
+
+``stack_map``
+ Text export of all deduplicated stacks with symbol resolution.
+ Writing ``0`` or ``reset`` clears all entries (only when tracing
+ is stopped).
+
+``stack_map_stat``
+ Statistics: entry count, hits, drops, and hit rate.
+
+``stack_map_bin``
+ Binary export for efficient userspace consumption. Format:
+
+ - Header (16 bytes): magic(u32) + version(u32) + nr_stacks(u32) + reserved(u32)
+ - Per stack: stack_id(u32) + nr(u32) + ref_count(u32) + reserved(u32) + ips(u64 × nr)
+
+ All fields are written in the kernel's native byte order.
+ Userspace tools detect endianness by reading the magic value.
+ Magic: ``0x464D5342`` ('FSMB'), Version: 2.
+
+ The export is a best-effort snapshot allocated at ``open()``;
+ concurrent inserts during the snapshot may be truncated. A
+ bounds check ensures no overflow.
+
+Design
+======
+
+The stack map is modeled after ``tracing_map.c`` (used by hist triggers),
+using a lock-free design based on Dr. Cliff Click's non-blocking hash table
+algorithm:
+
+- **Lookup/Insert**: Lock-free via ``cmpxchg``, safe in NMI/IRQ/any context
+- **Memory**: Pre-allocated element pool, zero allocation on the hot path
+ (no GFP_ATOMIC failures under memory pressure)
+- **Collision**: Linear probing with a 2x over-provisioned table; probe
+ length is bounded so worst-case insert/lookup is O(1)
+- **Scope**: Currently supports the global trace instance
+- **Hash**: 32-bit jhash with a per-instance random seed; full ``memcmp``
+ confirms matches
+
+Performance
+===========
+
+Typical results on ARM64 Android device (function tracer, 2 seconds):
+
+- Unique stacks: ~3000
+- Hit rate: 84-98% (depends on workload diversity)
+- Ring buffer savings: ~80% for stack data
+- Overhead per event: ~50ns (one jhash + hash table lookup)
diff --git a/Documentation/trace/index.rst b/Documentation/trace/index.rst
index 5d9bf4694d5d..ac8b1141c23a 100644
--- a/Documentation/trace/index.rst
+++ b/Documentation/trace/index.rst
@@ -33,6 +33,7 @@ the Linux kernel.
ftrace
ftrace-design
ftrace-uses
+ ftrace-stackmap
kprobes
kprobetrace
fprobetrace
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/stackmap-basic.tc b/tools/testing/selftests/ftrace/test.d/ftrace/stackmap-basic.tc
new file mode 100755
index 000000000000..34e4e31ff7a1
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/stackmap-basic.tc
@@ -0,0 +1,100 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: ftrace - stackmap basic functionality
+# requires: stack_map options/stackmap
+
+# Test that ftrace stackmap deduplication works:
+# 1. Enable stackmap + stacktrace options
+# 2. Run function tracer briefly
+# 3. Verify stack_map has entries
+# 4. Verify stack_map_stat shows successes and zero drops
+# 5. Verify trace contains <stack_id> events
+# 6. Verify reset works when tracing is stopped
+# 7. Verify reset is rejected (-EBUSY) while tracing is active
+
+fail() {
+ echo "FAIL: $1"
+ exit_fail
+}
+
+# Restore state on any exit (success, fail, or interrupt) so a
+# half-finished test does not leave stacktrace/stackmap enabled.
+cleanup() {
+ disable_tracing 2>/dev/null
+ echo nop > current_tracer 2>/dev/null
+ echo 0 > options/stackmap 2>/dev/null
+ echo 0 > options/stacktrace 2>/dev/null
+}
+trap cleanup EXIT
+
+disable_tracing
+clear_trace
+
+# Verify stackmap files exist
+test -f stack_map || fail "stack_map file missing"
+test -f stack_map_stat || fail "stack_map_stat file missing"
+test -f stack_map_bin || fail "stack_map_bin file missing"
+
+# Enable stackmap dedup
+echo 1 > options/stackmap
+echo 1 > options/stacktrace
+
+# Run function tracer briefly
+echo function > current_tracer
+enable_tracing
+sleep 1
+disable_tracing
+echo nop > current_tracer
+echo 0 > options/stackmap
+
+# Check stack_map_stat has entries (default empty to avoid [: too many args)
+entries=$(cat stack_map_stat | grep "^entries:" | awk '{print $2}')
+: "${entries:=0}"
+if [ "$entries" -eq 0 ]; then
+ fail "stackmap has zero entries after tracing"
+fi
+
+# Check successes > 0
+successes=$(cat stack_map_stat | grep "^successes:" | awk '{print $2}')
+: "${successes:=0}"
+if [ "$successes" -eq 0 ]; then
+ fail "stackmap has zero successes"
+fi
+
+# Check drops == 0 (pool should be large enough for 1s trace)
+drops=$(cat stack_map_stat | grep "^drops:" | awk '{print $2}')
+: "${drops:=0}"
+if [ "$drops" -ne 0 ]; then
+ fail "stackmap had $drops drops (pool exhausted?)"
+fi
+
+# Check stack_map text output is parseable
+first_id=$(cat stack_map | grep "^stack_id" | head -1 | awk '{print $2}')
+if [ -z "$first_id" ]; then
+ fail "stack_map output has no stack_id entries"
+fi
+
+# Check trace has stack_id events
+count=$(grep -c "stack_id" trace || true)
+if [ "$count" -eq 0 ]; then
+ fail "trace has no <stack_id> events"
+fi
+
+# Test reset (tracing must be stopped — disable_tracing was called above)
+echo 0 > stack_map
+entries_after=$(cat stack_map_stat | grep "^entries:" | awk '{print $2}')
+: "${entries_after:=-1}"
+if [ "$entries_after" -ne 0 ]; then
+ fail "stackmap reset did not clear entries (got $entries_after)"
+fi
+
+# Test that reset is rejected while tracing is active
+enable_tracing
+if echo 0 > stack_map 2>/dev/null; then
+ disable_tracing
+ fail "stackmap reset should fail while tracing is active"
+fi
+disable_tracing
+
+echo "stackmap basic test passed: $entries unique stacks, $successes successes, $drops drops"
+exit 0
diff --git a/tools/tracing/stackmap_dump.py b/tools/tracing/stackmap_dump.py
new file mode 100755
index 000000000000..fc5d0c9cf0af
--- /dev/null
+++ b/tools/tracing/stackmap_dump.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+"""
+stackmap_dump.py - Parse and display ftrace stack_map_bin binary export.
+
+Usage:
+ # Pull from device and parse
+ adb pull /sys/kernel/debug/tracing/stack_map_bin /tmp/stack_map.bin
+ python3 stackmap_dump.py /tmp/stack_map.bin
+
+ # With vmlinux for offline symbol resolution
+ python3 stackmap_dump.py /tmp/stack_map.bin --vmlinux vmlinux
+
+ # JSON output for tooling
+ python3 stackmap_dump.py /tmp/stack_map.bin --json
+"""
+
+import struct
+import sys
+import argparse
+import json
+import subprocess
+
+MAGIC = 0x464D5342 # 'FSMB'
+HEADER_SIZE = 16 # 4 x u32
+ENTRY_SIZE = 16 # 4 x u32
+
+
+def detect_endianness(data):
+ """Detect byte order from magic number in header."""
+ if len(data) < 4:
+ raise ValueError("File too small")
+ magic_le = struct.unpack_from('<I', data, 0)[0]
+ if magic_le == MAGIC:
+ return '<'
+ magic_be = struct.unpack_from('>I', data, 0)[0]
+ if magic_be == MAGIC:
+ return '>'
+ raise ValueError(f"Bad magic: 0x{magic_le:08x} (neither LE nor BE)")
+
+
+def batch_addr2line(vmlinux, addrs):
+ """Resolve multiple addresses in one addr2line invocation."""
+ if not addrs:
+ return {}
+ try:
+ # Feed addresses on stdin to avoid ARG_MAX limits with large
+ # numbers of addresses (one stack can have 30+ frames; a
+ # snapshot can have thousands of unique stacks).
+ stdin = '\n'.join(hex(a) for a in addrs) + '\n'
+ result = subprocess.run(
+ ['addr2line', '-f', '-e', vmlinux],
+ input=stdin, capture_output=True, text=True, timeout=60
+ )
+ lines = result.stdout.split('\n')
+ # addr2line outputs 2 lines per address: function name + source location
+ symbols = {}
+ for i, addr in enumerate(addrs):
+ idx = i * 2
+ if idx < len(lines) and lines[idx] and lines[idx] != '??':
+ symbols[addr] = lines[idx]
+ return symbols
+ except (subprocess.TimeoutExpired, FileNotFoundError) as e:
+ print(f"warning: addr2line failed: {e}", file=sys.stderr)
+ return {}
+
+
+def parse_stackmap_bin(data):
+ """Parse binary stackmap data, yield (stack_id, ref_count, [ips])."""
+ if len(data) < HEADER_SIZE:
+ raise ValueError("File too small for header")
+
+ endian = detect_endianness(data)
+ header_fmt = f'{endian}IIII'
+ entry_fmt = f'{endian}IIII'
+
+ magic, version, nr_stacks, _ = struct.unpack_from(header_fmt, data, 0)
+ if version not in (1, 2):
+ raise ValueError(f"Unsupported version: {version}")
+
+ offset = HEADER_SIZE
+ for _ in range(nr_stacks):
+ if offset + ENTRY_SIZE > len(data):
+ break
+ stack_id, nr, ref_count, _ = struct.unpack_from(entry_fmt, data, offset)
+ offset += ENTRY_SIZE
+
+ ips_size = nr * 8
+ if offset + ips_size > len(data):
+ break
+ ips = struct.unpack_from(f'{endian}{nr}Q', data, offset)
+ offset += ips_size
+
+ yield stack_id, ref_count, list(ips)
+
+
+def main():
+ parser = argparse.ArgumentParser(description='Parse ftrace stack_map_bin')
+ parser.add_argument('file', help='Path to stack_map_bin file')
+ parser.add_argument('--vmlinux', help='Path to vmlinux for symbol resolution')
+ parser.add_argument('--json', action='store_true', help='JSON output')
+ parser.add_argument('--top', type=int, default=0,
+ help='Show only top N stacks by ref_count')
+ args = parser.parse_args()
+
+ with open(args.file, 'rb') as f:
+ data = f.read()
+
+ stacks = list(parse_stackmap_bin(data))
+
+ if args.top > 0:
+ stacks.sort(key=lambda x: x[1], reverse=True)
+ stacks = stacks[:args.top]
+
+ # Batch symbol resolution
+ symbols = {}
+ if args.vmlinux:
+ all_addrs = set()
+ for _, _, ips in stacks:
+ all_addrs.update(ips)
+ symbols = batch_addr2line(args.vmlinux, list(all_addrs))
+
+ if args.json:
+ output = []
+ for stack_id, ref_count, ips in stacks:
+ entry = {
+ 'stack_id': stack_id,
+ 'ref_count': ref_count,
+ 'ips': [f'0x{ip:x}' for ip in ips]
+ }
+ if args.vmlinux:
+ entry['symbols'] = [symbols.get(ip, f'0x{ip:x}')
+ for ip in ips]
+ output.append(entry)
+ print(json.dumps(output, indent=2))
+ else:
+ for stack_id, ref_count, ips in stacks:
+ print(f"stack_id {stack_id} [ref {ref_count}, depth {len(ips)}]")
+ for i, ip in enumerate(ips):
+ sym = symbols.get(ip, '')
+ if sym:
+ sym = f' {sym}'
+ print(f" [{i}] 0x{ip:x}{sym}")
+ print()
+
+ print(f"Total: {len(stacks)} unique stacks", file=sys.stderr)
+
+
+if __name__ == '__main__':
+ main()
--
2.34.1
^ permalink raw reply related
* [PATCH v2 2/3] trace: integrate stackmap into ftrace stack recording path
From: Li Pengfei @ 2026-05-22 10:40 UTC (permalink / raw)
To: linux-trace-kernel
Cc: rostedt, mhiramat, linux-kernel, cmllamas, zhangbo56, lipengfei28,
lkp
In-Reply-To: <20260522104017.1668638-1-lipengfei28@xiaomi.com>
From: Pengfei Li <lipengfei28@xiaomi.com>
Add TRACE_STACK_ID event type and integrate ftrace_stackmap into
__ftrace_trace_stack(). When the 'stackmap' trace option is enabled,
the stack recording path stores a 4-byte stack_id in the ring buffer
instead of the full stack trace.
Changes:
- New TRACE_STACK_ID in trace_type enum
- New stack_id_entry in trace_entries.h
- New TRACE_ITER(STACKMAP) trace option flag; when CONFIG_FTRACE_STACKMAP
is disabled, TRACE_ITER_STACKMAP_BIT is defined as -1 so that
TRACE_ITER(STACKMAP) evaluates to 0 (following the existing pattern
used by TRACE_ITER_PROF_TEXT_OFFSET)
- Modified __ftrace_trace_stack() to call ftrace_stackmap_get_id()
when the stackmap option is active
- Stackmap pointer read with smp_load_acquire(), published with
smp_store_release() to ensure proper initialization ordering
- NULL check on tr->stackmap prevents dereference if creation failed
or if used on a secondary trace instance (graceful fallback)
- ftrace_stackmap_create() takes the owning trace_array so the
stackmap can later check tracing state during reset
- Added stack_id print handler in trace_output.c
Fallback behavior: if stackmap returns an error (pool exhausted,
resetting, or NULL pointer), the full stack trace is recorded as
before — no new failure modes introduced.
Note: stackmap is currently initialized only for the global trace
instance. Secondary instances fall back to full stack recording.
Usage:
echo 1 > /sys/kernel/debug/tracing/options/stackmap
echo 1 > /sys/kernel/debug/tracing/options/stacktrace
Signed-off-by: Pengfei Li <lipengfei28@xiaomi.com>
---
kernel/trace/trace.c | 66 ++++++++++++++++++++++++++++++++++++
kernel/trace/trace.h | 16 +++++++++
kernel/trace/trace_entries.h | 15 ++++++++
kernel/trace/trace_output.c | 23 +++++++++++++
4 files changed, 120 insertions(+)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6eb4d3097a4d..49a675dffad5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -57,6 +57,7 @@
#include "trace.h"
#include "trace_output.h"
+#include "trace_stackmap.h"
#ifdef CONFIG_FTRACE_STARTUP_TEST
/*
@@ -2184,6 +2185,43 @@ void __ftrace_trace_stack(struct trace_array *tr,
}
#endif
+#ifdef CONFIG_FTRACE_STACKMAP
+ /*
+ * If stackmap dedup is enabled, try to store only the stack_id
+ * in the ring buffer instead of the full stack trace.
+ */
+ if (tr->trace_flags & TRACE_ITER(STACKMAP)) {
+ struct ftrace_stackmap *smap;
+ struct stack_id_entry *sid_entry;
+ int sid;
+
+ smap = smp_load_acquire(&tr->stackmap);
+ if (!smap)
+ goto full_stack;
+
+ sid = ftrace_stackmap_get_id(smap, fstack->calls, nr_entries);
+ if (sid >= 0) {
+ event = __trace_buffer_lock_reserve(buffer,
+ TRACE_STACK_ID,
+ sizeof(*sid_entry), trace_ctx);
+ if (!event)
+ goto out;
+ sid_entry = ring_buffer_event_data(event);
+ sid_entry->stack_id = sid;
+ /*
+ * stack_id is a synthetic side-event attached to a
+ * primary trace event that was already subject to
+ * filtering. No per-event filter is defined for
+ * TRACE_STACK_ID, so commit unconditionally.
+ */
+ __buffer_unlock_commit(buffer, event);
+ goto out;
+ }
+ /* Fall through to full stack on stackmap failure */
+ }
+full_stack:
+#endif
+
event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
struct_size(entry, caller, nr_entries),
trace_ctx);
@@ -9222,6 +9260,34 @@ static __init void tracer_init_tracefs_work_func(struct work_struct *work)
NULL, &tracing_dyn_info_fops);
#endif
+#ifdef CONFIG_FTRACE_STACKMAP
+ {
+ struct ftrace_stackmap *smap;
+
+ smap = ftrace_stackmap_create(&global_trace);
+ if (!IS_ERR(smap)) {
+ /*
+ * Use smp_store_release to ensure the stackmap
+ * structure is fully initialized before publishing
+ * the pointer to concurrent trace event readers.
+ */
+ smp_store_release(&global_trace.stackmap, smap);
+ trace_create_file("stack_map", TRACE_MODE_WRITE, NULL,
+ smap, &ftrace_stackmap_fops);
+ trace_create_file("stack_map_stat", TRACE_MODE_READ, NULL,
+ smap, &ftrace_stackmap_stat_fops);
+ trace_create_file("stack_map_bin", TRACE_MODE_READ, NULL,
+ smap, &ftrace_stackmap_bin_fops);
+ } else {
+ pr_warn("ftrace stackmap init failed, dedup disabled\n");
+ /*
+ * global_trace.stackmap is already NULL from kzalloc;
+ * leaving it NULL ensures the load-acquire in
+ * __ftrace_trace_stack falls back to full stack.
+ */
+ }
+ }
+#endif
create_trace_instances(NULL);
update_tracer_options();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 80fe152af1dd..7e7d5e5a35ff 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -57,6 +57,7 @@ enum trace_type {
TRACE_TIMERLAT,
TRACE_RAW_DATA,
TRACE_FUNC_REPEATS,
+ TRACE_STACK_ID,
__TRACE_LAST_TYPE,
};
@@ -453,6 +454,9 @@ struct trace_array {
struct cond_snapshot *cond_snapshot;
#endif
struct trace_func_repeats __percpu *last_func_repeats;
+#ifdef CONFIG_FTRACE_STACKMAP
+ struct ftrace_stackmap *stackmap;
+#endif
/*
* On boot up, the ring buffer is set to the minimum size, so that
* we do not waste memory on systems that are not using tracing.
@@ -579,6 +583,8 @@ extern void __ftrace_bad_type(void);
TRACE_GRAPH_RET); \
IF_ASSIGN(var, ent, struct func_repeats_entry, \
TRACE_FUNC_REPEATS); \
+ IF_ASSIGN(var, ent, struct stack_id_entry, \
+ TRACE_STACK_ID); \
__ftrace_bad_type(); \
} while (0)
@@ -1449,7 +1455,16 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
# define STACK_FLAGS
#endif
+#ifdef CONFIG_FTRACE_STACKMAP
+# define STACKMAP_FLAGS \
+ C(STACKMAP, "stackmap"),
+#else
+# define STACKMAP_FLAGS
+# define TRACE_ITER_STACKMAP_BIT -1
+#endif
+
#ifdef CONFIG_FUNCTION_PROFILER
+
# define PROFILER_FLAGS \
C(PROF_TEXT_OFFSET, "prof-text-offset"),
# ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -1506,6 +1521,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
FUNCTION_FLAGS \
FGRAPH_FLAGS \
STACK_FLAGS \
+ STACKMAP_FLAGS \
BRANCH_FLAGS \
PROFILER_FLAGS \
FPROFILE_FLAGS
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 54417468fdeb..89ed14b7e5fd 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -250,6 +250,21 @@ FTRACE_ENTRY(user_stack, userstack_entry,
(void *)__entry->caller[6], (void *)__entry->caller[7])
);
+/*
+ * Stack ID entry - stores only a stack_id referencing the stackmap.
+ * Used when CONFIG_FTRACE_STACKMAP is enabled to deduplicate stacks.
+ */
+FTRACE_ENTRY(stack_id, stack_id_entry,
+
+ TRACE_STACK_ID,
+
+ F_STRUCT(
+ __field( int, stack_id )
+ ),
+
+ F_printk("<stack_id %d>", __entry->stack_id)
+);
+
/*
* trace_printk entry:
*/
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index a5ad76175d10..68678ea88159 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1517,6 +1517,28 @@ static struct trace_event trace_user_stack_event = {
.funcs = &trace_user_stack_funcs,
};
+/* TRACE_STACK_ID */
+static enum print_line_t trace_stack_id_print(struct trace_iterator *iter,
+ int flags, struct trace_event *event)
+{
+ struct stack_id_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+ trace_seq_printf(s, "<stack_id %d>\n", field->stack_id);
+
+ return trace_handle_return(s);
+}
+
+static struct trace_event_functions trace_stack_id_funcs = {
+ .trace = trace_stack_id_print,
+};
+
+static struct trace_event trace_stack_id_event = {
+ .type = TRACE_STACK_ID,
+ .funcs = &trace_stack_id_funcs,
+};
+
/* TRACE_HWLAT */
static enum print_line_t
trace_hwlat_print(struct trace_iterator *iter, int flags,
@@ -1908,6 +1930,7 @@ static struct trace_event *events[] __initdata = {
&trace_wake_event,
&trace_stack_event,
&trace_user_stack_event,
+ &trace_stack_id_event,
&trace_bputs_event,
&trace_bprint_event,
&trace_print_event,
--
2.34.1
^ permalink raw reply related
* [PATCH v2 1/3] trace: add lock-free stackmap for stack trace deduplication
From: Li Pengfei @ 2026-05-22 10:40 UTC (permalink / raw)
To: linux-trace-kernel
Cc: rostedt, mhiramat, linux-kernel, cmllamas, zhangbo56, lipengfei28,
lkp
In-Reply-To: <20260522104017.1668638-1-lipengfei28@xiaomi.com>
From: Pengfei Li <lipengfei28@xiaomi.com>
Add a lock-free hash map (ftrace_stackmap) that deduplicates kernel
stack traces for the ftrace ring buffer. Instead of storing full
stack traces (80-160 bytes each) in the ring buffer for every event,
ftrace can store a 4-byte stack_id when the stackmap option is enabled.
The implementation is modeled after tracing_map.c (used by hist
triggers), using the same lock-free design based on Dr. Cliff Click's
non-blocking hash table algorithm:
- Lock-free insert via cmpxchg, safe in NMI/IRQ/any context
- Pre-allocated element pool (zero allocation on hot path)
- Linear probing with 2x over-provisioned table; probe length is
bounded by FTRACE_STACKMAP_MAX_PROBE so worst-case insert/lookup
is O(1) even when the table is heavily loaded with claimed-but-
empty slots from pool exhaustion
- Single global instance (initialized for the global trace array)
The stackmap is exported via three tracefs nodes:
- stack_map: text export with symbol resolution (mode 0640)
- stack_map_stat: counters (entries, successes, drops, success_rate)
- stack_map_bin: binary export (all fields native-endian)
Counter naming:
- 'successes' counts events that were successfully assigned a
stack_id (covers both first-time inserts and dedup hits).
- 'drops' counts events that fell back to recording the full stack
(pool exhausted, probe limit reached, or reset in progress).
- 'success_rate' is successes / (successes + drops).
Reset semantics:
- Reset is a control-path operation only allowed when tracing is
stopped on the owning trace_array. Online reset (with tracing
active) is intentionally not supported to keep the proof
obligations small.
- Reset uses atomic_cmpxchg() to claim the resetting flag, then
verifies tracer_tracing_is_on() returns false. The resetting
flag itself blocks subsequent get_id() callers; userspace
re-enabling tracing after our check still cannot let new
insertions through.
- synchronize_rcu() drains in-flight get_id() callers from the
ftrace callback path, which runs preempt-disabled.
- Reset clears the resetting flag with atomic_set_release() so a
subsequent get_id() observes a fully cleared map.
- Concurrent reset returns -EBUSY; reset while tracing is active
returns -EBUSY.
Concurrency notes:
- entry->val publication uses smp_store_release() paired with
smp_load_acquire() in all dereferencing readers (lookup, seq_show,
bin_open). seq_start/seq_next only check val for NULL and use
READ_ONCE().
- elt->nr is read with READ_ONCE() and clamped to MAX_DEPTH before
use in seq_show and bin_open.
- Pool exhaustion: stackmap_get_elt() short-circuits via
atomic_read() before the contended atomic RMW, avoiding cacheline
contention once the pool is full. Slots that win cmpxchg but
cannot get an elt are left 'claimed but empty'; subsequent
lookups treat val==NULL as a miss and probe past them. The
bounded probe length keeps per-event cost O(1).
Hash key:
- Per-instance random seed stored in the stackmap struct (no
global state), seeded at create time.
- 32-bit jhash is forced to 1 if it lands on 0 (which is the
free-slot sentinel). Full memcmp confirms matches.
Memory:
- Single flat vmalloc for the element pool (no per-elt kzalloc).
- bits parameter clamped to [10, 18]: at the maximum bits=18, the
element pool is ~130 MB and a stack_map_bin snapshot may briefly
allocate another ~130 MB.
- struct stackmap_bin_snapshot uses u64 (not size_t) for its size
field so data[] is 8-byte aligned on both 32-bit and 64-bit
architectures, avoiding alignment faults when writing u64 IPs
on strict-alignment architectures.
Kernel command line parameter:
- ftrace_stackmap.bits=N: set map capacity (2^N unique stacks,
range 10-18, default 14)
Signed-off-by: Pengfei Li <lipengfei28@xiaomi.com>
---
kernel/trace/Kconfig | 21 ++
kernel/trace/Makefile | 1 +
kernel/trace/trace_stackmap.c | 643 ++++++++++++++++++++++++++++++++++
kernel/trace/trace_stackmap.h | 56 +++
4 files changed, 721 insertions(+)
create mode 100644 kernel/trace/trace_stackmap.c
create mode 100644 kernel/trace/trace_stackmap.h
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e130da35808f..2a63fd2c9a96 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -412,6 +412,27 @@ config STACK_TRACER
Say N if unsure.
+config FTRACE_STACKMAP
+ bool "Ftrace stack map deduplication"
+ depends on TRACING
+ depends on STACKTRACE
+ select KALLSYMS
+ help
+ This enables a global stack trace hash table for ftrace, inspired
+ by eBPF's BPF_MAP_TYPE_STACK_TRACE. When enabled, ftrace can store
+ only a stack_id in the ring buffer instead of the full stack trace,
+ significantly reducing trace buffer usage when the same call stacks
+ appear repeatedly.
+
+ The deduplicated stacks are exported via:
+ /sys/kernel/debug/tracing/stack_map
+
+ Writing to this file resets the stack map. Reading shows all unique
+ stacks with their stack_id and reference count.
+
+ Say Y if you want to reduce ftrace buffer usage for stack traces.
+ Say N if unsure.
+
config TRACE_PREEMPT_TOGGLE
bool
help
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 1decdce8cbef..f1b6175099cc 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -85,6 +85,7 @@ obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o
obj-$(CONFIG_OSNOISE_TRACER) += trace_osnoise.o
obj-$(CONFIG_NOP_TRACER) += trace_nop.o
obj-$(CONFIG_STACK_TRACER) += trace_stack.o
+obj-$(CONFIG_FTRACE_STACKMAP) += trace_stackmap.o
obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
diff --git a/kernel/trace/trace_stackmap.c b/kernel/trace/trace_stackmap.c
new file mode 100644
index 000000000000..b23a60e9286c
--- /dev/null
+++ b/kernel/trace/trace_stackmap.c
@@ -0,0 +1,643 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Ftrace Stack Map - Lock-free stack trace deduplication for ftrace
+ *
+ * Modeled after tracing_map.c (used by hist triggers), this provides
+ * a lock-free hash map optimized for the ftrace hot path. The design
+ * is based on Dr. Cliff Click's non-blocking hash table algorithm.
+ *
+ * Key properties:
+ * - Lock-free insert via cmpxchg, safe in NMI/IRQ/any context
+ * - Pre-allocated element pool (zero allocation on hot path)
+ * - Linear probing with 2x over-provisioned table; probe length
+ * bounded by FTRACE_STACKMAP_MAX_PROBE to keep worst-case lookup
+ * cost constant even when the table is heavily loaded
+ * - Single global instance (initialized for the global trace array)
+ *
+ * Reset is a control-path operation, only allowed when tracing is
+ * stopped on the owning trace_array. The protocol is:
+ *
+ * - atomic_cmpxchg(&resetting, 0, 1) atomically claims reset rights
+ * and blocks new get_id() callers (they observe resetting=1 and
+ * return -EINVAL).
+ * - tracer_tracing_is_on() is checked AFTER the cmpxchg, so the
+ * resetting flag itself prevents new insertions even if userspace
+ * re-enables tracing immediately after the check.
+ * - synchronize_rcu() drains in-flight get_id() callers from the
+ * ftrace callback path, which runs with preemption disabled.
+ *
+ * Online reset (with tracing active) is intentionally not supported
+ * to keep the design simple and the proof obligations small.
+ *
+ * The 32-bit jhash of the stack IPs is the hash table key. On hash
+ * collision, linear probing finds the next slot and full memcmp
+ * confirms the match.
+ *
+ * Concurrent userspace readers (cat stack_map / stack_map_bin) get
+ * a best-effort snapshot. They are coherent with the hot path
+ * (smp_load_acquire on entry->val), but they are not coherent with
+ * a concurrent reset; since reset requires tracing to be stopped,
+ * mid-iteration reset can produce truncated or partial output but
+ * never crashes.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/jhash.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/vmalloc.h>
+#include <linux/atomic.h>
+#include <linux/random.h>
+#include <linux/rcupdate.h>
+#include <linux/log2.h>
+
+#include "trace.h"
+#include "trace_stackmap.h"
+
+/*
+ * Bound the linear-probe scan length. With a 2x over-provisioned table,
+ * a well-distributed hash gives very short probe chains. Capping at 64
+ * keeps worst-case lookup O(1) even when the table is heavily loaded
+ * with claimed-but-empty slots from pool exhaustion.
+ */
+#define FTRACE_STACKMAP_MAX_PROBE 64
+
+/*
+ * Each pre-allocated element holds one unique stack trace.
+ * Fixed size: MAX_DEPTH entries regardless of actual depth.
+ */
+struct stackmap_elt {
+ u32 nr; /* actual number of IPs */
+ atomic_t ref_count;
+ unsigned long ips[FTRACE_STACKMAP_MAX_DEPTH];
+};
+
+/*
+ * Hash table entry: a 32-bit key (jhash of stack) + pointer to elt.
+ * key == 0 means the slot is free.
+ */
+struct stackmap_entry {
+ u32 key; /* 0 = free, non-zero = jhash */
+ struct stackmap_elt *val; /* NULL until fully published */
+};
+
+struct ftrace_stackmap {
+ struct trace_array *tr; /* owning trace_array */
+ unsigned int map_bits;
+ unsigned int map_size; /* 1 << (map_bits + 1) */
+ unsigned int max_elts; /* 1 << map_bits */
+ u32 hash_seed; /* per-instance jhash seed */
+ atomic_t next_elt; /* index into elts pool */
+ struct stackmap_entry *entries; /* hash table */
+ struct stackmap_elt *elts; /* flat element pool */
+ atomic_t resetting;
+ atomic64_t successes; /* events served (hits + new inserts) */
+ atomic64_t drops;
+};
+
+/*
+ * Cap the bits parameter to keep worst-case allocations bounded:
+ * bits=18 → 256K elts, 512K slots, ~130 MB elt pool, ~130 MB bin
+ * export.
+ * Smaller workloads should use the default (14) which gives 16K elts
+ * (~8 MB pool); bump bits via the ftrace_stackmap.bits= kernel
+ * parameter for higher unique-stack capacity.
+ */
+#define FTRACE_STACKMAP_BITS_MIN 10
+#define FTRACE_STACKMAP_BITS_MAX 18
+#define FTRACE_STACKMAP_BITS_DEFAULT 14
+
+static unsigned int stackmap_map_bits = FTRACE_STACKMAP_BITS_DEFAULT;
+static int __init stackmap_bits_setup(char *str)
+{
+ unsigned long val;
+
+ if (kstrtoul(str, 0, &val))
+ return -EINVAL;
+ val = clamp_val(val, FTRACE_STACKMAP_BITS_MIN, FTRACE_STACKMAP_BITS_MAX);
+ stackmap_map_bits = val;
+ return 0;
+}
+early_param("ftrace_stackmap.bits", stackmap_bits_setup);
+
+/* --- Element pool --- */
+
+static struct stackmap_elt *stackmap_get_elt(struct ftrace_stackmap *smap)
+{
+ int idx;
+
+ /*
+ * Fast-path early-out once the pool is fully consumed. Avoids
+ * the contended atomic RMW on next_elt for every traced event
+ * after the pool is exhausted.
+ */
+ if (atomic_read(&smap->next_elt) >= smap->max_elts)
+ return NULL;
+
+ idx = atomic_fetch_add_unless(&smap->next_elt, 1, smap->max_elts);
+ if (idx < smap->max_elts)
+ return &smap->elts[idx];
+ return NULL;
+}
+
+/* --- Create / Destroy / Reset --- */
+
+struct ftrace_stackmap *ftrace_stackmap_create(struct trace_array *tr)
+{
+ struct ftrace_stackmap *smap;
+ unsigned int bits;
+
+ smap = kzalloc(sizeof(*smap), GFP_KERNEL);
+ if (!smap)
+ return ERR_PTR(-ENOMEM);
+
+ /* Defensive clamp: reject bogus bits even if early_param is bypassed. */
+ bits = clamp_val(stackmap_map_bits,
+ FTRACE_STACKMAP_BITS_MIN,
+ FTRACE_STACKMAP_BITS_MAX);
+
+ smap->tr = tr;
+ smap->map_bits = bits;
+ smap->max_elts = 1U << bits;
+ smap->map_size = 1U << (bits + 1); /* 2x over-provision */
+ BUG_ON(!is_power_of_2(smap->map_size));
+
+ smap->entries = vzalloc(sizeof(*smap->entries) * smap->map_size);
+ if (!smap->entries) {
+ kfree(smap);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /*
+ * Single large vmalloc of the element pool, indexed flat.
+ * At bits=16 this is 64K * sizeof(struct stackmap_elt). The
+ * struct is ~520 B (8 + 4 + 4 + 64*8), so total ~33 MB.
+ */
+ smap->elts = vzalloc(sizeof(*smap->elts) * (size_t)smap->max_elts);
+ if (!smap->elts) {
+ vfree(smap->entries);
+ kfree(smap);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ smap->hash_seed = get_random_u32();
+ atomic_set(&smap->next_elt, 0);
+ atomic_set(&smap->resetting, 0);
+ atomic64_set(&smap->successes, 0);
+ atomic64_set(&smap->drops, 0);
+
+ return smap;
+}
+
+void ftrace_stackmap_destroy(struct ftrace_stackmap *smap)
+{
+ if (!smap || IS_ERR(smap))
+ return;
+ vfree(smap->elts);
+ vfree(smap->entries);
+ kfree(smap);
+}
+
+/**
+ * ftrace_stackmap_reset - clear all entries in the stackmap
+ * @smap: the stackmap to reset
+ *
+ * Returns 0 on success, -EBUSY if another reset is already in
+ * progress, or if tracing is currently active on the owning
+ * trace_array.
+ *
+ * Online reset (with tracing active) is not supported. Caller must
+ * stop tracing first (echo 0 > tracing_on).
+ *
+ * Caller is process context (typically sysfs write handler).
+ *
+ * Protocol:
+ * 1. Atomically claim reset rights via cmpxchg on @resetting.
+ * 2. Verify tracing is stopped on @smap->tr; if not, release the
+ * claim and return -EBUSY. The resetting flag itself blocks
+ * any subsequent get_id() callers.
+ * 3. synchronize_rcu() drains in-flight get_id() callers from the
+ * ftrace callback path (which runs preempt-disabled).
+ * 4. memset entries, elts, and counters.
+ * 5. Release the resetting flag with release semantics so any new
+ * get_id() observes a fully cleared map.
+ */
+int ftrace_stackmap_reset(struct ftrace_stackmap *smap)
+{
+ if (!smap)
+ return 0;
+
+ if (atomic_cmpxchg(&smap->resetting, 0, 1) != 0)
+ return -EBUSY;
+
+ if (smap->tr && tracer_tracing_is_on(smap->tr)) {
+ atomic_set(&smap->resetting, 0);
+ return -EBUSY;
+ }
+
+ /*
+ * synchronize_rcu() itself is a full barrier; no extra smp_mb()
+ * is needed before it. It drains in-flight ftrace callbacks that
+ * may have already passed the resetting check with the old value.
+ */
+ synchronize_rcu();
+
+ memset(smap->entries, 0, sizeof(*smap->entries) * smap->map_size);
+ memset(smap->elts, 0, sizeof(*smap->elts) * (size_t)smap->max_elts);
+
+ atomic_set(&smap->next_elt, 0);
+ atomic64_set(&smap->successes, 0);
+ atomic64_set(&smap->drops, 0);
+
+ /* Release resetting=0 so new get_id() observes a cleared map. */
+ atomic_set_release(&smap->resetting, 0);
+ return 0;
+}
+
+/* --- Core: get_id (lock-free, NMI-safe) --- */
+
+int ftrace_stackmap_get_id(struct ftrace_stackmap *smap,
+ unsigned long *ips, unsigned int nr_entries)
+{
+ u32 key_hash, idx, test_key, trace_len;
+ struct stackmap_entry *entry;
+ struct stackmap_elt *val;
+ int probes = 0;
+
+ if (!smap || !nr_entries || atomic_read(&smap->resetting))
+ return -EINVAL;
+ if (nr_entries > FTRACE_STACKMAP_MAX_DEPTH)
+ nr_entries = FTRACE_STACKMAP_MAX_DEPTH;
+
+ trace_len = nr_entries * sizeof(unsigned long);
+ /*
+ * jhash2() requires the length in u32 units and the data to be
+ * u32-aligned. On 64-bit kernels sizeof(unsigned long)==8, so
+ * trace_len is always a multiple of 8 (hence of 4). Use jhash2
+ * directly; the cast to u32* is safe because ips[] is naturally
+ * aligned to sizeof(unsigned long) >= 4.
+ */
+ key_hash = jhash2((const u32 *)ips, trace_len / sizeof(u32),
+ smap->hash_seed);
+ if (key_hash == 0)
+ key_hash = 1; /* 0 means free slot */
+
+ idx = key_hash >> (32 - (smap->map_bits + 1));
+
+ while (probes < FTRACE_STACKMAP_MAX_PROBE) {
+ idx &= (smap->map_size - 1);
+ entry = &smap->entries[idx];
+ test_key = entry->key;
+
+ if (test_key == key_hash) {
+ /*
+ * smp_load_acquire pairs with smp_store_release in
+ * the publisher below; ensures we see fully-formed
+ * elt fields (nr, ips, ref_count) before dereference.
+ */
+ val = smp_load_acquire(&entry->val);
+ if (val && val->nr == nr_entries &&
+ memcmp(val->ips, ips, trace_len) == 0) {
+ atomic_inc(&val->ref_count);
+ atomic64_inc(&smap->successes);
+ return (int)idx;
+ }
+ /*
+ * val == NULL: another CPU is mid-insert, or this
+ * slot is "claimed but empty" (pool exhausted).
+ * val != NULL but mismatch: 32-bit hash collision
+ * with a different stack. In both cases, advance.
+ */
+ } else if (!test_key) {
+ /* Free slot: try to claim it */
+ if (cmpxchg(&entry->key, 0, key_hash) == 0) {
+ struct stackmap_elt *elt;
+
+ elt = stackmap_get_elt(smap);
+ if (!elt) {
+ /*
+ * Pool exhausted. We claimed this
+ * slot with cmpxchg but cannot fill
+ * it. Leave key set so the slot
+ * stays "claimed but empty" — future
+ * lookups treat val==NULL as a miss
+ * and probe past it. Cannot revert
+ * key=0 without racing other CPUs.
+ */
+ atomic64_inc(&smap->drops);
+ return -ENOSPC;
+ }
+
+ elt->nr = nr_entries;
+ atomic_set(&elt->ref_count, 1);
+ memcpy(elt->ips, ips, trace_len);
+
+ /*
+ * Publish elt with release semantics so the
+ * reader's smp_load_acquire can safely
+ * dereference val->nr / val->ips.
+ */
+ smp_store_release(&entry->val, elt);
+ atomic64_inc(&smap->successes);
+ return (int)idx;
+ }
+ /* cmpxchg failed; another CPU claimed this slot. */
+ }
+
+ idx++;
+ probes++;
+ }
+
+ atomic64_inc(&smap->drops);
+ return -ENOSPC;
+}
+
+/* --- Text export: /sys/kernel/debug/tracing/stack_map --- */
+
+struct stackmap_seq_private {
+ struct ftrace_stackmap *smap;
+};
+
+static void *stackmap_seq_start(struct seq_file *m, loff_t *pos)
+{
+ struct stackmap_seq_private *priv = m->private;
+ struct ftrace_stackmap *smap = priv->smap;
+ u32 i;
+
+ if (!smap)
+ return NULL;
+ for (i = *pos; i < smap->map_size; i++) {
+ if (smap->entries[i].key && READ_ONCE(smap->entries[i].val)) {
+ *pos = i;
+ return &smap->entries[i];
+ }
+ }
+ return NULL;
+}
+
+static void *stackmap_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct stackmap_seq_private *priv = m->private;
+ struct ftrace_stackmap *smap = priv->smap;
+ u32 i;
+
+ if (!smap)
+ return NULL;
+ for (i = *pos + 1; i < smap->map_size; i++) {
+ if (smap->entries[i].key && READ_ONCE(smap->entries[i].val)) {
+ *pos = i;
+ return &smap->entries[i];
+ }
+ }
+ return NULL;
+}
+
+static void stackmap_seq_stop(struct seq_file *m, void *v) { }
+
+static int stackmap_seq_show(struct seq_file *m, void *v)
+{
+ struct stackmap_entry *entry = v;
+ struct stackmap_elt *elt = smp_load_acquire(&entry->val);
+ struct stackmap_seq_private *priv = m->private;
+ u32 idx = entry - priv->smap->entries;
+ u32 i, nr;
+
+ if (!elt)
+ return 0;
+
+ nr = READ_ONCE(elt->nr);
+ if (nr > FTRACE_STACKMAP_MAX_DEPTH)
+ nr = FTRACE_STACKMAP_MAX_DEPTH;
+
+ seq_printf(m, "stack_id %u [ref %u, depth %u]\n",
+ idx, atomic_read(&elt->ref_count), nr);
+ for (i = 0; i < nr; i++)
+ seq_printf(m, " [%u] %pS\n", i, (void *)elt->ips[i]);
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static const struct seq_operations stackmap_seq_ops = {
+ .start = stackmap_seq_start,
+ .next = stackmap_seq_next,
+ .stop = stackmap_seq_stop,
+ .show = stackmap_seq_show,
+};
+
+static int stackmap_open(struct inode *inode, struct file *file)
+{
+ struct stackmap_seq_private *priv;
+ struct seq_file *m;
+ int ret;
+
+ ret = seq_open_private(file, &stackmap_seq_ops,
+ sizeof(struct stackmap_seq_private));
+ if (ret)
+ return ret;
+ m = file->private_data;
+ priv = m->private;
+ priv->smap = inode->i_private;
+ return 0;
+}
+
+/*
+ * Accept exactly "0" or "reset" (optionally followed by a single newline).
+ */
+static bool stackmap_write_is_reset(const char *buf, size_t n)
+{
+ if (n > 0 && buf[n - 1] == '\n')
+ n--;
+ return (n == 1 && buf[0] == '0') ||
+ (n == 5 && memcmp(buf, "reset", 5) == 0);
+}
+
+static ssize_t stackmap_write(struct file *file, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *m = file->private_data;
+ struct stackmap_seq_private *priv = m->private;
+ char buf[8];
+ size_t n = min(count, sizeof(buf) - 1);
+ int ret;
+
+ if (n == 0)
+ return -EINVAL;
+ if (copy_from_user(buf, ubuf, n))
+ return -EFAULT;
+ buf[n] = '\0';
+
+ if (!stackmap_write_is_reset(buf, n))
+ return -EINVAL;
+
+ /*
+ * ftrace_stackmap_reset() atomically claims reset rights via
+ * cmpxchg and returns -EBUSY if another reset is in progress
+ * or if tracing is active.
+ */
+ ret = ftrace_stackmap_reset(priv->smap);
+ if (ret)
+ return ret;
+ return count;
+}
+
+const struct file_operations ftrace_stackmap_fops = {
+ .open = stackmap_open,
+ .read = seq_read,
+ .write = stackmap_write,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+/* --- Stats --- */
+
+static int stackmap_stat_show(struct seq_file *m, void *v)
+{
+ struct ftrace_stackmap *smap = m->private;
+ u32 entries;
+ u64 successes, drops;
+
+ if (!smap) {
+ seq_puts(m, "stackmap not initialized\n");
+ return 0;
+ }
+
+ entries = atomic_read(&smap->next_elt);
+ successes = atomic64_read(&smap->successes);
+ drops = atomic64_read(&smap->drops);
+
+ seq_printf(m, "entries: %u / %u\n", entries, smap->max_elts);
+ seq_printf(m, "table_size: %u\n", smap->map_size);
+ seq_printf(m, "successes: %llu\n", successes);
+ seq_printf(m, "drops: %llu\n", drops);
+ if (successes + drops > 0)
+ seq_printf(m, "success_rate: %llu%%\n",
+ successes * 100 / (successes + drops));
+ return 0;
+}
+
+static int stackmap_stat_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, stackmap_stat_show, inode->i_private);
+}
+
+const struct file_operations ftrace_stackmap_stat_fops = {
+ .open = stackmap_stat_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/* --- Binary export --- */
+
+struct stackmap_bin_snapshot {
+ /*
+ * Use u64 (not size_t) so data[] is 8-byte aligned on both
+ * 32-bit and 64-bit architectures. The IP array within data[]
+ * is accessed as u64*, which would alignment-fault on strict
+ * architectures (e.g. older ARM, SPARC) if data[] started at
+ * a 4-byte boundary.
+ */
+ u64 size;
+ char data[];
+};
+
+static int stackmap_bin_open(struct inode *inode, struct file *file)
+{
+ struct ftrace_stackmap *smap = inode->i_private;
+ struct stackmap_bin_snapshot *snap;
+ struct ftrace_stackmap_bin_header *hdr;
+ size_t alloc_size, off;
+ u32 nr_entries, i, nr_stacks;
+
+ if (!smap)
+ return -ENODEV;
+
+ /*
+ * Worst-case allocation size: every populated entry uses a
+ * full-depth stack. The (+1) gives one slack slot in case a
+ * concurrent insert lands between this snapshot and iteration.
+ * The loop below performs an explicit bounds check anyway.
+ *
+ * At bits=16 this caps at ~33 MB. The file is mode 0440
+ * (TRACE_MODE_READ), so only privileged users can open it.
+ */
+ nr_entries = atomic_read(&smap->next_elt);
+ alloc_size = sizeof(*hdr) + (nr_entries + 1) *
+ (sizeof(struct ftrace_stackmap_bin_entry) +
+ FTRACE_STACKMAP_MAX_DEPTH * sizeof(u64));
+
+ snap = vmalloc(sizeof(*snap) + alloc_size);
+ if (!snap)
+ return -ENOMEM;
+
+ hdr = (struct ftrace_stackmap_bin_header *)snap->data;
+ hdr->magic = FTRACE_STACKMAP_BIN_MAGIC;
+ hdr->version = FTRACE_STACKMAP_BIN_VERSION;
+ hdr->reserved = 0;
+ off = sizeof(*hdr);
+ nr_stacks = 0;
+
+ for (i = 0; i < smap->map_size; i++) {
+ struct stackmap_entry *entry = &smap->entries[i];
+ struct stackmap_elt *elt;
+ struct ftrace_stackmap_bin_entry *e;
+ u64 *ips_out;
+ u32 k, nr;
+
+ if (!entry->key)
+ continue;
+ elt = smp_load_acquire(&entry->val);
+ if (!elt)
+ continue;
+
+ nr = READ_ONCE(elt->nr);
+ if (nr > FTRACE_STACKMAP_MAX_DEPTH)
+ nr = FTRACE_STACKMAP_MAX_DEPTH;
+
+ /* Bounds check: stop if we would overflow the allocation. */
+ if (off + sizeof(*e) + nr * sizeof(u64) > alloc_size)
+ break;
+
+ e = (struct ftrace_stackmap_bin_entry *)(snap->data + off);
+ e->stack_id = i;
+ e->nr = nr;
+ e->ref_count = atomic_read(&elt->ref_count);
+ e->reserved = 0;
+ off += sizeof(*e);
+
+ ips_out = (u64 *)(snap->data + off);
+ for (k = 0; k < nr; k++)
+ ips_out[k] = (u64)elt->ips[k];
+ off += nr * sizeof(u64);
+ nr_stacks++;
+ }
+
+ hdr->nr_stacks = nr_stacks;
+ snap->size = off;
+ file->private_data = snap;
+ return 0;
+}
+
+static ssize_t stackmap_bin_read(struct file *file, char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct stackmap_bin_snapshot *snap = file->private_data;
+
+ if (!snap)
+ return -EINVAL;
+ return simple_read_from_buffer(ubuf, count, ppos, snap->data, snap->size);
+}
+
+static int stackmap_bin_release(struct inode *inode, struct file *file)
+{
+ vfree(file->private_data);
+ return 0;
+}
+
+const struct file_operations ftrace_stackmap_bin_fops = {
+ .open = stackmap_bin_open,
+ .read = stackmap_bin_read,
+ .llseek = default_llseek,
+ .release = stackmap_bin_release,
+};
diff --git a/kernel/trace/trace_stackmap.h b/kernel/trace/trace_stackmap.h
new file mode 100644
index 000000000000..da51ed919e2c
--- /dev/null
+++ b/kernel/trace/trace_stackmap.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _TRACE_STACKMAP_H
+#define _TRACE_STACKMAP_H
+
+#include <linux/types.h>
+#include <linux/atomic.h>
+
+#define FTRACE_STACKMAP_MAX_DEPTH 64
+
+/* Binary export format */
+#define FTRACE_STACKMAP_BIN_MAGIC 0x464D5342 /* 'FSMB' */
+#define FTRACE_STACKMAP_BIN_VERSION 2
+
+struct ftrace_stackmap_bin_header {
+ u32 magic;
+ u32 version;
+ u32 nr_stacks;
+ u32 reserved;
+};
+
+struct ftrace_stackmap_bin_entry {
+ u32 stack_id;
+ u32 nr;
+ u32 ref_count;
+ u32 reserved;
+ /* followed by u64 ips[nr] */
+};
+
+struct trace_array;
+
+#ifdef CONFIG_FTRACE_STACKMAP
+
+struct ftrace_stackmap;
+
+struct ftrace_stackmap *ftrace_stackmap_create(struct trace_array *tr);
+void ftrace_stackmap_destroy(struct ftrace_stackmap *smap);
+int ftrace_stackmap_get_id(struct ftrace_stackmap *smap,
+ unsigned long *ips, unsigned int nr_entries);
+int ftrace_stackmap_reset(struct ftrace_stackmap *smap);
+
+extern const struct file_operations ftrace_stackmap_fops;
+extern const struct file_operations ftrace_stackmap_stat_fops;
+extern const struct file_operations ftrace_stackmap_bin_fops;
+
+#else
+
+struct ftrace_stackmap;
+static inline struct ftrace_stackmap *ftrace_stackmap_create(struct trace_array *tr) { return NULL; }
+static inline void ftrace_stackmap_destroy(struct ftrace_stackmap *s) { }
+static inline int ftrace_stackmap_get_id(struct ftrace_stackmap *s,
+ unsigned long *ips, unsigned int n)
+{ return -ENOSYS; }
+static inline int ftrace_stackmap_reset(struct ftrace_stackmap *s) { return 0; }
+
+#endif
+#endif /* _TRACE_STACKMAP_H */
--
2.34.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox