* [PATCH RESEND] tracing/osnoise: Dump stack on timerlat uret threshold event
From: Crystal Wood @ 2026-05-11 22:31 UTC (permalink / raw)
To: Steven Rostedt
Cc: linux-trace-kernel, John Kacur, Tomas Glozar, Costa Shulyupin,
Wander Lairson Costa, Crystal Wood
Dump the saved IRQ stack trace regardless of whether the event was
THREAD_CONTEXT or THREAD_URET.
In the uret case, the latency presumably had not yet crossed the
threshold at IRQ time (or else it would have dumped the stack at thread
wakeup time, unless we're racing with a change to the threshold), but it
may have at least contributed -- and this is possible with THREAD_CONTEXT
as well.
In any case, it helps with writing reliable rtla tests if we always get
a stack trace on a threshold event.
Signed-off-by: Crystal Wood <crwood@redhat.com>
---
Original: https://lore.kernel.org/all/20251112152529.956778-3-crwood@redhat.com/
kernel/trace/trace_osnoise.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 75678053b21c..62c2667d97fa 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -2544,9 +2544,12 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count,
notify_new_max_latency(diff);
tlat->tracing_thread = false;
- if (osnoise_data.stop_tracing_total)
- if (time_to_us(diff) >= osnoise_data.stop_tracing_total)
+ if (osnoise_data.stop_tracing_total) {
+ if (time_to_us(diff) >= osnoise_data.stop_tracing_total) {
+ timerlat_dump_stack(time_to_us(diff));
osnoise_stop_tracing();
+ }
+ }
} else {
tlat->tracing_thread = false;
tlat->kthread = current;
--
2.54.0
^ permalink raw reply related
* [PATCH v2] tracing/osnoise: Array printk init and cleanup
From: Crystal Wood @ 2026-05-11 22:30 UTC (permalink / raw)
To: Steven Rostedt
Cc: linux-trace-kernel, John Kacur, Tomas Glozar, Costa Shulyupin,
Wander Lairson Costa, Crystal Wood
None of the calls to trace_array_printk_buf() will do anything
if we don't initialize the buffer on instance creation (unless
some other tracer called it), so do that.
Add an osnoise_print() function to facilitate adding debug prints
(without tainting).
Use trace_array_printk() instead of trace_array_printk_buf(), as we're
only writing to the main buffer (of a non-main instance) anyway -- and
trace_array_printk_buf() skips the check to make sure we're not printing
to the global instance.
Signed-off-by: Crystal Wood <crwood@redhat.com>
---
v2: s/macro/function/ in commit message
v1: https://lore.kernel.org/all/20251112152529.956778-4-crwood@redhat.com/
kernel/trace/trace_osnoise.c | 39 ++++++++++++++++++++++--------------
1 file changed, 24 insertions(+), 15 deletions(-)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 62c2667d97fa..5e83c4f6f2b4 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -83,6 +83,22 @@ struct osnoise_instance {
static struct list_head osnoise_instances;
+static void osnoise_print(const char *fmt, ...)
+{
+ struct osnoise_instance *inst;
+ struct trace_array *tr;
+ va_list ap;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ tr = inst->tr;
+ va_start(ap, fmt);
+ trace_array_vprintk(tr, _RET_IP_, fmt, ap);
+ va_end(ap);
+ }
+ rcu_read_unlock();
+}
+
static bool osnoise_has_registered_instances(void)
{
return !!list_first_or_null_rcu(&osnoise_instances,
@@ -123,6 +139,7 @@ static int osnoise_register_instance(struct trace_array *tr)
* trace_types_lock.
*/
lockdep_assert_held(&trace_types_lock);
+ trace_array_init_printk(tr);
inst = kmalloc_obj(*inst);
if (!inst)
@@ -471,15 +488,7 @@ static void print_osnoise_headers(struct seq_file *s)
* osnoise_taint - report an osnoise error.
*/
#define osnoise_taint(msg) ({ \
- struct osnoise_instance *inst; \
- struct trace_buffer *buffer; \
- \
- rcu_read_lock(); \
- list_for_each_entry_rcu(inst, &osnoise_instances, list) { \
- buffer = inst->tr->array_buffer.buffer; \
- trace_array_printk_buf(buffer, _THIS_IP_, msg); \
- } \
- rcu_read_unlock(); \
+ osnoise_print(msg); \
osnoise_data.tainted = true; \
})
@@ -1189,10 +1198,10 @@ static __always_inline void osnoise_stop_exception(char *msg, int cpu)
rcu_read_lock();
list_for_each_entry_rcu(inst, &osnoise_instances, list) {
tr = inst->tr;
- trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_,
- "stop tracing hit on cpu %d due to exception: %s\n",
- smp_processor_id(),
- msg);
+ trace_array_printk(tr, _THIS_IP_,
+ "stop tracing hit on cpu %d due to exception: %s\n",
+ smp_processor_id(),
+ msg);
if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options))
panic("tracer hit on cpu %d due to exception: %s\n",
@@ -1362,8 +1371,8 @@ static __always_inline void osnoise_stop_tracing(void)
rcu_read_lock();
list_for_each_entry_rcu(inst, &osnoise_instances, list) {
tr = inst->tr;
- trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_,
- "stop tracing hit on cpu %d\n", smp_processor_id());
+ trace_array_printk(tr, _THIS_IP_,
+ "stop tracing hit on cpu %d\n", smp_processor_id());
if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options))
panic("tracer hit stop condition on CPU %d\n", smp_processor_id());
--
2.54.0
^ permalink raw reply related
* Re: [PATCH mm-unstable v17 00/14] khugepaged: mTHP support
From: Andrew Morton @ 2026-05-11 21:04 UTC (permalink / raw)
To: Nico Pache
Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
lance.yang, liam, ljs, mathieu.desnoyers, matthew.brost, mhiramat,
mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
zokeefe
In-Reply-To: <20260511185817.686831-1-npache@redhat.com>
On Mon, 11 May 2026 12:58:00 -0600 Nico Pache <npache@redhat.com> wrote:
> The following series provides khugepaged with the capability to collapse
> anonymous memory regions to mTHPs.
Thanks, I've updated mm.git's mm-new branch to this version.
> V17 Changes:
> - Added Acks/RB
> - New patch(5): split the mmap_read_unlock() locking contract change out of
> "generalize collapse_huge_page" into its own patch; add a comment
> documenting the enter/exit-with-lock-dropped contract (Usama, David)
> - [patch 03] Add const to max_ptes_none/shared/swap variables; improve the
> three helper docstrings; replace the paragraphs with inline comments;
> note that sysctl values are now snapshotted once per scan (Usama, David)
> - [patch 04] Add SCAN_INVALID_PTES_NONE result code and return it instead
> of SCAN_FAIL when collapse_max_ptes_none() returns -EINVAL (Usama);
> snapshot khugepaged_max_ptes_none into a local variable to fix race on
> the two comparisons (Usama); clean up mTHP docstring paragraphs into
> inline comments; fix commit message wording (David)
> - [patch 06] Remove /* PMD collapse */ and /* mTHP collapse */ comments
> (David); move const declarations to top of variable list (David); add
> comment explaining that map_anon_folio_pte_nopf() calls set_ptes under
> pmd_ptl and is safe because PMD is expected to be none (Usama)
> - [patch 08] Shorten sysfs counter documentation for
> collapse_exceed_swap/shared_pte to concise one-liners; trim
> collapse_exceed_none_pte description; fix "dont" → "do not" (David)
> - [patch 10] Keep vm_flags parameter in khugepaged_enter_vma() and
> collapse_allowable_orders() rather than dropping it and reading
> vma->vm_flags internally; pass vm_flags explicitly at all three
> collapse_allowable_orders() call sites (David, sashskio)
> - [patch 11] Fix MTHP_STACK_SIZE: was exponential (~128); correct formula
> is (height + 1) for a DFS on a binary tree rewrite comment to explain
> the DFS sizing (sashskio)
> - [patch 12] Replace SCAN_PAGE_LRU with SCAN_PAGE_LAZYFREE in the
> "goto next_order" early-bail cases; non-LRU page failures cannot be
> recovered at any order and belong in the default (return) path
> - [patch 13] Use tva_flags == TVA_KHUGEPAGED (strict equality) instead of
> tva_flags & TVA_KHUGEPAGED; flatten nested if into single condition;
> retain vm_flags parameter; pass vm_flags to collapse_allowable_orders()
Here's how v17 altered mm.git:
Documentation/admin-guide/mm/transhuge.rst | 24 ---
include/linux/khugepaged.h | 6
include/trace/events/huge_memory.h | 3
mm/huge_memory.c | 2
mm/khugepaged.c | 152 ++++++++++---------
mm/vma.c | 6
tools/testing/vma/include/stubs.h | 3
7 files changed, 99 insertions(+), 97 deletions(-)
--- a/Documentation/admin-guide/mm/transhuge.rst~b
+++ a/Documentation/admin-guide/mm/transhuge.rst
@@ -725,27 +725,17 @@ nr_anon_partially_mapped
collapse_exceed_none_pte
The number of collapse attempts that failed due to exceeding the
- max_ptes_none threshold. For mTHP collapse, Currently only max_ptes_none
- values of 0 and (HPAGE_PMD_NR - 1) are supported. Any other value will
- emit a warning and no mTHP collapse will be attempted. khugepaged will
- try to collapse to the largest enabled (m)THP size; if it fails, it will
- try the next lower enabled mTHP size. This counter records the number of
- times a collapse attempt was skipped for exceeding the max_ptes_none
- threshold, and khugepaged will move on to the next available mTHP size.
+ max_ptes_none threshold.
collapse_exceed_swap_pte
- The number of anonymous mTHP PTE ranges which were unable to collapse due
- to containing at least one swap PTE. Currently khugepaged does not
- support collapsing mTHP regions that contain a swap PTE. This counter can
- be used to monitor the number of khugepaged mTHP collapses that failed
- due to the presence of a swap PTE.
+ The number of collapse attempts that failed due to exceeding the
+ max_ptes_swap threshold. For non-PMD orders this occurs if a mTHP range
+ contains at least one swap PTE.
collapse_exceed_shared_pte
- The number of anonymous mTHP PTE ranges which were unable to collapse due
- to containing at least one shared PTE. Currently khugepaged does not
- support collapsing mTHP PTE ranges that contain a shared PTE. This
- counter can be used to monitor the number of khugepaged mTHP collapses
- that failed due to the presence of a shared PTE.
+ The number of collapse attempts that failed due to exceeding the
+ max_ptes_shared threshold. For non-PMD orders this occurs if a mTHP range
+ contains at least one shared PTE.
As the system ages, allocating huge pages may be expensive as the
system uses memory compaction to copy data around memory to free a
--- a/include/linux/khugepaged.h~b
+++ a/include/linux/khugepaged.h
@@ -13,7 +13,8 @@ extern void khugepaged_destroy(void);
extern int start_stop_khugepaged(void);
extern void __khugepaged_enter(struct mm_struct *mm);
extern void __khugepaged_exit(struct mm_struct *mm);
-extern void khugepaged_enter_vma(struct vm_area_struct *vma);
+extern void khugepaged_enter_vma(struct vm_area_struct *vma,
+ vm_flags_t vm_flags);
extern void khugepaged_min_free_kbytes_update(void);
extern bool current_is_khugepaged(void);
void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
@@ -37,7 +38,8 @@ static inline void khugepaged_fork(struc
static inline void khugepaged_exit(struct mm_struct *mm)
{
}
-static inline void khugepaged_enter_vma(struct vm_area_struct *vma)
+static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
+ vm_flags_t vm_flags)
{
}
static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
--- a/include/trace/events/huge_memory.h~b
+++ a/include/trace/events/huge_memory.h
@@ -39,7 +39,8 @@
EM( SCAN_STORE_FAILED, "store_failed") \
EM( SCAN_COPY_MC, "copy_poisoned_page") \
EM( SCAN_PAGE_FILLED, "page_filled") \
- EMe(SCAN_PAGE_DIRTY_OR_WRITEBACK, "page_dirty_or_writeback")
+ EM(SCAN_PAGE_DIRTY_OR_WRITEBACK, "page_dirty_or_writeback") \
+ EMe(SCAN_INVALID_PTES_NONE, "invalid_ptes_none")
#undef EM
#undef EMe
--- a/mm/huge_memory.c~b
+++ a/mm/huge_memory.c
@@ -1571,7 +1571,7 @@ vm_fault_t do_huge_pmd_anonymous_page(st
ret = vmf_anon_prepare(vmf);
if (ret)
return ret;
- khugepaged_enter_vma(vma);
+ khugepaged_enter_vma(vma, vma->vm_flags);
if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm) &&
--- a/mm/khugepaged.c~b
+++ a/mm/khugepaged.c
@@ -61,6 +61,7 @@ enum scan_result {
SCAN_COPY_MC,
SCAN_PAGE_FILLED,
SCAN_PAGE_DIRTY_OR_WRITEBACK,
+ SCAN_INVALID_PTES_NONE,
};
#define CREATE_TRACE_POINTS
@@ -101,16 +102,15 @@ static struct kmem_cache *mm_slot_cache
#define KHUGEPAGED_MIN_MTHP_ORDER 2
/*
- * The maximum number of mTHP ranges that can be stored on the stack.
- * This is calculated based on the number of PTE entries in a PTE page table
- * and the minimum mTHP order.
+ * mthp_collapse() does an iterative DFS over a binary tree, from
+ * HPAGE_PMD_ORDER down to KHUGEPAGED_MIN_MTHP_ORDER. The max stack
+ * size needed for a DFS on a binary tree is height + 1, where
+ * height = HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER.
*
- * ilog2 is needed in place of HPAGE_PMD_ORDER due to some architectures
- * (ie ppc64le) not defining HPAGE_PMD_ORDER until after build time.
- *
- * At most there will be 1 << (PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER) mTHP ranges
+ * ilog2 is used in place of HPAGE_PMD_ORDER because some architectures
+ * (e.g. ppc64le) do not define HPAGE_PMD_ORDER until after build time.
*/
-#define MTHP_STACK_SIZE (1UL << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER))
+#define MTHP_STACK_SIZE (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER + 1)
/*
* Defines a range of PTE entries in a PTE page table which are being
@@ -380,89 +380,87 @@ static bool pte_none_or_zero(pte_t pte)
}
/**
- * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
+ * collapse_max_ptes_none - Calculate maximum allowed none-page or zero-page
+ * PTEs for the given collapse operation.
* @cc: The collapse control struct
* @vma: The vma to check for userfaultfd
* @order: The folio order being collapsed to
*
- * If we are not in khugepaged mode use HPAGE_PMD_NR to allow any
- * empty page. For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the
- * configured khugepaged_max_ptes_none value.
- *
- * For mTHP collapses, we currently only support khugepaged_max_pte_none values
- * of 0 or (KHUGEPAGED_MAX_PTES_LIMIT). Any other value will emit a warning and
- * no mTHP collapse will be attempted
- *
- * Return: Maximum number of empty PTEs allowed for the collapse operation
+ * Return: Maximum number of none-page or zero-page PTEs allowed for the
+ * collapse operation.
*/
static int collapse_max_ptes_none(struct collapse_control *cc,
struct vm_area_struct *vma, unsigned int order)
{
+ unsigned int max_ptes_none = khugepaged_max_ptes_none;
+ // If the vma is userfaultfd-armed, allow no none-page or zero-page PTEs.
if (vma && userfaultfd_armed(vma))
return 0;
+ // for MADV_COLLAPSE, allow any none-page or zero-page PTEs.
if (!cc->is_khugepaged)
return HPAGE_PMD_NR;
+ // for PMD collapse, respect the user defined maximum.
if (is_pmd_order(order))
- return khugepaged_max_ptes_none;
+ return max_ptes_none;
/* Zero/non-present collapse disabled. */
- if (!khugepaged_max_ptes_none)
+ if (!max_ptes_none)
return 0;
- if (khugepaged_max_ptes_none == KHUGEPAGED_MAX_PTES_LIMIT)
+ // for mTHP collapse with the sysctl value set to KHUGEPAGED_MAX_PTES_LIMIT,
+ // scale the maximum number of PTEs to the order of the collapse.
+ if (max_ptes_none == KHUGEPAGED_MAX_PTES_LIMIT)
return (1 << order) - 1;
+ // We currently only support max_ptes_none values of 0 or KHUGEPAGED_MAX_PTES_LIMIT.
+ // Emit a warning and return -EINVAL.
pr_warn_once("mTHP collapse only supports max_ptes_none values of 0 or %u\n",
KHUGEPAGED_MAX_PTES_LIMIT);
return -EINVAL;
}
/**
- * collapse_max_ptes_shared - Calculate maximum allowed shared PTEs for collapse
+ * collapse_max_ptes_shared - Calculate maximum allowed PTEs that map shared
+ * anonymous pages for the given collapse operation.
* @cc: The collapse control struct
* @order: The folio order being collapsed to
*
- * If we are not in khugepaged mode use HPAGE_PMD_NR to allow any
- * shared page.
- *
- * For mTHP collapses, we currently dont support collapsing memory with
- * shared memory.
- *
- * Return: Maximum number of shared PTEs allowed for the collapse operation
+ * Return: Maximum number of PTEs that map shared anonymous pages for the
+ * collapse operation
*/
static unsigned int collapse_max_ptes_shared(struct collapse_control *cc,
unsigned int order)
{
+ // for MADV_COLLAPSE, do not restrict the number of PTEs that map shared
+ // anonymous pages.
if (!cc->is_khugepaged)
return HPAGE_PMD_NR;
+ // for mTHP collapse do not allow collapsing anonymous memory pages that
+ // are shared between processes.
if (!is_pmd_order(order))
return 0;
-
+ // for PMD collapse, respect the user defined maximum.
return khugepaged_max_ptes_shared;
}
/**
- * collapse_max_ptes_swap - Calculate maximum allowed swap PTEs for collapse
+ * collapse_max_ptes_swap - Calculate the maximum allowed non-present PTEs or the
+ * maximum allowed non-present pagecache entries for the given collapse operation.
* @cc: The collapse control struct
* @order: The folio order being collapsed to
*
- * If we are not in khugepaged mode use HPAGE_PMD_NR to allow any
- * swap page.
- *
- * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured
- * khugepaged_max_ptes_swap value.
- *
- * For mTHP collapses, we currently dont support collapsing memory with
- * swapped out memory.
- *
- * Return: Maximum number of swap PTEs allowed for the collapse operation
+ * Return: Maximum number of non-present PTEs or the maximum allowed non-present
+ * pagecache entries for the collapse operation.
*/
static unsigned int collapse_max_ptes_swap(struct collapse_control *cc,
unsigned int order)
{
+ // for MADV_COLLAPSE, do not restrict the number PTEs entries or
+ // pagecache entries that are non-present.
if (!cc->is_khugepaged)
return HPAGE_PMD_NR;
+ // for mTHP collapse do not allow any non-present PTEs or pagecache entries.
if (!is_pmd_order(order))
return 0;
-
+ // for PMD collapse, respect the user defined maximum.
return khugepaged_max_ptes_swap;
}
@@ -478,7 +476,7 @@ int hugepage_madvise(struct vm_area_stru
* register it here without waiting a page fault that
* may not happen any time soon.
*/
- khugepaged_enter_vma(vma);
+ khugepaged_enter_vma(vma, *vm_flags);
break;
case MADV_NOHUGEPAGE:
*vm_flags &= ~VM_HUGEPAGE;
@@ -579,26 +577,26 @@ void __khugepaged_enter(struct mm_struct
/* Check what orders are allowed based on the vma and collapse type */
static unsigned long collapse_allowable_orders(struct vm_area_struct *vma,
- enum tva_type tva_flags)
+ vm_flags_t vm_flags, enum tva_type tva_flags)
{
unsigned long orders;
/* If khugepaged is scanning an anonymous vma, allow mTHP collapse */
- if ((tva_flags & TVA_KHUGEPAGED) && vma_is_anonymous(vma))
+ if ((tva_flags == TVA_KHUGEPAGED) && vma_is_anonymous(vma))
orders = THP_ORDERS_ALL_ANON;
else
orders = BIT(HPAGE_PMD_ORDER);
- return thp_vma_allowable_orders(vma, vma->vm_flags, tva_flags, orders);
+ return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
}
-void khugepaged_enter_vma(struct vm_area_struct *vma)
+void khugepaged_enter_vma(struct vm_area_struct *vma,
+ vm_flags_t vm_flags)
{
if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
- hugepage_enabled()) {
- if (collapse_allowable_orders(vma, TVA_KHUGEPAGED))
- __khugepaged_enter(vma->vm_mm);
- }
+ collapse_allowable_orders(vma, vm_flags, TVA_KHUGEPAGED) &&
+ hugepage_enabled())
+ __khugepaged_enter(vma->vm_mm);
}
void __khugepaged_exit(struct mm_struct *mm)
@@ -683,7 +681,7 @@ static enum scan_result __collapse_huge_
unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, order);
if (max_ptes_none < 0)
- return result;
+ return SCAN_INVALID_PTES_NONE;
for (_pte = pte; _pte < pte + nr_pages;
_pte++, addr += PAGE_SIZE) {
@@ -905,6 +903,7 @@ static void __collapse_huge_page_copy_fa
{
const unsigned long nr_pages = 1UL << order;
spinlock_t *pmd_ptl;
+
/*
* Re-establish the PMD to point to the original page table
* entry. Restoring PMD needs to be done prior to releasing
@@ -944,6 +943,7 @@ static enum scan_result __collapse_huge_
const unsigned long nr_pages = 1UL << order;
unsigned int i;
enum scan_result result = SCAN_SUCCEED;
+
/*
* Copying pages' contents is subject to memory poison at any iteration.
*/
@@ -1263,10 +1263,20 @@ static enum scan_result alloc_charge_fol
return SCAN_SUCCEED;
}
+/*
+ * collapse_huge_page expects the mmap_read_lock to be dropped before
+ * entering this function. The function will also always return with the lock
+ * dropped. The function starts by allocation a folio, which can potentially
+ * take a long time if it involves sync compaction, and we do not need to hold
+ * the mmap_lock during that. We must recheck the vma after taking it again in
+ * write mode.
+ */
static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long start_addr,
int referenced, int unmapped, struct collapse_control *cc,
unsigned int order)
{
+ const unsigned long pmd_addr = start_addr & HPAGE_PMD_MASK;
+ const unsigned long end_addr = start_addr + (PAGE_SIZE << order);
LIST_HEAD(compound_pagelist);
pmd_t *pmd, _pmd;
pte_t *pte = NULL;
@@ -1277,8 +1287,6 @@ static enum scan_result collapse_huge_pa
struct vm_area_struct *vma;
struct mmu_notifier_range range;
bool anon_vma_locked = false;
- const unsigned long pmd_addr = start_addr & HPAGE_PMD_MASK;
- const unsigned long end_addr = start_addr + (PAGE_SIZE << order);
result = alloc_charge_folio(&folio, mm, cc, order);
if (result != SCAN_SUCCEED)
@@ -1399,11 +1407,16 @@ static enum scan_result collapse_huge_pa
__folio_mark_uptodate(folio);
spin_lock(pmd_ptl);
WARN_ON_ONCE(!pmd_none(*pmd));
- if (is_pmd_order(order)) { /* PMD collapse */
+ if (is_pmd_order(order)) {
pgtable = pmd_pgtable(_pmd);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_addr);
- } else { /* mTHP collapse */
+ } else {
+ /*
+ * set_ptes is called in map_anon_folio_pte_nopf with the
+ * pmd_ptl lock still held; this is safe as the PMD is expected
+ * to be none. The pmd entry is then repopulated below.
+ */
map_anon_folio_pte_nopf(folio, pte, vma, start_addr, /*uffd_wp=*/ false);
smp_wmb(); /* make PTEs visible before PMD. See pmd_install() */
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
@@ -1538,12 +1551,12 @@ static int mthp_collapse(struct mm_struc
case SCAN_EXCEED_SHARED_PTE:
case SCAN_PAGE_LOCK:
case SCAN_PAGE_COUNT:
- case SCAN_PAGE_LRU:
case SCAN_PAGE_NULL:
case SCAN_DEL_PAGE_LRU:
case SCAN_PTE_NON_PRESENT:
case SCAN_PTE_UFFD_WP:
case SCAN_ALLOC_HUGE_PAGE_FAIL:
+ case SCAN_PAGE_LAZYFREE:
goto next_order;
/* Cases where no further collapse is possible */
default:
@@ -1569,6 +1582,10 @@ static enum scan_result collapse_scan_pm
struct vm_area_struct *vma, unsigned long start_addr,
bool *lock_dropped, struct collapse_control *cc)
{
+ int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
+ const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, HPAGE_PMD_ORDER);
+ const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
+ enum tva_type tva_flags = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
pmd_t *pmd;
pte_t *pte, *_pte, pteval;
int i;
@@ -1580,10 +1597,6 @@ static enum scan_result collapse_scan_pm
unsigned long enabled_orders;
spinlock_t *ptl;
int node = NUMA_NO_NODE, unmapped = 0;
- int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
- unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, HPAGE_PMD_ORDER);
- unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
- enum tva_type tva_flags = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
@@ -1597,7 +1610,7 @@ static enum scan_result collapse_scan_pm
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
- enabled_orders = collapse_allowable_orders(vma, tva_flags);
+ enabled_orders = collapse_allowable_orders(vma, vma->vm_flags, tva_flags);
/*
* If PMD is the only enabled order, enforce max_ptes_none, otherwise
@@ -1757,12 +1770,7 @@ static enum scan_result collapse_scan_pm
out_unmap:
pte_unmap_unlock(pte, ptl);
if (result == SCAN_SUCCEED) {
- /*
- * Before allocating the hugepage, release the mmap_lock read lock.
- * The allocation can take potentially a long time if it involves
- * sync compaction, and we do not need to hold the mmap_lock during
- * that. We will recheck the vma after taking it again in write mode.
- */
+ /* collapse_huge_page expects the lock to be dropped before calling */
mmap_read_unlock(mm);
nr_collapsed = mthp_collapse(mm, start_addr, referenced, unmapped,
cc, enabled_orders);
@@ -2657,14 +2665,14 @@ static enum scan_result collapse_scan_fi
unsigned long addr, struct file *file, pgoff_t start,
struct collapse_control *cc)
{
+ const int max_ptes_none = collapse_max_ptes_none(cc, NULL, HPAGE_PMD_ORDER);
+ const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
struct folio *folio = NULL;
struct address_space *mapping = file->f_mapping;
XA_STATE(xas, &mapping->i_pages, start);
int present, swap;
int node = NUMA_NO_NODE;
enum scan_result result = SCAN_SUCCEED;
- int max_ptes_none = collapse_max_ptes_none(cc, NULL, HPAGE_PMD_ORDER);
- unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
present = 0;
swap = 0;
@@ -2867,7 +2875,7 @@ static void collapse_scan_mm_slot(unsign
cc->progress++;
break;
}
- if (!collapse_allowable_orders(vma, TVA_KHUGEPAGED)) {
+ if (!collapse_allowable_orders(vma, vma->vm_flags, TVA_KHUGEPAGED)) {
cc->progress++;
continue;
}
@@ -3177,7 +3185,7 @@ int madvise_collapse(struct vm_area_stru
BUG_ON(vma->vm_start > start);
BUG_ON(vma->vm_end < end);
- if (!collapse_allowable_orders(vma, TVA_FORCED_COLLAPSE))
+ if (!collapse_allowable_orders(vma, vma->vm_flags, TVA_FORCED_COLLAPSE))
return -EINVAL;
cc = kmalloc_obj(*cc);
--- a/mm/vma.c~b
+++ a/mm/vma.c
@@ -989,7 +989,7 @@ static __must_check struct vm_area_struc
goto abort;
vma_set_flags_mask(vmg->target, sticky_flags);
- khugepaged_enter_vma(vmg->target);
+ khugepaged_enter_vma(vmg->target, vmg->vm_flags);
vmg->state = VMA_MERGE_SUCCESS;
return vmg->target;
@@ -1110,7 +1110,7 @@ struct vm_area_struct *vma_merge_new_ran
* following VMA if we have VMAs on both sides.
*/
if (vmg->target && !vma_expand(vmg)) {
- khugepaged_enter_vma(vmg->target);
+ khugepaged_enter_vma(vmg->target, vmg->vm_flags);
vmg->state = VMA_MERGE_SUCCESS;
return vmg->target;
}
@@ -2589,7 +2589,7 @@ static int __mmap_new_vma(struct mmap_st
* call covers the non-merge case.
*/
if (!vma_is_anonymous(vma))
- khugepaged_enter_vma(vma);
+ khugepaged_enter_vma(vma, map->vm_flags);
*vmap = vma;
return 0;
--- a/tools/testing/vma/include/stubs.h~b
+++ a/tools/testing/vma/include/stubs.h
@@ -183,7 +183,8 @@ static inline bool mpol_equal(struct mem
return true;
}
-static inline void khugepaged_enter_vma(struct vm_area_struct *vma)
+static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
+ vm_flags_t vm_flags)
{
}
_
^ permalink raw reply
* [PATCH mm-unstable v17 14/14] Documentation: mm: update the admin guide for mTHP collapse
From: Nico Pache @ 2026-05-11 18:58 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe,
Bagas Sanjaya
In-Reply-To: <20260511185817.686831-1-npache@redhat.com>
Now that we can collapse to mTHPs lets update the admin guide to
reflect these changes and provide proper guidance on how to utilize it.
Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
Documentation/admin-guide/mm/transhuge.rst | 49 +++++++++++++---------
1 file changed, 29 insertions(+), 20 deletions(-)
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 80a4d0bed70b..fc0127a36ef6 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -63,7 +63,8 @@ often.
THP can be enabled system wide or restricted to certain tasks or even
memory ranges inside task's address space. Unless THP is completely
disabled, there is ``khugepaged`` daemon that scans memory and
-collapses sequences of basic pages into PMD-sized huge pages.
+collapses sequences of basic pages into huge pages of either PMD size
+or mTHP sizes, if the system is configured to do so.
The THP behaviour is controlled via :ref:`sysfs <thp_sysfs>`
interface and using madvise(2) and prctl(2) system calls.
@@ -219,10 +220,10 @@ this behaviour by writing 0 to shrink_underused, and enable it by writing
echo 0 > /sys/kernel/mm/transparent_hugepage/shrink_underused
echo 1 > /sys/kernel/mm/transparent_hugepage/shrink_underused
-khugepaged will be automatically started when PMD-sized THP is enabled
+khugepaged will be automatically started when any THP size is enabled
(either of the per-size anon control or the top-level control are set
to "always" or "madvise"), and it'll be automatically shutdown when
-PMD-sized THP is disabled (when both the per-size anon control and the
+all THP sizes are disabled (when both the per-size anon control and the
top-level control are "never")
process THP controls
@@ -264,11 +265,6 @@ support the following arguments::
Khugepaged controls
-------------------
-.. note::
- khugepaged currently only searches for opportunities to collapse to
- PMD-sized THP and no attempt is made to collapse to other THP
- sizes.
-
khugepaged runs usually at low frequency so while one may not want to
invoke defrag algorithms synchronously during the page faults, it
should be worth invoking defrag at least in khugepaged. However it's
@@ -296,11 +292,11 @@ allocation failure to throttle the next allocation attempt::
The khugepaged progress can be seen in the number of pages collapsed (note
that this counter may not be an exact count of the number of pages
collapsed, since "collapsed" could mean multiple things: (1) A PTE mapping
-being replaced by a PMD mapping, or (2) All 4K physical pages replaced by
-one 2M hugepage. Each may happen independently, or together, depending on
-the type of memory and the failures that occur. As such, this value should
-be interpreted roughly as a sign of progress, and counters in /proc/vmstat
-consulted for more accurate accounting)::
+being replaced by a PMD mapping, or (2) physical pages replaced by one
+hugepage of various sizes (PMD-sized or mTHP). Each may happen independently,
+or together, depending on the type of memory and the failures that occur.
+As such, this value should be interpreted roughly as a sign of progress,
+and counters in /proc/vmstat consulted for more accurate accounting)::
/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
@@ -308,16 +304,20 @@ for each pass::
/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans
-``max_ptes_none`` specifies how many extra small pages (that are
-not already mapped) can be allocated when collapsing a group
-of small pages into one large page::
+``max_ptes_none`` specifies how many empty (none/zero) pages are allowed
+when collapsing a group of small pages into one large page::
/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none
-A higher value leads to use additional memory for programs.
-A lower value leads to gain less thp performance. Value of
-max_ptes_none can waste cpu time very little, you can
-ignore it.
+For PMD-sized THP collapse, this directly limits the number of empty pages
+allowed in the 2MB region.
+
+For mTHP collapse, only 0 or (HPAGE_PMD_NR - 1) are supported. Any other value
+will emit a warning and no mTHP collapse will be attempted.
+
+A higher value allows more empty pages, potentially leading to more memory
+usage but better THP performance. A lower value is more conservative and
+may result in fewer THP collapses.
``max_ptes_swap`` specifies how many pages can be brought in from
swap when collapsing a group of pages into a transparent huge page::
@@ -337,6 +337,15 @@ that THP is shared. Exceeding the number would block the collapse::
A higher value may increase memory footprint for some workloads.
+.. note::
+ For mTHP collapse, khugepaged does not support collapsing regions that
+ contain shared or swapped out pages, as this could lead to continuous
+ promotion to higher orders. The collapse will fail if any shared or
+ swapped PTEs are encountered during the scan.
+
+ Currently, madvise_collapse only supports collapsing to PMD-sized THPs
+ and does not attempt mTHP collapses.
+
Boot parameters
===============
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v17 13/14] mm/khugepaged: run khugepaged for all orders
From: Nico Pache @ 2026-05-11 18:58 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe,
Usama Arif
In-Reply-To: <20260511185817.686831-1-npache@redhat.com>
From: Baolin Wang <baolin.wang@linux.alibaba.com>
If any order (m)THP is enabled we should allow running khugepaged to
attempt scanning and collapsing mTHPs. In order for khugepaged to operate
when only mTHP sizes are specified in sysfs, we must modify the predicate
function that determines whether it ought to run to do so.
This function is currently called hugepage_pmd_enabled(), this patch
renames it to hugepage_enabled() and updates the logic to check to
determine whether any valid orders may exist which would justify
khugepaged running.
We must also update collapse_allowable_orders() to check all orders if
the vma is anonymous and the collapse is khugepaged.
After this patch khugepaged mTHP collapse is fully enabled.
Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Acked-by: Usama Arif <usama.arif@linux.dev>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 35 ++++++++++++++++++++---------------
1 file changed, 20 insertions(+), 15 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index f0ae02936638..5ba298d420b7 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -522,23 +522,23 @@ static inline int collapse_test_exit_or_disable(struct mm_struct *mm)
mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
}
-static bool hugepage_pmd_enabled(void)
+static bool hugepage_enabled(void)
{
/*
* We cover the anon, shmem and the file-backed case here; file-backed
* hugepages, when configured in, are determined by the global control.
- * Anon pmd-sized hugepages are determined by the pmd-size control.
+ * Anon hugepages are determined by its per-size mTHP control.
* Shmem pmd-sized hugepages are also determined by its pmd-size control,
* except when the global shmem_huge is set to SHMEM_HUGE_DENY.
*/
if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
hugepage_global_enabled())
return true;
- if (test_bit(PMD_ORDER, &huge_anon_orders_always))
+ if (READ_ONCE(huge_anon_orders_always))
return true;
- if (test_bit(PMD_ORDER, &huge_anon_orders_madvise))
+ if (READ_ONCE(huge_anon_orders_madvise))
return true;
- if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) &&
+ if (READ_ONCE(huge_anon_orders_inherit) &&
hugepage_global_enabled())
return true;
if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled())
@@ -579,7 +579,13 @@ void __khugepaged_enter(struct mm_struct *mm)
static unsigned long collapse_allowable_orders(struct vm_area_struct *vma,
vm_flags_t vm_flags, enum tva_type tva_flags)
{
- unsigned long orders = BIT(HPAGE_PMD_ORDER);
+ unsigned long orders;
+
+ /* If khugepaged is scanning an anonymous vma, allow mTHP collapse */
+ if ((tva_flags == TVA_KHUGEPAGED) && vma_is_anonymous(vma))
+ orders = THP_ORDERS_ALL_ANON;
+ else
+ orders = BIT(HPAGE_PMD_ORDER);
return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
}
@@ -588,10 +594,9 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
- hugepage_pmd_enabled()) {
- if (collapse_allowable_orders(vma, vm_flags, TVA_KHUGEPAGED))
- __khugepaged_enter(vma->vm_mm);
- }
+ collapse_allowable_orders(vma, vm_flags, TVA_KHUGEPAGED) &&
+ hugepage_enabled())
+ __khugepaged_enter(vma->vm_mm);
}
void __khugepaged_exit(struct mm_struct *mm)
@@ -2945,7 +2950,7 @@ static void collapse_scan_mm_slot(unsigned int progress_max,
static int khugepaged_has_work(void)
{
- return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled();
+ return !list_empty(&khugepaged_scan.mm_head) && hugepage_enabled();
}
static int khugepaged_wait_event(void)
@@ -3018,7 +3023,7 @@ static void khugepaged_wait_work(void)
return;
}
- if (hugepage_pmd_enabled())
+ if (hugepage_enabled())
wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
}
@@ -3049,7 +3054,7 @@ void set_recommended_min_free_kbytes(void)
int nr_zones = 0;
unsigned long recommended_min;
- if (!hugepage_pmd_enabled()) {
+ if (!hugepage_enabled()) {
calculate_min_free_kbytes();
goto update_wmarks;
}
@@ -3099,7 +3104,7 @@ int start_stop_khugepaged(void)
int err = 0;
mutex_lock(&khugepaged_mutex);
- if (hugepage_pmd_enabled()) {
+ if (hugepage_enabled()) {
if (!khugepaged_thread)
khugepaged_thread = kthread_run(khugepaged, NULL,
"khugepaged");
@@ -3125,7 +3130,7 @@ int start_stop_khugepaged(void)
void khugepaged_min_free_kbytes_update(void)
{
mutex_lock(&khugepaged_mutex);
- if (hugepage_pmd_enabled() && khugepaged_thread)
+ if (hugepage_enabled() && khugepaged_thread)
set_recommended_min_free_kbytes();
mutex_unlock(&khugepaged_mutex);
}
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v17 12/14] mm/khugepaged: avoid unnecessary mTHP collapse attempts
From: Nico Pache @ 2026-05-11 18:58 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe,
Usama Arif
In-Reply-To: <20260511185817.686831-1-npache@redhat.com>
There are cases where, if an attempted collapse fails, all subsequent
orders are guaranteed to also fail. Avoid these collapse attempts by
bailing out early.
Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
Acked-by: Usama Arif <usama.arif@linux.dev>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 24 +++++++++++++++++++++++-
1 file changed, 23 insertions(+), 1 deletion(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 39bf7ea8a6e8..f0ae02936638 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1531,9 +1531,31 @@ static int mthp_collapse(struct mm_struct *mm, unsigned long address,
collapse_address = address + offset * PAGE_SIZE;
ret = collapse_huge_page(mm, collapse_address, referenced,
unmapped, cc, order);
- if (ret == SCAN_SUCCEED) {
+
+ switch (ret) {
+ /* Cases where we continue to next collapse candidate */
+ case SCAN_SUCCEED:
collapsed += nr_ptes;
+ fallthrough;
+ case SCAN_PTE_MAPPED_HUGEPAGE:
continue;
+ /* Cases where lower orders might still succeed */
+ case SCAN_LACK_REFERENCED_PAGE:
+ case SCAN_EXCEED_NONE_PTE:
+ case SCAN_EXCEED_SWAP_PTE:
+ case SCAN_EXCEED_SHARED_PTE:
+ case SCAN_PAGE_LOCK:
+ case SCAN_PAGE_COUNT:
+ case SCAN_PAGE_NULL:
+ case SCAN_DEL_PAGE_LRU:
+ case SCAN_PTE_NON_PRESENT:
+ case SCAN_PTE_UFFD_WP:
+ case SCAN_ALLOC_HUGE_PAGE_FAIL:
+ case SCAN_PAGE_LAZYFREE:
+ goto next_order;
+ /* Cases where no further collapse is possible */
+ default:
+ return collapsed;
}
}
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v17 11/14] mm/khugepaged: Introduce mTHP collapse support
From: Nico Pache @ 2026-05-11 18:58 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260511185817.686831-1-npache@redhat.com>
Enable khugepaged to collapse to mTHP orders. This patch implements the
main scanning logic using a bitmap to track occupied pages and a stack
structure that allows us to find optimal collapse sizes.
Previous to this patch, PMD collapse had 3 main phases, a light weight
scanning phase (mmap_read_lock) that determines a potential PMD
collapse, an alloc phase (mmap unlocked), then finally heavier collapse
phase (mmap_write_lock).
To enabled mTHP collapse we make the following changes:
During PMD scan phase, track occupied pages in a bitmap. When mTHP
orders are enabled, we remove the restriction of max_ptes_none during the
scan phase to avoid missing potential mTHP collapse candidates. Once we
have scanned the full PMD range and updated the bitmap to track occupied
pages, we use the bitmap to find the optimal mTHP size.
Implement collapse_scan_bitmap() to perform binary recursion on the bitmap
and determine the best eligible order for the collapse. A stack structure
is used instead of traditional recursion to manage the search. This also
prevents a traditional recursive approach when the kernel stack struct is
limited. The algorithm recursively splits the bitmap into smaller chunks to
find the highest order mTHPs that satisfy the collapse criteria. We start
by attempting the PMD order, then moved on the consecutively lower orders
(mTHP collapse). The stack maintains a pair of variables (offset, order),
indicating the number of PTEs from the start of the PMD, and the order of
the potential collapse candidate.
The algorithm for consuming the bitmap works as such:
1) push (0, HPAGE_PMD_ORDER) onto the stack
2) pop the stack
3) check if the number of set bits in that (offset,order) pair
statisfy the max_ptes_none threshold for that order
4) if yes, attempt collapse
5) if no (or collapse fails), push two new stack items representing
the left and right halves of the current bitmap range, at the
next lower order
6) repeat at step (2) until stack is empty.
Below is a diagram representing the algorithm and stack items:
offset mid_offset
| |
| |
v v
____________________________________
| PTE Page Table |
--------------------------------------
<-------><------->
order-1 order-1
mTHP collapses reject regions containing swapped out or shared pages.
This is because adding new entries can lead to new none pages, and these
may lead to constant promotion into a higher order mTHP. A similar
issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse
introducing at least 2x the number of pages, and on a future scan will
satisfy the promotion condition once again. This issue is prevented via
the collapse_max_ptes_none() function which imposes the max_ptes_none
restrictions above.
We currently only support mTHP collapse for max_ptes_none values of 0
and HPAGE_PMD_NR - 1. resulting in the following behavior:
- max_ptes_none=0: Never introduce new empty pages during collapse
- max_ptes_none=HPAGE_PMD_NR-1: Always try collapse to the highest
available mTHP order
Any other max_ptes_none value will emit a warning and skip mTHP collapse
attempts. There should be no behavior change for PMD collapse.
Once we determine what mTHP sizes fits best in that PMD range a collapse
is attempted. A minimum collapse order of 2 is used as this is the lowest
order supported by anon memory as defined by THP_ORDERS_ALL_ANON.
Currently madv_collapse is not supported and will only attempt PMD
collapse.
We can also remove the check for is_khugepaged inside the PMD scan as
the collapse_max_ptes_none() function handles this logic now.
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 182 +++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 174 insertions(+), 8 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 3492b135d667..39bf7ea8a6e8 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -100,6 +100,30 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
static struct kmem_cache *mm_slot_cache __ro_after_init;
+#define KHUGEPAGED_MIN_MTHP_ORDER 2
+/*
+ * mthp_collapse() does an iterative DFS over a binary tree, from
+ * HPAGE_PMD_ORDER down to KHUGEPAGED_MIN_MTHP_ORDER. The max stack
+ * size needed for a DFS on a binary tree is height + 1, where
+ * height = HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER.
+ *
+ * ilog2 is used in place of HPAGE_PMD_ORDER because some architectures
+ * (e.g. ppc64le) do not define HPAGE_PMD_ORDER until after build time.
+ */
+#define MTHP_STACK_SIZE (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER + 1)
+
+/*
+ * Defines a range of PTE entries in a PTE page table which are being
+ * considered for mTHP collapse.
+ *
+ * @offset: the offset of the first PTE entry in a PMD range.
+ * @order: the order of the PTE entries being considered for collapse.
+ */
+struct mthp_range {
+ u16 offset;
+ u8 order;
+};
+
struct collapse_control {
bool is_khugepaged;
@@ -111,6 +135,12 @@ struct collapse_control {
/* nodemask for allocation fallback */
nodemask_t alloc_nmask;
+
+ /* Each bit represents a single occupied (!none/zero) page. */
+ DECLARE_BITMAP(mthp_bitmap, MAX_PTRS_PER_PTE);
+ /* A mask of the current range being considered for mTHP collapse. */
+ DECLARE_BITMAP(mthp_bitmap_mask, MAX_PTRS_PER_PTE);
+ struct mthp_range mthp_bitmap_stack[MTHP_STACK_SIZE];
};
/**
@@ -1404,20 +1434,140 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long s
return result;
}
+static void collapse_mthp_stack_push(struct collapse_control *cc, int *stack_size,
+ u16 offset, u8 order)
+{
+ const int size = *stack_size;
+ struct mthp_range *stack = &cc->mthp_bitmap_stack[size];
+
+ VM_WARN_ON_ONCE(size >= MTHP_STACK_SIZE);
+ stack->order = order;
+ stack->offset = offset;
+ (*stack_size)++;
+}
+
+static struct mthp_range collapse_mthp_stack_pop(struct collapse_control *cc,
+ int *stack_size)
+{
+ const int size = *stack_size;
+
+ VM_WARN_ON_ONCE(size <= 0);
+ (*stack_size)--;
+ return cc->mthp_bitmap_stack[size - 1];
+}
+
+static unsigned int collapse_mthp_count_present(struct collapse_control *cc,
+ u16 offset, unsigned int nr_ptes)
+{
+ bitmap_zero(cc->mthp_bitmap_mask, MAX_PTRS_PER_PTE);
+ bitmap_set(cc->mthp_bitmap_mask, offset, nr_ptes);
+ return bitmap_weight_and(cc->mthp_bitmap, cc->mthp_bitmap_mask, MAX_PTRS_PER_PTE);
+}
+
+/*
+ * mthp_collapse() consumes the bitmap that is generated during
+ * collapse_scan_pmd() to determine what regions and mTHP orders fit best.
+ *
+ * Each bit in cc->mthp_bitmap represents a single occupied (!none/zero) page.
+ * A stack structure cc->mthp_bitmap_stack is used to check different regions
+ * of the bitmap for collapse eligibility. The stack maintains a pair of
+ * variables (offset, order), indicating the number of PTEs from the start of
+ * the PMD, and the order of the potential collapse candidate respectively. We
+ * start at the PMD order and check if it is eligible for collapse; if not, we
+ * add two entries to the stack at a lower order to represent the left and right
+ * halves of the PTE page table we are examining.
+ *
+ * offset mid_offset
+ * | |
+ * | |
+ * v v
+ * --------------------------------------
+ * | cc->mthp_bitmap |
+ * --------------------------------------
+ * <-------><------->
+ * order-1 order-1
+ *
+ * For each of these, we determine how many PTE entries are occupied in the
+ * range of PTE entries we propose to collapse, then we compare this to a
+ * threshold number of PTE entries which would need to be occupied for a
+ * collapse to be permitted at that order (accounting for max_ptes_none).
+ *
+ * If a collapse is permitted, we attempt to collapse the PTE range into a
+ * mTHP.
+ */
+static int mthp_collapse(struct mm_struct *mm, unsigned long address,
+ int referenced, int unmapped, struct collapse_control *cc,
+ unsigned long enabled_orders)
+{
+ unsigned int nr_occupied_ptes, nr_ptes;
+ int max_ptes_none, collapsed = 0, stack_size = 0;
+ unsigned long collapse_address;
+ struct mthp_range range;
+ u16 offset;
+ u8 order;
+
+ collapse_mthp_stack_push(cc, &stack_size, 0, HPAGE_PMD_ORDER);
+
+ while (stack_size) {
+ range = collapse_mthp_stack_pop(cc, &stack_size);
+ order = range.order;
+ offset = range.offset;
+ nr_ptes = 1UL << order;
+
+ if (!test_bit(order, &enabled_orders))
+ goto next_order;
+
+ max_ptes_none = collapse_max_ptes_none(cc, NULL, order);
+
+ if (max_ptes_none < 0)
+ return collapsed;
+
+ nr_occupied_ptes = collapse_mthp_count_present(cc, offset,
+ nr_ptes);
+
+ if (nr_occupied_ptes >= nr_ptes - max_ptes_none) {
+ int ret;
+
+ collapse_address = address + offset * PAGE_SIZE;
+ ret = collapse_huge_page(mm, collapse_address, referenced,
+ unmapped, cc, order);
+ if (ret == SCAN_SUCCEED) {
+ collapsed += nr_ptes;
+ continue;
+ }
+ }
+
+next_order:
+ if (order > KHUGEPAGED_MIN_MTHP_ORDER) {
+ const u8 next_order = order - 1;
+ const u16 mid_offset = offset + (nr_ptes / 2);
+
+ collapse_mthp_stack_push(cc, &stack_size, mid_offset,
+ next_order);
+ collapse_mthp_stack_push(cc, &stack_size, offset,
+ next_order);
+ }
+ }
+ return collapsed;
+}
+
static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long start_addr,
bool *lock_dropped, struct collapse_control *cc)
{
- const int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
+ int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, HPAGE_PMD_ORDER);
const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
+ enum tva_type tva_flags = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
pmd_t *pmd;
- pte_t *pte, *_pte;
- int none_or_zero = 0, shared = 0, referenced = 0;
+ pte_t *pte, *_pte, pteval;
+ int i;
+ int none_or_zero = 0, shared = 0, nr_collapsed = 0, referenced = 0;
enum scan_result result = SCAN_FAIL;
struct page *page = NULL;
struct folio *folio = NULL;
unsigned long addr;
+ unsigned long enabled_orders;
spinlock_t *ptl;
int node = NUMA_NO_NODE, unmapped = 0;
@@ -1429,8 +1579,19 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
goto out;
}
+ bitmap_zero(cc->mthp_bitmap, MAX_PTRS_PER_PTE);
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
+
+ enabled_orders = collapse_allowable_orders(vma, vma->vm_flags, tva_flags);
+
+ /*
+ * If PMD is the only enabled order, enforce max_ptes_none, otherwise
+ * scan all pages to populate the bitmap for mTHP collapse.
+ */
+ if (enabled_orders != BIT(HPAGE_PMD_ORDER))
+ max_ptes_none = KHUGEPAGED_MAX_PTES_LIMIT;
+
pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
if (!pte) {
cc->progress++;
@@ -1438,11 +1599,13 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
goto out;
}
- for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
- _pte++, addr += PAGE_SIZE) {
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ _pte = pte + i;
+ addr = start_addr + i * PAGE_SIZE;
+ pteval = ptep_get(_pte);
+
cc->progress++;
- pte_t pteval = ptep_get(_pte);
if (pte_none_or_zero(pteval)) {
if (++none_or_zero > max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
@@ -1522,6 +1685,8 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
}
}
+ /* Set bit for occupied pages */
+ __set_bit(i, cc->mthp_bitmap);
/*
* Record which node the original page is from and save this
* information to cc->node_load[].
@@ -1580,10 +1745,11 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
if (result == SCAN_SUCCEED) {
/* collapse_huge_page expects the lock to be dropped before calling */
mmap_read_unlock(mm);
- result = collapse_huge_page(mm, start_addr, referenced,
- unmapped, cc, HPAGE_PMD_ORDER);
+ nr_collapsed = mthp_collapse(mm, start_addr, referenced, unmapped,
+ cc, enabled_orders);
/* collapse_huge_page will return with the mmap_lock released */
*lock_dropped = true;
+ result = nr_collapsed ? SCAN_SUCCEED : SCAN_FAIL;
}
out:
trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v17 10/14] mm/khugepaged: introduce collapse_allowable_orders helper function
From: Nico Pache @ 2026-05-11 18:58 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260511185817.686831-1-npache@redhat.com>
Add collapse_allowable_orders() to generalize THP order eligibility. The
function determines which THP orders are permitted based on collapse
context (khugepaged vs madv_collapse).
This consolidates collapse configuration logic and provides a clean
interface for future mTHP collapse support where the orders may be
different.
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 15 ++++++++++++---
1 file changed, 12 insertions(+), 3 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index f28066069437..3492b135d667 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -545,12 +545,21 @@ void __khugepaged_enter(struct mm_struct *mm)
wake_up_interruptible(&khugepaged_wait);
}
+/* Check what orders are allowed based on the vma and collapse type */
+static unsigned long collapse_allowable_orders(struct vm_area_struct *vma,
+ vm_flags_t vm_flags, enum tva_type tva_flags)
+{
+ unsigned long orders = BIT(HPAGE_PMD_ORDER);
+
+ return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
+}
+
void khugepaged_enter_vma(struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
hugepage_pmd_enabled()) {
- if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
+ if (collapse_allowable_orders(vma, vm_flags, TVA_KHUGEPAGED))
__khugepaged_enter(vma->vm_mm);
}
}
@@ -2673,7 +2682,7 @@ static void collapse_scan_mm_slot(unsigned int progress_max,
cc->progress++;
break;
}
- if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
+ if (!collapse_allowable_orders(vma, vma->vm_flags, TVA_KHUGEPAGED)) {
cc->progress++;
continue;
}
@@ -2983,7 +2992,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
BUG_ON(vma->vm_start > start);
BUG_ON(vma->vm_end < end);
- if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
+ if (!collapse_allowable_orders(vma, vma->vm_flags, TVA_FORCED_COLLAPSE))
return -EINVAL;
cc = kmalloc_obj(*cc);
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v17 09/14] mm/khugepaged: improve tracepoints for mTHP orders
From: Nico Pache @ 2026-05-11 18:58 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260511185817.686831-1-npache@redhat.com>
Add the order to the mm_collapse_huge_page<_swapin,_isolate> tracepoints to
give better insight into what order is being operated at for.
Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Signed-off-by: Nico Pache <npache@redhat.com>
---
include/trace/events/huge_memory.h | 34 +++++++++++++++++++-----------
mm/khugepaged.c | 9 ++++----
2 files changed, 27 insertions(+), 16 deletions(-)
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index 443e0bd13fdb..70c25136e7e8 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -90,40 +90,44 @@ TRACE_EVENT(mm_khugepaged_scan_pmd,
TRACE_EVENT(mm_collapse_huge_page,
- TP_PROTO(struct mm_struct *mm, int isolated, int status),
+ TP_PROTO(struct mm_struct *mm, int isolated, int status, unsigned int order),
- TP_ARGS(mm, isolated, status),
+ TP_ARGS(mm, isolated, status, order),
TP_STRUCT__entry(
__field(struct mm_struct *, mm)
__field(int, isolated)
__field(int, status)
+ __field(unsigned int, order)
),
TP_fast_assign(
__entry->mm = mm;
__entry->isolated = isolated;
__entry->status = status;
+ __entry->order = order;
),
- TP_printk("mm=%p, isolated=%d, status=%s",
+ TP_printk("mm=%p, isolated=%d, status=%s, order=%u",
__entry->mm,
__entry->isolated,
- __print_symbolic(__entry->status, SCAN_STATUS))
+ __print_symbolic(__entry->status, SCAN_STATUS),
+ __entry->order)
);
TRACE_EVENT(mm_collapse_huge_page_isolate,
TP_PROTO(struct folio *folio, int none_or_zero,
- int referenced, int status),
+ int referenced, int status, unsigned int order),
- TP_ARGS(folio, none_or_zero, referenced, status),
+ TP_ARGS(folio, none_or_zero, referenced, status, order),
TP_STRUCT__entry(
__field(unsigned long, pfn)
__field(int, none_or_zero)
__field(int, referenced)
__field(int, status)
+ __field(unsigned int, order)
),
TP_fast_assign(
@@ -131,26 +135,30 @@ TRACE_EVENT(mm_collapse_huge_page_isolate,
__entry->none_or_zero = none_or_zero;
__entry->referenced = referenced;
__entry->status = status;
+ __entry->order = order;
),
- TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, status=%s",
+ TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, status=%s, order=%u",
__entry->pfn,
__entry->none_or_zero,
__entry->referenced,
- __print_symbolic(__entry->status, SCAN_STATUS))
+ __print_symbolic(__entry->status, SCAN_STATUS),
+ __entry->order)
);
TRACE_EVENT(mm_collapse_huge_page_swapin,
- TP_PROTO(struct mm_struct *mm, int swapped_in, int referenced, int ret),
+ TP_PROTO(struct mm_struct *mm, int swapped_in, int referenced, int ret,
+ unsigned int order),
- TP_ARGS(mm, swapped_in, referenced, ret),
+ TP_ARGS(mm, swapped_in, referenced, ret, order),
TP_STRUCT__entry(
__field(struct mm_struct *, mm)
__field(int, swapped_in)
__field(int, referenced)
__field(int, ret)
+ __field(unsigned int, order)
),
TP_fast_assign(
@@ -158,13 +166,15 @@ TRACE_EVENT(mm_collapse_huge_page_swapin,
__entry->swapped_in = swapped_in;
__entry->referenced = referenced;
__entry->ret = ret;
+ __entry->order = order;
),
- TP_printk("mm=%p, swapped_in=%d, referenced=%d, ret=%d",
+ TP_printk("mm=%p, swapped_in=%d, referenced=%d, ret=%d, order=%u",
__entry->mm,
__entry->swapped_in,
__entry->referenced,
- __entry->ret)
+ __entry->ret,
+ __entry->order)
);
TRACE_EVENT(mm_khugepaged_scan_file,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 27654ea3f5ca..f28066069437 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -779,13 +779,13 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
} else {
result = SCAN_SUCCEED;
trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
- referenced, result);
+ referenced, result, order);
return result;
}
out:
release_pte_pages(pte, _pte, compound_pagelist);
trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
- referenced, result);
+ referenced, result, order);
return result;
}
@@ -1181,7 +1181,8 @@ static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
result = SCAN_SUCCEED;
out:
- trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result);
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result,
+ order);
return result;
}
@@ -1390,7 +1391,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long s
out_nolock:
if (folio)
folio_put(folio);
- trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
+ trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result, order);
return result;
}
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v17 08/14] mm/khugepaged: add per-order mTHP collapse failure statistics
From: Nico Pache @ 2026-05-11 18:58 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260511185817.686831-1-npache@redhat.com>
Add three new mTHP statistics to track collapse failures for different
orders when encountering swap PTEs, excessive none PTEs, and shared PTEs:
- collapse_exceed_swap_pte: Increment when mTHP collapse fails due to
encountering a swap PTE.
- collapse_exceed_none_pte: Counts when mTHP collapse fails due to
exceeding the none PTE threshold for the given order
- collapse_exceed_shared_pte: Counts when mTHP collapse fails due to
encountering a shared PTE.
These statistics complement the existing THP_SCAN_EXCEED_* events by
providing per-order granularity for mTHP collapse attempts. The stats are
exposed via sysfs under
`/sys/kernel/mm/transparent_hugepage/hugepages-*/stats/` for each
supported hugepage size.
As we currently do not support collapsing mTHPs that contain a swap or
shared entry, those statistics keep track of how often we are
encountering failed mTHP collapses due to these restrictions.
We will add support for mTHP collapse for anonymous pages next; lets also
track when this happens at the PMD level within the per-mTHP stats.
Signed-off-by: Nico Pache <npache@redhat.com>
---
Documentation/admin-guide/mm/transhuge.rst | 14 ++++++++++++++
include/linux/huge_mm.h | 3 +++
mm/huge_memory.c | 7 +++++++
mm/khugepaged.c | 21 +++++++++++++++++++--
4 files changed, 43 insertions(+), 2 deletions(-)
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index c51932e6275d..80a4d0bed70b 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -714,6 +714,20 @@ nr_anon_partially_mapped
an anonymous THP as "partially mapped" and count it here, even though it
is not actually partially mapped anymore.
+collapse_exceed_none_pte
+ The number of collapse attempts that failed due to exceeding the
+ max_ptes_none threshold.
+
+collapse_exceed_swap_pte
+ The number of collapse attempts that failed due to exceeding the
+ max_ptes_swap threshold. For non-PMD orders this occurs if a mTHP range
+ contains at least one swap PTE.
+
+collapse_exceed_shared_pte
+ The number of collapse attempts that failed due to exceeding the
+ max_ptes_shared threshold. For non-PMD orders this occurs if a mTHP range
+ contains at least one shared PTE.
+
As the system ages, allocating huge pages may be expensive as the
system uses memory compaction to copy data around memory to free a
huge page for use. There are some counters in ``/proc/vmstat`` to help
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ba7ae6808544..48496f09909b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -144,6 +144,9 @@ enum mthp_stat_item {
MTHP_STAT_SPLIT_DEFERRED,
MTHP_STAT_NR_ANON,
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
+ MTHP_STAT_COLLAPSE_EXCEED_SWAP,
+ MTHP_STAT_COLLAPSE_EXCEED_NONE,
+ MTHP_STAT_COLLAPSE_EXCEED_SHARED,
__MTHP_STAT_COUNT
};
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 05f482a72a89..3e9eabc74c6c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -717,6 +717,10 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_swap_pte, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_none_pte, MTHP_STAT_COLLAPSE_EXCEED_NONE);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_shared_pte, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
+
static struct attribute *anon_stats_attrs[] = {
&anon_fault_alloc_attr.attr,
@@ -733,6 +737,9 @@ static struct attribute *anon_stats_attrs[] = {
&split_deferred_attr.attr,
&nr_anon_attr.attr,
&nr_anon_partially_mapped_attr.attr,
+ &collapse_exceed_swap_pte_attr.attr,
+ &collapse_exceed_none_pte_attr.attr,
+ &collapse_exceed_shared_pte_attr.attr,
NULL,
};
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ba21b134fc86..27654ea3f5ca 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -645,7 +645,9 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
if (pte_none_or_zero(pteval)) {
if (++none_or_zero > max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
- count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+ if (is_pmd_order(order))
+ count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE);
goto out;
}
continue;
@@ -679,9 +681,17 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
/* See collapse_scan_pmd(). */
if (folio_maybe_mapped_shared(folio)) {
+ /*
+ * TODO: Support shared pages without leading to further
+ * mTHP collapses. Currently bringing in new pages via
+ * shared may cause a future higher order collapse on a
+ * rescan of the same range.
+ */
if (++shared > max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
- count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+ if (is_pmd_order(order))
+ count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
goto out;
}
}
@@ -1130,6 +1140,7 @@ static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
* range.
*/
if (!is_pmd_order(order)) {
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
pte_unmap(pte);
mmap_read_unlock(mm);
result = SCAN_EXCEED_SWAP_PTE;
@@ -1426,6 +1437,8 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
if (++none_or_zero > max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+ count_mthp_stat(HPAGE_PMD_ORDER,
+ MTHP_STAT_COLLAPSE_EXCEED_NONE);
goto out_unmap;
}
continue;
@@ -1434,6 +1447,8 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
if (++unmapped > max_ptes_swap) {
result = SCAN_EXCEED_SWAP_PTE;
count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
+ count_mthp_stat(HPAGE_PMD_ORDER,
+ MTHP_STAT_COLLAPSE_EXCEED_SWAP);
goto out_unmap;
}
/*
@@ -1491,6 +1506,8 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
if (++shared > max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+ count_mthp_stat(HPAGE_PMD_ORDER,
+ MTHP_STAT_COLLAPSE_EXCEED_SHARED);
goto out_unmap;
}
}
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v17 07/14] mm/khugepaged: skip collapsing mTHP to smaller orders
From: Nico Pache @ 2026-05-11 18:58 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe,
Usama Arif
In-Reply-To: <20260511185817.686831-1-npache@redhat.com>
khugepaged may try to collapse a mTHP to a smaller mTHP, resulting in
some pages being unmapped. Skip these cases until we have a way to check
if its ok to collapse to a smaller mTHP size (like in the case of a
partially mapped folio). This check is also not done during the scan phase
as the current collapse order is unknown at that time.
This patch is inspired by Dev Jain's work on khugepaged mTHP support [1].
[1] https://lore.kernel.org/lkml/20241216165105.56185-11-dev.jain@arm.com/
Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: Usama Arif <usama.arif@linux.dev>
Co-developed-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index f49bef78cf51..ba21b134fc86 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -685,6 +685,14 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
goto out;
}
}
+ /*
+ * TODO: In some cases of partially-mapped folios, we'd actually
+ * want to collapse.
+ */
+ if (!is_pmd_order(order) && folio_order(folio) >= order) {
+ result = SCAN_PTE_MAPPED_HUGEPAGE;
+ goto out;
+ }
if (folio_test_large(folio)) {
struct folio *f;
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v17 06/14] mm/khugepaged: generalize collapse_huge_page for mTHP collapse
From: Nico Pache @ 2026-05-11 18:58 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260511185817.686831-1-npache@redhat.com>
Pass an order and offset to collapse_huge_page to support collapsing anon
memory to arbitrary orders within a PMD. order indicates what mTHP size we
are attempting to collapse to, and offset indicates were in the PMD to
start the collapse attempt.
For non-PMD collapse we must leave the anon VMA write locked until after
we collapse the mTHP-- in the PMD case all the pages are isolated, but in
the mTHP case this is not true, and we must keep the lock to prevent
access/changes to the page tables. This can happen if the rmap walkers hit
a pmd_none while the PMD entry is currently unavailable due to being
temporarily removed during the collapse phase.
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 93 +++++++++++++++++++++++++++++--------------------
1 file changed, 55 insertions(+), 38 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 37a5f6791816..f49bef78cf51 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1207,34 +1207,36 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
* the mmap_lock during that. We must recheck the vma after taking it again in
* write mode.
*/
-static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address,
- int referenced, int unmapped, struct collapse_control *cc)
+static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long start_addr,
+ int referenced, int unmapped, struct collapse_control *cc,
+ unsigned int order)
{
+ const unsigned long pmd_addr = start_addr & HPAGE_PMD_MASK;
+ const unsigned long end_addr = start_addr + (PAGE_SIZE << order);
LIST_HEAD(compound_pagelist);
pmd_t *pmd, _pmd;
- pte_t *pte;
+ pte_t *pte = NULL;
pgtable_t pgtable;
struct folio *folio;
spinlock_t *pmd_ptl, *pte_ptl;
enum scan_result result = SCAN_FAIL;
struct vm_area_struct *vma;
struct mmu_notifier_range range;
+ bool anon_vma_locked = false;
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-
- result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
+ result = alloc_charge_folio(&folio, mm, cc, order);
if (result != SCAN_SUCCEED)
goto out_nolock;
mmap_read_lock(mm);
- result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
- HPAGE_PMD_ORDER);
+ result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
+ &vma, cc, order);
if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm);
goto out_nolock;
}
- result = find_pmd_or_thp_or_none(mm, address, &pmd);
+ result = find_pmd_or_thp_or_none(mm, pmd_addr, &pmd);
if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm);
goto out_nolock;
@@ -1246,8 +1248,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
* released when it fails. So we jump out_nolock directly in
* that case. Continuing to collapse causes inconsistency.
*/
- result = __collapse_huge_page_swapin(mm, vma, address, pmd,
- referenced, HPAGE_PMD_ORDER);
+ result = __collapse_huge_page_swapin(mm, vma, start_addr, pmd,
+ referenced, order);
if (result != SCAN_SUCCEED)
goto out_nolock;
}
@@ -1262,20 +1264,21 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
* mmap_lock.
*/
mmap_write_lock(mm);
- result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
- HPAGE_PMD_ORDER);
+ result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
+ &vma, cc, order);
if (result != SCAN_SUCCEED)
goto out_up_write;
/* check if the pmd is still valid */
vma_start_write(vma);
- result = check_pmd_still_valid(mm, address, pmd);
+ result = check_pmd_still_valid(mm, pmd_addr, pmd);
if (result != SCAN_SUCCEED)
goto out_up_write;
anon_vma_lock_write(vma->anon_vma);
+ anon_vma_locked = true;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
- address + HPAGE_PMD_SIZE);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start_addr,
+ end_addr);
mmu_notifier_invalidate_range_start(&range);
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
@@ -1287,26 +1290,23 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
* Parallel GUP-fast is fine since GUP-fast will back off when
* it detects PMD is changed.
*/
- _pmd = pmdp_collapse_flush(vma, address, pmd);
+ _pmd = pmdp_collapse_flush(vma, pmd_addr, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(&range);
tlb_remove_table_sync_one();
- pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
+ pte = pte_offset_map_lock(mm, &_pmd, start_addr, &pte_ptl);
if (pte) {
- result = __collapse_huge_page_isolate(vma, address, pte, cc,
- HPAGE_PMD_ORDER,
- &compound_pagelist);
+ result = __collapse_huge_page_isolate(vma, start_addr, pte, cc,
+ order, &compound_pagelist);
spin_unlock(pte_ptl);
} else {
result = SCAN_NO_PTE_TABLE;
}
if (unlikely(result != SCAN_SUCCEED)) {
- if (pte)
- pte_unmap(pte);
spin_lock(pmd_ptl);
- BUG_ON(!pmd_none(*pmd));
+ WARN_ON_ONCE(!pmd_none(*pmd));
/*
* We can only use set_pmd_at when establishing
* hugepmds and never for establishing regular pmds that
@@ -1314,21 +1314,24 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
*/
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(pmd_ptl);
- anon_vma_unlock_write(vma->anon_vma);
goto out_up_write;
}
/*
- * All pages are isolated and locked so anon_vma rmap
- * can't run anymore.
+ * For PMD collapse all pages are isolated and locked so anon_vma
+ * rmap can't run anymore. For mTHP collapse the PMD entry has been
+ * removed and not all pages are isolated and locked, so we must hold
+ * the lock to prevent neighboring folios from attempting to access
+ * this PMD until its reinstalled.
*/
- anon_vma_unlock_write(vma->anon_vma);
+ if (is_pmd_order(order)) {
+ anon_vma_unlock_write(vma->anon_vma);
+ anon_vma_locked = false;
+ }
result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
- vma, address, pte_ptl,
- HPAGE_PMD_ORDER,
- &compound_pagelist);
- pte_unmap(pte);
+ vma, start_addr, pte_ptl,
+ order, &compound_pagelist);
if (unlikely(result != SCAN_SUCCEED))
goto out_up_write;
@@ -1338,18 +1341,32 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
* write.
*/
__folio_mark_uptodate(folio);
- pgtable = pmd_pgtable(_pmd);
-
spin_lock(pmd_ptl);
- BUG_ON(!pmd_none(*pmd));
- pgtable_trans_huge_deposit(mm, pmd, pgtable);
- map_anon_folio_pmd_nopf(folio, pmd, vma, address);
+ WARN_ON_ONCE(!pmd_none(*pmd));
+ if (is_pmd_order(order)) {
+ pgtable = pmd_pgtable(_pmd);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_addr);
+ } else {
+ /*
+ * set_ptes is called in map_anon_folio_pte_nopf with the
+ * pmd_ptl lock still held; this is safe as the PMD is expected
+ * to be none. The pmd entry is then repopulated below.
+ */
+ map_anon_folio_pte_nopf(folio, pte, vma, start_addr, /*uffd_wp=*/ false);
+ smp_wmb(); /* make PTEs visible before PMD. See pmd_install() */
+ pmd_populate(mm, pmd, pmd_pgtable(_pmd));
+ }
spin_unlock(pmd_ptl);
folio = NULL;
result = SCAN_SUCCEED;
out_up_write:
+ if (anon_vma_locked)
+ anon_vma_unlock_write(vma->anon_vma);
+ if (pte)
+ pte_unmap(pte);
mmap_write_unlock(mm);
out_nolock:
if (folio)
@@ -1529,7 +1546,7 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
/* collapse_huge_page expects the lock to be dropped before calling */
mmap_read_unlock(mm);
result = collapse_huge_page(mm, start_addr, referenced,
- unmapped, cc);
+ unmapped, cc, HPAGE_PMD_ORDER);
/* collapse_huge_page will return with the mmap_lock released */
*lock_dropped = true;
}
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v17 05/14] mm/khugepaged: require collapse_huge_page to enter/exit with the lock dropped
From: Nico Pache @ 2026-05-11 18:58 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260511185817.686831-1-npache@redhat.com>
Currently the collapse_huge_page function requires the mmap_read_lock to
enter with it held, and exit with it dropped. This function moves the
unlock into its parent caller, and changes this semantic to requiring it
to enter/exit with it always unlocked.
In future patches, we need this expectation, as for in mTHP collapse, we
may have already have dropped the lock, and do not want to conditionally
check for this by passing through the lock_dropped variable.
No functional change is expected as one of the first things the
collapse_huge_page function does is drop this lock before allocating the
hugepage.
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 18 ++++++++++--------
1 file changed, 10 insertions(+), 8 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 27465161fa6d..37a5f6791816 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1199,6 +1199,14 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
return SCAN_SUCCEED;
}
+/*
+ * collapse_huge_page expects the mmap_read_lock to be dropped before
+ * entering this function. The function will also always return with the lock
+ * dropped. The function starts by allocation a folio, which can potentially
+ * take a long time if it involves sync compaction, and we do not need to hold
+ * the mmap_lock during that. We must recheck the vma after taking it again in
+ * write mode.
+ */
static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address,
int referenced, int unmapped, struct collapse_control *cc)
{
@@ -1214,14 +1222,6 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- /*
- * Before allocating the hugepage, release the mmap_lock read lock.
- * The allocation can take potentially a long time if it involves
- * sync compaction, and we do not need to hold the mmap_lock during
- * that. We will recheck the vma after taking it again in write mode.
- */
- mmap_read_unlock(mm);
-
result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED)
goto out_nolock;
@@ -1526,6 +1526,8 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
out_unmap:
pte_unmap_unlock(pte, ptl);
if (result == SCAN_SUCCEED) {
+ /* collapse_huge_page expects the lock to be dropped before calling */
+ mmap_read_unlock(mm);
result = collapse_huge_page(mm, start_addr, referenced,
unmapped, cc);
/* collapse_huge_page will return with the mmap_lock released */
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v17 04/14] mm/khugepaged: generalize __collapse_huge_page_* for mTHP support
From: Nico Pache @ 2026-05-11 18:58 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260511185817.686831-1-npache@redhat.com>
generalize the order of the __collapse_huge_page_* and collapse_max_*
functions to support future mTHP collapse.
The current mechanism for determining collapse with the
khugepaged_max_ptes_none value is not designed with mTHP in mind. This
raises a key design issue: if we support user defined max_pte_none values
(even those scaled by order), a collapse of a lower order can introduces
an feedback loop, or "creep", when max_ptes_none is set to a value greater
than HPAGE_PMD_NR / 2. [1]
With this configuration, a successful collapse to order N will populate
enough pages to satisfy the collapse condition on order N+1 on the next
scan. This leads to unnecessary work and memory churn.
To fix this issue introduce a helper function that will limit mTHP
collapse support to two max_ptes_none values, 0 and HPAGE_PMD_NR - 1.
This effectively supports two modes: [2]
- max_ptes_none=0: never collapses if it encounters an empty PTE or a PTE
that maps the shared zeropage. Consequently, no memory bloat.
- max_ptes_none=511 (on 4k pagesz): Always collapse to the highest
available mTHP order.
This removes the possiblilty of "creep", while not modifying any uAPI
expectations. A warning will be emitted if any non-supported
max_ptes_none value is configured with mTHP enabled.
mTHP collapse will not honor the khugepaged_max_ptes_shared or
khugepaged_max_ptes_swap parameters, and will fail if it encounters a
shared or swapped entry.
No functional changes in this patch; however it defines future behavior
for mTHP collapse.
[1] - https://lore.kernel.org/all/e46ab3ab-a3d7-4fb7-9970-d0704bd5d05a@arm.com
[2] - https://lore.kernel.org/all/37375ace-5601-4d6c-9dac-d1c8268698e9@redhat.com
Co-developed-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
include/trace/events/huge_memory.h | 3 +-
mm/khugepaged.c | 117 ++++++++++++++++++++---------
2 files changed, 85 insertions(+), 35 deletions(-)
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index bcdc57eea270..443e0bd13fdb 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -39,7 +39,8 @@
EM( SCAN_STORE_FAILED, "store_failed") \
EM( SCAN_COPY_MC, "copy_poisoned_page") \
EM( SCAN_PAGE_FILLED, "page_filled") \
- EMe(SCAN_PAGE_DIRTY_OR_WRITEBACK, "page_dirty_or_writeback")
+ EM(SCAN_PAGE_DIRTY_OR_WRITEBACK, "page_dirty_or_writeback") \
+ EMe(SCAN_INVALID_PTES_NONE, "invalid_ptes_none")
#undef EM
#undef EMe
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index f68853b3caa7..27465161fa6d 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -61,6 +61,7 @@ enum scan_result {
SCAN_COPY_MC,
SCAN_PAGE_FILLED,
SCAN_PAGE_DIRTY_OR_WRITEBACK,
+ SCAN_INVALID_PTES_NONE,
};
#define CREATE_TRACE_POINTS
@@ -353,37 +354,60 @@ static bool pte_none_or_zero(pte_t pte)
* PTEs for the given collapse operation.
* @cc: The collapse control struct
* @vma: The vma to check for userfaultfd
+ * @order: The folio order being collapsed to
*
* Return: Maximum number of none-page or zero-page PTEs allowed for the
* collapse operation.
*/
-static unsigned int collapse_max_ptes_none(struct collapse_control *cc,
- struct vm_area_struct *vma)
+static int collapse_max_ptes_none(struct collapse_control *cc,
+ struct vm_area_struct *vma, unsigned int order)
{
+ unsigned int max_ptes_none = khugepaged_max_ptes_none;
// If the vma is userfaultfd-armed, allow no none-page or zero-page PTEs.
if (vma && userfaultfd_armed(vma))
return 0;
// for MADV_COLLAPSE, allow any none-page or zero-page PTEs.
if (!cc->is_khugepaged)
return HPAGE_PMD_NR;
- // For all other cases repect the user defined maximum.
- return khugepaged_max_ptes_none;
+ // for PMD collapse, respect the user defined maximum.
+ if (is_pmd_order(order))
+ return max_ptes_none;
+ /* Zero/non-present collapse disabled. */
+ if (!max_ptes_none)
+ return 0;
+ // for mTHP collapse with the sysctl value set to KHUGEPAGED_MAX_PTES_LIMIT,
+ // scale the maximum number of PTEs to the order of the collapse.
+ if (max_ptes_none == KHUGEPAGED_MAX_PTES_LIMIT)
+ return (1 << order) - 1;
+
+ // We currently only support max_ptes_none values of 0 or KHUGEPAGED_MAX_PTES_LIMIT.
+ // Emit a warning and return -EINVAL.
+ pr_warn_once("mTHP collapse only supports max_ptes_none values of 0 or %u\n",
+ KHUGEPAGED_MAX_PTES_LIMIT);
+ return -EINVAL;
}
/**
* collapse_max_ptes_shared - Calculate maximum allowed PTEs that map shared
* anonymous pages for the given collapse operation.
* @cc: The collapse control struct
+ * @order: The folio order being collapsed to
*
* Return: Maximum number of PTEs that map shared anonymous pages for the
* collapse operation
*/
-static unsigned int collapse_max_ptes_shared(struct collapse_control *cc)
+static unsigned int collapse_max_ptes_shared(struct collapse_control *cc,
+ unsigned int order)
{
// for MADV_COLLAPSE, do not restrict the number of PTEs that map shared
// anonymous pages.
if (!cc->is_khugepaged)
return HPAGE_PMD_NR;
+ // for mTHP collapse do not allow collapsing anonymous memory pages that
+ // are shared between processes.
+ if (!is_pmd_order(order))
+ return 0;
+ // for PMD collapse, respect the user defined maximum.
return khugepaged_max_ptes_shared;
}
@@ -391,16 +415,22 @@ static unsigned int collapse_max_ptes_shared(struct collapse_control *cc)
* collapse_max_ptes_swap - Calculate the maximum allowed non-present PTEs or the
* maximum allowed non-present pagecache entries for the given collapse operation.
* @cc: The collapse control struct
+ * @order: The folio order being collapsed to
*
* Return: Maximum number of non-present PTEs or the maximum allowed non-present
* pagecache entries for the collapse operation.
*/
-static unsigned int collapse_max_ptes_swap(struct collapse_control *cc)
+static unsigned int collapse_max_ptes_swap(struct collapse_control *cc,
+ unsigned int order)
{
// for MADV_COLLAPSE, do not restrict the number PTEs entries or
// pagecache entries that are non-present.
if (!cc->is_khugepaged)
return HPAGE_PMD_NR;
+ // for mTHP collapse do not allow any non-present PTEs or pagecache entries.
+ if (!is_pmd_order(order))
+ return 0;
+ // for PMD collapse, respect the user defined maximum.
return khugepaged_max_ptes_swap;
}
@@ -594,18 +624,22 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long start_addr, pte_t *pte, struct collapse_control *cc,
- struct list_head *compound_pagelist)
+ unsigned int order, struct list_head *compound_pagelist)
{
+ const unsigned long nr_pages = 1UL << order;
struct page *page = NULL;
struct folio *folio = NULL;
unsigned long addr = start_addr;
pte_t *_pte;
int none_or_zero = 0, shared = 0, referenced = 0;
enum scan_result result = SCAN_FAIL;
- unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma);
- unsigned int max_ptes_shared = collapse_max_ptes_shared(cc);
+ int max_ptes_none = collapse_max_ptes_none(cc, vma, order);
+ unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, order);
+
+ if (max_ptes_none < 0)
+ return SCAN_INVALID_PTES_NONE;
- for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
+ for (_pte = pte; _pte < pte + nr_pages;
_pte++, addr += PAGE_SIZE) {
pte_t pteval = ptep_get(_pte);
if (pte_none_or_zero(pteval)) {
@@ -738,18 +772,18 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
}
static void __collapse_huge_page_copy_succeeded(pte_t *pte,
- struct vm_area_struct *vma,
- unsigned long address,
- spinlock_t *ptl,
- struct list_head *compound_pagelist)
+ struct vm_area_struct *vma, unsigned long address,
+ spinlock_t *ptl, unsigned int order,
+ struct list_head *compound_pagelist)
{
- unsigned long end = address + HPAGE_PMD_SIZE;
+ const unsigned long nr_pages = 1UL << order;
+ unsigned long end = address + (PAGE_SIZE << order);
struct folio *src, *tmp;
pte_t pteval;
pte_t *_pte;
unsigned int nr_ptes;
- for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
+ for (_pte = pte; _pte < pte + nr_pages; _pte += nr_ptes,
address += nr_ptes * PAGE_SIZE) {
nr_ptes = 1;
pteval = ptep_get(_pte);
@@ -802,11 +836,10 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
}
static void __collapse_huge_page_copy_failed(pte_t *pte,
- pmd_t *pmd,
- pmd_t orig_pmd,
- struct vm_area_struct *vma,
- struct list_head *compound_pagelist)
+ pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
+ unsigned int order, struct list_head *compound_pagelist)
{
+ const unsigned long nr_pages = 1UL << order;
spinlock_t *pmd_ptl;
/*
@@ -822,7 +855,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
* Release both raw and compound pages isolated
* in __collapse_huge_page_isolate.
*/
- release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
+ release_pte_pages(pte, pte + nr_pages, compound_pagelist);
}
/*
@@ -842,16 +875,17 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
*/
static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
- unsigned long address, spinlock_t *ptl,
+ unsigned long address, spinlock_t *ptl, unsigned int order,
struct list_head *compound_pagelist)
{
+ const unsigned long nr_pages = 1UL << order;
unsigned int i;
enum scan_result result = SCAN_SUCCEED;
/*
* Copying pages' contents is subject to memory poison at any iteration.
*/
- for (i = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0; i < nr_pages; i++) {
pte_t pteval = ptep_get(pte + i);
struct page *page = folio_page(folio, i);
unsigned long src_addr = address + i * PAGE_SIZE;
@@ -870,10 +904,10 @@ static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio *foli
if (likely(result == SCAN_SUCCEED))
__collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
- compound_pagelist);
+ order, compound_pagelist);
else
__collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
- compound_pagelist);
+ order, compound_pagelist);
return result;
}
@@ -1044,12 +1078,12 @@ static enum scan_result check_pmd_still_valid(struct mm_struct *mm,
* Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
*/
static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long start_addr, pmd_t *pmd,
- int referenced)
+ struct vm_area_struct *vma, unsigned long start_addr,
+ pmd_t *pmd, int referenced, unsigned int order)
{
int swapped_in = 0;
vm_fault_t ret = 0;
- unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
+ unsigned long addr, end = start_addr + (PAGE_SIZE << order);
enum scan_result result;
pte_t *pte = NULL;
spinlock_t *ptl;
@@ -1081,6 +1115,19 @@ static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
pte_present(vmf.orig_pte))
continue;
+ /*
+ * TODO: Support swapin without leading to further mTHP
+ * collapses. Currently bringing in new pages via swapin may
+ * cause a future higher order collapse on a rescan of the same
+ * range.
+ */
+ if (!is_pmd_order(order)) {
+ pte_unmap(pte);
+ mmap_read_unlock(mm);
+ result = SCAN_EXCEED_SWAP_PTE;
+ goto out;
+ }
+
vmf.pte = pte;
vmf.ptl = ptl;
ret = do_swap_page(&vmf);
@@ -1200,7 +1247,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
* that case. Continuing to collapse causes inconsistency.
*/
result = __collapse_huge_page_swapin(mm, vma, address, pmd,
- referenced);
+ referenced, HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED)
goto out_nolock;
}
@@ -1248,6 +1295,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
if (pte) {
result = __collapse_huge_page_isolate(vma, address, pte, cc,
+ HPAGE_PMD_ORDER,
&compound_pagelist);
spin_unlock(pte_ptl);
} else {
@@ -1278,6 +1326,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
vma, address, pte_ptl,
+ HPAGE_PMD_ORDER,
&compound_pagelist);
pte_unmap(pte);
if (unlikely(result != SCAN_SUCCEED))
@@ -1313,9 +1362,9 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long start_addr,
bool *lock_dropped, struct collapse_control *cc)
{
- const unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma);
- const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc);
- const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc);
+ const int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
+ const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, HPAGE_PMD_ORDER);
+ const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
pmd_t *pmd;
pte_t *pte, *_pte;
int none_or_zero = 0, shared = 0, referenced = 0;
@@ -2369,8 +2418,8 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
unsigned long addr, struct file *file, pgoff_t start,
struct collapse_control *cc)
{
- const unsigned int max_ptes_none = collapse_max_ptes_none(cc, NULL);
- const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc);
+ const int max_ptes_none = collapse_max_ptes_none(cc, NULL, HPAGE_PMD_ORDER);
+ const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
struct folio *folio = NULL;
struct address_space *mapping = file->f_mapping;
XA_STATE(xas, &mapping->i_pages, start);
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v17 03/14] mm/khugepaged: rework max_ptes_* handling with helper functions
From: Nico Pache @ 2026-05-11 18:58 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe,
Usama Arif
In-Reply-To: <20260511185817.686831-1-npache@redhat.com>
The following cleanup reworks all the max_ptes_* handling into helper
functions. This increases the code readability and will later be used to
implement the mTHP handling of these variables.
With these changes we abstract all the madvise_collapse() special casing
(dont respect the sysctls) away from the functions that utilize them. And
will be used later in this series to cleanly restrict the mTHP collapse
behavior.
No functional change is intended; however, we are now only reading the
sysfs variables once per scan, whereas before these variables were being
read on each loop iteration.
Suggested-by: David Hildenbrand <david@kernel.org>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Acked-by: Usama Arif <usama.arif@linux.dev>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 118 +++++++++++++++++++++++++++++++++---------------
1 file changed, 82 insertions(+), 36 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index f0e29d5c7b1f..f68853b3caa7 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -348,6 +348,62 @@ static bool pte_none_or_zero(pte_t pte)
return pte_present(pte) && is_zero_pfn(pte_pfn(pte));
}
+/**
+ * collapse_max_ptes_none - Calculate maximum allowed none-page or zero-page
+ * PTEs for the given collapse operation.
+ * @cc: The collapse control struct
+ * @vma: The vma to check for userfaultfd
+ *
+ * Return: Maximum number of none-page or zero-page PTEs allowed for the
+ * collapse operation.
+ */
+static unsigned int collapse_max_ptes_none(struct collapse_control *cc,
+ struct vm_area_struct *vma)
+{
+ // If the vma is userfaultfd-armed, allow no none-page or zero-page PTEs.
+ if (vma && userfaultfd_armed(vma))
+ return 0;
+ // for MADV_COLLAPSE, allow any none-page or zero-page PTEs.
+ if (!cc->is_khugepaged)
+ return HPAGE_PMD_NR;
+ // For all other cases repect the user defined maximum.
+ return khugepaged_max_ptes_none;
+}
+
+/**
+ * collapse_max_ptes_shared - Calculate maximum allowed PTEs that map shared
+ * anonymous pages for the given collapse operation.
+ * @cc: The collapse control struct
+ *
+ * Return: Maximum number of PTEs that map shared anonymous pages for the
+ * collapse operation
+ */
+static unsigned int collapse_max_ptes_shared(struct collapse_control *cc)
+{
+ // for MADV_COLLAPSE, do not restrict the number of PTEs that map shared
+ // anonymous pages.
+ if (!cc->is_khugepaged)
+ return HPAGE_PMD_NR;
+ return khugepaged_max_ptes_shared;
+}
+
+/**
+ * collapse_max_ptes_swap - Calculate the maximum allowed non-present PTEs or the
+ * maximum allowed non-present pagecache entries for the given collapse operation.
+ * @cc: The collapse control struct
+ *
+ * Return: Maximum number of non-present PTEs or the maximum allowed non-present
+ * pagecache entries for the collapse operation.
+ */
+static unsigned int collapse_max_ptes_swap(struct collapse_control *cc)
+{
+ // for MADV_COLLAPSE, do not restrict the number PTEs entries or
+ // pagecache entries that are non-present.
+ if (!cc->is_khugepaged)
+ return HPAGE_PMD_NR;
+ return khugepaged_max_ptes_swap;
+}
+
int hugepage_madvise(struct vm_area_struct *vma,
vm_flags_t *vm_flags, int advice)
{
@@ -546,21 +602,19 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
pte_t *_pte;
int none_or_zero = 0, shared = 0, referenced = 0;
enum scan_result result = SCAN_FAIL;
+ unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma);
+ unsigned int max_ptes_shared = collapse_max_ptes_shared(cc);
for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, addr += PAGE_SIZE) {
pte_t pteval = ptep_get(_pte);
if (pte_none_or_zero(pteval)) {
- ++none_or_zero;
- if (!userfaultfd_armed(vma) &&
- (!cc->is_khugepaged ||
- none_or_zero <= khugepaged_max_ptes_none)) {
- continue;
- } else {
+ if (++none_or_zero > max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
goto out;
}
+ continue;
}
if (!pte_present(pteval)) {
result = SCAN_PTE_NON_PRESENT;
@@ -591,9 +645,7 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
/* See collapse_scan_pmd(). */
if (folio_maybe_mapped_shared(folio)) {
- ++shared;
- if (cc->is_khugepaged &&
- shared > khugepaged_max_ptes_shared) {
+ if (++shared > max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
goto out;
@@ -1261,6 +1313,9 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long start_addr,
bool *lock_dropped, struct collapse_control *cc)
{
+ const unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma);
+ const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc);
+ const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc);
pmd_t *pmd;
pte_t *pte, *_pte;
int none_or_zero = 0, shared = 0, referenced = 0;
@@ -1294,36 +1349,29 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
pte_t pteval = ptep_get(_pte);
if (pte_none_or_zero(pteval)) {
- ++none_or_zero;
- if (!userfaultfd_armed(vma) &&
- (!cc->is_khugepaged ||
- none_or_zero <= khugepaged_max_ptes_none)) {
- continue;
- } else {
+ if (++none_or_zero > max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
goto out_unmap;
}
+ continue;
}
if (!pte_present(pteval)) {
- ++unmapped;
- if (!cc->is_khugepaged ||
- unmapped <= khugepaged_max_ptes_swap) {
- /*
- * Always be strict with uffd-wp
- * enabled swap entries. Please see
- * comment below for pte_uffd_wp().
- */
- if (pte_swp_uffd_wp_any(pteval)) {
- result = SCAN_PTE_UFFD_WP;
- goto out_unmap;
- }
- continue;
- } else {
+ if (++unmapped > max_ptes_swap) {
result = SCAN_EXCEED_SWAP_PTE;
count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
goto out_unmap;
}
+ /*
+ * Always be strict with uffd-wp
+ * enabled swap entries. Please see
+ * comment below for pte_uffd_wp().
+ */
+ if (pte_swp_uffd_wp_any(pteval)) {
+ result = SCAN_PTE_UFFD_WP;
+ goto out_unmap;
+ }
+ continue;
}
if (pte_uffd_wp(pteval)) {
/*
@@ -1366,9 +1414,7 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
* is shared.
*/
if (folio_maybe_mapped_shared(folio)) {
- ++shared;
- if (cc->is_khugepaged &&
- shared > khugepaged_max_ptes_shared) {
+ if (++shared > max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
goto out_unmap;
@@ -2323,6 +2369,8 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
unsigned long addr, struct file *file, pgoff_t start,
struct collapse_control *cc)
{
+ const unsigned int max_ptes_none = collapse_max_ptes_none(cc, NULL);
+ const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc);
struct folio *folio = NULL;
struct address_space *mapping = file->f_mapping;
XA_STATE(xas, &mapping->i_pages, start);
@@ -2341,8 +2389,7 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
if (xa_is_value(folio)) {
swap += 1 << xas_get_order(&xas);
- if (cc->is_khugepaged &&
- swap > khugepaged_max_ptes_swap) {
+ if (swap > max_ptes_swap) {
result = SCAN_EXCEED_SWAP_PTE;
count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
break;
@@ -2413,8 +2460,7 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
cc->progress += HPAGE_PMD_NR;
if (result == SCAN_SUCCEED) {
- if (cc->is_khugepaged &&
- present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+ if (present < HPAGE_PMD_NR - max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
} else {
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v17 02/14] mm/khugepaged: generalize alloc_charge_folio()
From: Nico Pache @ 2026-05-11 18:58 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe,
Usama Arif
In-Reply-To: <20260511185817.686831-1-npache@redhat.com>
From: Dev Jain <dev.jain@arm.com>
Pass order to alloc_charge_folio() and update mTHP statistics.
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Acked-by: Usama Arif <usama.arif@linux.dev>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Signed-off-by: Dev Jain <dev.jain@arm.com>
Co-developed-by: Nico Pache <npache@redhat.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
Documentation/admin-guide/mm/transhuge.rst | 8 ++++++++
include/linux/huge_mm.h | 2 ++
mm/huge_memory.c | 4 ++++
mm/khugepaged.c | 17 +++++++++++------
4 files changed, 25 insertions(+), 6 deletions(-)
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 5fbc3d89bb07..c51932e6275d 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -639,6 +639,14 @@ anon_fault_fallback_charge
instead falls back to using huge pages with lower orders or
small pages even though the allocation was successful.
+collapse_alloc
+ is incremented every time a huge page is successfully allocated for a
+ khugepaged collapse.
+
+collapse_alloc_failed
+ is incremented every time a huge page allocation fails during a
+ khugepaged collapse.
+
zswpout
is incremented every time a huge page is swapped out to zswap in one
piece without splitting.
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2949e5acff35..ba7ae6808544 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -128,6 +128,8 @@ enum mthp_stat_item {
MTHP_STAT_ANON_FAULT_ALLOC,
MTHP_STAT_ANON_FAULT_FALLBACK,
MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
+ MTHP_STAT_COLLAPSE_ALLOC,
+ MTHP_STAT_COLLAPSE_ALLOC_FAILED,
MTHP_STAT_ZSWPOUT,
MTHP_STAT_SWPIN,
MTHP_STAT_SWPIN_FALLBACK,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e9d499da0ac7..05f482a72a89 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -699,6 +699,8 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+DEFINE_MTHP_STAT_ATTR(collapse_alloc, MTHP_STAT_COLLAPSE_ALLOC);
+DEFINE_MTHP_STAT_ATTR(collapse_alloc_failed, MTHP_STAT_COLLAPSE_ALLOC_FAILED);
DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK);
@@ -764,6 +766,8 @@ static struct attribute *any_stats_attrs[] = {
#endif
&split_attr.attr,
&split_failed_attr.attr,
+ &collapse_alloc_attr.attr,
+ &collapse_alloc_failed_attr.attr,
NULL,
};
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 979885694351..f0e29d5c7b1f 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1068,21 +1068,26 @@ static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
}
static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
- struct collapse_control *cc)
+ struct collapse_control *cc, unsigned int order)
{
gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
GFP_TRANSHUGE);
int node = collapse_find_target_node(cc);
struct folio *folio;
- folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
+ folio = __folio_alloc(gfp, order, node, &cc->alloc_nmask);
if (!folio) {
*foliop = NULL;
- count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ if (is_pmd_order(order))
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_ALLOC_FAILED);
return SCAN_ALLOC_HUGE_PAGE_FAIL;
}
- count_vm_event(THP_COLLAPSE_ALLOC);
+ if (is_pmd_order(order))
+ count_vm_event(THP_COLLAPSE_ALLOC);
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_ALLOC);
+
if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
folio_put(folio);
*foliop = NULL;
@@ -1118,7 +1123,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
*/
mmap_read_unlock(mm);
- result = alloc_charge_folio(&folio, mm, cc);
+ result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED)
goto out_nolock;
@@ -1899,7 +1904,7 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
- result = alloc_charge_folio(&new_folio, mm, cc);
+ result = alloc_charge_folio(&new_folio, mm, cc, HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED)
goto out;
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v17 01/14] mm/khugepaged: generalize hugepage_vma_revalidate for mTHP support
From: Nico Pache @ 2026-05-11 18:58 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe,
Usama Arif
In-Reply-To: <20260511185817.686831-1-npache@redhat.com>
For khugepaged to support different mTHP orders, we must generalize this
to check if the PMD is not shared by another VMA and that the order is
enabled.
No functional change in this patch. Also correct a comment about the
functionality of the revalidation and fix a double space issues.
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Acked-by: Usama Arif <usama.arif@linux.dev>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Co-developed-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 20 ++++++++++++--------
1 file changed, 12 insertions(+), 8 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 28a843f30b32..979885694351 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -902,12 +902,13 @@ static int collapse_find_target_node(struct collapse_control *cc)
/*
* If mmap_lock temporarily dropped, revalidate vma
- * before taking mmap_lock.
+ * after taking the mmap_lock again.
* Returns enum scan_result value.
*/
static enum scan_result hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
- bool expect_anon, struct vm_area_struct **vmap, struct collapse_control *cc)
+ bool expect_anon, struct vm_area_struct **vmap,
+ struct collapse_control *cc, unsigned int order)
{
struct vm_area_struct *vma;
enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED :
@@ -920,15 +921,16 @@ static enum scan_result hugepage_vma_revalidate(struct mm_struct *mm, unsigned l
if (!vma)
return SCAN_VMA_NULL;
+ /* Always check the PMD order to ensure its not shared by another VMA */
if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
return SCAN_ADDRESS_RANGE;
- if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER))
+ if (!thp_vma_allowable_orders(vma, vma->vm_flags, type, BIT(order)))
return SCAN_VMA_CHECK;
/*
* Anon VMA expected, the address may be unmapped then
* remapped to file after khugepaged reaquired the mmap_lock.
*
- * thp_vma_allowable_order may return true for qualified file
+ * thp_vma_allowable_orders may return true for qualified file
* vmas.
*/
if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
@@ -1121,7 +1123,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
goto out_nolock;
mmap_read_lock(mm);
- result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
+ result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
+ HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm);
goto out_nolock;
@@ -1155,7 +1158,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
* mmap_lock.
*/
mmap_write_lock(mm);
- result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
+ result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
+ HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED)
goto out_up_write;
/* check if the pmd is still valid */
@@ -2858,8 +2862,8 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
mmap_unlocked = false;
*lock_dropped = true;
result = hugepage_vma_revalidate(mm, addr, false, &vma,
- cc);
- if (result != SCAN_SUCCEED) {
+ cc, HPAGE_PMD_ORDER);
+ if (result != SCAN_SUCCEED) {
last_fail = result;
goto out_nolock;
}
--
2.54.0
^ permalink raw reply related
* [PATCH mm-unstable v17 00/14] khugepaged: mTHP support
From: Nico Pache @ 2026-05-11 18:58 UTC (permalink / raw)
To: linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, npache, peterx, pfalcato,
rakie.kim, raquini, rdunlap, richard.weiyang, rientjes, rostedt,
rppt, ryan.roberts, shivankg, sunnanyong, surenb,
thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
The following series provides khugepaged with the capability to collapse
anonymous memory regions to mTHPs.
To achieve this we generalize the khugepaged functions to no longer depend
on PMD_ORDER. Then during the PMD scan, we use a bitmap to track individual
pages that are occupied (!none/zero). After the PMD scan is done, we use
the bitmap to find the optimal mTHP sizes for the PMD range. The
restriction on max_ptes_none is removed during the scan, to make sure we
account for the whole PMD range in the bitmap. When no mTHP size is
enabled, the legacy behavior of khugepaged is maintained.
We currently only support max_ptes_none values of 0 or HPAGE_PMD_NR - 1
(ie 511). If any other value is specified, the kernel will emit a warning
and no mTHP collapse will be attempted. If a mTHP collapse is attempted,
but contains swapped out, or shared pages, we don't perform the collapse.
It is now also possible to collapse to mTHPs without requiring the PMD THP
size to be enabled. These limitations are to prevent collapse "creep"
behavior. This prevents constantly promoting mTHPs to the next available
size, which would occur because a collapse introduces more non-zero pages
that would satisfy the promotion condition on subsequent scans.
Patch 1-2: Generalize hugepage_vma_revalidate and alloc_charge_folio
for arbitrary orders.
Patch 3: Rework max_ptes_* handling into helper functions
Patch 4: Generalize __collapse_huge_page_* for mTHP support
Patch 5: Require collapse_huge_page to enter/exit with the lock dropped
Patch 6: Generalize collapse_huge_page for mTHP collapse
Patch 7: Skip collapsing mTHP to smaller orders
Patch 8-9: Add per-order mTHP statistics and tracepoints
Patch 10: Introduce collapse_allowable_orders helper function
Patch 11-13: Introduce bitmap and mTHP collapse support, fully enabled
Patch 14: Documentation
Testing:
- Built for x86_64, aarch64, ppc64le, and s390x
- ran all arches on test suites provided by the kernel-tests project
- internal testing suites: functional testing and performance testing
- selftests mm
- I created a test script that I used to push khugepaged to its limits
while monitoring a number of stats and tracepoints. The code is
available here[1] (Run in legacy mode for these changes and set mthp
sizes to inherit)
The summary from my testings was that there was no significant
regression noticed through this test. In some cases my changes had
better collapse latencies, and was able to scan more pages in the same
amount of time/work, but for the most part the results were consistent.
- redis testing. I did some testing with these changes along with my defer
changes (see followup [2] post for more details). We've decided to get
the mTHP changes merged first before attempting the defer series.
- some basic testing on 64k page size.
- lots of general use.
V17 Changes:
- Added Acks/RB
- New patch(5): split the mmap_read_unlock() locking contract change out of
"generalize collapse_huge_page" into its own patch; add a comment
documenting the enter/exit-with-lock-dropped contract (Usama, David)
- [patch 03] Add const to max_ptes_none/shared/swap variables; improve the
three helper docstrings; replace the paragraphs with inline comments;
note that sysctl values are now snapshotted once per scan (Usama, David)
- [patch 04] Add SCAN_INVALID_PTES_NONE result code and return it instead
of SCAN_FAIL when collapse_max_ptes_none() returns -EINVAL (Usama);
snapshot khugepaged_max_ptes_none into a local variable to fix race on
the two comparisons (Usama); clean up mTHP docstring paragraphs into
inline comments; fix commit message wording (David)
- [patch 06] Remove /* PMD collapse */ and /* mTHP collapse */ comments
(David); move const declarations to top of variable list (David); add
comment explaining that map_anon_folio_pte_nopf() calls set_ptes under
pmd_ptl and is safe because PMD is expected to be none (Usama)
- [patch 08] Shorten sysfs counter documentation for
collapse_exceed_swap/shared_pte to concise one-liners; trim
collapse_exceed_none_pte description; fix "dont" → "do not" (David)
- [patch 10] Keep vm_flags parameter in khugepaged_enter_vma() and
collapse_allowable_orders() rather than dropping it and reading
vma->vm_flags internally; pass vm_flags explicitly at all three
collapse_allowable_orders() call sites (David, sashskio)
- [patch 11] Fix MTHP_STACK_SIZE: was exponential (~128); correct formula
is (height + 1) for a DFS on a binary tree rewrite comment to explain
the DFS sizing (sashskio)
- [patch 12] Replace SCAN_PAGE_LRU with SCAN_PAGE_LAZYFREE in the
"goto next_order" early-bail cases; non-LRU page failures cannot be
recovered at any order and belong in the default (return) path
- [patch 13] Use tva_flags == TVA_KHUGEPAGED (strict equality) instead of
tva_flags & TVA_KHUGEPAGED; flatten nested if into single condition;
retain vm_flags parameter; pass vm_flags to collapse_allowable_orders()
V16: https://lore.kernel.org/all/20260419185750.260784-1-npache@redhat.com
V15: https://lore.kernel.org/all/20260226031741.230674-1-npache@redhat.com
V14: https://lore.kernel.org/all/20260122192841.128719-1-npache@redhat.com
V13: https://lore.kernel.org/all/20251201174627.23295-1-npache@redhat.com
V12: https://lore.kernel.org/all/20251022183717.70829-1-npache@redhat.com
V11: https://lore.kernel.org/all/20250912032810.197475-1-npache@redhat.com
V10: https://lore.kernel.org/all/20250819134205.622806-1-npache@redhat.com
V9 : https://lore.kernel.org/all/20250714003207.113275-1-npache@redhat.com
V8 : https://lore.kernel.org/all/20250702055742.102808-1-npache@redhat.com
V7 : https://lore.kernel.org/all/20250515032226.128900-1-npache@redhat.com
V6 : https://lore.kernel.org/all/20250515030312.125567-1-npache@redhat.com
V5 : https://lore.kernel.org/all/20250428181218.85925-1-npache@redhat.com
V4 : https://lore.kernel.org/all/20250417000238.74567-1-npache@redhat.com
V3 : https://lore.kernel.org/all/20250414220557.35388-1-npache@redhat.com
V2 : https://lore.kernel.org/all/20250211003028.213461-1-npache@redhat.com
V1 : https://lore.kernel.org/all/20250108233128.14484-1-npache@redhat.com
Baolin Wang (1):
mm/khugepaged: run khugepaged for all orders
Dev Jain (1):
mm/khugepaged: generalize alloc_charge_folio()
Nico Pache (12):
mm/khugepaged: generalize hugepage_vma_revalidate for mTHP support
mm/khugepaged: rework max_ptes_* handling with helper functions
mm/khugepaged: generalize __collapse_huge_page_* for mTHP support
mm/khugepaged: require collapse_huge_page to enter/exit with the lock
dropped
mm/khugepaged: generalize collapse_huge_page for mTHP collapse
mm/khugepaged: skip collapsing mTHP to smaller orders
mm/khugepaged: add per-order mTHP collapse failure statistics
mm/khugepaged: improve tracepoints for mTHP orders
mm/khugepaged: introduce collapse_allowable_orders helper function
mm/khugepaged: Introduce mTHP collapse support
mm/khugepaged: avoid unnecessary mTHP collapse attempts
Documentation: mm: update the admin guide for mTHP collapse
Documentation/admin-guide/mm/transhuge.rst | 71 ++-
include/linux/huge_mm.h | 5 +
include/trace/events/huge_memory.h | 37 +-
mm/huge_memory.c | 11 +
mm/khugepaged.c | 625 ++++++++++++++++-----
5 files changed, 579 insertions(+), 170 deletions(-)
base-commit: e9dd96806dbc2d50a66770b6a86962bd5d601153
--
2.54.0
^ permalink raw reply
* [RFC PATCH v2 10/10] selftests/verification: add tlob selftests
From: wen.yang @ 2026-05-11 18:24 UTC (permalink / raw)
To: Gabriele Monaco, Steven Rostedt
Cc: linux-trace-kernel, linux-kernel, Wen Yang
In-Reply-To: <cover.1778522945.git.wen.yang@linux.dev>
From: Wen Yang <wen.yang@linux.dev>
Add selftest coverage for the tlob RV monitor in
tools/testing/selftests/verification/.
Two helper binaries are built by tlob/Makefile: tlob_helper for the
ioctl interface (/dev/rv) and tlob_uprobe_target for the uprobe tests.
The top-level Makefile delegates to tlob/ via a generic MONITOR_SUBDIRS
pattern so monitor-specific build details stay within each monitor's
own subdirectory.
Eight test files cover the tracefs control interface (tracefs.tc), the
ioctl self-instrumentation interface (ioctl.tc, 8 scenarios), and the
uprobe external monitoring interface (uprobe_bind.tc, uprobe_violation.tc,
uprobe_no_event.tc, uprobe_multi.tc, uprobe_detail_sleeping.tc,
uprobe_detail_waiting.tc).
Tested on x86_64 with vng (virtme-ng):
TAP version 13
1..12
ok 1 Test monitor enable/disable
ok 2 Test monitor reactor setting
ok 3 Check available monitors
ok 4 Test wwnr monitor with printk reactor
ok 5 Test tlob ioctl self-instrumentation (within/over-budget, error paths)
ok 6 Test tlob monitor tracefs interface (enable/disable and files)
ok 7 uprobe binding: visible in monitor file, removable, duplicate offset rejected
ok 8 uprobe detail sleeping: sleeping_ns dominates when task blocks between probes
ok 9 uprobe detail waiting: waiting_ns dominates when task is preempted between probes
ok 10 Two bindings on same binary with different offsets and budgets fire independently
ok 11 Verify no spurious error_env_tlob events without an active uprobe binding
ok 12 uprobe violation: error_env_tlob and detail_env_tlob fire with correct fields
# Totals: pass:12 fail:0 xfail:0 xpass:0 skip:0 error:0
Suggested-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
tools/testing/selftests/verification/Makefile | 21 +-
.../verification/test.d/tlob/ioctl.tc | 36 +
.../verification/test.d/tlob/tracefs.tc | 17 +
.../verification/test.d/tlob/uprobe_bind.tc | 34 +
.../test.d/tlob/uprobe_detail_sleeping.tc | 47 ++
.../test.d/tlob/uprobe_detail_waiting.tc | 60 ++
.../verification/test.d/tlob/uprobe_multi.tc | 60 ++
.../test.d/tlob/uprobe_no_event.tc | 19 +
.../test.d/tlob/uprobe_violation.tc | 60 ++
.../selftests/verification/tlob/Makefile | 21 +
.../selftests/verification/tlob/tlob_ioctl.c | 626 ++++++++++++++++++
.../selftests/verification/tlob/tlob_target.c | 138 ++++
12 files changed, 1138 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/verification/test.d/tlob/ioctl.tc
create mode 100644 tools/testing/selftests/verification/test.d/tlob/tracefs.tc
create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_bind.tc
create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_detail_sleeping.tc
create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_detail_waiting.tc
create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_multi.tc
create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_no_event.tc
create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_violation.tc
create mode 100644 tools/testing/selftests/verification/tlob/Makefile
create mode 100644 tools/testing/selftests/verification/tlob/tlob_ioctl.c
create mode 100644 tools/testing/selftests/verification/tlob/tlob_target.c
diff --git a/tools/testing/selftests/verification/Makefile b/tools/testing/selftests/verification/Makefile
index aa8790c22a71..b5584fd3762d 100644
--- a/tools/testing/selftests/verification/Makefile
+++ b/tools/testing/selftests/verification/Makefile
@@ -1,8 +1,27 @@
# SPDX-License-Identifier: GPL-2.0
-all:
TEST_PROGS := verificationtest-ktap
TEST_FILES := test.d settings
EXTRA_CLEAN := $(OUTPUT)/logs/*
+# Subdirectories that provide helper binaries for the test runner.
+# Each entry must contain a Makefile that accepts OUTDIR= and deposits
+# its binaries there; verificationtest-ktap adds OUTDIR to PATH so
+# the ftracetest require-checks resolve the binaries by name.
+MONITOR_SUBDIRS := tlob
+
include ../lib.mk
+
+# Build and clean each monitor subdirectory.
+all: $(patsubst %,_build_%,$(MONITOR_SUBDIRS))
+
+clean: $(patsubst %,_clean_%,$(MONITOR_SUBDIRS))
+
+.PHONY: $(patsubst %,_build_%,$(MONITOR_SUBDIRS)) \
+ $(patsubst %,_clean_%,$(MONITOR_SUBDIRS))
+
+$(patsubst %,_build_%,$(MONITOR_SUBDIRS)): _build_%:
+ $(MAKE) -C $* OUTDIR="$(OUTPUT)" TOOLS_INCLUDES="$(TOOLS_INCLUDES)"
+
+$(patsubst %,_clean_%,$(MONITOR_SUBDIRS)): _clean_%:
+ $(MAKE) -C $* OUTDIR="$(OUTPUT)" clean
diff --git a/tools/testing/selftests/verification/test.d/tlob/ioctl.tc b/tools/testing/selftests/verification/test.d/tlob/ioctl.tc
new file mode 100644
index 000000000000..54ae249af9a6
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/ioctl.tc
@@ -0,0 +1,36 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# description: Test tlob ioctl self-instrumentation (within/over-budget, error paths)
+# requires: tlob:monitor tlob_ioctl:program
+
+TLOB_HELPER=$(command -v tlob_ioctl)
+
+[ -c /dev/rv ] || exit_unsupported
+
+echo 1 > monitors/tlob/enable
+
+# within budget: 50 ms threshold, 10 ms workload
+"$TLOB_HELPER" within_budget
+
+# over budget in running state: 1 ms threshold, 100 ms busy-spin
+"$TLOB_HELPER" over_budget_running
+
+# over budget in sleeping state: 3 ms threshold, 50 ms sleep
+"$TLOB_HELPER" over_budget_sleeping
+
+# over budget in waiting state: 1 us threshold, sched_yield
+"$TLOB_HELPER" over_budget_waiting
+
+# error paths
+"$TLOB_HELPER" double_start
+"$TLOB_HELPER" stop_no_start
+
+# per-thread isolation
+"$TLOB_HELPER" multi_thread
+
+# bind against disabled monitor must return ENODEV, not crash
+echo 0 > monitors/tlob/enable
+"$TLOB_HELPER" not_enabled
+echo 1 > monitors/tlob/enable
+
+echo 0 > monitors/tlob/enable
diff --git a/tools/testing/selftests/verification/test.d/tlob/tracefs.tc b/tools/testing/selftests/verification/test.d/tlob/tracefs.tc
new file mode 100644
index 000000000000..5d1e7cc02498
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/tracefs.tc
@@ -0,0 +1,17 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# description: Test tlob monitor tracefs interface (enable/disable and files)
+# requires: tlob:monitor
+
+check_requires monitors/tlob/enable monitors/tlob/desc monitors/tlob/monitor
+
+# enable / disable via the enable file
+echo 1 > monitors/tlob/enable
+grep -q 1 monitors/tlob/enable
+echo "tlob" >> enabled_monitors
+grep -q tlob enabled_monitors
+
+echo 0 > monitors/tlob/enable
+grep -q 0 monitors/tlob/enable
+echo "!tlob" >> enabled_monitors
+! grep -q "^tlob$" enabled_monitors
diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_bind.tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_bind.tc
new file mode 100644
index 000000000000..41e20d593855
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_bind.tc
@@ -0,0 +1,34 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# description: Test uprobe binding (visible in monitor file, removable, duplicate rejected)
+# requires: tlob:monitor tlob_ioctl:program tlob_target:program
+
+TLOB_HELPER=$(command -v tlob_ioctl)
+UPROBE_TARGET=$(command -v tlob_target)
+TLOB_MONITOR=monitors/tlob/monitor
+
+busy_offset=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_busy_work 2>/dev/null)
+stop_offset=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_busy_work_done 2>/dev/null)
+[ -n "$busy_offset" ] || exit_unsupported
+[ -n "$stop_offset" ] || exit_unsupported
+
+"$UPROBE_TARGET" 30000 &
+busy_pid=$!
+sleep 0.05
+
+echo 1 > monitors/tlob/enable
+echo "p ${UPROBE_TARGET}:${busy_offset} ${stop_offset} threshold=5000000" > "$TLOB_MONITOR"
+
+# Binding must appear in monitor file with canonical hex-offset format.
+grep -qE "^p ${UPROBE_TARGET}:0x[0-9a-f]+ 0x[0-9a-f]+ threshold=[0-9]+$" "$TLOB_MONITOR"
+grep -q "threshold=5000000" "$TLOB_MONITOR"
+
+# Duplicate offset_start must be rejected.
+! echo "p ${UPROBE_TARGET}:${busy_offset} ${stop_offset} threshold=9999" > "$TLOB_MONITOR" 2>/dev/null
+
+# Remove the binding; it must no longer appear.
+echo "-${UPROBE_TARGET}:${busy_offset}" > "$TLOB_MONITOR"
+! grep -q "^p .*:0x${busy_offset#0x} " "$TLOB_MONITOR"
+
+kill "$busy_pid" 2>/dev/null; wait "$busy_pid" 2>/dev/null || true
+echo 0 > monitors/tlob/enable
diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_detail_sleeping.tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_detail_sleeping.tc
new file mode 100644
index 000000000000..2b8656e0fef1
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_detail_sleeping.tc
@@ -0,0 +1,47 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# description: Test uprobe detail sleeping (sleeping_ns dominates when task blocks between probes)
+# requires: tlob:monitor tlob_ioctl:program tlob_target:program
+
+TLOB_HELPER=$(command -v tlob_ioctl)
+UPROBE_TARGET=$(command -v tlob_target)
+TLOB_MONITOR=monitors/tlob/monitor
+
+start_offset=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_sleep_work 2>/dev/null)
+stop_offset=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_sleep_work_done 2>/dev/null)
+[ -n "$start_offset" ] || exit_unsupported
+[ -n "$stop_offset" ] || exit_unsupported
+
+"$UPROBE_TARGET" 5000 sleep &
+busy_pid=$!
+sleep 0.05
+
+echo 1 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+echo 1 > /sys/kernel/tracing/tracing_on
+echo 1 > monitors/tlob/enable
+echo > /sys/kernel/tracing/trace
+
+# 50 ms budget; task sleeps 200 ms per iteration -> sleeping_ns dominates.
+echo "p ${UPROBE_TARGET}:${start_offset} ${stop_offset} threshold=50000" > "$TLOB_MONITOR"
+
+found=0; i=0
+while [ "$i" -lt 30 ]; do
+ sleep 0.1
+ grep -q "detail_env_tlob" /sys/kernel/tracing/trace && { found=1; break; }
+ i=$((i+1))
+done
+
+echo "-${UPROBE_TARGET}:${start_offset}" > "$TLOB_MONITOR" 2>/dev/null
+kill "$busy_pid" 2>/dev/null; wait "$busy_pid" 2>/dev/null || true
+echo 0 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+echo 0 > monitors/tlob/enable
+
+[ "$found" = "1" ]
+
+line=$(grep "detail_env_tlob" /sys/kernel/tracing/trace | head -n 1)
+running=$(echo "$line" | sed 's/.*running_ns=\([0-9]*\).*/\1/')
+waiting=$(echo "$line" | sed 's/.*waiting_ns=\([0-9]*\).*/\1/')
+sleeping=$(echo "$line" | sed 's/.*sleeping_ns=\([0-9]*\).*/\1/')
+[ "$sleeping" -gt "$((running + waiting))" ]
+
+echo > /sys/kernel/tracing/trace
diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_detail_waiting.tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_detail_waiting.tc
new file mode 100644
index 000000000000..0705854f24df
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_detail_waiting.tc
@@ -0,0 +1,60 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# description: Test uprobe detail waiting (waiting_ns dominates when task is preempted between probes)
+# requires: tlob:monitor tlob_ioctl:program tlob_target:program
+
+TLOB_HELPER=$(command -v tlob_ioctl)
+UPROBE_TARGET=$(command -v tlob_target)
+TLOB_MONITOR=monitors/tlob/monitor
+
+command -v chrt > /dev/null || exit_unsupported
+command -v taskset > /dev/null || exit_unsupported
+
+start_offset=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_preempt_work 2>/dev/null)
+stop_offset=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_preempt_work_done 2>/dev/null)
+[ -n "$start_offset" ] || exit_unsupported
+[ -n "$stop_offset" ] || exit_unsupported
+
+cpu=0
+
+echo 1 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+echo 1 > /sys/kernel/tracing/tracing_on
+echo 1 > monitors/tlob/enable
+echo > /sys/kernel/tracing/trace
+
+# Register probe before the target starts so the start uprobe fires on the
+# first entry to tlob_preempt_work. Budget: 500 ms.
+echo "p ${UPROBE_TARGET}:${start_offset} ${stop_offset} threshold=500000" > "$TLOB_MONITOR"
+
+# Target starts; start probe fires on tlob_preempt_work entry.
+taskset -c "$cpu" "$UPROBE_TARGET" 5000 preempt &
+busy_pid=$!
+sleep 0.05
+
+# RT hog on the same CPU preempts the target; target stays in waiting state
+# (runnable, off-CPU) until the budget expires -> waiting_ns dominates.
+chrt -f 99 taskset -c "$cpu" sh -c 'while true; do :; done' 2>/dev/null &
+hog_pid=$!
+
+found=0; i=0
+while [ "$i" -lt 30 ]; do
+ sleep 0.1
+ grep -q "detail_env_tlob" /sys/kernel/tracing/trace && { found=1; break; }
+ i=$((i+1))
+done
+
+echo "-${UPROBE_TARGET}:${start_offset}" > "$TLOB_MONITOR" 2>/dev/null
+kill "$hog_pid" 2>/dev/null; wait "$hog_pid" 2>/dev/null || true
+kill "$busy_pid" 2>/dev/null; wait "$busy_pid" 2>/dev/null || true
+echo 0 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+echo 0 > monitors/tlob/enable
+
+[ "$found" = "1" ]
+
+line=$(grep "detail_env_tlob" /sys/kernel/tracing/trace | head -n 1)
+running=$(echo "$line" | sed 's/.*running_ns=\([0-9]*\).*/\1/')
+sleeping=$(echo "$line" | sed 's/.*sleeping_ns=\([0-9]*\).*/\1/')
+waiting=$(echo "$line" | sed 's/.*waiting_ns=\([0-9]*\).*/\1/')
+[ "$waiting" -gt "$((running + sleeping))" ]
+
+echo > /sys/kernel/tracing/trace
diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_multi.tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_multi.tc
new file mode 100644
index 000000000000..c4b8f7108ae9
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_multi.tc
@@ -0,0 +1,60 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# description: Test two uprobe bindings on same binary (different offsets fire independently)
+# requires: tlob:monitor tlob_ioctl:program tlob_target:program
+
+TLOB_HELPER=$(command -v tlob_ioctl)
+UPROBE_TARGET=$(command -v tlob_target)
+TLOB_MONITOR=monitors/tlob/monitor
+
+busy_offset=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_busy_work 2>/dev/null)
+busy_stop=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_busy_work_done 2>/dev/null)
+sleep_offset=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_sleep_work 2>/dev/null)
+sleep_stop=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_sleep_work_done 2>/dev/null)
+[ -n "$busy_offset" ] || exit_unsupported
+[ -n "$busy_stop" ] || exit_unsupported
+[ -n "$sleep_offset" ] || exit_unsupported
+[ -n "$sleep_stop" ] || exit_unsupported
+
+"$UPROBE_TARGET" 30000 & # busy mode: tlob_busy_work fires every 200 ms
+busy_pid=$!
+"$UPROBE_TARGET" 30000 sleep & # sleep mode: tlob_sleep_work fires every 200 ms
+sleep_pid=$!
+sleep 0.05
+
+echo 1 > /sys/kernel/tracing/events/rv/error_env_tlob/enable
+echo 1 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+echo 1 > /sys/kernel/tracing/tracing_on
+echo 1 > monitors/tlob/enable
+echo > /sys/kernel/tracing/trace
+
+# Binding A: 5 s budget on the busy probe - must not fire in 200 ms loops.
+echo "p ${UPROBE_TARGET}:${busy_offset} ${busy_stop} threshold=5000000" > "$TLOB_MONITOR"
+# Binding B: 10 ns budget on the sleep probe - fires on first invocation.
+echo "p ${UPROBE_TARGET}:${sleep_offset} ${sleep_stop} threshold=10" > "$TLOB_MONITOR"
+
+# Wait up to 2 s for error_env_tlob from binding B.
+found=0; i=0
+while [ "$i" -lt 20 ]; do
+ sleep 0.1
+ grep -q "error_env_tlob" /sys/kernel/tracing/trace && { found=1; break; }
+ i=$((i+1))
+done
+
+echo "-${UPROBE_TARGET}:${busy_offset}" > "$TLOB_MONITOR" 2>/dev/null
+echo "-${UPROBE_TARGET}:${sleep_offset}" > "$TLOB_MONITOR" 2>/dev/null
+kill "$sleep_pid" 2>/dev/null; wait "$sleep_pid" 2>/dev/null || true
+kill "$busy_pid" 2>/dev/null; wait "$busy_pid" 2>/dev/null || true
+
+echo 0 > monitors/tlob/enable
+echo 0 > /sys/kernel/tracing/events/rv/error_env_tlob/enable
+echo 0 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+
+[ "$found" = "1" ]
+# error_env_tlob payload: label and clock variable must be present.
+grep "error_env_tlob" /sys/kernel/tracing/trace | head -n 1 | grep -q "budget_exceeded"
+grep "error_env_tlob" /sys/kernel/tracing/trace | head -n 1 | grep -q "clk_elapsed="
+# detail_env_tlob must appear alongside the error.
+grep -q "detail_env_tlob" /sys/kernel/tracing/trace
+
+echo > /sys/kernel/tracing/trace
diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_no_event.tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_no_event.tc
new file mode 100644
index 000000000000..4a74853346e3
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_no_event.tc
@@ -0,0 +1,19 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# description: Test no spurious error_env_tlob events without an active uprobe binding
+# requires: tlob:monitor tlob_ioctl:program
+
+TLOB_MONITOR=monitors/tlob/monitor
+
+echo 1 > /sys/kernel/tracing/events/rv/error_env_tlob/enable
+echo 1 > /sys/kernel/tracing/tracing_on
+echo 1 > monitors/tlob/enable
+echo > /sys/kernel/tracing/trace
+
+sleep 0.5
+
+! grep -q "error_env_tlob" /sys/kernel/tracing/trace
+
+echo 0 > monitors/tlob/enable
+echo 0 > /sys/kernel/tracing/events/rv/error_env_tlob/enable
+echo > /sys/kernel/tracing/trace
diff --git a/tools/testing/selftests/verification/test.d/tlob/uprobe_violation.tc b/tools/testing/selftests/verification/test.d/tlob/uprobe_violation.tc
new file mode 100644
index 000000000000..624fdb950f6b
--- /dev/null
+++ b/tools/testing/selftests/verification/test.d/tlob/uprobe_violation.tc
@@ -0,0 +1,60 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# description: Test uprobe violation (error_env_tlob and detail_env_tlob fire with correct fields)
+# requires: tlob:monitor tlob_ioctl:program tlob_target:program
+
+TLOB_HELPER=$(command -v tlob_ioctl)
+UPROBE_TARGET=$(command -v tlob_target)
+TLOB_MONITOR=monitors/tlob/monitor
+
+busy_offset=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_busy_work 2>/dev/null)
+stop_offset=$("$TLOB_HELPER" sym_offset "$UPROBE_TARGET" tlob_busy_work_done 2>/dev/null)
+[ -n "$busy_offset" ] || exit_unsupported
+[ -n "$stop_offset" ] || exit_unsupported
+
+"$UPROBE_TARGET" 30000 &
+busy_pid=$!
+sleep 0.05
+
+echo 1 > /sys/kernel/tracing/events/rv/error_env_tlob/enable
+echo 1 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+echo 1 > /sys/kernel/tracing/tracing_on
+echo 1 > monitors/tlob/enable
+echo > /sys/kernel/tracing/trace
+
+# 10 ns budget - fires almost immediately; task is busy-spinning on-CPU.
+echo "p ${UPROBE_TARGET}:${busy_offset} ${stop_offset} threshold=10" > "$TLOB_MONITOR"
+
+# wait up to 2 s for detail_env_tlob
+found=0; i=0
+while [ "$i" -lt 20 ]; do
+ sleep 0.1
+ grep -q "detail_env_tlob" /sys/kernel/tracing/trace && { found=1; break; }
+ i=$((i+1))
+done
+
+echo "-${UPROBE_TARGET}:${busy_offset}" > "$TLOB_MONITOR" 2>/dev/null
+kill "$busy_pid" 2>/dev/null; wait "$busy_pid" 2>/dev/null || true
+echo 0 > /sys/kernel/tracing/events/rv/error_env_tlob/enable
+echo 0 > /sys/kernel/tracing/events/rv/detail_env_tlob/enable
+echo 0 > monitors/tlob/enable
+
+[ "$found" = "1" ]
+
+# error_env_tlob event label must be budget_exceeded
+grep "error_env_tlob" /sys/kernel/tracing/trace | head -n 1 | grep -q "budget_exceeded"
+
+# detail_env_tlob must have all five fields with the correct threshold
+line=$(grep "detail_env_tlob" /sys/kernel/tracing/trace | head -n 1)
+echo "$line" | grep -q "pid="
+echo "$line" | grep -q "threshold_us=10"
+echo "$line" | grep -q "running_ns="
+echo "$line" | grep -q "waiting_ns="
+echo "$line" | grep -q "sleeping_ns="
+
+# Busy-spin keeps the task on-CPU: running_ns must exceed sleeping_ns.
+running=$(echo "$line" | sed 's/.*running_ns=\([0-9]*\).*/\1/')
+sleeping=$(echo "$line" | sed 's/.*sleeping_ns=\([0-9]*\).*/\1/')
+[ "$running" -gt "$sleeping" ]
+
+echo > /sys/kernel/tracing/trace
diff --git a/tools/testing/selftests/verification/tlob/Makefile b/tools/testing/selftests/verification/tlob/Makefile
new file mode 100644
index 000000000000..1bedf946cb34
--- /dev/null
+++ b/tools/testing/selftests/verification/tlob/Makefile
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0
+# Builds tlob selftest helper binaries.
+#
+# Invoked by ../Makefile; pass OUTDIR to control the output directory
+# and TOOLS_INCLUDES for the in-tree UAPI -isystem flag.
+
+OUTDIR ?= $(CURDIR)/..
+CFLAGS += $(TOOLS_INCLUDES)
+
+.PHONY: all
+all: $(OUTDIR)/tlob_ioctl $(OUTDIR)/tlob_target
+
+$(OUTDIR)/tlob_ioctl: tlob_ioctl.c
+ $(CC) $(CFLAGS) -o $@ $< -lpthread
+
+$(OUTDIR)/tlob_target: tlob_target.c
+ $(CC) $(CFLAGS) -o $@ $<
+
+.PHONY: clean
+clean:
+ $(RM) $(OUTDIR)/tlob_ioctl $(OUTDIR)/tlob_target
diff --git a/tools/testing/selftests/verification/tlob/tlob_ioctl.c b/tools/testing/selftests/verification/tlob/tlob_ioctl.c
new file mode 100644
index 000000000000..abb4e2e80a2c
--- /dev/null
+++ b/tools/testing/selftests/verification/tlob/tlob_ioctl.c
@@ -0,0 +1,626 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * tlob_ioctl.c - ioctl test driver and ELF utility for tlob selftests
+ *
+ * Usage: tlob_ioctl <subcommand> [args...]
+ *
+ * not_enabled - TRACE_START without monitor enabled -> ENODEV
+ * within_budget - sleep within budget -> 0
+ * over_budget_running - busy-spin past budget -> EOVERFLOW
+ * over_budget_sleeping - sleep past budget -> EOVERFLOW
+ * over_budget_waiting - sched_yield into waiting state -> EOVERFLOW
+ * double_start - two starts without stop -> EALREADY
+ * stop_no_start - stop without start -> EINVAL
+ * multi_thread - two fds: thread A within budget, thread B over
+ * bench - TRACE_START/STOP latency (TAP output, always passes)
+ * sym_offset <binary> <symbol> - print ELF file offset of symbol
+ *
+ * Exit: 0 = pass, 1 = fail, 2 = skip (device not available).
+ */
+#define _GNU_SOURCE
+#include <elf.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <linux/rv.h>
+
+static int rv_fd = -1;
+
+static int open_rv(void)
+{
+ struct rv_bind_args bind = { .monitor_name = "tlob" };
+
+ rv_fd = open("/dev/rv", O_RDWR);
+ if (rv_fd < 0) {
+ fprintf(stderr, "open /dev/rv: %s\n", strerror(errno));
+ return -1;
+ }
+ if (ioctl(rv_fd, RV_IOCTL_BIND_MONITOR, &bind) < 0) {
+ fprintf(stderr, "bind tlob: %s\n", strerror(errno));
+ close(rv_fd);
+ rv_fd = -1;
+ return -1;
+ }
+ return 0;
+}
+
+static void busy_spin_us(unsigned long us)
+{
+ struct timespec start, now;
+ unsigned long elapsed;
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ do {
+ clock_gettime(CLOCK_MONOTONIC, &now);
+ elapsed = (unsigned long)(now.tv_sec - start.tv_sec)
+ * 1000000000UL
+ + (unsigned long)(now.tv_nsec - start.tv_nsec);
+ } while (elapsed < us * 1000UL);
+}
+
+static int trace_start(uint64_t threshold_us)
+{
+ struct tlob_start_args args = {
+ .threshold_us = threshold_us,
+ };
+
+ return ioctl(rv_fd, TLOB_IOCTL_TRACE_START, &args);
+}
+
+static int trace_stop(void)
+{
+ return ioctl(rv_fd, TLOB_IOCTL_TRACE_STOP, NULL);
+}
+
+/* Synchronous TRACE_START / TRACE_STOP tests */
+
+/* Bind to a disabled monitor must return ENODEV without crashing */
+static int test_not_enabled(void)
+{
+ struct rv_bind_args bind = { .monitor_name = "tlob" };
+ int fd;
+ int ret;
+
+ fd = open("/dev/rv", O_RDWR);
+ if (fd < 0) {
+ fprintf(stderr, "open /dev/rv: %s\n", strerror(errno));
+ return 2; /* skip */
+ }
+
+ ret = ioctl(fd, RV_IOCTL_BIND_MONITOR, &bind);
+ close(fd);
+
+ if (ret == 0) {
+ fprintf(stderr, "RV_IOCTL_BIND_MONITOR: expected ENODEV, got success\n");
+ return 1;
+ }
+ if (errno != ENODEV) {
+ fprintf(stderr, "RV_IOCTL_BIND_MONITOR: expected ENODEV, got %s\n",
+ strerror(errno));
+ return 1;
+ }
+ return 0;
+}
+
+static int test_within_budget(void)
+{
+ int ret;
+
+ /* 50 ms budget */
+ if (trace_start(50000) < 0) {
+ fprintf(stderr, "TRACE_START: %s\n", strerror(errno));
+ return 1;
+ }
+ usleep(10000); /* 10 ms */
+ ret = trace_stop();
+ if (ret != 0) {
+ fprintf(stderr, "TRACE_STOP: expected 0, got %d errno=%s\n",
+ ret, strerror(errno));
+ return 1;
+ }
+ return 0;
+}
+
+static int test_over_budget_running(void)
+{
+ int ret;
+
+ /* 1 ms budget */
+ if (trace_start(1000) < 0) {
+ fprintf(stderr, "TRACE_START: %s\n", strerror(errno));
+ return 1;
+ }
+ busy_spin_us(100000); /* 100 ms */
+ ret = trace_stop();
+ if (ret == 0) {
+ fprintf(stderr, "TRACE_STOP: expected EOVERFLOW, got 0\n");
+ return 1;
+ }
+ if (errno != EOVERFLOW) {
+ fprintf(stderr, "TRACE_STOP: expected EOVERFLOW, got %s\n",
+ strerror(errno));
+ return 1;
+ }
+ return 0;
+}
+
+static int test_over_budget_sleeping(void)
+{
+ int ret;
+
+ /* 3 ms budget */
+ if (trace_start(3000) < 0) {
+ fprintf(stderr, "TRACE_START: %s\n", strerror(errno));
+ return 1;
+ }
+ usleep(50000); /* 50 ms; sleeping time counts toward budget */
+ ret = trace_stop();
+ if (ret == 0) {
+ fprintf(stderr, "TRACE_STOP: expected EOVERFLOW, got 0\n");
+ return 1;
+ }
+ if (errno != EOVERFLOW) {
+ fprintf(stderr, "TRACE_STOP: expected EOVERFLOW, got %s\n",
+ strerror(errno));
+ return 1;
+ }
+ return 0;
+}
+
+static int test_over_budget_waiting(void)
+{
+ int ret;
+
+ /* 1 us budget */
+ if (trace_start(1) < 0) {
+ fprintf(stderr, "TRACE_START: %s\n", strerror(errno));
+ return 1;
+ }
+ sched_yield(); /* running -> waiting -> running */
+ busy_spin_us(10); /* 10 us >> 1 us budget; hrtimer fires during spin */
+ ret = trace_stop();
+ if (ret == 0) {
+ fprintf(stderr, "TRACE_STOP: expected EOVERFLOW, got 0\n");
+ return 1;
+ }
+ if (errno != EOVERFLOW) {
+ fprintf(stderr, "TRACE_STOP: expected EOVERFLOW, got %s\n",
+ strerror(errno));
+ return 1;
+ }
+ return 0;
+}
+
+/* Error-handling tests */
+
+static int test_double_start(void)
+{
+ int ret;
+
+ /* 10 s: large enough the hrtimer won't fire during the test */
+ if (trace_start(10000000ULL) < 0) {
+ fprintf(stderr, "first TRACE_START: %s\n", strerror(errno));
+ return 1;
+ }
+ ret = trace_start(10000000);
+ if (ret == 0) {
+ fprintf(stderr, "second TRACE_START: expected EALREADY, got 0\n");
+ trace_stop();
+ return 1;
+ }
+ if (errno != EALREADY) {
+ fprintf(stderr, "second TRACE_START: expected EALREADY, got %s\n",
+ strerror(errno));
+ trace_stop();
+ return 1;
+ }
+ trace_stop();
+ return 0;
+}
+
+static int test_stop_no_start(void)
+{
+ int ret;
+
+ /* Ensure clean state: ignore error from a stale entry */
+ trace_stop();
+
+ ret = trace_stop();
+ if (ret == 0) {
+ fprintf(stderr, "TRACE_STOP: expected EINVAL, got 0\n");
+ return 1;
+ }
+ if (errno != EINVAL) {
+ fprintf(stderr, "TRACE_STOP: expected EINVAL, got %s\n",
+ strerror(errno));
+ return 1;
+ }
+ return 0;
+}
+
+/* Two threads, each with its own fd: A within budget, B over budget. */
+
+struct mt_thread_args {
+ uint64_t threshold_us;
+ unsigned long workload_us;
+ int busy;
+ int expect_eoverflow;
+ int result;
+};
+
+static void *mt_thread_fn(void *arg)
+{
+ struct mt_thread_args *a = arg;
+ struct tlob_start_args args = { .threshold_us = a->threshold_us };
+ struct rv_bind_args bind = { .monitor_name = "tlob" };
+ int fd;
+ int ret;
+
+ fd = open("/dev/rv", O_RDWR);
+ if (fd < 0) {
+ fprintf(stderr, "thread open /dev/rv: %s\n", strerror(errno));
+ a->result = 1;
+ return NULL;
+ }
+ if (ioctl(fd, RV_IOCTL_BIND_MONITOR, &bind) < 0) {
+ fprintf(stderr, "thread bind tlob: %s\n", strerror(errno));
+ close(fd);
+ a->result = 1;
+ return NULL;
+ }
+
+ ret = ioctl(fd, TLOB_IOCTL_TRACE_START, &args);
+ if (ret < 0) {
+ fprintf(stderr, "thread TRACE_START: %s\n", strerror(errno));
+ close(fd);
+ a->result = 1;
+ return NULL;
+ }
+
+ if (a->busy)
+ busy_spin_us(a->workload_us);
+ else
+ usleep(a->workload_us);
+
+ ret = ioctl(fd, TLOB_IOCTL_TRACE_STOP, NULL);
+ if (a->expect_eoverflow) {
+ if (ret == 0 || errno != EOVERFLOW) {
+ fprintf(stderr, "thread: expected EOVERFLOW, got ret=%d errno=%s\n",
+ ret, strerror(errno));
+ close(fd);
+ a->result = 1;
+ return NULL;
+ }
+ } else {
+ if (ret != 0) {
+ fprintf(stderr, "thread: expected 0, got ret=%d errno=%s\n",
+ ret, strerror(errno));
+ close(fd);
+ a->result = 1;
+ return NULL;
+ }
+ }
+ close(fd);
+ a->result = 0;
+ return NULL;
+}
+
+static int test_multi_thread(void)
+{
+ pthread_t ta, tb;
+ struct mt_thread_args a = {
+ .threshold_us = 20000, /* 20 ms */
+ .workload_us = 5000, /* 5 ms sleep -> within budget */
+ .busy = 0,
+ .expect_eoverflow = 0,
+ };
+ struct mt_thread_args b = {
+ .threshold_us = 3000, /* 3 ms */
+ .workload_us = 30000, /* 30 ms spin -> over budget */
+ .busy = 1,
+ .expect_eoverflow = 1,
+ };
+
+ pthread_create(&ta, NULL, mt_thread_fn, &a);
+ pthread_create(&tb, NULL, mt_thread_fn, &b);
+ pthread_join(ta, NULL);
+ pthread_join(tb, NULL);
+
+ return (a.result || b.result) ? 1 : 0;
+}
+
+/*
+ * Benchmark TRACE_START, TRACE_STOP, and round-trip ioctls.
+ * Output uses TAP '#' prefix; always returns 0.
+ */
+#define BENCH_WARMUP 32
+#define BENCH_N 1000
+
+static long long timespec_diff_ns(const struct timespec *a,
+ const struct timespec *b)
+{
+ return (long long)(b->tv_sec - a->tv_sec) * 1000000000LL
+ + (b->tv_nsec - a->tv_nsec);
+}
+
+static int test_bench(void)
+{
+ struct tlob_start_args args = {
+ .threshold_us = 10000000ULL, /* 10 s */
+ };
+ struct timespec t0, t1;
+ long long total_start_ns = 0, total_stop_ns = 0, total_rt_ns = 0;
+ int i;
+
+ /* warm up */
+ for (i = 0; i < BENCH_WARMUP; i++) {
+ if (ioctl(rv_fd, TLOB_IOCTL_TRACE_START, &args) == 0)
+ ioctl(rv_fd, TLOB_IOCTL_TRACE_STOP, NULL);
+ }
+
+ /* start only */
+ for (i = 0; i < BENCH_N; i++) {
+ clock_gettime(CLOCK_MONOTONIC, &t0);
+ ioctl(rv_fd, TLOB_IOCTL_TRACE_START, &args);
+ clock_gettime(CLOCK_MONOTONIC, &t1);
+ total_start_ns += timespec_diff_ns(&t0, &t1);
+ ioctl(rv_fd, TLOB_IOCTL_TRACE_STOP, NULL);
+ }
+
+ /* stop only */
+ for (i = 0; i < BENCH_N; i++) {
+ ioctl(rv_fd, TLOB_IOCTL_TRACE_START, &args);
+ clock_gettime(CLOCK_MONOTONIC, &t0);
+ ioctl(rv_fd, TLOB_IOCTL_TRACE_STOP, NULL);
+ clock_gettime(CLOCK_MONOTONIC, &t1);
+ total_stop_ns += timespec_diff_ns(&t0, &t1);
+ }
+
+ /* round-trip */
+ clock_gettime(CLOCK_MONOTONIC, &t0);
+ for (i = 0; i < BENCH_N; i++) {
+ ioctl(rv_fd, TLOB_IOCTL_TRACE_START, &args);
+ ioctl(rv_fd, TLOB_IOCTL_TRACE_STOP, NULL);
+ }
+ clock_gettime(CLOCK_MONOTONIC, &t1);
+ total_rt_ns = timespec_diff_ns(&t0, &t1);
+
+ printf("# start ioctl only: %lld ns/iter (N=%d, includes syscall)\n",
+ total_start_ns / BENCH_N, BENCH_N);
+ printf("# stop ioctl only: %lld ns/iter (N=%d, includes syscall)\n",
+ total_stop_ns / BENCH_N, BENCH_N);
+ printf("# start+stop roundtrip: %lld ns/iter (N=%d, includes 2 syscalls)\n",
+ total_rt_ns / BENCH_N, BENCH_N);
+ return 0;
+}
+
+/*
+ * Print the ELF file offset of <symname> in <binary>. Walks .symtab
+ * (falling back to .dynsym) and converts vaddr to file offset via PT_LOAD.
+ * Supports 32- and 64-bit ELF.
+ */
+static int sym_offset(const char *binary, const char *symname)
+{
+ int fd;
+ struct stat st;
+ void *map;
+ Elf64_Ehdr *ehdr;
+ Elf32_Ehdr *ehdr32;
+ int is64;
+ uint64_t sym_vaddr = 0;
+ int found = 0;
+ uint64_t file_offset = 0;
+
+ fd = open(binary, O_RDONLY);
+ if (fd < 0) {
+ fprintf(stderr, "open %s: %s\n", binary, strerror(errno));
+ return 1;
+ }
+ if (fstat(fd, &st) < 0) {
+ close(fd);
+ return 1;
+ }
+ map = mmap(NULL, (size_t)st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+ close(fd);
+ if (map == MAP_FAILED) {
+ fprintf(stderr, "mmap: %s\n", strerror(errno));
+ return 1;
+ }
+
+ ehdr = (Elf64_Ehdr *)map;
+ ehdr32 = (Elf32_Ehdr *)map;
+ if (st.st_size < 4 ||
+ ehdr->e_ident[EI_MAG0] != ELFMAG0 ||
+ ehdr->e_ident[EI_MAG1] != ELFMAG1 ||
+ ehdr->e_ident[EI_MAG2] != ELFMAG2 ||
+ ehdr->e_ident[EI_MAG3] != ELFMAG3) {
+ fprintf(stderr, "%s: not an ELF file\n", binary);
+ munmap(map, (size_t)st.st_size);
+ return 1;
+ }
+ is64 = (ehdr->e_ident[EI_CLASS] == ELFCLASS64);
+
+ if (is64) {
+ Elf64_Shdr *shdrs = (Elf64_Shdr *)((char *)map + ehdr->e_shoff);
+ Elf64_Shdr *shstrtab_hdr = &shdrs[ehdr->e_shstrndx];
+ const char *shstrtab = (char *)map + shstrtab_hdr->sh_offset;
+ int si;
+
+ /* prefer .symtab; fall back to .dynsym */
+ for (int pass = 0; pass < 2 && !found; pass++) {
+ const char *target = pass ? ".dynsym" : ".symtab";
+
+ for (si = 0; si < ehdr->e_shnum && !found; si++) {
+ Elf64_Shdr *sh = &shdrs[si];
+ const char *name = shstrtab + sh->sh_name;
+
+ if (strcmp(name, target) != 0)
+ continue;
+
+ Elf64_Shdr *strtab_sh = &shdrs[sh->sh_link];
+ const char *strtab = (char *)map + strtab_sh->sh_offset;
+ Elf64_Sym *syms = (Elf64_Sym *)((char *)map + sh->sh_offset);
+ uint64_t nsyms = sh->sh_size / sizeof(Elf64_Sym);
+ uint64_t j;
+
+ for (j = 0; j < nsyms; j++) {
+ if (strcmp(strtab + syms[j].st_name, symname) == 0) {
+ sym_vaddr = syms[j].st_value;
+ found = 1;
+ break;
+ }
+ }
+ }
+ }
+
+ if (!found) {
+ fprintf(stderr, "symbol '%s' not found in %s\n", symname, binary);
+ munmap(map, (size_t)st.st_size);
+ return 1;
+ }
+
+ /* Convert vaddr to file offset via PT_LOAD segments */
+ Elf64_Phdr *phdrs = (Elf64_Phdr *)((char *)map + ehdr->e_phoff);
+ int pi;
+
+ for (pi = 0; pi < ehdr->e_phnum; pi++) {
+ Elf64_Phdr *ph = &phdrs[pi];
+
+ if (ph->p_type != PT_LOAD)
+ continue;
+ if (sym_vaddr >= ph->p_vaddr &&
+ sym_vaddr < ph->p_vaddr + ph->p_filesz) {
+ file_offset = sym_vaddr - ph->p_vaddr + ph->p_offset;
+ break;
+ }
+ }
+ } else {
+ /* 32-bit ELF */
+ Elf32_Shdr *shdrs = (Elf32_Shdr *)((char *)map + ehdr32->e_shoff);
+ Elf32_Shdr *shstrtab_hdr = &shdrs[ehdr32->e_shstrndx];
+ const char *shstrtab = (char *)map + shstrtab_hdr->sh_offset;
+ int si;
+ uint32_t sym_vaddr32 = 0;
+
+ for (int pass = 0; pass < 2 && !found; pass++) {
+ const char *target = pass ? ".dynsym" : ".symtab";
+
+ for (si = 0; si < ehdr32->e_shnum && !found; si++) {
+ Elf32_Shdr *sh = &shdrs[si];
+ const char *name = shstrtab + sh->sh_name;
+
+ if (strcmp(name, target) != 0)
+ continue;
+
+ Elf32_Shdr *strtab_sh = &shdrs[sh->sh_link];
+ const char *strtab = (char *)map + strtab_sh->sh_offset;
+ Elf32_Sym *syms = (Elf32_Sym *)((char *)map + sh->sh_offset);
+ uint32_t nsyms = sh->sh_size / sizeof(Elf32_Sym);
+ uint32_t j;
+
+ for (j = 0; j < nsyms; j++) {
+ if (strcmp(strtab + syms[j].st_name, symname) == 0) {
+ sym_vaddr32 = syms[j].st_value;
+ found = 1;
+ break;
+ }
+ }
+ }
+ }
+
+ if (!found) {
+ fprintf(stderr, "symbol '%s' not found in %s\n", symname, binary);
+ munmap(map, (size_t)st.st_size);
+ return 1;
+ }
+
+ Elf32_Phdr *phdrs = (Elf32_Phdr *)((char *)map + ehdr32->e_phoff);
+ int pi;
+
+ for (pi = 0; pi < ehdr32->e_phnum; pi++) {
+ Elf32_Phdr *ph = &phdrs[pi];
+
+ if (ph->p_type != PT_LOAD)
+ continue;
+ if (sym_vaddr32 >= ph->p_vaddr &&
+ sym_vaddr32 < ph->p_vaddr + ph->p_filesz) {
+ file_offset = sym_vaddr32 - ph->p_vaddr + ph->p_offset;
+ break;
+ }
+ }
+ sym_vaddr = sym_vaddr32;
+ }
+
+ munmap(map, (size_t)st.st_size);
+
+ if (!file_offset && sym_vaddr) {
+ fprintf(stderr, "could not map vaddr 0x%lx to file offset\n",
+ (unsigned long)sym_vaddr);
+ return 1;
+ }
+
+ printf("0x%lx\n", (unsigned long)file_offset);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ int rc;
+
+ if (argc < 2) {
+ fprintf(stderr, "Usage: %s <subcommand> [args...]\n", argv[0]);
+ return 1;
+ }
+
+ /* sym_offset does not need /dev/rv */
+ if (strcmp(argv[1], "sym_offset") == 0) {
+ if (argc < 4) {
+ fprintf(stderr, "Usage: %s sym_offset <binary> <symbol>\n",
+ argv[0]);
+ return 1;
+ }
+ return sym_offset(argv[2], argv[3]);
+ }
+
+ /* not_enabled: monitor is disabled; bind must return ENODEV without open_rv() */
+ if (strcmp(argv[1], "not_enabled") == 0)
+ return test_not_enabled();
+
+ if (open_rv() < 0)
+ return 2; /* skip */
+
+ if (strcmp(argv[1], "bench") == 0)
+ rc = test_bench();
+ else if (strcmp(argv[1], "within_budget") == 0)
+ rc = test_within_budget();
+ else if (strcmp(argv[1], "over_budget_running") == 0)
+ rc = test_over_budget_running();
+ else if (strcmp(argv[1], "over_budget_sleeping") == 0)
+ rc = test_over_budget_sleeping();
+ else if (strcmp(argv[1], "over_budget_waiting") == 0)
+ rc = test_over_budget_waiting();
+ else if (strcmp(argv[1], "double_start") == 0)
+ rc = test_double_start();
+ else if (strcmp(argv[1], "stop_no_start") == 0)
+ rc = test_stop_no_start();
+ else if (strcmp(argv[1], "multi_thread") == 0)
+ rc = test_multi_thread();
+ else {
+ fprintf(stderr, "Unknown test: %s\n", argv[1]);
+ rc = 1;
+ }
+
+ close(rv_fd);
+ return rc;
+}
diff --git a/tools/testing/selftests/verification/tlob/tlob_target.c b/tools/testing/selftests/verification/tlob/tlob_target.c
new file mode 100644
index 000000000000..0fdbc575d71d
--- /dev/null
+++ b/tools/testing/selftests/verification/tlob/tlob_target.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * tlob_target.c - uprobe target binary for tlob selftests.
+ *
+ * Provides three start/stop probe pairs, each designed to exercise a
+ * different dominant component of the detail_env_tlob ns breakdown:
+ *
+ * tlob_busy_work / tlob_busy_work_done - busy-spin: running_ns dominates
+ * tlob_sleep_work / tlob_sleep_work_done - nanosleep: sleeping_ns dominates
+ * tlob_preempt_work / tlob_preempt_work_done - busy-spin: waiting_ns dominates
+ * (needs an RT competitor on the same CPU)
+ *
+ * Usage: tlob_target <duration_ms> [mode]
+ *
+ * mode is one of: busy (default), sleep, preempt.
+ * Loops in 200 ms iterations until <duration_ms> has elapsed
+ * (0 = run for ~24 hours).
+ */
+#define _GNU_SOURCE
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#ifndef noinline
+#define noinline __attribute__((noinline))
+#endif
+
+static inline int timespec_before(const struct timespec *a,
+ const struct timespec *b)
+{
+ return a->tv_sec < b->tv_sec ||
+ (a->tv_sec == b->tv_sec && a->tv_nsec < b->tv_nsec);
+}
+
+static void timespec_add_ms(struct timespec *ts, unsigned long ms)
+{
+ ts->tv_sec += ms / 1000;
+ ts->tv_nsec += (long)(ms % 1000) * 1000000L;
+ if (ts->tv_nsec >= 1000000000L) {
+ ts->tv_sec++;
+ ts->tv_nsec -= 1000000000L;
+ }
+}
+
+/* stop probe; noinline keeps the entry point visible to uprobes */
+noinline void tlob_busy_work_done(void)
+{
+ /* empty: uprobe fires on entry */
+}
+
+/* start probe; busy-spin so running_ns dominates */
+noinline void tlob_busy_work(unsigned long duration_ns)
+{
+ struct timespec start, now;
+ unsigned long elapsed;
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ do {
+ clock_gettime(CLOCK_MONOTONIC, &now);
+ elapsed = (unsigned long)(now.tv_sec - start.tv_sec)
+ * 1000000000UL
+ + (unsigned long)(now.tv_nsec - start.tv_nsec);
+ } while (elapsed < duration_ns);
+
+ tlob_busy_work_done();
+}
+
+/* stop probe; noinline keeps the entry point visible to uprobes */
+noinline void tlob_sleep_work_done(void)
+{
+ /* empty: uprobe fires on entry */
+}
+
+/* start probe; nanosleep so sleeping_ns dominates */
+noinline void tlob_sleep_work(unsigned long duration_ms)
+{
+ struct timespec ts = {
+ .tv_sec = duration_ms / 1000,
+ .tv_nsec = (long)(duration_ms % 1000) * 1000000L,
+ };
+ nanosleep(&ts, NULL);
+ tlob_sleep_work_done();
+}
+
+/* stop probe; noinline keeps the entry point visible to uprobes */
+noinline void tlob_preempt_work_done(void)
+{
+ /* empty: uprobe fires on entry */
+}
+
+/*
+ * start probe; busy-spin so an RT competitor on the same CPU drives
+ * waiting_ns (prev_state==0 -> preempt event, task stays runnable off-CPU).
+ */
+noinline void tlob_preempt_work(unsigned long duration_ms)
+{
+ struct timespec start, now;
+ unsigned long elapsed;
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ do {
+ clock_gettime(CLOCK_MONOTONIC, &now);
+ elapsed = (unsigned long)(now.tv_sec - start.tv_sec)
+ * 1000000000UL
+ + (unsigned long)(now.tv_nsec - start.tv_nsec);
+ } while (elapsed < duration_ms * 1000000UL);
+
+ tlob_preempt_work_done();
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned long duration_ms = 0;
+ const char *mode = "busy";
+ struct timespec deadline, now;
+
+ if (argc >= 2)
+ duration_ms = strtoul(argv[1], NULL, 10);
+ if (argc >= 3)
+ mode = argv[2];
+
+ clock_gettime(CLOCK_MONOTONIC, &deadline);
+ timespec_add_ms(&deadline, duration_ms ? duration_ms : 86400000UL);
+
+ do {
+ if (strcmp(mode, "sleep") == 0)
+ tlob_sleep_work(200);
+ else if (strcmp(mode, "preempt") == 0)
+ tlob_preempt_work(200);
+ else
+ tlob_busy_work(200 * 1000000UL);
+ clock_gettime(CLOCK_MONOTONIC, &now);
+ } while (timespec_before(&now, &deadline));
+
+ return 0;
+}
--
2.25.1
^ permalink raw reply related
* [RFC PATCH v2 09/10] rv/tlob: add KUnit tests for the tlob monitor
From: wen.yang @ 2026-05-11 18:24 UTC (permalink / raw)
To: Gabriele Monaco, Steven Rostedt
Cc: linux-trace-kernel, linux-kernel, Wen Yang
In-Reply-To: <cover.1778522945.git.wen.yang@linux.dev>
From: Wen Yang <wen.yang@linux.dev>
Add five KUnit test suites gated behind CONFIG_TLOB_KUNIT_TEST
(depends on RV_MON_TLOB && KUNIT; default KUNIT_ALL_TESTS) with a
.kunitconfig fragment for the kunit.py runner.
tlob_task_api tests the start/stop API, error returns (-EEXIST,
-ESRCH, -EOVERFLOW, -ENOSPC, -ERANGE).
tlob_sched_integration covers context-switch accounting and monitoring
a kthread. tlob_parse_uprobe exercises the uprobe line parser.
tlob_trace_output checks sched_switch and error_env_tlob field layout.
tlob_violation_react verifies error_env_tlob fires once on budget
expiry and zero times when the budget is not exceeded.
Suggested-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
kernel/trace/rv/monitors/tlob/.kunitconfig | 5 +
kernel/trace/rv/monitors/tlob/tlob.c | 26 +
kernel/trace/rv/monitors/tlob/tlob_kunit.c | 881 +++++++++++++++++++++
3 files changed, 912 insertions(+)
create mode 100644 kernel/trace/rv/monitors/tlob/.kunitconfig
create mode 100644 kernel/trace/rv/monitors/tlob/tlob_kunit.c
diff --git a/kernel/trace/rv/monitors/tlob/.kunitconfig b/kernel/trace/rv/monitors/tlob/.kunitconfig
new file mode 100644
index 000000000000..977c58601ab7
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/.kunitconfig
@@ -0,0 +1,5 @@
+CONFIG_FTRACE=y
+CONFIG_KUNIT=y
+CONFIG_RV=y
+CONFIG_RV_MON_TLOB=y
+CONFIG_TLOB_KUNIT_TEST=y
diff --git a/kernel/trace/rv/monitors/tlob/tlob.c b/kernel/trace/rv/monitors/tlob/tlob.c
index 475e972ae9aa..90e7035a0b55 100644
--- a/kernel/trace/rv/monitors/tlob/tlob.c
+++ b/kernel/trace/rv/monitors/tlob/tlob.c
@@ -1024,6 +1024,7 @@ EXPORT_SYMBOL_IF_KUNIT(tlob_num_monitored_read);
/* Tracepoint probes for KUnit; rv_trace.h is only included here. */
static struct tlob_captured_event tlob_kunit_last_event;
static struct tlob_captured_error_env tlob_kunit_last_error_env;
+static struct tlob_captured_detail tlob_kunit_last_detail;
static atomic_t tlob_kunit_event_cnt = ATOMIC_INIT(0);
static atomic_t tlob_kunit_error_env_cnt = ATOMIC_INIT(0);
@@ -1054,6 +1055,17 @@ static void tlob_kunit_error_env_probe(void *data, int id, char *state,
atomic_inc(&tlob_kunit_error_env_cnt);
}
+static void tlob_kunit_detail_probe(void *data, int pid, u64 threshold_us,
+ u64 running_ns, u64 waiting_ns,
+ u64 sleeping_ns)
+{
+ tlob_kunit_last_detail.pid = pid;
+ tlob_kunit_last_detail.threshold_us = threshold_us;
+ tlob_kunit_last_detail.running_ns = running_ns;
+ tlob_kunit_last_detail.waiting_ns = waiting_ns;
+ tlob_kunit_last_detail.sleeping_ns = sleeping_ns;
+}
+
int tlob_register_kunit_probes(void)
{
int ret;
@@ -1069,6 +1081,12 @@ int tlob_register_kunit_probes(void)
unregister_trace_event_tlob(tlob_kunit_event_probe, NULL);
return ret;
}
+ ret = register_trace_detail_env_tlob(tlob_kunit_detail_probe, NULL);
+ if (ret) {
+ unregister_trace_error_env_tlob(tlob_kunit_error_env_probe, NULL);
+ unregister_trace_event_tlob(tlob_kunit_event_probe, NULL);
+ return ret;
+ }
return 0;
}
EXPORT_SYMBOL_IF_KUNIT(tlob_register_kunit_probes);
@@ -1077,6 +1095,7 @@ void tlob_unregister_kunit_probes(void)
{
unregister_trace_event_tlob(tlob_kunit_event_probe, NULL);
unregister_trace_error_env_tlob(tlob_kunit_error_env_probe, NULL);
+ unregister_trace_detail_env_tlob(tlob_kunit_detail_probe, NULL);
tracepoint_synchronize_unregister();
}
EXPORT_SYMBOL_IF_KUNIT(tlob_unregister_kunit_probes);
@@ -1105,6 +1124,7 @@ void tlob_error_env_count_reset(void)
}
EXPORT_SYMBOL_IF_KUNIT(tlob_error_env_count_reset);
+
const struct tlob_captured_event *tlob_last_event_read(void)
{
return &tlob_kunit_last_event;
@@ -1117,6 +1137,12 @@ const struct tlob_captured_error_env *tlob_last_error_env_read(void)
}
EXPORT_SYMBOL_IF_KUNIT(tlob_last_error_env_read);
+const struct tlob_captured_detail *tlob_last_detail_read(void)
+{
+ return &tlob_kunit_last_detail;
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_last_detail_read);
+
#endif /* CONFIG_KUNIT */
VISIBLE_IF_KUNIT int tlob_enable_hooks(void)
diff --git a/kernel/trace/rv/monitors/tlob/tlob_kunit.c b/kernel/trace/rv/monitors/tlob/tlob_kunit.c
new file mode 100644
index 000000000000..ed2e7c7abaf8
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/tlob_kunit.c
@@ -0,0 +1,881 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit tests for the tlob RV monitor.
+ *
+ * tlob_task_api: start/stop lifecycle, error paths, violations.
+ * tlob_sched_integration: per-state accounting across real context switches.
+ * tlob_uprobe_format: uprobe binding format; add/remove acceptance and rejection.
+ * tlob_trace_output: trace event format for event_tlob, error_env_tlob.
+ * tlob_violation_react: error count per budget expiry; per-state breakdown.
+ *
+ * tlob_add_uprobe() duplicate-(binary, offset_start) constraint is not covered
+ * here: kern_path() requires a real filesystem; see selftests instead.
+ */
+#include <kunit/test.h>
+#include <linux/atomic.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/ktime.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/task.h>
+
+#include "tlob.h"
+
+MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING");
+
+/*
+ * Kthread cleanup guard: registers a kunit action that stops a kthread on
+ * test exit, even when a KUNIT_ASSERT fires before normal teardown code runs.
+ *
+ * Caller must call get_task_struct() before registering the guard.
+ * Set guard->task = NULL before normal-path teardown to prevent double-stop.
+ * Pass the completion to unblock on early exit, or NULL if not needed.
+ */
+struct tlob_kthread_guard {
+ struct task_struct *task;
+ struct completion *unblock;
+};
+
+static void kthread_guard_fn(void *arg)
+{
+ struct tlob_kthread_guard *g = arg;
+
+ if (!g->task)
+ return;
+ if (g->unblock)
+ complete(g->unblock);
+ kthread_stop(g->task);
+ put_task_struct(g->task);
+}
+
+static struct tlob_kthread_guard *
+tlob_guard_kthread(struct kunit *test, struct task_struct *task,
+ struct completion *unblock)
+{
+ struct tlob_kthread_guard *g;
+
+ g = kunit_kzalloc(test, sizeof(*g), GFP_KERNEL);
+ if (!g)
+ return NULL;
+ g->task = task;
+ g->unblock = unblock;
+ if (kunit_add_action_or_reset(test, kthread_guard_fn, g))
+ return NULL;
+ return g;
+}
+
+/* Suite 1: task API - lifecycle, error paths, violations. */
+
+/* Basic start/stop cycle */
+static void tlob_start_stop_ok(struct kunit *test)
+{
+ int ret;
+
+ ret = tlob_start_task(current, 10000000ULL);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(current), 0);
+ KUNIT_EXPECT_EQ(test, tlob_num_monitored_read(), 0);
+}
+
+/* Double start must return -EALREADY; double stop must return -ESRCH. */
+static void tlob_double_start(struct kunit *test)
+{
+ KUNIT_ASSERT_EQ(test, tlob_start_task(current, 10000000ULL), 0);
+ KUNIT_EXPECT_EQ(test, tlob_start_task(current, 10000000ULL), -EALREADY);
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(current), 0);
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(current), -ESRCH);
+ KUNIT_EXPECT_EQ(test, tlob_num_monitored_read(), 0);
+}
+
+/* Stop without start must return -ESRCH. */
+static void tlob_stop_without_start(struct kunit *test)
+{
+ tlob_stop_task(current);
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(current), -ESRCH);
+ KUNIT_EXPECT_EQ(test, tlob_num_monitored_read(), 0);
+}
+
+/* threshold_us == 0 is invalid and must return -ERANGE. */
+static void tlob_zero_threshold(struct kunit *test)
+{
+ KUNIT_EXPECT_EQ(test, tlob_start_task(current, 0), -ERANGE);
+}
+
+/* 1 ns budget: timer almost certainly fires before tlob_stop_task(). */
+static void tlob_immediate_deadline(struct kunit *test)
+{
+ int ret = tlob_start_task(current, 1);
+
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ udelay(100);
+ /* timer fired -> -EOVERFLOW; if we won the race, 0 is also valid */
+ ret = tlob_stop_task(current);
+ KUNIT_EXPECT_TRUE(test, ret == 0 || ret == -EOVERFLOW);
+ KUNIT_EXPECT_EQ(test, tlob_num_monitored_read(), 0);
+}
+
+/*
+ * kthreads provide distinct task_structs; fill to TLOB_MAX_MONITORED,
+ * then verify -ENOSPC.
+ */
+struct tlob_waiter_ctx {
+ struct completion start;
+ struct completion done;
+};
+
+static int tlob_waiter_fn(void *arg)
+{
+ struct tlob_waiter_ctx *ctx = arg;
+
+ wait_for_completion(&ctx->start);
+ complete(&ctx->done);
+ return 0;
+}
+
+static void tlob_enospc(struct kunit *test)
+{
+ struct tlob_waiter_ctx *ctxs;
+ struct task_struct **threads;
+ int i, ret;
+
+ ctxs = kunit_kcalloc(test, TLOB_MAX_MONITORED,
+ sizeof(*ctxs), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, ctxs);
+
+ threads = kunit_kcalloc(test, TLOB_MAX_MONITORED,
+ sizeof(*threads), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, threads);
+
+ KUNIT_ASSERT_EQ(test, tlob_num_monitored_read(), 0);
+
+ for (i = 0; i < TLOB_MAX_MONITORED; i++) {
+ init_completion(&ctxs[i].start);
+ init_completion(&ctxs[i].done);
+
+ threads[i] = kthread_run(tlob_waiter_fn, &ctxs[i],
+ "tlob_waiter_%d", i);
+ if (IS_ERR(threads[i])) {
+ KUNIT_FAIL(test, "kthread_run failed at i=%d", i);
+ threads[i] = NULL;
+ goto cleanup;
+ }
+ get_task_struct(threads[i]);
+
+ ret = tlob_start_task(threads[i], 10000000ULL);
+ if (ret != 0) {
+ KUNIT_FAIL(test, "tlob_start_task failed at i=%d: %d",
+ i, ret);
+ put_task_struct(threads[i]);
+ complete(&ctxs[i].start);
+ threads[i] = NULL;
+ goto cleanup;
+ }
+ }
+
+ ret = tlob_start_task(current, 10000000ULL);
+ KUNIT_EXPECT_EQ(test, ret, -ENOSPC);
+
+cleanup:
+ /* cancel monitoring and unblock first, then wait for full exit */
+ for (i = 0; i < TLOB_MAX_MONITORED; i++) {
+ if (!threads[i])
+ break;
+ tlob_stop_task(threads[i]);
+ complete(&ctxs[i].start);
+ }
+ for (i = 0; i < TLOB_MAX_MONITORED; i++) {
+ if (!threads[i])
+ break;
+ kthread_stop(threads[i]);
+ put_task_struct(threads[i]);
+ }
+}
+
+/*
+ * Holder kthread holds a mutex for 80 ms; arm a 10 ms budget, burn ~1 ms
+ * on-CPU, then block on the mutex; timer fires while sleeping -> -EOVERFLOW.
+ */
+struct tlob_holder_ctx {
+ struct mutex lock;
+ struct completion ready;
+ unsigned int hold_ms;
+};
+
+static int tlob_holder_fn(void *arg)
+{
+ struct tlob_holder_ctx *ctx = arg;
+
+ mutex_lock(&ctx->lock);
+ complete(&ctx->ready);
+ msleep(ctx->hold_ms);
+ mutex_unlock(&ctx->lock);
+ return 0;
+}
+
+static void tlob_deadline_fires_sleeping(struct kunit *test)
+{
+ struct tlob_holder_ctx *ctx;
+ struct tlob_kthread_guard *guard;
+ struct task_struct *holder;
+ ktime_t t0;
+ int ret;
+
+ ctx = kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, ctx);
+ ctx->hold_ms = 80;
+ mutex_init(&ctx->lock);
+ init_completion(&ctx->ready);
+
+ holder = kthread_run(tlob_holder_fn, ctx, "tlob_holder_kunit");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, holder);
+ get_task_struct(holder);
+
+ guard = tlob_guard_kthread(test, holder, NULL);
+ KUNIT_ASSERT_NOT_NULL(test, guard);
+
+ wait_for_completion(&ctx->ready);
+
+ ret = tlob_start_task(current, 10000);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ t0 = ktime_get();
+ while (ktime_us_delta(ktime_get(), t0) < 1000)
+ cpu_relax();
+
+ /* block on mutex: running->sleeping; timer fires while sleeping */
+ mutex_lock(&ctx->lock);
+ mutex_unlock(&ctx->lock);
+
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(current), -EOVERFLOW);
+
+ guard->task = NULL;
+ kthread_stop(holder);
+ put_task_struct(holder);
+}
+
+/*
+ * yield() triggers a preempt sched_switch (prev_state==0): running->waiting.
+ * Busy-spin 50 ms so the 2 ms budget fires regardless of scheduler timing.
+ */
+static void tlob_deadline_fires_waiting(struct kunit *test)
+{
+ ktime_t t0;
+ int ret;
+
+ ret = tlob_start_task(current, 2000);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ yield();
+
+ t0 = ktime_get();
+ while (ktime_us_delta(ktime_get(), t0) < 50000)
+ cpu_relax();
+
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(current), -EOVERFLOW);
+}
+
+/* Arm a 1 ms budget and busy-spin for 50 ms; timer fires in running state. */
+static void tlob_deadline_fires_running(struct kunit *test)
+{
+ ktime_t t0;
+ int ret;
+
+ ret = tlob_start_task(current, 1000);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ t0 = ktime_get();
+ while (ktime_us_delta(ktime_get(), t0) < 50000)
+ cpu_relax();
+
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(current), -EOVERFLOW);
+}
+
+/* Start three tasks, reinit monitor, verify all entries are gone. */
+static int tlob_dummy_fn(void *arg)
+{
+ wait_for_completion((struct completion *)arg);
+ return 0;
+}
+
+static void tlob_reinit_clears_all(struct kunit *test)
+{
+ struct completion *done1, *done2;
+ struct tlob_kthread_guard *guard1, *guard2;
+ struct task_struct *t1, *t2;
+ int ret;
+
+ done1 = kunit_kzalloc(test, sizeof(*done1), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, done1);
+ done2 = kunit_kzalloc(test, sizeof(*done2), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, done2);
+
+ init_completion(done1);
+ init_completion(done2);
+
+ t1 = kthread_run(tlob_dummy_fn, done1, "tlob_dummy1");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, t1);
+ get_task_struct(t1);
+ guard1 = tlob_guard_kthread(test, t1, done1);
+ KUNIT_ASSERT_NOT_NULL(test, guard1);
+
+ t2 = kthread_run(tlob_dummy_fn, done2, "tlob_dummy2");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, t2);
+ get_task_struct(t2);
+ guard2 = tlob_guard_kthread(test, t2, done2);
+ KUNIT_ASSERT_NOT_NULL(test, guard2);
+
+ KUNIT_ASSERT_EQ(test, tlob_start_task(current, 10000000ULL), 0);
+ KUNIT_ASSERT_EQ(test, tlob_start_task(t1, 10000000ULL), 0);
+ KUNIT_ASSERT_EQ(test, tlob_start_task(t2, 10000000ULL), 0);
+
+ tlob_destroy_monitor();
+ ret = tlob_init_monitor();
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(current), -ESRCH);
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(t1), -ESRCH);
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(t2), -ESRCH);
+
+ /* null guards before teardown to prevent double-stop */
+ guard1->task = NULL;
+ guard2->task = NULL;
+ complete(done1);
+ complete(done2);
+ kthread_stop(t1);
+ kthread_stop(t2);
+ put_task_struct(t1);
+ put_task_struct(t2);
+}
+
+static int tlob_task_api_suite_init(struct kunit_suite *suite)
+{
+ rv_kunit_monitoring_on();
+ return tlob_init_monitor();
+}
+
+static void tlob_task_api_suite_exit(struct kunit_suite *suite)
+{
+ tlob_destroy_monitor();
+ rv_kunit_monitoring_off();
+}
+
+static void tlob_task_api_exit(struct kunit *test)
+{
+ /*
+ * tlob_stop_task() returns pool slots via call_rcu (da_pool_return_cb).
+ * Wait for all pending callbacks so each test starts with a full pool.
+ */
+ rcu_barrier();
+}
+
+static struct kunit_case tlob_task_api_cases[] = {
+ KUNIT_CASE(tlob_start_stop_ok),
+ KUNIT_CASE(tlob_double_start),
+ KUNIT_CASE(tlob_stop_without_start),
+ KUNIT_CASE(tlob_zero_threshold),
+ KUNIT_CASE(tlob_immediate_deadline),
+ KUNIT_CASE(tlob_enospc),
+ KUNIT_CASE(tlob_deadline_fires_sleeping),
+ KUNIT_CASE(tlob_deadline_fires_waiting),
+ KUNIT_CASE(tlob_deadline_fires_running),
+ KUNIT_CASE(tlob_reinit_clears_all),
+ {}
+};
+
+static struct kunit_suite tlob_task_api_suite = {
+ .name = "tlob_task_api",
+ .suite_init = tlob_task_api_suite_init,
+ .suite_exit = tlob_task_api_suite_exit,
+ .exit = tlob_task_api_exit,
+ .test_cases = tlob_task_api_cases,
+};
+
+/* Suite 2: sched integration - per-state ns accounting. */
+
+struct tlob_ping_ctx {
+ struct completion ping;
+ struct completion pong;
+};
+
+static int tlob_ping_fn(void *arg)
+{
+ struct tlob_ping_ctx *ctx = arg;
+
+ wait_for_completion(&ctx->ping);
+ complete(&ctx->pong);
+ return 0;
+}
+
+/* Force two context switches and verify stop returns 0 (within budget). */
+static void tlob_sched_switch_accounting(struct kunit *test)
+{
+ struct tlob_ping_ctx *ctx;
+ struct tlob_kthread_guard *guard;
+ struct task_struct *peer;
+ int ret;
+
+ ctx = kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, ctx);
+ init_completion(&ctx->ping);
+ init_completion(&ctx->pong);
+
+ peer = kthread_run(tlob_ping_fn, ctx, "tlob_ping_kunit");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, peer);
+ get_task_struct(peer);
+
+ guard = tlob_guard_kthread(test, peer, &ctx->ping);
+ KUNIT_ASSERT_NOT_NULL(test, guard);
+
+ ret = tlob_start_task(current, 5000000ULL);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ /* complete(ping) -> peer runs, forcing a context switch out and back */
+ complete(&ctx->ping);
+ wait_for_completion(&ctx->pong);
+
+ ret = tlob_stop_task(current);
+ KUNIT_EXPECT_EQ(test, ret, 0);
+
+ guard->task = NULL;
+ kthread_stop(peer);
+ put_task_struct(peer);
+}
+
+/* start/stop monitoring a kthread other than current */
+static int tlob_block_fn(void *arg)
+{
+ struct completion *done = arg;
+
+ msleep(20);
+ complete(done);
+ return 0;
+}
+
+static void tlob_monitor_other_task(struct kunit *test)
+{
+ struct completion *done;
+ struct tlob_kthread_guard *guard;
+ struct task_struct *target;
+ int ret;
+
+ done = kunit_kzalloc(test, sizeof(*done), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, done);
+ init_completion(done);
+
+ target = kthread_run(tlob_block_fn, done, "tlob_target_kunit");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, target);
+ get_task_struct(target);
+
+ guard = tlob_guard_kthread(test, target, NULL);
+ KUNIT_ASSERT_NOT_NULL(test, guard);
+
+ ret = tlob_start_task(target, 5000000ULL);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ wait_for_completion(done);
+
+ /* 5 s budget won't fire in 20 ms; 0 or -EOVERFLOW are both valid */
+ ret = tlob_stop_task(target);
+ KUNIT_EXPECT_TRUE(test, ret == 0 || ret == -EOVERFLOW);
+
+ guard->task = NULL;
+ kthread_stop(target);
+ put_task_struct(target);
+}
+
+static int tlob_sched_suite_init(struct kunit_suite *suite)
+{
+ rv_kunit_monitoring_on();
+ return tlob_init_monitor();
+}
+
+static void tlob_sched_suite_exit(struct kunit_suite *suite)
+{
+ tlob_destroy_monitor();
+ rv_kunit_monitoring_off();
+}
+
+static struct kunit_case tlob_sched_integration_cases[] = {
+ KUNIT_CASE(tlob_sched_switch_accounting),
+ KUNIT_CASE(tlob_monitor_other_task),
+ {}
+};
+
+static struct kunit_suite tlob_sched_integration_suite = {
+ .name = "tlob_sched_integration",
+ .suite_init = tlob_sched_suite_init,
+ .suite_exit = tlob_sched_suite_exit,
+ .test_cases = tlob_sched_integration_cases,
+};
+
+/* Suite 3: uprobe binding format - add/remove acceptance and rejection. */
+
+static const char * const tlob_format_valid[] = {
+ "p /usr/bin/myapp:4768 4848 threshold=5000",
+ "p /usr/bin/myapp:0x12a0 0x12f0 threshold=10000",
+ "p /opt/my:app/bin:0x100 0x200 threshold=1000",
+};
+
+static const char * const tlob_format_invalid[] = {
+ /* add: malformed */
+ "p /usr/bin/myapp:0x100 0x200 threshold=0",
+ "p :0x100 0x200 threshold=5000",
+ "p /usr/bin/myapp:0x100 threshold=5000",
+ "p /usr/bin/myapp:-1 0x200 threshold=5000",
+ "p /usr/bin/myapp:0x100 0x200",
+ "p /usr/bin/myapp:0x100 0x100 threshold=5000",
+ /* remove: malformed */
+ "-usr/bin/myapp:0x100",
+ "-/usr/bin/myapp",
+ "-/:0x100",
+ "-/usr/bin/myapp:abc",
+};
+
+/*
+ * Valid add lines return -ENOENT (path does not exist in the test environment)
+ * rather than 0; a non-(-EINVAL) return confirms the format was accepted.
+ */
+static void tlob_format_accepted(struct kunit *test)
+{
+ char buf[128];
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(tlob_format_valid); i++) {
+ strscpy(buf, tlob_format_valid[i], sizeof(buf));
+ KUNIT_EXPECT_NE(test, tlob_create_or_delete_uprobe(buf), -EINVAL);
+ }
+}
+
+static void tlob_format_rejected(struct kunit *test)
+{
+ char buf[128];
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(tlob_format_invalid); i++) {
+ strscpy(buf, tlob_format_invalid[i], sizeof(buf));
+ KUNIT_EXPECT_EQ(test, tlob_create_or_delete_uprobe(buf), -EINVAL);
+ }
+}
+
+static struct kunit_case tlob_uprobe_format_cases[] = {
+ KUNIT_CASE(tlob_format_accepted),
+ KUNIT_CASE(tlob_format_rejected),
+ {}
+};
+
+static struct kunit_suite tlob_uprobe_format_suite = {
+ .name = "tlob_uprobe_format",
+ .test_cases = tlob_uprobe_format_cases,
+};
+
+/* Suite 4: trace output - verify event_tlob and error_env_tlob field values. */
+
+static void tlob_trace_event_format(struct kunit *test)
+{
+ const struct tlob_captured_event *ev;
+ int pid = current->pid;
+ int ret;
+
+ tlob_event_count_reset();
+ ret = tlob_start_task(current, 5000000ULL);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ /* sleep/wakeup/switch_in: running->sleeping->waiting->running */
+ msleep(20);
+
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(current), 0);
+
+ KUNIT_EXPECT_GE(test, tlob_event_count_read(), 3);
+
+ ev = tlob_last_event_read();
+ KUNIT_EXPECT_EQ(test, ev->id, pid);
+ KUNIT_EXPECT_STREQ(test, ev->state, "waiting");
+ KUNIT_EXPECT_STREQ(test, ev->event, "switch_in");
+ KUNIT_EXPECT_STREQ(test, ev->next_state, "running");
+ KUNIT_EXPECT_TRUE(test, ev->final_state);
+}
+
+static void tlob_trace_error_env_format(struct kunit *test)
+{
+ const struct tlob_captured_error_env *err;
+ ktime_t t0;
+ int pid = current->pid;
+ int ret;
+
+ tlob_error_env_count_reset();
+ ret = tlob_start_task(current, 1000);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ t0 = ktime_get();
+ while (ktime_us_delta(ktime_get(), t0) < 50000)
+ cpu_relax();
+
+ tlob_stop_task(current);
+
+ KUNIT_ASSERT_GE(test, tlob_error_env_count_read(), 1);
+
+ err = tlob_last_error_env_read();
+ KUNIT_EXPECT_EQ(test, err->id, pid);
+ KUNIT_EXPECT_STREQ(test, err->state, "running");
+ KUNIT_EXPECT_STREQ(test, err->event, "budget_exceeded");
+ KUNIT_EXPECT_TRUE(test, strncmp(err->env, "clk_elapsed=", 12) == 0);
+}
+
+static int tlob_trace_suite_init(struct kunit_suite *suite)
+{
+ int ret;
+
+ rv_kunit_monitoring_on();
+ ret = tlob_init_monitor();
+ if (ret)
+ goto err_mon_off;
+ ret = tlob_register_kunit_probes();
+ if (ret)
+ goto err_destroy;
+ ret = tlob_enable_hooks();
+ if (ret)
+ goto err_probes;
+ return 0;
+
+err_probes:
+ tlob_unregister_kunit_probes();
+err_destroy:
+ tlob_destroy_monitor();
+err_mon_off:
+ rv_kunit_monitoring_off();
+ return ret;
+}
+
+static void tlob_trace_suite_exit(struct kunit_suite *suite)
+{
+ tlob_disable_hooks();
+ tlob_unregister_kunit_probes();
+ tlob_destroy_monitor();
+ rv_kunit_monitoring_off();
+}
+
+static struct kunit_case tlob_trace_output_cases[] = {
+ KUNIT_CASE(tlob_trace_event_format),
+ KUNIT_CASE(tlob_trace_error_env_format),
+ {}
+};
+
+static struct kunit_suite tlob_trace_output_suite = {
+ .name = "tlob_trace_output",
+ .suite_init = tlob_trace_suite_init,
+ .suite_exit = tlob_trace_suite_exit,
+ .test_cases = tlob_trace_output_cases,
+};
+
+/*
+ * Suite 5: violation reaction - complement to Suite 4.
+ * Suite 4 checks trace field values; Suite 5 checks semantics:
+ * error count per budget expiry and per-state ns breakdown.
+ */
+
+/* generous budget; usleep forces state transitions; no error must fire */
+static void tlob_no_error_within_budget(struct kunit *test)
+{
+ tlob_error_env_count_reset();
+ tlob_event_count_reset();
+
+ KUNIT_ASSERT_EQ(test, tlob_start_task(current, 10000000ULL), 0);
+ usleep_range(5000, 10000);
+ KUNIT_EXPECT_EQ(test, tlob_stop_task(current), 0);
+ KUNIT_EXPECT_EQ(test, tlob_error_env_count_read(), 0);
+ KUNIT_EXPECT_GE(test, tlob_event_count_read(), 2);
+}
+
+/* busy-spin 50 ms >> 1 ms budget; running_ns must dominate */
+static void tlob_detail_running_dominates(struct kunit *test)
+{
+ const struct tlob_captured_detail *d;
+ u64 total_ns;
+ ktime_t t0;
+ int ret;
+
+ tlob_error_env_count_reset();
+
+ ret = tlob_start_task(current, 1000);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ t0 = ktime_get();
+ while (ktime_us_delta(ktime_get(), t0) < 50000)
+ cpu_relax();
+
+ tlob_stop_task(current);
+
+ KUNIT_EXPECT_EQ(test, tlob_error_env_count_read(), 1);
+ d = tlob_last_detail_read();
+ KUNIT_EXPECT_EQ(test, d->pid, current->pid);
+ KUNIT_EXPECT_EQ(test, d->threshold_us, 1000ULL);
+ total_ns = d->running_ns + d->waiting_ns + d->sleeping_ns;
+ KUNIT_EXPECT_GE(test, total_ns, 1000ULL * 1000);
+ KUNIT_EXPECT_GT(test, d->running_ns, d->sleeping_ns + d->waiting_ns);
+}
+
+struct tlob_hog_ctx {
+ int spin_ms;
+};
+
+static int tlob_hog_fn(void *arg)
+{
+ struct tlob_hog_ctx *ctx = arg;
+ ktime_t t0 = ktime_get();
+
+ while (!kthread_should_stop() &&
+ ktime_ms_delta(ktime_get(), t0) < ctx->spin_ms)
+ cpu_relax();
+ return 0;
+}
+
+/*
+ * SCHED_FIFO kthread bound to the same CPU preempts the monitored task
+ * (sched_switch prev_state == 0: running->waiting) and holds the CPU for
+ * 80 ms >> 10 ms budget, guaranteeing the timer fires in waiting state.
+ */
+static void tlob_detail_waiting_dominates(struct kunit *test)
+{
+ struct tlob_hog_ctx *ctx;
+ struct task_struct *hog;
+ struct tlob_kthread_guard *guard;
+ const struct tlob_captured_detail *d;
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+ int ret;
+
+ tlob_error_env_count_reset();
+
+ ctx = kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, ctx);
+ ctx->spin_ms = 80;
+
+ hog = kthread_create(tlob_hog_fn, ctx, "tlob_s5_hog");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, hog);
+ get_task_struct(hog);
+
+ kthread_bind(hog, smp_processor_id());
+ sched_setscheduler_nocheck(hog, SCHED_FIFO, ¶m);
+
+ guard = tlob_guard_kthread(test, hog, NULL);
+ KUNIT_ASSERT_NOT_NULL(test, guard);
+
+ ret = tlob_start_task(current, 10000); /* 10 ms budget */
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ wake_up_process(hog);
+ yield(); /* sched_switch prev_state == 0: running->waiting */
+
+ tlob_stop_task(current);
+
+ KUNIT_EXPECT_EQ(test, tlob_error_env_count_read(), 1);
+ d = tlob_last_detail_read();
+ KUNIT_EXPECT_EQ(test, d->sleeping_ns, 0ULL);
+ KUNIT_EXPECT_GT(test, d->waiting_ns, d->running_ns + d->sleeping_ns);
+
+ guard->task = NULL;
+ kthread_stop(hog);
+ put_task_struct(hog);
+}
+
+/* block on mutex for 80 ms >> 10 ms budget; sleeping_ns must dominate */
+static void tlob_detail_sleeping_dominates(struct kunit *test)
+{
+ struct tlob_holder_ctx *ctx;
+ struct tlob_kthread_guard *guard;
+ struct task_struct *holder;
+ const struct tlob_captured_detail *d;
+ int ret;
+
+ tlob_error_env_count_reset();
+
+ ctx = kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, ctx);
+ ctx->hold_ms = 80;
+ mutex_init(&ctx->lock);
+ init_completion(&ctx->ready);
+
+ holder = kthread_run(tlob_holder_fn, ctx, "tlob_s5_detail");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, holder);
+ get_task_struct(holder);
+
+ guard = tlob_guard_kthread(test, holder, NULL);
+ KUNIT_ASSERT_NOT_NULL(test, guard);
+
+ wait_for_completion(&ctx->ready);
+
+ ret = tlob_start_task(current, 10000);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ mutex_lock(&ctx->lock);
+ mutex_unlock(&ctx->lock);
+
+ tlob_stop_task(current);
+
+ KUNIT_EXPECT_EQ(test, tlob_error_env_count_read(), 1);
+ d = tlob_last_detail_read();
+ KUNIT_EXPECT_GT(test, d->sleeping_ns, d->running_ns + d->waiting_ns);
+
+ guard->task = NULL;
+ kthread_stop(holder);
+ put_task_struct(holder);
+}
+
+static int tlob_violation_suite_init(struct kunit_suite *suite)
+{
+ int ret;
+
+ rv_kunit_monitoring_on();
+ ret = tlob_init_monitor();
+ if (ret)
+ goto err_mon_off;
+ ret = tlob_register_kunit_probes();
+ if (ret)
+ goto err_destroy;
+ ret = tlob_enable_hooks();
+ if (ret)
+ goto err_probes;
+ return 0;
+
+err_probes:
+ tlob_unregister_kunit_probes();
+err_destroy:
+ tlob_destroy_monitor();
+err_mon_off:
+ rv_kunit_monitoring_off();
+ return ret;
+}
+
+static void tlob_violation_suite_exit(struct kunit_suite *suite)
+{
+ tlob_disable_hooks();
+ tlob_unregister_kunit_probes();
+ tlob_destroy_monitor();
+ rv_kunit_monitoring_off();
+}
+
+static struct kunit_case tlob_violation_react_cases[] = {
+ KUNIT_CASE(tlob_no_error_within_budget),
+ KUNIT_CASE(tlob_detail_running_dominates),
+ KUNIT_CASE(tlob_detail_sleeping_dominates),
+ KUNIT_CASE(tlob_detail_waiting_dominates),
+ {}
+};
+
+static struct kunit_suite tlob_violation_react_suite = {
+ .name = "tlob_violation_react",
+ .suite_init = tlob_violation_suite_init,
+ .suite_exit = tlob_violation_suite_exit,
+ .test_cases = tlob_violation_react_cases,
+};
+
+kunit_test_suites(&tlob_task_api_suite,
+ &tlob_sched_integration_suite,
+ &tlob_uprobe_format_suite,
+ &tlob_trace_output_suite,
+ &tlob_violation_react_suite);
+
+MODULE_DESCRIPTION("KUnit tests for the tlob RV monitor");
+MODULE_LICENSE("GPL");
--
2.25.1
^ permalink raw reply related
* [RFC PATCH v2 08/10] rv/tlob: add tlob hybrid automaton monitor
From: wen.yang @ 2026-05-11 18:24 UTC (permalink / raw)
To: Gabriele Monaco, Steven Rostedt
Cc: linux-trace-kernel, linux-kernel, Wen Yang
In-Reply-To: <cover.1778522945.git.wen.yang@linux.dev>
From: Wen Yang <wen.yang@linux.dev>
Introduce tlob (task latency over budget), a per-task hybrid-automaton
RV monitor that measures elapsed time (CLOCK_MONOTONIC) across
a user-delimited code section and fires an error_env_tlob tracepoint
when the elapsed time exceeds a configurable per-invocation budget.
The monitor is built on RV_MON_PER_OBJ with HA_TIMER_HRTIMER. Three
states track the scheduler status of the monitored task:
running --(sleep)-------> sleeping
running --(preempt)-----> waiting
sleeping --(wakeup)------> waiting
waiting --(switch_in)--> running
A single clock invariant clk_elapsed < BUDGET_NS() is active in all
three states. The budget hrtimer is rearmed on each DA transition for
the remaining budget, keeping the absolute deadline fixed at
start_time + BUDGET_NS.
Per-task state is stored in the DA framework's hash table keyed by
task->pid. Storage is pre-allocated by tlob_start_task() with
GFP_KERNEL via da_create_or_get() before the scheduler tracepoints
can fire, using DA_SKIP_AUTO_ALLOC so that no kmalloc occurs on the
tracepoint hot path. This avoids both the kmalloc_nolock() restriction
(requires HAVE_ALIGNED_STRUCT_PAGE) and latency issues under PREEMPT_RT.
Nested monitoring is handled by nest_depth: tlob_start_task() on an
already-monitored pid returns -EEXIST and increments nest_depth without
disturbing the outer window; only the outermost tlob_stop_task()
performs real cleanup.
Two userspace interfaces are provided. The ioctl interface exposes
in-process self-instrumentation via /dev/rv with TLOB_IOCTL_TRACE_START
and TLOB_IOCTL_TRACE_STOP. The uprobe interface enables external
monitoring of unmodified binaries via tracefs:
echo "p PATH:OFFSET_START OFFSET_STOP threshold=NS" \
> /sys/kernel/tracing/rv/monitors/tlob/monitor
Violations are reported via error_env_tlob (HA clock-invariant)
regardless of which interface triggered them.
Suggested-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
Documentation/trace/rv/index.rst | 1 +
Documentation/trace/rv/monitor_tlob.rst | 213 ++++
include/linux/rv.h | 45 +
include/rv/automata.h | 15 +
include/rv/ha_monitor.h | 33 +-
include/rv/rv_uprobe.h | 32 +
include/uapi/linux/rv.h | 86 ++
kernel/trace/rv/Kconfig | 2 +
kernel/trace/rv/Makefile | 4 +-
kernel/trace/rv/monitors/tlob/Kconfig | 69 ++
kernel/trace/rv/monitors/tlob/tlob.c | 1307 ++++++++++++++++++++
kernel/trace/rv/monitors/tlob/tlob.h | 171 +++
kernel/trace/rv/monitors/tlob/tlob_trace.h | 58 +
kernel/trace/rv/rv.c | 38 +
kernel/trace/rv/rv.h | 2 +
kernel/trace/rv/rv_chardev.c | 201 +++
kernel/trace/rv/rv_trace.h | 1 +
kernel/trace/rv/rv_uprobe.c | 46 +-
tools/include/uapi/linux/rv.h | 86 ++
19 files changed, 2400 insertions(+), 10 deletions(-)
create mode 100644 Documentation/trace/rv/monitor_tlob.rst
create mode 100644 include/uapi/linux/rv.h
create mode 100644 kernel/trace/rv/monitors/tlob/Kconfig
create mode 100644 kernel/trace/rv/monitors/tlob/tlob.c
create mode 100644 kernel/trace/rv/monitors/tlob/tlob.h
create mode 100644 kernel/trace/rv/monitors/tlob/tlob_trace.h
create mode 100644 kernel/trace/rv/rv_chardev.c
create mode 100644 tools/include/uapi/linux/rv.h
diff --git a/Documentation/trace/rv/index.rst b/Documentation/trace/rv/index.rst
index 29769f06bb0f..1501545b5f08 100644
--- a/Documentation/trace/rv/index.rst
+++ b/Documentation/trace/rv/index.rst
@@ -16,5 +16,6 @@ Runtime Verification
monitor_wwnr.rst
monitor_sched.rst
monitor_rtapp.rst
+ monitor_tlob.rst
monitor_stall.rst
monitor_deadline.rst
diff --git a/Documentation/trace/rv/monitor_tlob.rst b/Documentation/trace/rv/monitor_tlob.rst
new file mode 100644
index 000000000000..91b592630b3f
--- /dev/null
+++ b/Documentation/trace/rv/monitor_tlob.rst
@@ -0,0 +1,213 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Monitor tlob
+============
+
+- Name: tlob - task latency over budget
+- Type: per-object hybrid automaton (RV_MON_PER_OBJ)
+- Author: Wen Yang <wen.yang@linux.dev>
+
+Description
+-----------
+
+The tlob monitor tracks per-task elapsed wall-clock time (CLOCK_MONOTONIC,
+spanning running, waiting, and sleeping states) and reports a violation when
+the monitored task exceeds a configurable per-invocation budget threshold.
+
+The monitor implements a three-state hybrid automaton with a single clock
+environment variable ``clk_elapsed``. The clock invariant
+``clk_elapsed < BUDGET_NS()`` is active in all three states; when it is
+violated the HA timer fires and the framework emits ``error_env_tlob``
+then calls ``da_monitor_reset()`` automatically::
+
+ | (initial, via task_start)
+ v
+ +--------------+
+ | running | <-----------+
+ +--------------+ |
+ | | |
+ sleep preempt switch_in
+ | | |
+ v v |
+ +---------+ +---------+ |
+ | sleeping| | waiting | -------+
+ +---------+ +---------+
+ | ^
+ +---wakeup---+
+
+ Key transitions:
+ running --(sleep)------> sleeping (task blocks waiting for a resource)
+ running --(preempt)----> waiting (task preempted, back in runqueue)
+ sleeping --(wakeup)-----> waiting (resource available, enters runqueue)
+ waiting --(switch_in)--> running (scheduler picks task, back on CPU)
+
+ ``task_start`` calls ``da_handle_start_event()`` with the synthetic event
+ ``switch_in_tlob`` to force the initial DA state to ``running`` (since
+ ``switch_in`` transitions waiting→running), then resets ``clk_elapsed`` and
+ arms the budget timer directly via ``ha_reset_clk_ns()`` + ``ha_start_timer_ns()``.
+ ``task_stop`` cancels the HA timer synchronously via
+ ``ha_cancel_timer_sync()`` then calls ``da_monitor_reset()`` directly.
+
+The non-running condition (monitor not yet started or reset after a
+stop/violation) is handled implicitly by the RV framework
+(``da_mon->monitoring == 0``) — it is not an explicit DA state.
+
+Per-task state lives in ``struct tlob_task_state`` which is stored as
+``monitor_target`` in the framework's ``da_monitor_storage``, indexed by
+pid. The per-invocation ``threshold_us`` is read via
+``ha_get_target(ha_mon)->threshold_us`` inside the HA constraint functions,
+following the same pattern as the ``nomiss`` monitor.
+
+Usage
+-----
+
+tracefs interface (uprobe-based external monitoring)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``monitor`` tracefs file instruments an unmodified binary via uprobes.
+The format follows the ftrace ``uprobe_events`` convention (``PATH:OFFSET``
+for the probe location, ``key=value`` for configuration parameters)::
+
+ p PATH:OFFSET_START OFFSET_STOP threshold=US
+
+The uprobe at ``OFFSET_START`` fires ``tlob_start_task()``; the uprobe at
+``OFFSET_STOP`` fires ``tlob_stop_task()``. Both offsets are ELF file
+offsets of entry points in ``PATH``. ``PATH`` may contain ``:``; the last
+``:`` in the ``PATH:OFFSET_START`` token is the separator.
+
+To remove a binding, use ``-PATH:OFFSET_START``::
+
+ echo 1 > /sys/kernel/tracing/rv/monitors/tlob/enable
+
+ echo "p /usr/bin/myapp:0x12a0 0x12f0 threshold=5000" \
+ > /sys/kernel/tracing/rv/monitors/tlob/monitor
+
+ # Remove a binding
+ echo "-/usr/bin/myapp:0x12a0" > /sys/kernel/tracing/rv/monitors/tlob/monitor
+
+ # List registered bindings
+ cat /sys/kernel/tracing/rv/monitors/tlob/monitor
+
+ # Read violations from the trace buffer
+ cat /sys/kernel/tracing/trace
+
+ioctl self-instrumentation (/dev/rv)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+``/dev/rv`` is a shared RV character device. Before using any monitor-specific
+ioctl, the fd must be bound to a monitor via ``RV_IOCTL_BIND_MONITOR``. Each
+open fd has independent per-fd monitoring state::
+
+ int fd = open("/dev/rv", O_RDWR);
+
+ /* Bind this fd to the tlob monitor. */
+ struct rv_bind_args bind = { .monitor_name = "tlob" };
+ ioctl(fd, RV_IOCTL_BIND_MONITOR, &bind);
+
+ struct tlob_start_args args = {
+ .threshold_us = 50000, /* 50 ms in microseconds */
+ };
+ ioctl(fd, TLOB_IOCTL_TRACE_START, &args);
+
+ /* ... code path under observation ... */
+
+ int ret = ioctl(fd, TLOB_IOCTL_TRACE_STOP, NULL);
+ /* ret == 0: within budget */
+ /* ret == -EOVERFLOW: budget exceeded */
+
+ close(fd);
+
+``TRACE_STOP`` returns ``-EOVERFLOW`` whenever the budget was exceeded.
+The HA timer calls ``da_monitor_reset()`` (storage remains); the
+synchronous ``ha_cancel_timer_sync()`` in ``tlob_stop_task()`` ensures the
+callback has completed before checking ``da_monitoring()``.
+
+Violation events
+~~~~~~~~~~~~~~~~
+
+Budget violations are always reported via the ``error_env_tlob`` RV
+tracepoint (HA clock-invariant violation), regardless of which interface
+triggered them::
+
+ cat /sys/kernel/tracing/trace
+
+To capture violations in a file::
+
+ trace-cmd record -e error_env_tlob &
+ # ... run workload ...
+ trace-cmd report
+
+tracefs files
+-------------
+
+The following files are created under
+``/sys/kernel/tracing/rv/monitors/tlob/``:
+
+``enable`` (rw)
+ Write ``1`` to enable the monitor; write ``0`` to disable it.
+
+``desc`` (ro)
+ Human-readable description of the monitor.
+
+``monitor`` (rw)
+ Write ``p PATH:OFFSET_START OFFSET_STOP threshold=US``
+ to bind two entry uprobes. Write ``-PATH:OFFSET_START`` to remove a
+ binding. Read to list registered bindings in the same format.
+
+Kernel API
+----------
+
+.. kernel-doc:: kernel/trace/rv/monitors/tlob/tlob.c
+ :functions: tlob_start_task tlob_stop_task
+
+``tlob_start_task(task, threshold_us)``
+ Begin monitoring *task* with a total latency budget of *threshold_us*
+ microseconds. Allocates per-task state, sets initial DA state to
+ ``running``, resets ``clk_elapsed``, and arms the HA budget timer.
+ Returns 0, -ENODEV (monitor disabled), -ERANGE (zero threshold),
+ -EALREADY (already monitoring), -ENOSPC (at capacity), or -ENOMEM.
+
+``tlob_stop_task(task)``
+ Stop monitoring *task*. Synchronously cancels the HA timer via
+ ``ha_cancel_timer_sync()``, checks ``da_monitoring()`` to determine outcome.
+ Returns 0 (clean stop, within budget), -EOVERFLOW (budget was exceeded),
+ -ESRCH (not monitored), or -EAGAIN (concurrent stop racing).
+
+Design notes
+------------
+
+State transitions are driven by two tracepoints:
+
+- ``sched_switch``: ``prev_state == 0`` (``TASK_RUNNING``, preempted,
+ stays on runqueue) → running→waiting; ``prev_state != 0`` (voluntarily
+ blocked, leaves runqueue) → running→sleeping; ``next`` pointer →
+ waiting→running.
+- ``sched_wakeup``: task moves back onto the runqueue → sleeping→waiting.
+
+No ``waiting → sleeping`` edge exists because a task can only block
+itself while executing on CPU. ``try_to_wake_up()`` is also a no-op
+when ``__state == TASK_RUNNING``, so ``sched_wakeup`` never fires while
+the task is in ``waiting`` state.
+
+Limitations:
+
+- The initial DA state is always ``running``, set by feeding the synthetic
+ event ``switch_in_tlob`` to ``da_handle_start_event()``. Monitoring a non-current
+ task that is already in waiting or sleeping state at call time misclassifies
+ the first interval as ``running_ns``.
+- ``TASK_STOPPED`` and ``TASK_TRACED`` carry ``prev_state != 0`` and are
+ therefore counted as ``sleeping_ns``, indistinguishable from
+ I/O-blocked time.
+- ``sched_wakeup_new`` is not hooked. In practice this is not an issue
+ because ``tlob_start_task`` is always called from a running context.
+
+Specification
+-------------
+
+Graphviz DOT file in tools/verification/models/tlob.dot.
+
+KUnit tests under ``kernel/trace/rv/monitors/tlob/tlob_kunit.c``
+(CONFIG_TLOB_KUNIT_TEST).
+
+User-space integration tests under ``tools/testing/selftests/verification/``
+(requires CONFIG_RV_MON_TLOB=y and root).
diff --git a/include/linux/rv.h b/include/linux/rv.h
index 541ba404926a..1ea91bb3f1c2 100644
--- a/include/linux/rv.h
+++ b/include/linux/rv.h
@@ -21,6 +21,13 @@
#include <linux/list.h>
#include <linux/types.h>
+/* Forward declaration: poll_table is only needed by rv_chardev_ops::poll.
+ * Avoid pulling in <linux/poll.h> from rv.h — that header is included by
+ * sched.h, and poll.h → fs.h → rcupdate.h creates a header-ordering cycle
+ * with migrate_disable() on UML/non-SMP targets.
+ */
+struct poll_table_struct;
+
/*
* Deterministic automaton per-object variables.
*/
@@ -158,6 +165,44 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent);
int rv_get_task_monitor_slot(void);
void rv_put_task_monitor_slot(int slot);
+/**
+ * struct rv_chardev_ops - per-monitor callbacks for the /dev/rv chardev
+ *
+ * Monitors that want to expose an ioctl self-instrumentation interface
+ * register an instance of this struct with rv_chardev_register_monitor().
+ *
+ * @owner: Module that owns this ops struct. Set to THIS_MODULE.
+ * The chardev holds a module reference for every bound fd so
+ * the module cannot be unloaded while any fd remains open.
+ * @bind: Called when userspace issues RV_IOCTL_BIND_MONITOR. Should
+ * allocate and return per-fd private data (opaque pointer), or
+ * ERR_PTR(errno) on failure.
+ * @ioctl: Called for every monitor-specific ioctl after binding. @priv
+ * is the pointer returned by @bind.
+ * @poll: Optional. Called from the fd's poll() / epoll_wait() path.
+ * Should call poll_wait(@file, wq, @wait) on the monitor's internal
+ * wait queue and return the current event mask (EPOLLIN | EPOLLRDNORM
+ * when an event is pending, 0 otherwise). If NULL, poll() always
+ * returns 0 (no events).
+ * @release: Called when the fd is closed. Must free @priv.
+ */
+struct rv_chardev_ops {
+ struct module *owner;
+ void *(*bind)(void);
+ long (*ioctl)(void *priv, unsigned int cmd, unsigned long arg);
+ __poll_t (*poll)(void *priv, struct file *file, struct poll_table_struct *wait);
+ void (*release)(void *priv);
+};
+
+int rv_chardev_register_monitor(const char *name,
+ const struct rv_chardev_ops *ops);
+void rv_chardev_unregister_monitor(const char *name);
+
+#if IS_ENABLED(CONFIG_KUNIT)
+void rv_kunit_monitoring_on(void);
+void rv_kunit_monitoring_off(void);
+#endif
+
#ifdef CONFIG_RV_REACTORS
int rv_unregister_reactor(struct rv_reactor *reactor);
int rv_register_reactor(struct rv_reactor *reactor);
diff --git a/include/rv/automata.h b/include/rv/automata.h
index 4a4eb40cf09a..ae819638d85a 100644
--- a/include/rv/automata.h
+++ b/include/rv/automata.h
@@ -41,6 +41,21 @@ static char *model_get_event_name(enum events event)
return RV_AUTOMATON_NAME.event_names[event];
}
+/*
+ * model_get_timer_event_name - label used when the HA timer fires (no event).
+ *
+ * Monitors may define MONITOR_TIMER_EVENT_NAME before including the model
+ * header to give the timer-fired violation a semantically meaningful label
+ * (e.g. "budget_exceeded" for tlob). Defaults to "none".
+ */
+#ifndef MONITOR_TIMER_EVENT_NAME
+#define MONITOR_TIMER_EVENT_NAME "none"
+#endif
+static inline char *model_get_timer_event_name(void)
+{
+ return MONITOR_TIMER_EVENT_NAME;
+}
+
/*
* model_get_initial_state - return the automaton's initial state
*/
diff --git a/include/rv/ha_monitor.h b/include/rv/ha_monitor.h
index d59507e8cb30..dfc993774089 100644
--- a/include/rv/ha_monitor.h
+++ b/include/rv/ha_monitor.h
@@ -28,6 +28,7 @@ static inline void ha_monitor_init_env(struct da_monitor *da_mon);
static inline void ha_monitor_reset_env(struct da_monitor *da_mon);
static inline void ha_setup_timer(struct ha_monitor *ha_mon);
static inline bool ha_cancel_timer(struct ha_monitor *ha_mon);
+static inline void ha_cancel_timer_sync(struct ha_monitor *ha_mon);
static bool ha_monitor_handle_constraint(struct da_monitor *da_mon,
enum states curr_state,
enum events event,
@@ -35,7 +36,10 @@ static bool ha_monitor_handle_constraint(struct da_monitor *da_mon,
da_id_type id);
#define da_monitor_event_hook ha_monitor_handle_constraint
#define da_monitor_init_hook ha_monitor_init_env
+/* Allow monitors to override da_monitor_reset_hook before including this header. */
+#ifndef da_monitor_reset_hook
#define da_monitor_reset_hook ha_monitor_reset_env
+#endif
#include <rv/da_monitor.h>
#include <linux/seq_buf.h>
@@ -70,7 +74,7 @@ static void ha_react(enum states curr_state, enum events event, char *env)
rv_react(&rv_this,
"rv: monitor %s does not allow event %s on state %s with env %s\n",
__stringify(MONITOR_NAME),
- event == EVENT_NONE ? EVENT_NONE_LBL : model_get_event_name(event),
+ event == EVENT_NONE ? model_get_timer_event_name() : model_get_event_name(event),
model_get_state_name(curr_state), env);
}
@@ -246,7 +250,7 @@ static inline void __ha_monitor_timer_callback(struct ha_monitor *ha_mon)
ha_get_env_string(&env_string, ha_mon, time_ns);
ha_react(curr_state, EVENT_NONE, env_string.buffer);
ha_trace_error_env(ha_mon, model_get_state_name(curr_state),
- EVENT_NONE_LBL, env_string.buffer,
+ model_get_timer_event_name(), env_string.buffer,
da_get_id(&ha_mon->da_mon));
da_monitor_reset(&ha_mon->da_mon);
@@ -412,6 +416,14 @@ static inline bool ha_cancel_timer(struct ha_monitor *ha_mon)
{
return timer_delete(&ha_mon->timer);
}
+/*
+ * ha_cancel_timer_sync - Cancel the timer, blocking until any running
+ * callback has completed.
+ */
+static inline void ha_cancel_timer_sync(struct ha_monitor *ha_mon)
+{
+ timer_delete_sync(&ha_mon->timer);
+}
#elif HA_TIMER_TYPE == HA_TIMER_HRTIMER
/*
* Helper functions to handle the monitor timer.
@@ -432,12 +444,12 @@ static enum hrtimer_restart ha_monitor_timer_callback(struct hrtimer *hrtimer)
static inline void ha_setup_timer(struct ha_monitor *ha_mon)
{
hrtimer_setup(&ha_mon->hrtimer, ha_monitor_timer_callback,
- CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
+ CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
}
static inline void ha_start_timer_ns(struct ha_monitor *ha_mon, enum envs env,
u64 expire, u64 time_ns)
{
- int mode = HRTIMER_MODE_REL_HARD;
+ int mode = HRTIMER_MODE_REL_SOFT;
u64 passed = ha_invariant_passed_ns(ha_mon, env, expire, time_ns);
if (RV_MON_TYPE == RV_MON_PER_CPU)
@@ -463,6 +475,18 @@ static inline bool ha_cancel_timer(struct ha_monitor *ha_mon)
{
return hrtimer_try_to_cancel(&ha_mon->hrtimer) == 1;
}
+/*
+ * ha_cancel_timer_sync - Cancel the timer, blocking until any running
+ * callback has completed.
+ *
+ * Use in teardown paths (e.g. stop_task) where the caller must know the
+ * callback has finished before inspecting or freeing monitor state.
+ * Must not be called from atomic context or within the timer callback.
+ */
+static inline void ha_cancel_timer_sync(struct ha_monitor *ha_mon)
+{
+ hrtimer_cancel(&ha_mon->hrtimer);
+}
#else /* HA_TIMER_NONE */
/*
* Start function is intentionally not defined, monitors using timers must
@@ -473,6 +497,7 @@ static inline bool ha_cancel_timer(struct ha_monitor *ha_mon)
{
return false;
}
+static inline void ha_cancel_timer_sync(struct ha_monitor *ha_mon) { }
#endif
#endif
diff --git a/include/rv/rv_uprobe.h b/include/rv/rv_uprobe.h
index 084cdb36a2ff..9106c5c9275e 100644
--- a/include/rv/rv_uprobe.h
+++ b/include/rv/rv_uprobe.h
@@ -79,9 +79,41 @@ struct rv_uprobe *rv_uprobe_attach(const char *binpath, loff_t offset,
* for any in-progress handler to finish, then releases the path reference
* and frees the rv_uprobe struct. The caller's priv data is NOT freed.
*
+ * When removing a single probe, prefer this over the three-phase API.
* Safe to call from process context only (uprobe_unregister_sync() may
* schedule).
*/
void rv_uprobe_detach(struct rv_uprobe *p);
+/**
+ * rv_uprobe_unregister_nosync - dequeue an uprobe without waiting
+ * @p: probe to dequeue; may be NULL (no-op)
+ *
+ * Removes the uprobe from the uprobe subsystem but does NOT wait for
+ * in-flight handlers to complete. The caller must call rv_uprobe_sync()
+ * before calling rv_uprobe_free() on the same probe.
+ *
+ * Use this to batch multiple deregistrations before a single rv_uprobe_sync().
+ */
+void rv_uprobe_unregister_nosync(struct rv_uprobe *p);
+
+/**
+ * rv_uprobe_sync - wait for all in-flight uprobe handlers to complete
+ *
+ * Global barrier: waits for every in-flight uprobe handler across the system
+ * to finish. Call once after a batch of rv_uprobe_unregister_nosync() calls
+ * and before any rv_uprobe_free() call.
+ */
+void rv_uprobe_sync(void);
+
+/**
+ * rv_uprobe_free - release resources of a previously deregistered probe
+ * @p: probe to free; may be NULL (no-op)
+ *
+ * Releases the path reference and frees the rv_uprobe struct. Must only
+ * be called after rv_uprobe_sync() has returned. The caller's priv data
+ * is NOT freed.
+ */
+void rv_uprobe_free(struct rv_uprobe *p);
+
#endif /* _RV_UPROBE_H */
diff --git a/include/uapi/linux/rv.h b/include/uapi/linux/rv.h
new file mode 100644
index 000000000000..a34e5426393b
--- /dev/null
+++ b/include/uapi/linux/rv.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * UAPI definitions for Runtime Verification (RV) monitors.
+ *
+ * All RV monitors that expose an ioctl self-instrumentation interface
+ * share the magic byte RV_IOC_MAGIC ('r').
+ *
+ * Usage examples and design rationale are in:
+ * Documentation/trace/rv/monitor_tlob.rst
+ */
+
+#ifndef _UAPI_LINUX_RV_H
+#define _UAPI_LINUX_RV_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/* Magic byte shared by all RV monitor ioctls. */
+#define RV_IOC_MAGIC 'r'
+
+/* Maximum monitor name length (including NUL terminator). */
+#define RV_MONITOR_NAME_MAX 32
+
+/* Generic /dev/rv ioctls (ioctl numbers 0–15 are reserved for the core) */
+
+/**
+ * struct rv_bind_args - arguments for RV_IOCTL_BIND_MONITOR
+ * @monitor_name: NUL-terminated name of the monitor to bind (e.g. "tlob").
+ */
+struct rv_bind_args {
+ char monitor_name[RV_MONITOR_NAME_MAX];
+};
+
+/*
+ * RV_IOCTL_BIND_MONITOR - associate this fd with a specific RV monitor.
+ *
+ * Must be called once after open() and before any monitor-specific ioctl.
+ *
+ * Returns 0 on success.
+ * Returns -EBUSY if this fd is already bound to a monitor.
+ * Returns -ENOENT if the requested monitor is not registered.
+ * Returns -ENOMEM on allocation failure.
+ */
+#define RV_IOCTL_BIND_MONITOR _IOW(RV_IOC_MAGIC, 0, struct rv_bind_args)
+
+/* tlob: task latency over budget monitor (ioctl numbers 1–15) */
+
+/**
+ * struct tlob_start_args - arguments for TLOB_IOCTL_TRACE_START
+ * @threshold_us: Total latency budget for this window, in microseconds.
+ * Must be greater than zero. Both on-CPU and off-CPU time
+ * (including runqueue wait) count toward this budget.
+ */
+struct tlob_start_args {
+ __u64 threshold_us;
+};
+
+/*
+ * TLOB_IOCTL_TRACE_START - begin monitoring the calling task.
+ *
+ * Arms a per-task hrtimer for threshold_us microseconds (CLOCK_MONOTONIC,
+ * so both on-CPU and off-CPU time count toward the budget).
+ *
+ * Returns 0 on success.
+ * Returns -EEXIST if TRACE_START was already called on this fd.
+ * Returns -ENOSPC if TLOB_MAX_MONITORED tasks are already being tracked.
+ * Returns -ENOMEM on allocation failure.
+ * Returns -ENODEV if the tlob monitor is not enabled.
+ * Returns -ERANGE if threshold_us is 0.
+ */
+#define TLOB_IOCTL_TRACE_START _IOW(RV_IOC_MAGIC, 1, struct tlob_start_args)
+
+/*
+ * TLOB_IOCTL_TRACE_STOP - end monitoring the calling task.
+ *
+ * Returns 0 if within budget.
+ * Returns -EOVERFLOW if the latency budget was exceeded.
+ * Returns -EINVAL if TLOB_IOCTL_TRACE_START was not called on this fd.
+ *
+ * poll/epoll: after TRACE_START the fd becomes readable (EPOLLIN) when the
+ * budget is exceeded. The caller may then issue TRACE_STOP to retrieve the
+ * result, or simply close the fd to clean up.
+ */
+#define TLOB_IOCTL_TRACE_STOP _IO(RV_IOC_MAGIC, 2)
+
+#endif /* _UAPI_LINUX_RV_H */
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index e2e0033a00b9..1c36939db8e5 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -87,6 +87,8 @@ source "kernel/trace/rv/monitors/deadline/Kconfig"
source "kernel/trace/rv/monitors/nomiss/Kconfig"
# Add new deadline monitors here
+source "kernel/trace/rv/monitors/tlob/Kconfig"
+
# Add new monitors here
config RV_REACTORS
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index f139b904bea3..8a5b5c84aff9 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -2,7 +2,7 @@
ccflags-y += -I $(src) # needed for trace events
-obj-$(CONFIG_RV) += rv.o
+obj-$(CONFIG_RV) += rv.o rv_chardev.o
obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o
obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o
obj-$(CONFIG_RV_MON_SCHED) += monitors/sched/sched.o
@@ -17,6 +17,8 @@ obj-$(CONFIG_RV_MON_STS) += monitors/sts/sts.o
obj-$(CONFIG_RV_MON_NRP) += monitors/nrp/nrp.o
obj-$(CONFIG_RV_MON_SSSW) += monitors/sssw/sssw.o
obj-$(CONFIG_RV_MON_OPID) += monitors/opid/opid.o
+obj-$(CONFIG_RV_MON_TLOB) += monitors/tlob/tlob.o
+obj-$(CONFIG_TLOB_KUNIT_TEST) += monitors/tlob/tlob_kunit.o
obj-$(CONFIG_RV_MON_STALL) += monitors/stall/stall.o
obj-$(CONFIG_RV_MON_DEADLINE) += monitors/deadline/deadline.o
obj-$(CONFIG_RV_MON_NOMISS) += monitors/nomiss/nomiss.o
diff --git a/kernel/trace/rv/monitors/tlob/Kconfig b/kernel/trace/rv/monitors/tlob/Kconfig
new file mode 100644
index 000000000000..82e521891496
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/Kconfig
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_TLOB
+ depends on RV
+ select RV_UPROBE
+ select HA_MON_EVENTS_ID
+ bool "tlob monitor"
+ help
+ Enable the tlob (task latency over budget) monitor. This monitor
+ tracks the elapsed time (CLOCK_MONOTONIC) of a marked code path
+ within a task (including both on-CPU and off-CPU time) and reports
+ a violation when the elapsed time exceeds a configurable budget.
+
+ The monitor uses a three-state hybrid automaton (running, waiting,
+ sleeping) stored per object using RV_MON_PER_OBJ. A single HA
+ clock invariant (clk_elapsed < BUDGET_NS) is enforced in all three
+ states via a per-task hrtimer.
+
+ States: running (initial, on-CPU), waiting (in runqueue, off-CPU),
+ sleeping (blocked on resource, off-CPU).
+ Key transitions:
+ running --(sleep)------> sleeping
+ running --(preempt)----> waiting
+ sleeping --(wakeup)-----> waiting
+ waiting --(switch_in)--> running
+ task_start calls da_handle_start_event() to set the initial state,
+ then arms the budget timer directly via ha_reset_clk_ns() +
+ ha_start_timer_ns(). task_stop cancels the timer synchronously via
+ ha_cancel_timer_sync() then calls da_monitor_reset().
+
+ Two userspace interfaces are provided:
+
+ tracefs uprobe binding (external, unmodified binaries):
+ echo "p PATH:OFFSET_START OFFSET_STOP threshold=NS" \
+ > /sys/kernel/tracing/rv/monitors/tlob/monitor
+ The uprobe at offset_start fires tlob_start_task(); the uprobe at
+ offset_stop fires tlob_stop_task(). Both are plain entry uprobes
+ so a mistyped offset cannot corrupt the call stack.
+
+ /dev/rv ioctl (in-process self-instrumentation):
+ ioctl(fd, TLOB_IOCTL_TRACE_START, &args);
+ do_critical_work();
+ ret = ioctl(fd, TLOB_IOCTL_TRACE_STOP, NULL);
+ /* ret == -EOVERFLOW when budget exceeded */
+ Allows conditional monitoring, sub-function granularity, and
+ inline reaction to violations without polling the trace buffer.
+
+ Up to TLOB_MAX_MONITORED tasks may be monitored simultaneously.
+
+ Violations are always reported via the standard error_env_tlob RV
+ tracepoint regardless of which interface triggered them. The
+ tracefs interface requires only tracefs write permissions, avoiding
+ the CAP_BPF privilege needed for equivalent eBPF-based approaches.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_tlob.rst
+
+config TLOB_KUNIT_TEST
+ tristate "KUnit tests for tlob monitor" if !KUNIT_ALL_TESTS
+ depends on RV_MON_TLOB && KUNIT
+ default KUNIT_ALL_TESTS
+ help
+ Enable KUnit in-kernel unit tests for the tlob RV monitor.
+
+ Tests cover automaton state transitions, the start/stop task
+ interface, scheduler context-switch accounting, and the uprobe
+ format string parser.
+
+ Say Y or M here to run the tlob KUnit test suite; otherwise say N.
diff --git a/kernel/trace/rv/monitors/tlob/tlob.c b/kernel/trace/rv/monitors/tlob/tlob.c
new file mode 100644
index 000000000000..475e972ae9aa
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/tlob.c
@@ -0,0 +1,1307 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * tlob: task latency over budget monitor
+ *
+ * Track the elapsed wall-clock time of a marked code path and detect when
+ * a monitored task exceeds its per-task latency budget. CLOCK_MONOTONIC
+ * is used so both on-CPU and off-CPU time count toward the budget.
+ *
+ * On a budget violation, two tracepoints are emitted from the hrtimer
+ * callback: error_env_tlob signals the violation, and detail_env_tlob
+ * provides a per-state time breakdown (running_ns, waiting_ns, sleeping_ns)
+ * that pinpoints whether the overrun occurred in running, waiting, or sleeping state.
+ *
+ * The monitor uses RV_MON_PER_OBJ: per-task state (struct tlob_task_state)
+ * is stored as monitor_target in the framework's hash table.
+ *
+ * One HA clock invariant is enforced:
+ * clk_elapsed < BUDGET_NS() (active in all states)
+ *
+ * task_start uses da_handle_start_event() to set the initial state, then
+ * calls ha_reset_clk_ns() + ha_start_timer_ns() directly to initialise the
+ * clock and arm the budget timer. No synthetic event is needed.
+ * The HA timer is cancelled synchronously by ha_cancel_timer_sync() in
+ * tlob_stop_task().
+ *
+ * Copyright (C) 2026 Wen Yang <wen.yang@linux.dev>
+ */
+#include <linux/completion.h>
+#include <linux/hrtimer.h>
+#include <linux/kernel.h>
+#include <linux/ktime.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/refcount.h>
+#include <linux/rv.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/tracefs.h>
+#include <linux/uaccess.h>
+#include <kunit/visibility.h>
+#include <rv/instrumentation.h>
+#include <rv/rv_uprobe.h>
+#include <uapi/linux/rv.h>
+#include "../../rv.h"
+
+#define MODULE_NAME "tlob"
+
+#include <trace/events/sched.h>
+#include <rv_trace.h>
+
+/*
+ * Per-fd private data; one instance per open /dev/rv fd.
+ * monitoring: set while TRACE_START is active; cleared at TRACE_STOP.
+ * budget_exceeded: set by hrtimer callback; read at TRACE_STOP to report
+ * -EOVERFLOW even when cleanup was claimed by a concurrent stop_all or
+ * a task-exit handler.
+ */
+struct tlob_fpriv {
+ struct task_struct *task;
+ bool monitoring;
+ bool budget_exceeded;
+};
+
+/*
+ * Per-task latency monitoring state. One instance per monitoring window.
+ * Stored as monitor_target in da_monitor_storage; freed via call_rcu.
+ */
+struct tlob_task_state {
+ struct task_struct *task; /* via get_task_struct */
+ u64 threshold_us; /* budget in microseconds */
+
+ /* 1 = cleanup claimed; ha_setup_invariants won't restart the timer. */
+ atomic_t stopping;
+
+ /* Serialises the ns accumulators; held briefly (hardirq-safe). */
+ raw_spinlock_t entry_lock;
+ u64 running_ns; /* time in running state */
+ u64 waiting_ns; /* time in waiting state */
+ u64 sleeping_ns; /* time in sleeping state */
+ ktime_t last_ts;
+
+ /* store-release in TRACE_START ioctl, load-acquire in reset_notify. */
+ struct tlob_fpriv *fpriv;
+
+ struct rcu_head rcu; /* for call_rcu() teardown */
+};
+
+#define RV_MON_TYPE RV_MON_PER_OBJ
+#define HA_TIMER_TYPE HA_TIMER_HRTIMER
+/* Pool mode: da_handle_start_event uses da_fill_empty_storage, not kmalloc. */
+#define DA_SKIP_AUTO_ALLOC
+
+/* Type for da_monitor_storage.target; must be defined before the includes. */
+typedef struct tlob_task_state *monitor_target;
+
+/* Forward-declared so da_monitor_reset_hook works before ha_monitor.h. */
+static inline void tlob_reset_notify(struct da_monitor *da_mon);
+#define da_monitor_reset_hook tlob_reset_notify
+
+/*
+ * When the hrtimer fires (budget elapsed), the HA framework emits
+ * error_env_tlob with this label instead of the generic "none".
+ */
+#define MONITOR_TIMER_EVENT_NAME "budget_exceeded"
+
+#include "tlob.h"
+#include <rv/ha_monitor.h>
+
+/*
+ * Called from da_monitor_reset() on both normal stop and hrtimer expiry.
+ * On violation (stopping==0), emits detail_env_tlob.
+ */
+static inline void tlob_reset_notify(struct da_monitor *da_mon)
+{
+ struct ha_monitor *ha_mon = to_ha_monitor(da_mon);
+ struct tlob_task_state *ws;
+
+ ha_monitor_reset_env(da_mon);
+
+ ws = ha_get_target(ha_mon);
+ if (!ws)
+ return;
+
+ /*
+ * Emit per-state breakdown on budget violation only.
+ * stopping==0: timer callback owns this path (genuine overrun).
+ * stopping==1: normal stop claimed ownership first; skip.
+ */
+ if (!atomic_read(&ws->stopping)) {
+ unsigned int curr_state = READ_ONCE(da_mon->curr_state);
+ u64 running_ns, waiting_ns, sleeping_ns, partial_ns;
+ struct tlob_fpriv *fp;
+ unsigned long flags;
+
+ /*
+ * Snapshot accumulators; partial_ns covers curr_state time
+ * not yet folded in (transition-out pending).
+ */
+ raw_spin_lock_irqsave(&ws->entry_lock, flags);
+ partial_ns = ktime_get_ns() - ktime_to_ns(ws->last_ts);
+ running_ns = ws->running_ns +
+ (curr_state == running_tlob ? partial_ns : 0);
+ waiting_ns = ws->waiting_ns +
+ (curr_state == waiting_tlob ? partial_ns : 0);
+ sleeping_ns = ws->sleeping_ns +
+ (curr_state == sleeping_tlob ? partial_ns : 0);
+ raw_spin_unlock_irqrestore(&ws->entry_lock, flags);
+
+ trace_detail_env_tlob(da_get_id(da_mon), ws->threshold_us,
+ running_ns, waiting_ns, sleeping_ns);
+
+ /*
+ * Latch violation in the fd so TRACE_STOP can return -EOVERFLOW
+ * even if a concurrent stop_all or task-exit handler claims
+ * cleanup first. Pairs with smp_store_release in TRACE_START.
+ */
+ fp = smp_load_acquire(&ws->fpriv);
+ if (fp)
+ WRITE_ONCE(fp->budget_exceeded, true);
+ }
+}
+
+#define BUDGET_US(ha_mon) (ha_get_target(ha_mon)->threshold_us)
+#define BUDGET_NS(ha_mon) (BUDGET_US(ha_mon) * 1000ULL)
+
+/* HA constraint functions (called by ha_monitor_handle_constraint) */
+
+static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_tlob env, u64 time_ns)
+{
+ if (env == clk_elapsed_tlob)
+ return ha_get_clk_ns(ha_mon, env, time_ns);
+ return ENV_INVALID_VALUE;
+}
+
+static void ha_reset_env(struct ha_monitor *ha_mon, enum envs_tlob env, u64 time_ns)
+{
+ if (env == clk_elapsed_tlob)
+ ha_reset_clk_ns(ha_mon, env, time_ns);
+}
+
+/*
+ * ha_verify_invariants - clk_elapsed < BUDGET_NS must hold in all states.
+ */
+static inline bool ha_verify_invariants(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (curr_state == running_tlob)
+ return ha_check_invariant_ns(ha_mon, clk_elapsed_tlob, time_ns);
+ else if (curr_state == sleeping_tlob)
+ return ha_check_invariant_ns(ha_mon, clk_elapsed_tlob, time_ns);
+ else if (curr_state == waiting_tlob)
+ return ha_check_invariant_ns(ha_mon, clk_elapsed_tlob, time_ns);
+ return true;
+}
+
+/*
+ * Convert invariant (deadline) to guard (reset anchor) on state transitions.
+ * Skip if uninitialised (ENV_INVALID_VALUE): the race between
+ * da_handle_start_event() and ha_reset_clk_ns() would give U64_MAX - BUDGET_NS.
+ */
+static inline void ha_convert_inv_guard(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (curr_state == next_state)
+ return;
+ if (curr_state == running_tlob &&
+ !ha_monitor_env_invalid(ha_mon, clk_elapsed_tlob))
+ ha_inv_to_guard(ha_mon, clk_elapsed_tlob, BUDGET_NS(ha_mon), time_ns);
+ else if (curr_state == sleeping_tlob &&
+ !ha_monitor_env_invalid(ha_mon, clk_elapsed_tlob))
+ ha_inv_to_guard(ha_mon, clk_elapsed_tlob, BUDGET_NS(ha_mon), time_ns);
+ else if (curr_state == waiting_tlob &&
+ !ha_monitor_env_invalid(ha_mon, clk_elapsed_tlob))
+ ha_inv_to_guard(ha_mon, clk_elapsed_tlob, BUDGET_NS(ha_mon), time_ns);
+}
+
+/* No per-event guard conditions for tlob; invariants suffice. */
+static inline bool ha_verify_guards(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ return true;
+}
+
+/*
+ * Arm or cancel the HA budget timer on state transitions.
+ * Guard on stopping: sched_switch events can arrive after ha_cancel_timer_sync,
+ * restarting the timer and triggering an ODEBUG "activate active" splat.
+ */
+static inline void ha_setup_invariants(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (next_state == curr_state)
+ return;
+ if (next_state == running_tlob) {
+ if (!atomic_read_acquire(&ha_get_target(ha_mon)->stopping))
+ ha_start_timer_ns(ha_mon, clk_elapsed_tlob, BUDGET_NS(ha_mon), time_ns);
+ } else if (next_state == sleeping_tlob) {
+ if (!atomic_read_acquire(&ha_get_target(ha_mon)->stopping))
+ ha_start_timer_ns(ha_mon, clk_elapsed_tlob, BUDGET_NS(ha_mon), time_ns);
+ } else if (next_state == waiting_tlob) {
+ if (!atomic_read_acquire(&ha_get_target(ha_mon)->stopping))
+ ha_start_timer_ns(ha_mon, clk_elapsed_tlob, BUDGET_NS(ha_mon), time_ns);
+ } else if (curr_state == running_tlob)
+ ha_cancel_timer(ha_mon);
+ else if (curr_state == waiting_tlob)
+ ha_cancel_timer(ha_mon);
+ else if (curr_state == sleeping_tlob)
+ ha_cancel_timer(ha_mon);
+}
+
+static bool ha_verify_constraint(struct ha_monitor *ha_mon,
+ enum states curr_state, enum events event,
+ enum states next_state, u64 time_ns)
+{
+ if (!ha_verify_invariants(ha_mon, curr_state, event, next_state, time_ns))
+ return false;
+
+ ha_convert_inv_guard(ha_mon, curr_state, event, next_state, time_ns);
+
+ if (!ha_verify_guards(ha_mon, curr_state, event, next_state, time_ns))
+ return false;
+
+ ha_setup_invariants(ha_mon, curr_state, event, next_state, time_ns);
+
+ return true;
+}
+
+static struct kmem_cache *tlob_state_cache;
+
+static atomic_t tlob_num_monitored = ATOMIC_INIT(0);
+
+/* Uprobe binding list; protected by tlob_uprobe_mutex. */
+static LIST_HEAD(tlob_uprobe_list);
+static DEFINE_MUTEX(tlob_uprobe_mutex);
+
+/*
+ * Serialises duplicate-check + da_create_or_get() to prevent two concurrent
+ * callers for the same pid from both inserting into the hash table.
+ */
+static DEFINE_MUTEX(tlob_start_mutex);
+
+/*
+ * Counts open /dev/rv fds plus one synthetic ref held while enabled.
+ * __tlob_destroy_monitor() drops the synthetic ref and waits for zero
+ * before teardown, preventing kmem_cache_zalloc() on a destroyed cache.
+ */
+static refcount_t tlob_fd_refcount = REFCOUNT_INIT(0);
+static DECLARE_COMPLETION(tlob_fd_released);
+
+/* Per-uprobe-binding state: a start + stop probe pair for one binary region. */
+struct tlob_uprobe_binding {
+ struct list_head list;
+ u64 threshold_us;
+ char binpath[TLOB_MAX_PATH];
+ loff_t offset_start;
+ loff_t offset_stop;
+ struct rv_uprobe *start_probe;
+ struct rv_uprobe *stop_probe;
+};
+
+/* RCU callback: free the slab once no readers remain. */
+static void tlob_free_rcu(struct rcu_head *head)
+{
+ struct tlob_task_state *ws =
+ container_of(head, struct tlob_task_state, rcu);
+ kmem_cache_free(tlob_state_cache, ws);
+}
+
+/*
+ * handle_sched_switch - advance the DA on every context switch.
+ *
+ * Generates three DA events:
+ * prev, prev_state != 0 -> sleep_tlob (running -> sleeping)
+ * prev, prev_state == 0 -> preempt_tlob (running -> waiting)
+ * next -> switch_in_tlob (waiting -> running)
+ */
+static void handle_sched_switch(void *data, bool preempt_unused,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
+{
+ struct tlob_task_state *ws;
+ unsigned long flags;
+ bool do_prev = false, do_next = false;
+ bool prev_preempted;
+ ktime_t now;
+
+ rcu_read_lock();
+
+ ws = da_get_target_by_id(prev->pid);
+ if (ws) {
+ raw_spin_lock_irqsave(&ws->entry_lock, flags);
+ now = ktime_get();
+ ws->running_ns += ktime_to_ns(ktime_sub(now, ws->last_ts));
+ ws->last_ts = now;
+ /* prev_state == 0: TASK_RUNNING (preempted); != 0: sleeping. */
+ prev_preempted = (prev_state == 0);
+ do_prev = true;
+ raw_spin_unlock_irqrestore(&ws->entry_lock, flags);
+ }
+
+ ws = da_get_target_by_id(next->pid);
+ if (ws) {
+ raw_spin_lock_irqsave(&ws->entry_lock, flags);
+ now = ktime_get();
+ ws->waiting_ns += ktime_to_ns(ktime_sub(now, ws->last_ts));
+ ws->last_ts = now;
+ do_next = true;
+ raw_spin_unlock_irqrestore(&ws->entry_lock, flags);
+ }
+
+ rcu_read_unlock();
+
+ if (do_prev)
+ da_handle_event(prev->pid, NULL,
+ prev_preempted ? preempt_tlob : sleep_tlob);
+ if (do_next)
+ da_handle_event(next->pid, NULL, switch_in_tlob);
+}
+
+/*
+ * handle_sched_wakeup - sleeping -> waiting transition.
+ *
+ * try_to_wake_up() skips TASK_RUNNING tasks, so this never fires for a
+ * task already in running or waiting state.
+ */
+static void handle_sched_wakeup(void *data, struct task_struct *p)
+{
+ struct tlob_task_state *ws;
+ unsigned long flags;
+ bool found = false;
+
+ rcu_read_lock();
+ ws = da_get_target_by_id(p->pid);
+ if (ws) {
+ ktime_t now = ktime_get();
+
+ raw_spin_lock_irqsave(&ws->entry_lock, flags);
+ ws->sleeping_ns += ktime_to_ns(ktime_sub(now, ws->last_ts));
+ ws->last_ts = now;
+ raw_spin_unlock_irqrestore(&ws->entry_lock, flags);
+ found = true;
+ }
+ rcu_read_unlock();
+
+ if (found)
+ da_handle_event(p->pid, NULL, wakeup_tlob);
+}
+
+/*
+ * handle_sched_process_exit - clean up if a task exits without TRACE_STOP.
+ *
+ * Called in do_exit() context; the task still has a valid pid here.
+ */
+static void handle_sched_process_exit(void *data, struct task_struct *p,
+ bool group_dead)
+{
+ struct tlob_task_state *ws;
+ bool found = false;
+
+ rcu_read_lock();
+ ws = da_get_target_by_id(p->pid);
+ found = !!ws;
+ rcu_read_unlock();
+
+ if (found)
+ tlob_stop_task(p);
+}
+
+
+
+/**
+ * tlob_start_task - begin monitoring @task with budget @threshold_us us.
+ * @task: Task to monitor; may be current or another task.
+ * @threshold_us: Latency budget in microseconds (wall-clock; running + waiting + sleeping). > 0.
+ *
+ * Returns 0, -ENODEV, -EALREADY, -ENOSPC, or -ENOMEM.
+ */
+int tlob_start_task(struct task_struct *task, u64 threshold_us)
+{
+ struct tlob_task_state *ws_existing;
+ struct tlob_task_state *ws;
+ struct da_monitor *da_mon;
+ struct ha_monitor *ha_mon;
+ u64 now_ns;
+ int ret;
+
+ if (!da_monitor_enabled())
+ return -ENODEV;
+
+ if (threshold_us == 0)
+ return -ERANGE;
+
+ /* Serialise duplicate-check + da_create_or_get for the same pid. */
+ guard(mutex)(&tlob_start_mutex);
+
+ rcu_read_lock();
+ ws_existing = da_get_target_by_id(task->pid);
+ if (ws_existing) {
+ rcu_read_unlock();
+ return -EALREADY;
+ }
+ rcu_read_unlock();
+
+ ws = kmem_cache_zalloc(tlob_state_cache, GFP_KERNEL);
+ if (!ws)
+ return -ENOMEM;
+
+ ws->task = task;
+ get_task_struct(task);
+ ws->threshold_us = threshold_us;
+ ws->last_ts = ktime_get();
+ raw_spin_lock_init(&ws->entry_lock);
+
+ /* Claim a pool slot (no kmalloc; DA_SKIP_AUTO_ALLOC + prealloc). */
+ ret = da_create_or_get(task->pid, ws);
+ if (ret) {
+ put_task_struct(task);
+ kmem_cache_free(tlob_state_cache, ws);
+ return ret;
+ }
+
+ atomic_inc(&tlob_num_monitored);
+
+ /* Hold RCU across handle + timer setup to keep da_mon valid. */
+ rcu_read_lock();
+ da_handle_start_event(task->pid, ws, switch_in_tlob);
+ da_mon = da_get_monitor(task->pid, NULL);
+ if (unlikely(!da_mon)) {
+ /* Slot registered; missing da_mon means concurrent destroy. */
+ rcu_read_unlock();
+ da_destroy_storage(task->pid);
+ atomic_dec(&tlob_num_monitored);
+ put_task_struct(task);
+ kmem_cache_free(tlob_state_cache, ws);
+ return -ENOMEM;
+ }
+ ha_mon = to_ha_monitor(da_mon);
+ now_ns = ktime_get_ns();
+ ha_reset_env(ha_mon, clk_elapsed_tlob, now_ns);
+ ha_start_timer_ns(ha_mon, clk_elapsed_tlob, BUDGET_NS(ha_mon), now_ns);
+ rcu_read_unlock();
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tlob_start_task);
+
+/**
+ * tlob_stop_task - stop monitoring @task.
+ * @task: Task to stop.
+ *
+ * CAS on ws->stopping (0->1) under RCU claims cleanup ownership;
+ * the winner cancels the timer synchronously and frees all resources.
+ *
+ * Returns 0, -EOVERFLOW (budget exceeded), -ESRCH (not monitored),
+ * or -EAGAIN (concurrent caller claimed cleanup).
+ */
+int tlob_stop_task(struct task_struct *task)
+{
+ struct da_monitor *da_mon;
+ struct ha_monitor *ha_mon;
+ struct tlob_task_state *ws;
+ bool budget_exceeded;
+
+ rcu_read_lock();
+ ws = da_get_target_by_id(task->pid);
+ if (!ws) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+
+ da_mon = da_get_monitor(task->pid, NULL);
+ if (unlikely(!da_mon)) {
+ /* ws in hash but da_mon gone; internal inconsistency. */
+ rcu_read_unlock();
+ WARN_ON_ONCE(1);
+ return -ESRCH;
+ }
+
+ ha_mon = to_ha_monitor(da_mon);
+
+ /*
+ * CAS (0->1) claims cleanup ownership under RCU (ws guaranteed valid).
+ * _release pairs with atomic_read_acquire in ha_setup_invariants.
+ */
+ if (atomic_cmpxchg_release(&ws->stopping, 0, 1) != 0) {
+ rcu_read_unlock();
+ return -EAGAIN;
+ }
+
+ rcu_read_unlock();
+
+ /* Wait for in-flight timer callback before reading da_monitoring. */
+ ha_cancel_timer_sync(ha_mon);
+
+ /* Timer fired first -> budget exceeded; otherwise reset normally. */
+ rcu_read_lock();
+ budget_exceeded = !da_monitoring(da_mon);
+ if (!budget_exceeded)
+ da_monitor_reset(da_mon);
+ rcu_read_unlock();
+ da_destroy_storage(task->pid);
+ atomic_dec(&tlob_num_monitored);
+
+ put_task_struct(ws->task);
+ call_rcu(&ws->rcu, tlob_free_rcu);
+ return budget_exceeded ? -EOVERFLOW : 0;
+}
+EXPORT_SYMBOL_GPL(tlob_stop_task);
+
+static void tlob_stop_all(void)
+{
+ struct da_monitor_storage *ms;
+ pid_t pids[TLOB_MAX_MONITORED];
+ int bkt, n = 0;
+
+ /* Snapshot pids under RCU; re-derive ws under a fresh lock below. */
+ rcu_read_lock();
+ hash_for_each_rcu(da_monitor_ht, bkt, ms, node) {
+ if (ms->target && n < TLOB_MAX_MONITORED)
+ pids[n++] = ms->id;
+ }
+ rcu_read_unlock();
+
+ for (int i = 0; i < n; i++) {
+ pid_t pid = pids[i];
+ struct da_monitor *da_mon;
+ struct ha_monitor *ha_mon;
+ struct tlob_task_state *ws;
+
+ rcu_read_lock();
+ da_mon = da_get_monitor(pid, NULL);
+ if (!da_mon) {
+ /* Cleaned up by tlob_stop_task or exit handler. */
+ rcu_read_unlock();
+ continue;
+ }
+
+ ws = da_get_target(da_mon);
+ ha_mon = to_ha_monitor(da_mon);
+
+ /* CAS (0->1) claims ownership; skip if another caller won. */
+ if (atomic_cmpxchg_release(&ws->stopping, 0, 1) != 0) {
+ rcu_read_unlock();
+ continue;
+ }
+ rcu_read_unlock();
+
+ ha_cancel_timer_sync(ha_mon);
+
+ scoped_guard(rcu) {
+ da_monitor_reset(da_mon);
+ }
+ da_destroy_storage(pid);
+ atomic_dec(&tlob_num_monitored);
+ put_task_struct(ws->task);
+ call_rcu(&ws->rcu, tlob_free_rcu);
+ }
+}
+
+static int tlob_uprobe_entry_handler(struct rv_uprobe *p, struct pt_regs *regs,
+ __u64 *data)
+{
+ struct tlob_uprobe_binding *b = p->priv;
+
+ tlob_start_task(current, b->threshold_us);
+ return 0;
+}
+
+static int tlob_uprobe_stop_handler(struct rv_uprobe *p, struct pt_regs *regs,
+ __u64 *data)
+{
+ tlob_stop_task(current);
+ return 0;
+}
+
+/*
+ * Register start + stop entry uprobes for a binding.
+ * Called with tlob_uprobe_mutex held.
+ */
+static int tlob_add_uprobe(u64 threshold_us, const char *binpath,
+ loff_t offset_start, loff_t offset_stop)
+{
+ struct tlob_uprobe_binding *b, *tmp_b;
+ char pathbuf[TLOB_MAX_PATH];
+ struct path path;
+ char *canon;
+ int ret;
+
+ if (binpath[0] != '/')
+ return -EINVAL;
+
+ b = kzalloc_obj(*b, GFP_KERNEL);
+ if (!b)
+ return -ENOMEM;
+
+ b->threshold_us = threshold_us;
+ b->offset_start = offset_start;
+ b->offset_stop = offset_stop;
+
+ ret = kern_path(binpath, LOOKUP_FOLLOW, &path);
+ if (ret)
+ goto err_free;
+
+ if (!d_is_reg(path.dentry)) {
+ ret = -EINVAL;
+ goto err_path;
+ }
+
+ /* Reject duplicate start offset for the same binary. */
+ list_for_each_entry(tmp_b, &tlob_uprobe_list, list) {
+ if (tmp_b->offset_start == offset_start &&
+ tmp_b->start_probe->path.dentry == path.dentry) {
+ ret = -EEXIST;
+ goto err_path;
+ }
+ }
+
+ canon = d_path(&path, pathbuf, sizeof(pathbuf));
+ if (IS_ERR(canon)) {
+ ret = PTR_ERR(canon);
+ goto err_path;
+ }
+ strscpy(b->binpath, canon, sizeof(b->binpath));
+
+ /* Both probes share b (priv) and path; attach_path refs path itself. */
+ b->start_probe = rv_uprobe_attach_path(&path, offset_start,
+ tlob_uprobe_entry_handler, NULL, b);
+ if (IS_ERR(b->start_probe)) {
+ ret = PTR_ERR(b->start_probe);
+ b->start_probe = NULL;
+ goto err_path;
+ }
+
+ b->stop_probe = rv_uprobe_attach_path(&path, offset_stop,
+ tlob_uprobe_stop_handler, NULL, b);
+ if (IS_ERR(b->stop_probe)) {
+ ret = PTR_ERR(b->stop_probe);
+ b->stop_probe = NULL;
+ goto err_start;
+ }
+
+ path_put(&path);
+ list_add_tail(&b->list, &tlob_uprobe_list);
+ return 0;
+
+err_start:
+ rv_uprobe_detach(b->start_probe);
+err_path:
+ path_put(&path);
+err_free:
+ kfree(b);
+ return ret;
+}
+
+static int tlob_remove_uprobe_by_key(loff_t offset_start, const char *binpath)
+{
+ struct tlob_uprobe_binding *b, *tmp;
+ struct path remove_path;
+ int ret;
+
+ ret = kern_path(binpath, LOOKUP_FOLLOW, &remove_path);
+ if (ret)
+ return ret;
+
+ ret = -ENOENT;
+ list_for_each_entry_safe(b, tmp, &tlob_uprobe_list, list) {
+ if (b->offset_start != offset_start)
+ continue;
+ if (b->start_probe->path.dentry != remove_path.dentry)
+ continue;
+ list_del(&b->list);
+ rv_uprobe_detach(b->start_probe);
+ rv_uprobe_detach(b->stop_probe);
+ kfree(b);
+ ret = 0;
+ break;
+ }
+
+ path_put(&remove_path);
+ return ret;
+}
+
+static void tlob_remove_all_uprobes(void)
+{
+ struct tlob_uprobe_binding *b, *tmp;
+ LIST_HEAD(pending);
+
+ mutex_lock(&tlob_uprobe_mutex);
+ list_for_each_entry_safe(b, tmp, &tlob_uprobe_list, list) {
+ list_move(&b->list, &pending);
+ rv_uprobe_unregister_nosync(b->start_probe);
+ rv_uprobe_unregister_nosync(b->stop_probe);
+ }
+ mutex_unlock(&tlob_uprobe_mutex);
+
+ if (list_empty(&pending))
+ return;
+
+ /*
+ * One global barrier for all probes dequeued above; no new handlers
+ * for any of them can fire after this returns.
+ */
+ rv_uprobe_sync();
+
+ list_for_each_entry_safe(b, tmp, &pending, list) {
+ rv_uprobe_free(b->start_probe);
+ rv_uprobe_free(b->stop_probe);
+ kfree(b);
+ }
+}
+
+static ssize_t tlob_monitor_read(struct file *file,
+ char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ const int line_sz = TLOB_MAX_PATH + 128;
+ struct tlob_uprobe_binding *b;
+ char *buf, *p;
+ int n = 0, buf_sz, pos = 0;
+ ssize_t ret;
+
+ mutex_lock(&tlob_uprobe_mutex);
+ list_for_each_entry(b, &tlob_uprobe_list, list)
+ n++;
+
+ buf_sz = (n ? n : 1) * line_sz + 1;
+ buf = kmalloc(buf_sz, GFP_KERNEL);
+ if (!buf) {
+ mutex_unlock(&tlob_uprobe_mutex);
+ return -ENOMEM;
+ }
+
+ list_for_each_entry(b, &tlob_uprobe_list, list) {
+ p = b->binpath;
+ pos += scnprintf(buf + pos, buf_sz - pos,
+ "p %s:0x%llx 0x%llx threshold=%llu\n",
+ p,
+ (unsigned long long)b->offset_start,
+ (unsigned long long)b->offset_stop,
+ b->threshold_us);
+ }
+ mutex_unlock(&tlob_uprobe_mutex);
+
+ ret = simple_read_from_buffer(ubuf, count, ppos, buf, pos);
+ kfree(buf);
+ return ret;
+}
+
+/*
+ * Parse "p PATH:OFFSET_START OFFSET_STOP threshold=US".
+ * PATH may contain ':'; the last ':' separates path from offset.
+ * Returns 0 or -EINVAL.
+ */
+static int tlob_parse_uprobe_line(char *buf, u64 *thr_out,
+ char **path_out,
+ loff_t *start_out, loff_t *stop_out)
+{
+ unsigned long long thr = 0, stop_val = 0;
+ long long start_val;
+ char *p, *path_token, *token, *colon;
+ bool got_stop = false, got_thr = false;
+ int n;
+
+ /* Must start with "p " */
+ if (buf[0] != 'p' || buf[1] != ' ')
+ return -EINVAL;
+
+ p = buf + 2;
+ while (*p == ' ')
+ p++;
+
+ /* First space-delimited token is PATH:OFFSET_START */
+ path_token = strsep(&p, " \t");
+ if (!path_token || !*path_token)
+ return -EINVAL;
+
+ /* Split at last ':' to handle paths that contain ':'. */
+ colon = strrchr(path_token, ':');
+ if (!colon || colon - path_token < 2)
+ return -EINVAL;
+ *colon = '\0';
+
+ if (path_token[0] != '/')
+ return -EINVAL;
+
+ n = 0;
+ if (sscanf(colon + 1, "%lli%n", &start_val, &n) != 1 || n == 0)
+ return -EINVAL;
+ if (start_val < 0)
+ return -EINVAL;
+
+ /* Remaining tokens: OFFSET_STOP threshold=US */
+ while (p && (token = strsep(&p, " \t")) != NULL) {
+ if (!*token)
+ continue;
+ if (strncmp(token, "threshold=", 10) == 0) {
+ if (kstrtoull(token + 10, 0, &thr))
+ return -EINVAL;
+ got_thr = true;
+ } else if (!got_stop) {
+ long long sv;
+
+ n = 0;
+ if (sscanf(token, "%lli%n", &sv, &n) != 1 || n == 0)
+ return -EINVAL;
+ if (sv < 0)
+ return -EINVAL;
+ stop_val = (unsigned long long)sv;
+ got_stop = true;
+ } else {
+ return -EINVAL;
+ }
+ }
+
+ if (!got_stop || !got_thr || thr == 0)
+ return -EINVAL;
+ if (start_val == (long long)stop_val)
+ return -EINVAL;
+
+ *thr_out = thr;
+ *path_out = path_token;
+ *start_out = (loff_t)start_val;
+ *stop_out = (loff_t)stop_val;
+ return 0;
+}
+
+/* Parse "-PATH:OFFSET_START" (ftrace uprobe_events removal convention). */
+static int tlob_parse_remove_line(char *buf, char **path_out, loff_t *start_out)
+{
+ char *binpath, *colon;
+ long long off;
+ int n = 0;
+
+ if (buf[0] != '-')
+ return -EINVAL;
+ binpath = buf + 1;
+ if (binpath[0] != '/')
+ return -EINVAL;
+ colon = strrchr(binpath, ':');
+ if (!colon || colon - binpath < 2)
+ return -EINVAL;
+ *colon = '\0';
+ if (sscanf(colon + 1, "%lli%n", &off, &n) != 1 || n == 0)
+ return -EINVAL;
+ *path_out = binpath;
+ *start_out = (loff_t)off;
+ return 0;
+}
+
+VISIBLE_IF_KUNIT int tlob_create_or_delete_uprobe(char *buf)
+{
+ loff_t offset_start, offset_stop;
+ u64 threshold_us;
+ char *binpath;
+ int ret;
+
+ if (buf[0] == '-') {
+ ret = tlob_parse_remove_line(buf, &binpath, &offset_start);
+ if (ret)
+ return ret;
+ mutex_lock(&tlob_uprobe_mutex);
+ ret = tlob_remove_uprobe_by_key(offset_start, binpath);
+ mutex_unlock(&tlob_uprobe_mutex);
+ return ret;
+ }
+ ret = tlob_parse_uprobe_line(buf, &threshold_us, &binpath,
+ &offset_start, &offset_stop);
+ if (ret)
+ return ret;
+ mutex_lock(&tlob_uprobe_mutex);
+ ret = tlob_add_uprobe(threshold_us, binpath, offset_start, offset_stop);
+ mutex_unlock(&tlob_uprobe_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_create_or_delete_uprobe);
+
+static ssize_t tlob_monitor_write(struct file *file,
+ const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ char buf[TLOB_MAX_PATH + 128];
+
+ if (count >= sizeof(buf))
+ return -EINVAL;
+ if (copy_from_user(buf, ubuf, count))
+ return -EFAULT;
+ buf[count] = '\0';
+ if (count > 0 && buf[count - 1] == '\n')
+ buf[count - 1] = '\0';
+ return tlob_create_or_delete_uprobe(buf) ?: (ssize_t)count;
+}
+
+static const struct file_operations tlob_monitor_fops = {
+ .open = simple_open,
+ .read = tlob_monitor_read,
+ .write = tlob_monitor_write,
+ .llseek = noop_llseek,
+};
+
+static int __tlob_init_monitor(void)
+{
+ int retval;
+
+ tlob_state_cache = kmem_cache_create("tlob_task_state",
+ sizeof(struct tlob_task_state),
+ 0, 0, NULL);
+ if (!tlob_state_cache)
+ return -ENOMEM;
+
+ atomic_set(&tlob_num_monitored, 0);
+
+ retval = da_monitor_init_prealloc(TLOB_MAX_MONITORED);
+ if (retval) {
+ kmem_cache_destroy(tlob_state_cache);
+ tlob_state_cache = NULL;
+ return retval;
+ }
+
+ /* Synthetic reference: held while the monitor is enabled. */
+ reinit_completion(&tlob_fd_released);
+ refcount_set(&tlob_fd_refcount, 1);
+
+ rv_this.enabled = 1;
+ return 0;
+}
+
+static void __tlob_destroy_monitor(void)
+{
+ rv_this.enabled = 0;
+ /*
+ * Remove uprobes first so stop_task can't race with tlob_stop_all().
+ * rv_uprobe_sync() inside ensures all in-flight handlers have finished.
+ */
+ tlob_remove_all_uprobes();
+ tlob_stop_all();
+ /* Wait for tlob_free_rcu and da_pool_return_cb before pool teardown. */
+ synchronize_rcu();
+
+ /*
+ * Drop the synthetic ref and wait for all open fds to close before
+ * teardown; prevents kmem_cache_zalloc() on the destroyed cache.
+ */
+ if (!refcount_dec_and_test(&tlob_fd_refcount))
+ wait_for_completion(&tlob_fd_released);
+
+ da_monitor_destroy();
+ kmem_cache_destroy(tlob_state_cache);
+ tlob_state_cache = NULL;
+}
+
+/* KUnit wrappers that acquire rv_interface_lock around monitor init/destroy. */
+#if IS_ENABLED(CONFIG_KUNIT)
+int tlob_init_monitor(void)
+{
+ int ret;
+
+ mutex_lock(&rv_interface_lock);
+ ret = __tlob_init_monitor();
+ mutex_unlock(&rv_interface_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tlob_init_monitor);
+
+void tlob_destroy_monitor(void)
+{
+ mutex_lock(&rv_interface_lock);
+ __tlob_destroy_monitor();
+ mutex_unlock(&rv_interface_lock);
+}
+EXPORT_SYMBOL_GPL(tlob_destroy_monitor);
+
+int tlob_num_monitored_read(void)
+{
+ return atomic_read(&tlob_num_monitored);
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_num_monitored_read);
+
+/* Tracepoint probes for KUnit; rv_trace.h is only included here. */
+static struct tlob_captured_event tlob_kunit_last_event;
+static struct tlob_captured_error_env tlob_kunit_last_error_env;
+static atomic_t tlob_kunit_event_cnt = ATOMIC_INIT(0);
+static atomic_t tlob_kunit_error_env_cnt = ATOMIC_INIT(0);
+
+static void tlob_kunit_event_probe(void *data, int id, char *state, char *event,
+ char *next_state, bool final_state)
+{
+ tlob_kunit_last_event.id = id;
+ strscpy(tlob_kunit_last_event.state, state,
+ sizeof(tlob_kunit_last_event.state));
+ strscpy(tlob_kunit_last_event.event, event,
+ sizeof(tlob_kunit_last_event.event));
+ strscpy(tlob_kunit_last_event.next_state, next_state,
+ sizeof(tlob_kunit_last_event.next_state));
+ tlob_kunit_last_event.final_state = final_state;
+ atomic_inc(&tlob_kunit_event_cnt);
+}
+
+static void tlob_kunit_error_env_probe(void *data, int id, char *state,
+ char *event, char *env)
+{
+ tlob_kunit_last_error_env.id = id;
+ strscpy(tlob_kunit_last_error_env.state, state,
+ sizeof(tlob_kunit_last_error_env.state));
+ strscpy(tlob_kunit_last_error_env.event, event,
+ sizeof(tlob_kunit_last_error_env.event));
+ strscpy(tlob_kunit_last_error_env.env, env,
+ sizeof(tlob_kunit_last_error_env.env));
+ atomic_inc(&tlob_kunit_error_env_cnt);
+}
+
+int tlob_register_kunit_probes(void)
+{
+ int ret;
+
+ atomic_set(&tlob_kunit_event_cnt, 0);
+ atomic_set(&tlob_kunit_error_env_cnt, 0);
+
+ ret = register_trace_event_tlob(tlob_kunit_event_probe, NULL);
+ if (ret)
+ return ret;
+ ret = register_trace_error_env_tlob(tlob_kunit_error_env_probe, NULL);
+ if (ret) {
+ unregister_trace_event_tlob(tlob_kunit_event_probe, NULL);
+ return ret;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_register_kunit_probes);
+
+void tlob_unregister_kunit_probes(void)
+{
+ unregister_trace_event_tlob(tlob_kunit_event_probe, NULL);
+ unregister_trace_error_env_tlob(tlob_kunit_error_env_probe, NULL);
+ tracepoint_synchronize_unregister();
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_unregister_kunit_probes);
+
+int tlob_event_count_read(void)
+{
+ return atomic_read(&tlob_kunit_event_cnt);
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_event_count_read);
+
+void tlob_event_count_reset(void)
+{
+ atomic_set(&tlob_kunit_event_cnt, 0);
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_event_count_reset);
+
+int tlob_error_env_count_read(void)
+{
+ return atomic_read(&tlob_kunit_error_env_cnt);
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_error_env_count_read);
+
+void tlob_error_env_count_reset(void)
+{
+ atomic_set(&tlob_kunit_error_env_cnt, 0);
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_error_env_count_reset);
+
+const struct tlob_captured_event *tlob_last_event_read(void)
+{
+ return &tlob_kunit_last_event;
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_last_event_read);
+
+const struct tlob_captured_error_env *tlob_last_error_env_read(void)
+{
+ return &tlob_kunit_last_error_env;
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_last_error_env_read);
+
+#endif /* CONFIG_KUNIT */
+
+VISIBLE_IF_KUNIT int tlob_enable_hooks(void)
+{
+ rv_attach_trace_probe("tlob", sched_switch, handle_sched_switch);
+ rv_attach_trace_probe("tlob", sched_wakeup, handle_sched_wakeup);
+ rv_attach_trace_probe("tlob", sched_process_exit, handle_sched_process_exit);
+ return 0;
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_enable_hooks);
+
+VISIBLE_IF_KUNIT void tlob_disable_hooks(void)
+{
+ rv_detach_trace_probe("tlob", sched_switch, handle_sched_switch);
+ rv_detach_trace_probe("tlob", sched_wakeup, handle_sched_wakeup);
+ rv_detach_trace_probe("tlob", sched_process_exit, handle_sched_process_exit);
+}
+EXPORT_SYMBOL_IF_KUNIT(tlob_disable_hooks);
+
+static int enable_tlob(void)
+{
+ int retval;
+
+ retval = __tlob_init_monitor();
+ if (retval)
+ return retval;
+
+ return tlob_enable_hooks();
+}
+
+static void disable_tlob(void)
+{
+ tlob_disable_hooks();
+ __tlob_destroy_monitor();
+}
+
+static struct rv_monitor rv_this = {
+ .name = "tlob",
+ .description = "Per-task latency-over-budget monitor.",
+ .enable = enable_tlob,
+ .disable = disable_tlob,
+ .reset = da_monitor_reset_all,
+ .enabled = 0,
+};
+
+static void *tlob_chardev_bind(void)
+{
+ struct tlob_fpriv *fp;
+
+ fp = kzalloc_obj(*fp, GFP_KERNEL);
+ if (!fp)
+ return ERR_PTR(-ENOMEM);
+
+ /* Pin cache/pool for fd lifetime; balanced in tlob_chardev_release.
+ * If the synthetic ref has already been dropped (__tlob_destroy_monitor
+ * ran to completion), reject the bind so the caller gets ENODEV instead
+ * of corrupting a zero refcount.
+ */
+ if (!refcount_inc_not_zero(&tlob_fd_refcount)) {
+ kfree(fp);
+ return ERR_PTR(-ENODEV);
+ }
+ return fp;
+}
+
+static void tlob_chardev_release(void *priv)
+{
+ struct tlob_fpriv *fp = priv;
+
+ if (fp->monitoring) {
+ /* All return values are safe on close. */
+ (void)tlob_stop_task(fp->task);
+ put_task_struct(fp->task);
+ }
+
+ kfree(fp);
+
+ /* Release fd's pin; if last, wake __tlob_destroy_monitor. */
+ if (refcount_dec_and_test(&tlob_fd_refcount))
+ complete(&tlob_fd_released);
+}
+
+static long tlob_chardev_ioctl(void *priv, unsigned int cmd, unsigned long arg)
+{
+ struct tlob_fpriv *fp = priv;
+ struct tlob_start_args args;
+ struct task_struct *task;
+ int ret;
+
+ switch (cmd) {
+ case TLOB_IOCTL_TRACE_START:
+ if (fp->monitoring)
+ return -EALREADY;
+
+ if (copy_from_user(&args, (void __user *)arg, sizeof(args)))
+ return -EFAULT;
+
+ ret = tlob_start_task(current, args.threshold_us);
+ if (ret)
+ return ret;
+
+ fp->task = current;
+ get_task_struct(current);
+ fp->budget_exceeded = false;
+
+ /* Link fd so hrtimer callback can latch budget_exceeded. */
+ scoped_guard(rcu) {
+ struct tlob_task_state *ws = da_get_target_by_id(current->pid);
+
+ if (ws)
+ smp_store_release(&ws->fpriv, fp);
+ }
+
+ fp->monitoring = true;
+ return 0;
+
+ case TLOB_IOCTL_TRACE_STOP:
+ if (!fp->monitoring)
+ return -EINVAL;
+
+ task = fp->task;
+ fp->monitoring = false;
+ fp->task = NULL;
+
+ ret = tlob_stop_task(task);
+ put_task_struct(task);
+
+ /*
+ * -EOVERFLOW: budget exceeded; propagate to caller.
+ * -EAGAIN: concurrent stop_all claimed cleanup; fall through to
+ * budget_exceeded latch set by the hrtimer callback.
+ * -ESRCH: task exited before TRACE_STOP (process-exit handler
+ * claimed cleanup); same latch applies. Not an internal error.
+ */
+ if (ret == -EAGAIN || ret == -ESRCH)
+ return READ_ONCE(fp->budget_exceeded) ? -EOVERFLOW : 0;
+ return ret;
+
+ default:
+ return -ENOTTY;
+ }
+}
+
+static const struct rv_chardev_ops tlob_chardev_ops = {
+ .owner = THIS_MODULE,
+ .bind = tlob_chardev_bind,
+ .ioctl = tlob_chardev_ioctl,
+ .release = tlob_chardev_release,
+};
+
+static int __init register_tlob(void)
+{
+ int ret;
+
+ ret = rv_chardev_register_monitor("tlob", &tlob_chardev_ops);
+ if (ret)
+ return ret;
+
+ ret = rv_register_monitor(&rv_this, NULL);
+ if (ret) {
+ rv_chardev_unregister_monitor("tlob");
+ return ret;
+ }
+
+ if (rv_this.root_d) {
+ if (!tracefs_create_file("monitor", 0644, rv_this.root_d, NULL,
+ &tlob_monitor_fops)) {
+ rv_unregister_monitor(&rv_this);
+ rv_chardev_unregister_monitor("tlob");
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+static void __exit unregister_tlob(void)
+{
+ rv_chardev_unregister_monitor("tlob");
+ rv_unregister_monitor(&rv_this);
+}
+
+module_init(register_tlob);
+module_exit(unregister_tlob);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Wen Yang <wen.yang@linux.dev>");
+MODULE_DESCRIPTION("tlob: task latency over budget per-task monitor.");
diff --git a/kernel/trace/rv/monitors/tlob/tlob.h b/kernel/trace/rv/monitors/tlob/tlob.h
new file mode 100644
index 000000000000..71c1735d27d2
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/tlob.h
@@ -0,0 +1,171 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _RV_TLOB_H
+#define _RV_TLOB_H
+
+/*
+ * C representation of the tlob hybrid automaton.
+ *
+ * Three-state HA following sched_stat / wwnr monitor naming conventions:
+ *
+ * running (initial) - task is executing on CPU [sched_stat: runtime]
+ * waiting - task is in runqueue, awaiting CPU [sched_stat: wait ]
+ * sleeping - task is blocked, awaiting resource[sched_stat: sleep ]
+ *
+ * Events (derived from sched_switch / sched_wakeup tracepoints):
+ * sleep - sched_switch, prev_state != 0 running → sleeping
+ * preempt - sched_switch, prev_state == 0 running → waiting
+ * wakeup - sched_wakeup sleeping → waiting
+ * switch_in - sched_switch, next == task waiting → running
+ *
+ * One HA clock invariant:
+ * clk_elapsed < BUDGET_NS() active in all states (total latency budget)
+ *
+ * task_start and task_stop are NOT DA events:
+ * task_start calls da_handle_start_event() to set initial state, then
+ * ha_reset_clk_ns() + ha_start_timer_ns() to initialise the clock and arm
+ * the timer directly.
+ * task_stop calls hrtimer_cancel() + da_monitor_reset() directly.
+ *
+ * For the format description see:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+#include <linux/rv.h>
+#include <linux/sched.h>
+
+#define MONITOR_NAME tlob
+
+enum states_tlob {
+ running_tlob,
+ waiting_tlob,
+ sleeping_tlob,
+ state_max_tlob,
+};
+
+#define INVALID_STATE state_max_tlob
+
+enum events_tlob {
+ sleep_tlob,
+ preempt_tlob,
+ wakeup_tlob,
+ switch_in_tlob,
+ event_max_tlob,
+};
+
+/*
+ * HA environment variable: clk_elapsed is the only clock.
+ * It measures wall-clock time since task_start and is active in all states.
+ */
+enum envs_tlob {
+ clk_elapsed_tlob,
+ env_max_tlob,
+ env_max_stored_tlob = env_max_tlob,
+};
+
+_Static_assert(env_max_stored_tlob <= MAX_HA_ENV_LEN, "Not enough slots");
+#define HA_CLK_NS
+
+struct automaton_tlob {
+ char *state_names[state_max_tlob];
+ char *event_names[event_max_tlob];
+ char *env_names[env_max_tlob];
+ unsigned char function[state_max_tlob][event_max_tlob];
+ unsigned char initial_state;
+ bool final_states[state_max_tlob];
+};
+
+static const struct automaton_tlob automaton_tlob = {
+ .state_names = {
+ "running",
+ "waiting",
+ "sleeping",
+ },
+ .event_names = {
+ "sleep",
+ "preempt",
+ "wakeup",
+ "switch_in",
+ },
+ .env_names = {
+ "clk_elapsed",
+ },
+ .function = {
+ /* running */
+ {
+ sleeping_tlob, /* sleep (sched_switch, prev_state != 0) */
+ waiting_tlob, /* preempt (sched_switch, prev_state == 0) */
+ INVALID_STATE, /* wakeup (TASK_RUNNING can't be woken) */
+ INVALID_STATE, /* switch_in (already on CPU) */
+ },
+ /* waiting */
+ {
+ INVALID_STATE, /* sleep (not on CPU) */
+ INVALID_STATE, /* preempt (not on CPU) */
+ INVALID_STATE, /* wakeup (already TASK_RUNNING) */
+ running_tlob, /* switch_in */
+ },
+ /* sleeping */
+ {
+ INVALID_STATE, /* sleep (already sleeping) */
+ INVALID_STATE, /* preempt (not on CPU) */
+ waiting_tlob, /* wakeup */
+ INVALID_STATE, /* switch_in (must go through waiting first) */
+ },
+ },
+ .initial_state = running_tlob,
+ .final_states = { 1, 0, 0 },
+};
+
+/* Maximum number of concurrently monitored tasks. */
+#define TLOB_MAX_MONITORED 64U
+
+/* Maximum binary path length for uprobe binding. */
+#define TLOB_MAX_PATH 256
+
+/* Exported to ioctl/uprobe layers and KUnit */
+int tlob_start_task(struct task_struct *task, u64 threshold_us);
+int tlob_stop_task(struct task_struct *task);
+
+#if IS_ENABLED(CONFIG_KUNIT)
+int tlob_init_monitor(void);
+void tlob_destroy_monitor(void);
+int tlob_enable_hooks(void);
+void tlob_disable_hooks(void);
+int tlob_create_or_delete_uprobe(char *buf);
+int tlob_num_monitored_read(void);
+
+struct tlob_captured_event {
+ int id;
+ char state[16];
+ char event[16];
+ char next_state[16];
+ bool final_state;
+};
+
+struct tlob_captured_error_env {
+ int id;
+ char state[16];
+ char event[16];
+ char env[64];
+};
+
+struct tlob_captured_detail {
+ int pid;
+ u64 threshold_us;
+ u64 running_ns;
+ u64 waiting_ns;
+ u64 sleeping_ns;
+};
+
+int tlob_register_kunit_probes(void);
+void tlob_unregister_kunit_probes(void);
+int tlob_event_count_read(void);
+void tlob_event_count_reset(void);
+int tlob_error_env_count_read(void);
+void tlob_error_env_count_reset(void);
+const struct tlob_captured_event *tlob_last_event_read(void);
+const struct tlob_captured_error_env *tlob_last_error_env_read(void);
+const struct tlob_captured_detail *tlob_last_detail_read(void);
+#endif /* CONFIG_KUNIT */
+
+#endif /* _RV_TLOB_H */
diff --git a/kernel/trace/rv/monitors/tlob/tlob_trace.h b/kernel/trace/rv/monitors/tlob/tlob_trace.h
new file mode 100644
index 000000000000..08d34e1b0ab8
--- /dev/null
+++ b/kernel/trace/rv/monitors/tlob/tlob_trace.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h for tlob tracepoints.
+ *
+ * event_tlob and error_tlob are defined on the event_da_monitor_id and
+ * error_da_monitor_id classes, following the same pattern as nomiss.
+ * error_env_tlob carries the environment variable name that caused the
+ * clock-invariant violation (budget exceeded).
+ * The id field carries the pid of the monitored task.
+ */
+
+#ifdef CONFIG_RV_MON_TLOB
+/* id is the pid of the monitored task */
+DEFINE_EVENT(event_da_monitor_id, event_tlob,
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_tlob,
+ TP_PROTO(int id, char *state, char *event),
+ TP_ARGS(id, state, event));
+
+DEFINE_EVENT(error_env_da_monitor_id, error_env_tlob,
+ TP_PROTO(int id, char *state, char *event, char *env),
+ TP_ARGS(id, state, event, env));
+
+/*
+ * detail_env_tlob - per-state time breakdown emitted alongside error_env_tlob.
+ *
+ * Fired once per budget violation, immediately after error_env_tlob, from
+ * the hrtimer callback (hardirq context). The three _ns fields sum to
+ * approximately threshold_us * 1000; any rounding comes from the partial
+ * time accumulated in the current state since the last transition.
+ */
+TRACE_EVENT(detail_env_tlob,
+ TP_PROTO(int pid, u64 threshold_us,
+ u64 running_ns, u64 waiting_ns, u64 sleeping_ns),
+ TP_ARGS(pid, threshold_us, running_ns, waiting_ns, sleeping_ns),
+ TP_STRUCT__entry(
+ __field(int, pid)
+ __field(u64, threshold_us)
+ __field(u64, running_ns)
+ __field(u64, waiting_ns)
+ __field(u64, sleeping_ns)
+ ),
+ TP_fast_assign(
+ __entry->pid = pid;
+ __entry->threshold_us = threshold_us;
+ __entry->running_ns = running_ns;
+ __entry->waiting_ns = waiting_ns;
+ __entry->sleeping_ns = sleeping_ns;
+ ),
+ TP_printk("pid=%d threshold_us=%llu running_ns=%llu waiting_ns=%llu sleeping_ns=%llu",
+ __entry->pid, __entry->threshold_us,
+ __entry->running_ns, __entry->waiting_ns,
+ __entry->sleeping_ns)
+);
+#endif /* CONFIG_RV_MON_TLOB */
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index ee4e68102f17..a45c4763dbe5 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -142,10 +142,17 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <kunit/visibility.h>
#ifdef CONFIG_RV_MON_EVENTS
#define CREATE_TRACE_POINTS
#include <rv_trace.h>
+
+#ifdef CONFIG_RV_MON_TLOB
+EXPORT_TRACEPOINT_SYMBOL_GPL(error_tlob);
+EXPORT_TRACEPOINT_SYMBOL_GPL(event_tlob);
+EXPORT_TRACEPOINT_SYMBOL_GPL(error_env_tlob);
+#endif
#endif
#include "rv.h"
@@ -696,6 +703,33 @@ static void turn_monitoring_on(void)
WRITE_ONCE(monitoring_on, true);
}
+#if IS_ENABLED(CONFIG_KUNIT)
+/**
+ * rv_kunit_monitoring_on - enable the global monitoring_on flag for KUnit tests.
+ *
+ * KUnit test suite_init functions must call this before initialising any
+ * monitor, mirroring the turn_monitoring_on() call in rv_init_interface().
+ * The matching rv_kunit_monitoring_off() must be called in suite_exit to
+ * restore the flag so that test suites do not interfere with each other.
+ */
+void rv_kunit_monitoring_on(void)
+{
+ turn_monitoring_on();
+}
+EXPORT_SYMBOL_IF_KUNIT(rv_kunit_monitoring_on);
+
+/**
+ * rv_kunit_monitoring_off - disable the global monitoring_on flag for KUnit tests.
+ *
+ * Must be called in suite_exit to restore global state after rv_kunit_monitoring_on().
+ */
+void rv_kunit_monitoring_off(void)
+{
+ turn_monitoring_off();
+}
+EXPORT_SYMBOL_IF_KUNIT(rv_kunit_monitoring_off);
+#endif /* CONFIG_KUNIT */
+
static void turn_monitoring_on_with_reset(void)
{
lockdep_assert_held(&rv_interface_lock);
@@ -846,6 +880,10 @@ int __init rv_init_interface(void)
if (retval)
return 1;
+ retval = rv_chardev_init();
+ if (retval)
+ return 1;
+
turn_monitoring_on();
rv_root.root_dir = no_free_ptr(root_dir);
diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h
index 2c0f51ff9d5c..82c9a2b57596 100644
--- a/kernel/trace/rv/rv.h
+++ b/kernel/trace/rv/rv.h
@@ -31,6 +31,8 @@ int rv_enable_monitor(struct rv_monitor *mon);
bool rv_is_container_monitor(struct rv_monitor *mon);
bool rv_is_nested_monitor(struct rv_monitor *mon);
+int rv_chardev_init(void);
+
#ifdef CONFIG_RV_REACTORS
int reactor_populate_monitor(struct rv_monitor *mon, struct dentry *root);
int init_rv_reactors(struct dentry *root_dir);
diff --git a/kernel/trace/rv/rv_chardev.c b/kernel/trace/rv/rv_chardev.c
new file mode 100644
index 000000000000..1fba1642ebc1
--- /dev/null
+++ b/kernel/trace/rv/rv_chardev.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/rv.h>
+#include <uapi/linux/rv.h>
+
+#include "rv.h"
+
+static_assert(MAX_RV_MONITOR_NAME_SIZE == RV_MONITOR_NAME_MAX,
+ "RV internal and UAPI monitor name size constants must match");
+
+struct rv_fd_priv {
+ const struct rv_chardev_ops *ops;
+ void *monitor_priv;
+};
+
+struct rv_chardev_entry {
+ char name[MAX_RV_MONITOR_NAME_SIZE];
+ const struct rv_chardev_ops *ops;
+ struct list_head list;
+};
+
+/* Protected by rv_interface_lock (from rv.h / rv.c). */
+static LIST_HEAD(rv_chardev_list);
+
+/**
+ * rv_chardev_register_monitor - expose a monitor via /dev/rv
+ * @name: Monitor name, must match the rv_monitor .name field.
+ * @ops: Callbacks providing bind / ioctl / release.
+ *
+ * Returns 0 on success, -EINVAL if @name is too long, -EEXIST if @name is
+ * already registered, -ENOMEM on OOM.
+ */
+int rv_chardev_register_monitor(const char *name,
+ const struct rv_chardev_ops *ops)
+{
+ struct rv_chardev_entry *e, *existing;
+
+ if (strlen(name) >= MAX_RV_MONITOR_NAME_SIZE)
+ return -EINVAL;
+
+ e = kmalloc_obj(*e, GFP_KERNEL);
+ if (!e)
+ return -ENOMEM;
+
+ strscpy(e->name, name, sizeof(e->name));
+ e->ops = ops;
+
+ guard(mutex)(&rv_interface_lock);
+ list_for_each_entry(existing, &rv_chardev_list, list) {
+ if (strcmp(existing->name, name) == 0) {
+ kfree(e);
+ return -EEXIST;
+ }
+ }
+ list_add_tail(&e->list, &rv_chardev_list);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rv_chardev_register_monitor);
+
+/**
+ * rv_chardev_unregister_monitor - remove a monitor from the /dev/rv registry
+ * @name: Monitor name previously passed to rv_chardev_register_monitor().
+ *
+ * Existing bound fds remain valid; their ops pointer is stable until the
+ * fd is closed. The caller must ensure no new binds to this monitor can
+ * succeed after unregistration — typically by unregistering before unloading
+ * the module that provides the ops.
+ */
+void rv_chardev_unregister_monitor(const char *name)
+{
+ struct rv_chardev_entry *e, *tmp;
+
+ guard(mutex)(&rv_interface_lock);
+ list_for_each_entry_safe(e, tmp, &rv_chardev_list, list) {
+ if (strcmp(e->name, name) == 0) {
+ list_del(&e->list);
+ kfree(e);
+ return;
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(rv_chardev_unregister_monitor);
+
+static int rv_dev_open(struct inode *inode, struct file *file)
+{
+ struct rv_fd_priv *fp;
+
+ fp = kzalloc_obj(*fp, GFP_KERNEL);
+ if (!fp)
+ return -ENOMEM;
+
+ file->private_data = fp;
+ return 0;
+}
+
+static int rv_dev_release(struct inode *inode, struct file *file)
+{
+ struct rv_fd_priv *fp = file->private_data;
+
+ if (fp->ops) {
+ fp->ops->release(fp->monitor_priv);
+ module_put(fp->ops->owner);
+ }
+ kfree(fp);
+ return 0;
+}
+
+static int rv_bind_monitor(struct rv_fd_priv *fp, const char __user *uarg)
+{
+ const struct rv_chardev_ops *ops = NULL;
+ struct rv_bind_args args;
+ void *priv;
+
+ if (fp->ops)
+ return -EBUSY;
+
+ if (copy_from_user(&args, uarg, sizeof(args)))
+ return -EFAULT;
+
+ args.monitor_name[RV_MONITOR_NAME_MAX - 1] = '\0';
+
+ /*
+ * Pin the owning module while the list entry is still valid under
+ * rv_interface_lock, preventing a concurrent rmmod from completing
+ * between lookup and reference acquisition. bind() may sleep
+ * (GFP_KERNEL inside), so it runs after the lock is dropped.
+ */
+ scoped_guard(mutex, &rv_interface_lock) {
+ struct rv_chardev_entry *e;
+
+ list_for_each_entry(e, &rv_chardev_list, list) {
+ if (strcmp(e->name, args.monitor_name) != 0)
+ continue;
+ if (!try_module_get(e->ops->owner))
+ return -ENODEV;
+ ops = e->ops;
+ break;
+ }
+ }
+
+ if (!ops)
+ return -ENOENT;
+
+ priv = ops->bind();
+ if (IS_ERR(priv)) {
+ module_put(ops->owner);
+ return PTR_ERR(priv);
+ }
+
+ fp->ops = ops;
+ fp->monitor_priv = priv;
+ return 0;
+}
+
+static long rv_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct rv_fd_priv *fp = file->private_data;
+
+ if (cmd == RV_IOCTL_BIND_MONITOR)
+ return rv_bind_monitor(fp, (const char __user *)arg);
+
+ if (!fp->ops)
+ return -ENXIO;
+
+ return fp->ops->ioctl(fp->monitor_priv, cmd, arg);
+}
+
+static __poll_t rv_dev_poll(struct file *file, poll_table *wait)
+{
+ struct rv_fd_priv *fp = file->private_data;
+
+ if (!fp->ops || !fp->ops->poll)
+ return 0;
+
+ return fp->ops->poll(fp->monitor_priv, file, wait);
+}
+
+static const struct file_operations rv_dev_fops = {
+ .owner = THIS_MODULE,
+ .open = rv_dev_open,
+ .release = rv_dev_release,
+ .unlocked_ioctl = rv_dev_ioctl,
+ .compat_ioctl = rv_dev_ioctl,
+ .poll = rv_dev_poll,
+};
+
+static struct miscdevice rv_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "rv",
+ .fops = &rv_dev_fops,
+};
+
+int __init rv_chardev_init(void)
+{
+ return misc_register(&rv_miscdev);
+}
diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h
index 9622c269789c..a4bc215c1f15 100644
--- a/kernel/trace/rv/rv_trace.h
+++ b/kernel/trace/rv/rv_trace.h
@@ -189,6 +189,7 @@ DECLARE_EVENT_CLASS(error_env_da_monitor_id,
#include <monitors/stall/stall_trace.h>
#include <monitors/nomiss/nomiss_trace.h>
+#include <monitors/tlob/tlob_trace.h>
// Add new monitors based on CONFIG_HA_MON_EVENTS_ID here
#endif
diff --git a/kernel/trace/rv/rv_uprobe.c b/kernel/trace/rv/rv_uprobe.c
index bc28399cfd4b..1ba7b80c1d87 100644
--- a/kernel/trace/rv/rv_uprobe.c
+++ b/kernel/trace/rv/rv_uprobe.c
@@ -132,13 +132,10 @@ EXPORT_SYMBOL_GPL(rv_uprobe_attach);
*/
void rv_uprobe_detach(struct rv_uprobe *p)
{
- struct rv_uprobe_impl *impl;
-
if (!p)
return;
- impl = container_of(p, struct rv_uprobe_impl, pub);
- uprobe_unregister_nosync(impl->uprobe, &impl->uc);
+ rv_uprobe_unregister_nosync(p);
/*
* uprobe_unregister_sync() is a global barrier: it waits for all
* in-flight uprobe handlers across the entire system to complete,
@@ -146,8 +143,47 @@ void rv_uprobe_detach(struct rv_uprobe *p)
* guarantees that no handler touching impl->pub.priv is running by
* the time we return, even if the caller immediately frees priv.
*/
+ rv_uprobe_sync();
+ rv_uprobe_free(p);
+}
+EXPORT_SYMBOL_GPL(rv_uprobe_detach);
+
+/**
+ * rv_uprobe_unregister_nosync - dequeue an uprobe without waiting
+ */
+void rv_uprobe_unregister_nosync(struct rv_uprobe *p)
+{
+ struct rv_uprobe_impl *impl;
+
+ if (!p)
+ return;
+
+ impl = container_of(p, struct rv_uprobe_impl, pub);
+ uprobe_unregister_nosync(impl->uprobe, &impl->uc);
+}
+EXPORT_SYMBOL_GPL(rv_uprobe_unregister_nosync);
+
+/**
+ * rv_uprobe_sync - wait for all in-flight uprobe handlers to complete
+ */
+void rv_uprobe_sync(void)
+{
uprobe_unregister_sync();
+}
+EXPORT_SYMBOL_GPL(rv_uprobe_sync);
+
+/**
+ * rv_uprobe_free - release resources of a previously deregistered probe
+ */
+void rv_uprobe_free(struct rv_uprobe *p)
+{
+ struct rv_uprobe_impl *impl;
+
+ if (!p)
+ return;
+
+ impl = container_of(p, struct rv_uprobe_impl, pub);
path_put(&p->path);
kfree(impl);
}
-EXPORT_SYMBOL_GPL(rv_uprobe_detach);
+EXPORT_SYMBOL_GPL(rv_uprobe_free);
diff --git a/tools/include/uapi/linux/rv.h b/tools/include/uapi/linux/rv.h
new file mode 100644
index 000000000000..a34e5426393b
--- /dev/null
+++ b/tools/include/uapi/linux/rv.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * UAPI definitions for Runtime Verification (RV) monitors.
+ *
+ * All RV monitors that expose an ioctl self-instrumentation interface
+ * share the magic byte RV_IOC_MAGIC ('r').
+ *
+ * Usage examples and design rationale are in:
+ * Documentation/trace/rv/monitor_tlob.rst
+ */
+
+#ifndef _UAPI_LINUX_RV_H
+#define _UAPI_LINUX_RV_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/* Magic byte shared by all RV monitor ioctls. */
+#define RV_IOC_MAGIC 'r'
+
+/* Maximum monitor name length (including NUL terminator). */
+#define RV_MONITOR_NAME_MAX 32
+
+/* Generic /dev/rv ioctls (ioctl numbers 0–15 are reserved for the core) */
+
+/**
+ * struct rv_bind_args - arguments for RV_IOCTL_BIND_MONITOR
+ * @monitor_name: NUL-terminated name of the monitor to bind (e.g. "tlob").
+ */
+struct rv_bind_args {
+ char monitor_name[RV_MONITOR_NAME_MAX];
+};
+
+/*
+ * RV_IOCTL_BIND_MONITOR - associate this fd with a specific RV monitor.
+ *
+ * Must be called once after open() and before any monitor-specific ioctl.
+ *
+ * Returns 0 on success.
+ * Returns -EBUSY if this fd is already bound to a monitor.
+ * Returns -ENOENT if the requested monitor is not registered.
+ * Returns -ENOMEM on allocation failure.
+ */
+#define RV_IOCTL_BIND_MONITOR _IOW(RV_IOC_MAGIC, 0, struct rv_bind_args)
+
+/* tlob: task latency over budget monitor (ioctl numbers 1–15) */
+
+/**
+ * struct tlob_start_args - arguments for TLOB_IOCTL_TRACE_START
+ * @threshold_us: Total latency budget for this window, in microseconds.
+ * Must be greater than zero. Both on-CPU and off-CPU time
+ * (including runqueue wait) count toward this budget.
+ */
+struct tlob_start_args {
+ __u64 threshold_us;
+};
+
+/*
+ * TLOB_IOCTL_TRACE_START - begin monitoring the calling task.
+ *
+ * Arms a per-task hrtimer for threshold_us microseconds (CLOCK_MONOTONIC,
+ * so both on-CPU and off-CPU time count toward the budget).
+ *
+ * Returns 0 on success.
+ * Returns -EEXIST if TRACE_START was already called on this fd.
+ * Returns -ENOSPC if TLOB_MAX_MONITORED tasks are already being tracked.
+ * Returns -ENOMEM on allocation failure.
+ * Returns -ENODEV if the tlob monitor is not enabled.
+ * Returns -ERANGE if threshold_us is 0.
+ */
+#define TLOB_IOCTL_TRACE_START _IOW(RV_IOC_MAGIC, 1, struct tlob_start_args)
+
+/*
+ * TLOB_IOCTL_TRACE_STOP - end monitoring the calling task.
+ *
+ * Returns 0 if within budget.
+ * Returns -EOVERFLOW if the latency budget was exceeded.
+ * Returns -EINVAL if TLOB_IOCTL_TRACE_START was not called on this fd.
+ *
+ * poll/epoll: after TRACE_START the fd becomes readable (EPOLLIN) when the
+ * budget is exceeded. The caller may then issue TRACE_STOP to retrieve the
+ * result, or simply close the fd to clean up.
+ */
+#define TLOB_IOCTL_TRACE_STOP _IO(RV_IOC_MAGIC, 2)
+
+#endif /* _UAPI_LINUX_RV_H */
--
2.25.1
^ permalink raw reply related
* [RFC PATCH v2 07/10] rv/tlob: add tlob model DOT file
From: wen.yang @ 2026-05-11 18:24 UTC (permalink / raw)
To: Gabriele Monaco, Steven Rostedt
Cc: linux-trace-kernel, linux-kernel, Wen Yang
In-Reply-To: <cover.1778522945.git.wen.yang@linux.dev>
From: Wen Yang <wen.yang@linux.dev>
Add the Graphviz DOT specification for the tlob (task latency over
budget) hybrid automaton.
The model defines three states: running (initial), waiting
(in the scheduler runqueue), and sleeping (blocked on a
resource), with the transitions:
running --(sleep)-------> sleeping
running --(preempt)-----> waiting
sleeping --(wakeup)------> waiting
waiting --(switch_in)--> running
A single clock invariant clk_elapsed < BUDGET_NS() is active in all
three states. The HA framework enforces it via a per-task hrtimer;
expiry emits error_env_tlob and resets the monitor automatically.
Suggested-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
MAINTAINERS | 3 +++
tools/verification/models/tlob.dot | 21 +++++++++++++++++++++
2 files changed, 24 insertions(+)
create mode 100644 tools/verification/models/tlob.dot
diff --git a/MAINTAINERS b/MAINTAINERS
index 74c86cf9bc65..beb7224d08ef 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -23317,7 +23317,10 @@ S: Maintained
F: Documentation/trace/rv/
F: include/linux/rv.h
F: include/rv/
+F: include/uapi/linux/rv.h
F: kernel/trace/rv/
+F: samples/rv/
+F: tools/testing/selftests/rv/
F: tools/testing/selftests/verification/
F: tools/verification/
diff --git a/tools/verification/models/tlob.dot b/tools/verification/models/tlob.dot
new file mode 100644
index 000000000000..8421b1120e80
--- /dev/null
+++ b/tools/verification/models/tlob.dot
@@ -0,0 +1,21 @@
+digraph state_automaton {
+ center = true;
+ size = "7,11";
+ {node [shape = plaintext, style=invis, label=""] "__init_running"};
+ {node [shape = ellipse] "running"};
+ {node [shape = plaintext] "running"};
+ {node [shape = plaintext] "waiting"};
+ {node [shape = plaintext] "sleeping"};
+ "__init_running" -> "running" [ label = "reset(clk_elapsed)" ];
+ "running" [label = "running\nclk_elapsed < BUDGET_NS()", color = green3];
+ "waiting" [label = "waiting\nclk_elapsed < BUDGET_NS()"];
+ "sleeping" [label = "sleeping\nclk_elapsed < BUDGET_NS()"];
+ "running" -> "sleeping" [ label = "sleep" ];
+ "running" -> "waiting" [ label = "preempt" ];
+ "waiting" -> "running" [ label = "switch_in" ];
+ "sleeping" -> "waiting" [ label = "wakeup" ];
+ { rank = min ;
+ "__init_running";
+ "running";
+ }
+}
--
2.25.1
^ permalink raw reply related
* [RFC PATCH v2 06/10] rvgen: support reset() on the __init arrow for global-window HA clocks
From: wen.yang @ 2026-05-11 18:24 UTC (permalink / raw)
To: Gabriele Monaco, Steven Rostedt
Cc: linux-trace-kernel, linux-kernel, Wen Yang
In-Reply-To: <cover.1778522945.git.wen.yang@linux.dev>
From: Wen Yang <wen.yang@linux.dev>
rvgen rejects a state invariant when its env is never reset on any
state-transition edge. This prevents expressing monitors where a clock
tracks the full monitoring window — reset once at object creation,
active in all states.
Allow reset() annotations on the __init_STATE -> STATE arrow.
automata.py adds listed envs to the new env_init_started set (and to
env_stored so the HA framework allocates per-object storage). dot2k.py
uses env_init_started for three purposes:
- Generate a handle_monitor_start() skeleton that resets the env and
arms the timer after the caller sets up DA storage and initial state.
- Guard ha_inv_to_guard calls with !ha_monitor_env_invalid() for these
envs: a concurrent DA event between da_handle_start_event() and
ha_reset_env() would otherwise store U64_MAX - BUDGET as the guard
anchor, silently disabling enforcement.
- Always generate ha_verify_guards() for monitors with invariants,
providing a stable extension point for future per-event guards.
Models without __init resets (e.g. stall.dot) are unaffected.
Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
tools/verification/rvgen/rvgen/automata.py | 26 ++++++
tools/verification/rvgen/rvgen/dot2k.py | 100 +++++++++++++++++++--
2 files changed, 119 insertions(+), 7 deletions(-)
diff --git a/tools/verification/rvgen/rvgen/automata.py b/tools/verification/rvgen/rvgen/automata.py
index b9f8149f7118..178a1a4ffd8a 100644
--- a/tools/verification/rvgen/rvgen/automata.py
+++ b/tools/verification/rvgen/rvgen/automata.py
@@ -69,15 +69,41 @@ class Automata:
self.states, self.initial_state, self.final_states = self.__get_state_variables()
self.env_types = {}
self.env_stored = set()
+ self.env_init_started = set()
self.constraint_vars = set()
self.self_loop_reset_events = set()
self.events, self.envs = self.__get_event_variables()
+ self.__parse_init_resets()
self.function, self.constraints = self.__create_matrix()
self.events_start, self.events_start_run = self.__store_init_events()
self.env_stored = sorted(self.env_stored)
+ self.env_init_started = sorted(self.env_init_started)
self.constraint_vars = sorted(self.constraint_vars)
self.self_loop_reset_events = sorted(self.self_loop_reset_events)
+ def __parse_init_resets(self) -> None:
+ """Parse reset() annotations on the __init_STATE -> STATE arrow.
+
+ Adds each listed env to env_stored (HA framework allocates per-object
+ storage) and env_init_started (ha2k generates handle_monitor_start()).
+ """
+ init_prefix = f'"{self.init_marker}'
+ for line in map(str.lstrip, self.__dot_lines):
+ if not line.startswith(init_prefix):
+ continue
+ split_line = line.split()
+ if len(split_line) < 3 or split_line[1] != "->":
+ continue
+ if "label" not in line:
+ continue
+ label = "".join(split_line[split_line.index("label") + 2:-1]).replace('"', '')
+ for part in label.split(";"):
+ reset_m = self.constraint_reset.search(part.strip())
+ if reset_m:
+ env = reset_m["env"]
+ self.env_stored.add(env)
+ self.env_init_started.add(env)
+
def __get_model_name(self) -> str:
basename = ntpath.basename(self.__dot_path)
if not basename.endswith(".dot") and not basename.endswith(".gv"):
diff --git a/tools/verification/rvgen/rvgen/dot2k.py b/tools/verification/rvgen/rvgen/dot2k.py
index e6f476b903b0..e8066260c0af 100644
--- a/tools/verification/rvgen/rvgen/dot2k.py
+++ b/tools/verification/rvgen/rvgen/dot2k.py
@@ -366,7 +366,18 @@ f"""static inline void ha_convert_inv_guard(struct ha_monitor *ha_mon,
conf_g = [e for s, e in conflict_guards if s == state]
if not conf_i and not conf_g:
continue
- buff.append(f"\t{_else}if (curr_state == {self.states[state]}{self.enum_suffix})")
+
+ state_name = f"{self.states[state]}{self.enum_suffix}"
+ env_full = self.__get_constraint_env(constr)
+ env_bare = env_full[:-len(self.enum_suffix)]
+ if env_bare in self.env_init_started:
+ # env_store is ENV_INVALID_VALUE until handle_monitor_start();
+ # skip ha_inv_to_guard during the init race window.
+ cont = "\t\t " if _else else "\t "
+ buff.append(f"\t{_else}if (curr_state == {state_name} &&")
+ buff.append(f"{cont}!ha_monitor_env_invalid(ha_mon, {env_full}))")
+ else:
+ buff.append(f"\t{_else}if (curr_state == {state_name})")
buff.append(f"\t\t{self.__start_to_conv(constr)};")
_else = "else "
@@ -376,16 +387,22 @@ f"""static inline void ha_convert_inv_guard(struct ha_monitor *ha_mon,
def __fill_verify_guards_func(self) -> list[str]:
buff = []
- if not self.guards:
+ # Always generate for monitors with invariants: stable extension
+ # point for future guard conditions.
+ if not self.guards and not self.invariants:
return []
buff.append(
f"""static inline bool ha_verify_guards(struct ha_monitor *ha_mon,
\t\t\t\t enum {self.enum_states_def} curr_state, enum {self.enum_events_def} event,
\t\t\t\t enum {self.enum_states_def} next_state, u64 time_ns)
-{{
-\tbool res = true;
-""")
+{{""")
+
+ if not self.guards:
+ buff.append("\treturn true;\n}\n")
+ return buff
+
+ buff.append("\tbool res = true;\n")
_else = ""
for edge, constr in sorted(self.guards.items()):
@@ -522,7 +539,7 @@ f"""static bool ha_verify_constraint(struct ha_monitor *ha_mon,
buff.append("\tha_convert_inv_guard(ha_mon, curr_state, event, "
"next_state, time_ns);\n")
- if self.guards:
+ if self.guards or self.invariants:
buff.append("\tif (!ha_verify_guards(ha_mon, curr_state, event, "
"next_state, time_ns))\n\t\treturn false;\n")
@@ -599,8 +616,77 @@ f"""static bool ha_verify_constraint(struct ha_monitor *ha_mon,
buff.append("}\n")
return buff
+ def __fill_init_start_helper(self) -> list[str]:
+ """Generate handle_monitor_start() for envs reset on the __init arrow.
+
+ env_store is invalid inside da_handle_start_event(); this helper must
+ be called after DA storage is allocated and initial state is set.
+ """
+ if not self.env_init_started:
+ return []
+
+ # Collect the ha_start_timer call for each init-started env from the
+ # first state invariant that references it.
+ timer_calls: dict[str, str] = {}
+ for env in self.env_init_started:
+ env_full = f"{env}{self.enum_suffix}"
+ for constr in self.invariants.values():
+ if env_full in constr:
+ timer_calls[env] = constr
+ break
+
+ buff = []
+ buff.append(
+"""/*
+ * handle_monitor_start - reset per-object clock(s) and arm the timer.
+ *
+ * env_store is invalid inside da_handle_start_event(); call this helper
+ * after allocating DA storage and setting the initial DA state.
+ *
+ * XXX: replace the placeholders with the actual logic for your monitor.
+ */""")
+
+ if self.monitor_type == "per_obj":
+ buff.append("static int handle_monitor_start(int id, monitor_target t)")
+ buff.append("{")
+ buff.append("\tstruct ha_monitor *ha_mon;")
+ buff.append("\tu64 time_ns = ktime_get_ns();\n")
+ buff.append("\t/* XXX: allocate DA storage, e.g. da_create_or_get(id, t) */")
+ buff.append("\t/* XXX: set initial DA state, e.g. da_handle_start_event(id, t, <event>) */")
+ buff.append("\tha_mon = /* XXX: retrieve ha_monitor for (id, t) */;")
+ elif self.monitor_type == "per_task":
+ buff.append("static int handle_monitor_start(struct task_struct *p)")
+ buff.append("{")
+ buff.append("\tstruct ha_monitor *ha_mon;")
+ buff.append("\tu64 time_ns = ktime_get_ns();\n")
+ buff.append("\t/* XXX: allocate DA storage, e.g. da_create_or_get(p->pid, p) */")
+ buff.append("\t/* XXX: set initial DA state, e.g. da_handle_start_event(p->pid, p, <event>) */")
+ buff.append("\tha_mon = /* XXX: retrieve ha_monitor for p */;")
+ else:
+ buff.append("static int handle_monitor_start(void)")
+ buff.append("{")
+ buff.append("\tstruct ha_monitor *ha_mon;")
+ buff.append("\tu64 time_ns = ktime_get_ns();\n")
+ buff.append("\tha_mon = /* XXX: retrieve global ha_monitor */;")
+
+ buff.append("\tif (!ha_mon)")
+ buff.append("\t\treturn -ENOENT;")
+
+ for env in self.env_init_started:
+ buff.append(f"\tha_reset_env(ha_mon, {env}{self.enum_suffix}, time_ns);")
+ if env in timer_calls:
+ buff.append(f"\t{timer_calls[env]};")
+ else:
+ buff.append(f"\t/* XXX: arm timer for {env} */")
+
+ buff.append("\treturn 0;")
+ buff.append("}\n")
+ return buff
+
def _fill_hybrid_definitions(self) -> list[str]:
- return self.__fill_hybrid_get_reset_functions() + self.__fill_constr_func()
+ return (self.__fill_hybrid_get_reset_functions() +
+ self.__fill_init_start_helper() +
+ self.__fill_constr_func())
def _fill_timer_type(self) -> list:
if self.invariants:
--
2.25.1
^ permalink raw reply related
* [RFC PATCH v2 05/10] rv: add generic uprobe infrastructure for RV monitors
From: wen.yang @ 2026-05-11 18:24 UTC (permalink / raw)
To: Gabriele Monaco, Steven Rostedt
Cc: linux-trace-kernel, linux-kernel, Wen Yang
In-Reply-To: <cover.1778522945.git.wen.yang@linux.dev>
From: Wen Yang <wen.yang@linux.dev>
Introduce rv_uprobe, a thin wrapper around uprobe_consumer providing
rv_uprobe_attach_path(), rv_uprobe_attach(), and rv_uprobe_detach()
for RV monitors. An opaque priv pointer is forwarded unchanged to
entry/return handlers so monitors can carry per-binding state (e.g. a
latency threshold) to the hot path without any global lookup.
rv_uprobe_detach() is fully synchronous (nosync + sync + path_put +
kfree), closing the use-after-free window present in open-coded
patterns where kfree() precedes uprobe_unregister_sync().
Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
include/rv/rv_uprobe.h | 87 ++++++++++++++++++++
kernel/trace/rv/Kconfig | 4 +
kernel/trace/rv/Makefile | 1 +
kernel/trace/rv/rv_uprobe.c | 153 ++++++++++++++++++++++++++++++++++++
4 files changed, 245 insertions(+)
create mode 100644 include/rv/rv_uprobe.h
create mode 100644 kernel/trace/rv/rv_uprobe.c
diff --git a/include/rv/rv_uprobe.h b/include/rv/rv_uprobe.h
new file mode 100644
index 000000000000..084cdb36a2ff
--- /dev/null
+++ b/include/rv/rv_uprobe.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Generic uprobe infrastructure for RV monitors.
+ *
+ */
+
+#ifndef _RV_UPROBE_H
+#define _RV_UPROBE_H
+
+#include <linux/path.h>
+#include <linux/types.h>
+
+struct pt_regs;
+
+/**
+ * struct rv_uprobe - a single uprobe registered on behalf of an RV monitor
+ *
+ * @offset: byte offset within the ELF binary where the probe is installed
+ * @priv: monitor-private pointer; set at attach time, never touched by
+ * this layer; passed unchanged to entry_fn / ret_fn
+ * @path: resolved path of the probed binary (read-only after attach);
+ * callers may use path.dentry for identity comparisons
+ *
+ * The implementation fields (uprobe_consumer, uprobe handle, callbacks) are
+ * private to rv_uprobe.c and are not exposed here; monitors must not access
+ * them directly.
+ */
+struct rv_uprobe {
+ /* public: read-only after rv_uprobe_attach*() */
+ loff_t offset;
+ void *priv;
+ struct path path;
+};
+
+/**
+ * rv_uprobe_attach_path - register an uprobe given an already-resolved path
+ * @path: path of the target binary; rv_uprobe takes its own reference
+ * @offset: byte offset within the binary
+ * @entry_fn: called on probe hit (entry); may be NULL
+ * @ret_fn: called on function return (uretprobe); may be NULL
+ * @priv: opaque pointer forwarded to callbacks unchanged
+ *
+ * Use this variant when the caller has already resolved the path (e.g. to
+ * register multiple probes on the same binary with a single kern_path call).
+ * The inode is derived internally via d_real_inode(), so inode and path are
+ * always consistent.
+ *
+ * Returns a pointer to the new rv_uprobe on success, ERR_PTR on failure.
+ */
+struct rv_uprobe *rv_uprobe_attach_path(struct path *path, loff_t offset,
+ int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *data),
+ int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
+ struct pt_regs *regs, __u64 *data),
+ void *priv);
+
+/**
+ * rv_uprobe_attach - resolve binpath and register an uprobe
+ * @binpath: absolute path to the target binary
+ * @offset: byte offset within the binary
+ * @entry_fn: called on probe hit (entry); may be NULL
+ * @ret_fn: called on function return (uretprobe); may be NULL
+ * @priv: opaque pointer forwarded to callbacks unchanged
+ *
+ * Resolves binpath via kern_path(), then delegates to rv_uprobe_attach_path().
+ *
+ * Returns a pointer to the new rv_uprobe on success, ERR_PTR on failure.
+ */
+struct rv_uprobe *rv_uprobe_attach(const char *binpath, loff_t offset,
+ int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *data),
+ int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
+ struct pt_regs *regs, __u64 *data),
+ void *priv);
+
+/**
+ * rv_uprobe_detach - synchronously unregister an uprobe and free it
+ * @p: probe to detach; may be NULL (no-op)
+ *
+ * Calls uprobe_unregister_nosync(), then uprobe_unregister_sync() to wait
+ * for any in-progress handler to finish, then releases the path reference
+ * and frees the rv_uprobe struct. The caller's priv data is NOT freed.
+ *
+ * Safe to call from process context only (uprobe_unregister_sync() may
+ * schedule).
+ */
+void rv_uprobe_detach(struct rv_uprobe *p);
+
+#endif /* _RV_UPROBE_H */
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index 3884b14df375..e2e0033a00b9 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -59,6 +59,10 @@ config RV_PER_TASK_MONITORS
This option configures the maximum number of per-task RV monitors that can run
simultaneously.
+config RV_UPROBE
+ bool
+ depends on RV && UPROBES
+
source "kernel/trace/rv/monitors/wip/Kconfig"
source "kernel/trace/rv/monitors/wwnr/Kconfig"
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index 94498da35b37..f139b904bea3 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_RV_MON_STALL) += monitors/stall/stall.o
obj-$(CONFIG_RV_MON_DEADLINE) += monitors/deadline/deadline.o
obj-$(CONFIG_RV_MON_NOMISS) += monitors/nomiss/nomiss.o
# Add new monitors here
+obj-$(CONFIG_RV_UPROBE) += rv_uprobe.o
obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o
obj-$(CONFIG_RV_REACT_PANIC) += reactor_panic.o
diff --git a/kernel/trace/rv/rv_uprobe.c b/kernel/trace/rv/rv_uprobe.c
new file mode 100644
index 000000000000..bc28399cfd4b
--- /dev/null
+++ b/kernel/trace/rv/rv_uprobe.c
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generic uprobe infrastructure for RV monitors.
+ *
+ */
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/uprobes.h>
+#include <rv/rv_uprobe.h>
+
+/*
+ * Private extension of struct rv_uprobe. Allocated by rv_uprobe_attach*()
+ * and returned to callers as &impl->pub.
+ */
+struct rv_uprobe_impl {
+ struct rv_uprobe pub; /* must be first; callers hold &pub */
+ struct uprobe_consumer uc;
+ struct uprobe *uprobe;
+ int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *data);
+ int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
+ struct pt_regs *regs, __u64 *data);
+};
+
+static int rv_uprobe_handler(struct uprobe_consumer *uc,
+ struct pt_regs *regs, __u64 *data)
+{
+ struct rv_uprobe_impl *impl = container_of(uc, struct rv_uprobe_impl, uc);
+
+ if (impl->entry_fn)
+ return impl->entry_fn(&impl->pub, regs, data);
+ return 0;
+}
+
+static int rv_uprobe_ret_handler(struct uprobe_consumer *uc,
+ unsigned long func,
+ struct pt_regs *regs, __u64 *data)
+{
+ struct rv_uprobe_impl *impl = container_of(uc, struct rv_uprobe_impl, uc);
+
+ if (impl->ret_fn)
+ return impl->ret_fn(&impl->pub, func, regs, data);
+ return 0;
+}
+
+static struct rv_uprobe *
+__rv_uprobe_attach(struct inode *inode, struct path *path, loff_t offset,
+ int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *data),
+ int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
+ struct pt_regs *regs, __u64 *data),
+ void *priv)
+{
+ struct rv_uprobe_impl *impl;
+ int ret;
+
+ if (!entry_fn && !ret_fn)
+ return ERR_PTR(-EINVAL);
+
+ impl = kzalloc_obj(*impl, GFP_KERNEL);
+ if (!impl)
+ return ERR_PTR(-ENOMEM);
+
+ impl->pub.offset = offset;
+ impl->pub.priv = priv;
+ impl->entry_fn = entry_fn;
+ impl->ret_fn = ret_fn;
+ path_get(path);
+ impl->pub.path = *path;
+
+ if (entry_fn)
+ impl->uc.handler = rv_uprobe_handler;
+ if (ret_fn)
+ impl->uc.ret_handler = rv_uprobe_ret_handler;
+
+ impl->uprobe = uprobe_register(inode, offset, 0, &impl->uc);
+ if (IS_ERR(impl->uprobe)) {
+ ret = PTR_ERR(impl->uprobe);
+ path_put(&impl->pub.path);
+ kfree(impl);
+ return ERR_PTR(ret);
+ }
+
+ return &impl->pub;
+}
+
+/**
+ * rv_uprobe_attach_path - register an uprobe given an already-resolved path
+ */
+struct rv_uprobe *rv_uprobe_attach_path(struct path *path, loff_t offset,
+ int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *data),
+ int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
+ struct pt_regs *regs, __u64 *data),
+ void *priv)
+{
+ struct inode *inode = d_real_inode(path->dentry);
+
+ return __rv_uprobe_attach(inode, path, offset, entry_fn, ret_fn, priv);
+}
+EXPORT_SYMBOL_GPL(rv_uprobe_attach_path);
+
+/**
+ * rv_uprobe_attach - resolve binpath and register an uprobe
+ */
+struct rv_uprobe *rv_uprobe_attach(const char *binpath, loff_t offset,
+ int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64 *data),
+ int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
+ struct pt_regs *regs, __u64 *data),
+ void *priv)
+{
+ struct rv_uprobe *p;
+ struct path path;
+ int ret;
+
+ ret = kern_path(binpath, LOOKUP_FOLLOW, &path);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (!d_is_reg(path.dentry)) {
+ path_put(&path);
+ return ERR_PTR(-EINVAL);
+ }
+
+ p = rv_uprobe_attach_path(&path, offset, entry_fn, ret_fn, priv);
+ path_put(&path);
+ return p;
+}
+EXPORT_SYMBOL_GPL(rv_uprobe_attach);
+
+/**
+ * rv_uprobe_detach - synchronously unregister an uprobe and free it
+ */
+void rv_uprobe_detach(struct rv_uprobe *p)
+{
+ struct rv_uprobe_impl *impl;
+
+ if (!p)
+ return;
+
+ impl = container_of(p, struct rv_uprobe_impl, pub);
+ uprobe_unregister_nosync(impl->uprobe, &impl->uc);
+ /*
+ * uprobe_unregister_sync() is a global barrier: it waits for all
+ * in-flight uprobe handlers across the entire system to complete,
+ * not just handlers for this probe. This is intentional — it
+ * guarantees that no handler touching impl->pub.priv is running by
+ * the time we return, even if the caller immediately frees priv.
+ */
+ uprobe_unregister_sync();
+ path_put(&p->path);
+ kfree(impl);
+}
+EXPORT_SYMBOL_GPL(rv_uprobe_detach);
--
2.25.1
^ permalink raw reply related
* [RFC PATCH v2 00/10] rv/tlob: Add task latency over budget RV monitor
From: wen.yang @ 2026-05-11 18:24 UTC (permalink / raw)
To: Gabriele Monaco, Steven Rostedt
Cc: linux-trace-kernel, linux-kernel, Wen Yang
From: Wen Yang <wen.yang@linux.dev>
This series introduces tlob (task latency over budget), a per-task
hybrid automaton RV monitor that measures elapsed wall-clock time across
a user-delimited code section and fires when the time exceeds a
configurable budget.
Background
----------
The existing wwnr monitor uses a two-state DA to detect tasks that are
woken but never run. tlob extends the RV framework to a three-state
hybrid automaton:
running (initial) -- on CPU
waiting -- in the scheduler runqueue, not yet on CPU
sleeping -- blocked on a lock, I/O, or similar resource
A single HA clock invariant, clk_elapsed < BUDGET_NS(), is active in
all states. The framework enforces it via a per-task hrtimer. On
expiry, error_env_tlob is emitted, followed by detail_env_tlob which
carries a per-state time breakdown (running_ns, waiting_ns, sleeping_ns)
that pinpoints whether the overrun occurred in the running, waiting, or
sleeping state.
Two userspace interfaces are provided:
ioctl (/dev/rv): TLOB_IOCTL_TRACE_START / TLOB_IOCTL_TRACE_STOP for
in-process self-instrumentation; TRACE_STOP returns -EOVERFLOW if
the budget was exceeded during the window.
tracefs (monitors/tlob/monitor): write "p PATH:START STOP threshold=NS"
to attach uprobes to an unmodified binary. The start uprobe calls
tlob_start_task() and the stop uprobe calls tlob_stop_task().
Infrastructure additions
------------------------
The series also includes several prerequisite additions to the RV
framework that are useful beyond tlob:
rv/da: two bug fixes for race conditions in da_monitor_destroy() and
the monitoring flag memory ordering that affect existing per-task
monitors (patches 1-2).
rv/da: pre-allocated storage pool (da_monitor_init_prealloc) to avoid
kmalloc on the scheduler tracepoint hot path and to support
PREEMPT_RT configurations (patch 4).
rv: generic uprobe infrastructure (rv_uprobe) providing a thin wrapper
around uprobe_consumer with a per-binding priv pointer and fully
synchronous detach semantics (patch 5).
rvgen: support for reset() annotations on the __init arrow, enabling
monitors where a clock is reset once at object creation and active
in all states (patch 6).
Testing
-------
KUnit (five suites, 19 test cases):
tlob_task_api -- start/stop lifecycle, error paths, -ENOSPC,
deadline firing in each state
tlob_sched_integration -- context-switch accounting, monitoring a
kthread other than current
tlob_uprobe_format -- add/remove format acceptance and rejection
via tlob_create_or_delete_uprobe()
tlob_trace_output -- event_tlob and error_env_tlob field values
tlob_violation_react -- error count per budget expiry; per-state
ns breakdown dominance
kselftest (12 tests, run via verificationtest-ktap on x86_64 with vng):
TAP version 13
1..12
ok 1 Test monitor enable/disable
ok 2 Test monitor reactor setting
ok 3 Check available monitors
ok 4 Test wwnr monitor with printk reactor
ok 5 Test tlob ioctl self-instrumentation (within/over-budget, error paths)
ok 6 Test tlob monitor tracefs interface (enable/disable and files)
ok 7 uprobe binding: visible in monitor file, removable, duplicate offset rejected
ok 8 uprobe detail sleeping: sleeping_ns dominates when task blocks between probes
ok 9 uprobe detail waiting: waiting_ns dominates when task is preempted between probes
ok 10 Two bindings on same binary with different offsets and budgets fire independently
ok 11 Verify no spurious error_env_tlob events without an active uprobe binding
ok 12 uprobe violation: error_env_tlob and detail_env_tlob fire with correct fields
# Totals: pass:12 fail:0 xfail:0 xpass:0 skip:0 error:0
Changes in v2 (addressing review by Gabriele)
----------------------------------------------------------------
- Switch from a custom hash table to RV_MON_PER_OBJ with
"typedef struct tlob_task_state *monitor_target"; the per-invocation
threshold is carried in the target struct and accessed via
ha_get_target(ha_mon)->threshold_us, following the nomiss monitor
pattern.
- Replace the explicit unmonitored state and da_monitor_init_hook()
timer arming with da_handle_start_event() for initial state setup,
then ha_reset_clk_ns() + ha_start_timer_ns() directly. The HA clock
invariant clk_elapsed < BUDGET_NS() is now expressed in the DOT model
and enforced by the HA framework.
- Drop the v1 KUnit test suite that validated every DA transition matrix
entry; replace with five suites that test meaningful behaviours:
start/stop lifecycle, context-switch accounting, uprobe format
validation, trace event field values, and violation reaction with
per-state ns breakdown.
- Move selftests from tools/testing/selftests/rv/ into the existing
tools/testing/selftests/verification/ harness; test logic is expressed
as ftracetest .tc scripts under test.d/tlob/, with helper binaries
(tlob_ioctl, tlob_target) built by tlob/Makefile and located via PATH.
Wen Yang (10):
rv/da: fix monitor start ordering and memory ordering for monitoring
flag
rv/da: fix per-task da_monitor_destroy() ordering and sync
selftests/verification: fix verificationtest-ktap for out-of-tree
execution
rv/da: add pre-allocated storage pool for per-object monitors
rv: add generic uprobe infrastructure for RV monitors
rvgen: support reset() on the __init arrow for global-window HA clocks
rv/tlob: add tlob model DOT file
rv/tlob: add tlob hybrid automaton monitor
rv/tlob: add KUnit tests for the tlob monitor
selftests/verification: add tlob selftests
Documentation/trace/rv/index.rst | 1 +
Documentation/trace/rv/monitor_tlob.rst | 213 +++
MAINTAINERS | 3 +
include/linux/rv.h | 45 +
include/rv/automata.h | 15 +
include/rv/da_monitor.h | 234 ++-
include/rv/ha_monitor.h | 33 +-
include/rv/rv_uprobe.h | 119 ++
include/uapi/linux/rv.h | 86 ++
kernel/trace/rv/Kconfig | 6 +
kernel/trace/rv/Makefile | 5 +-
kernel/trace/rv/monitors/tlob/.kunitconfig | 5 +
kernel/trace/rv/monitors/tlob/Kconfig | 69 +
kernel/trace/rv/monitors/tlob/tlob.c | 1333 +++++++++++++++++
kernel/trace/rv/monitors/tlob/tlob.h | 171 +++
kernel/trace/rv/monitors/tlob/tlob_kunit.c | 881 +++++++++++
kernel/trace/rv/monitors/tlob/tlob_trace.h | 58 +
kernel/trace/rv/rv.c | 38 +
kernel/trace/rv/rv.h | 2 +
kernel/trace/rv/rv_chardev.c | 201 +++
kernel/trace/rv/rv_trace.h | 1 +
kernel/trace/rv/rv_uprobe.c | 189 +++
tools/include/uapi/linux/rv.h | 86 ++
tools/testing/selftests/verification/Makefile | 21 +-
.../verification/test.d/tlob/ioctl.tc | 36 +
.../verification/test.d/tlob/tracefs.tc | 17 +
.../verification/test.d/tlob/uprobe_bind.tc | 34 +
.../test.d/tlob/uprobe_detail_sleeping.tc | 47 +
.../test.d/tlob/uprobe_detail_waiting.tc | 60 +
.../verification/test.d/tlob/uprobe_multi.tc | 60 +
.../test.d/tlob/uprobe_no_event.tc | 19 +
.../test.d/tlob/uprobe_violation.tc | 60 +
.../selftests/verification/tlob/Makefile | 21 +
.../selftests/verification/tlob/tlob_ioctl.c | 626 ++++++++
.../selftests/verification/tlob/tlob_target.c | 138 ++
.../verification/verificationtest-ktap | 4 +-
tools/verification/models/tlob.dot | 21 +
tools/verification/rvgen/rvgen/automata.py | 26 +
tools/verification/rvgen/rvgen/dot2k.py | 100 +-
39 files changed, 5043 insertions(+), 41 deletions(-)
create mode 100644 Documentation/trace/rv/monitor_tlob.rst
create mode 100644 include/rv/rv_uprobe.h
create mode 100644 include/uapi/linux/rv.h
create mode 100644 kernel/trace/rv/monitors/tlob/.kunitconfig
create mode 100644 kernel/trace/rv/monitors/tlob/Kconfig
create mode 100644 kernel/trace/rv/monitors/tlob/tlob.c
create mode 100644 kernel/trace/rv/monitors/tlob/tlob.h
create mode 100644 kernel/trace/rv/monitors/tlob/tlob_kunit.c
create mode 100644 kernel/trace/rv/monitors/tlob/tlob_trace.h
create mode 100644 kernel/trace/rv/rv_chardev.c
create mode 100644 kernel/trace/rv/rv_uprobe.c
create mode 100644 tools/include/uapi/linux/rv.h
create mode 100644 tools/testing/selftests/verification/test.d/tlob/ioctl.tc
create mode 100644 tools/testing/selftests/verification/test.d/tlob/tracefs.tc
create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_bind.tc
create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_detail_sleeping.tc
create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_detail_waiting.tc
create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_multi.tc
create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_no_event.tc
create mode 100644 tools/testing/selftests/verification/test.d/tlob/uprobe_violation.tc
create mode 100644 tools/testing/selftests/verification/tlob/Makefile
create mode 100644 tools/testing/selftests/verification/tlob/tlob_ioctl.c
create mode 100644 tools/testing/selftests/verification/tlob/tlob_target.c
create mode 100644 tools/verification/models/tlob.dot
--
2.25.1
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox