* [PATCH v2 4/9] mm: move free_reserved_area() to mm/memblock.c
From: Mike Rapoport @ 2026-03-23 7:48 UTC (permalink / raw)
To: Andrew Morton
Cc: Alexander Potapenko, Alexander Viro, Andreas Larsson,
Ard Biesheuvel, Borislav Petkov, Brendan Jackman,
Christophe Leroy (CS GROUP), Catalin Marinas, Christian Brauner,
David S. Miller, Dave Hansen, David Hildenbrand, Dmitry Vyukov,
Ilias Apalodimas, Ingo Molnar, Jan Kara, Johannes Weiner,
Liam R. Howlett, Lorenzo Stoakes, Madhavan Srinivasan,
Marco Elver, Marek Szyprowski, Masami Hiramatsu, Michael Ellerman,
Michal Hocko, Mike Rapoport, Nicholas Piggin, H. Peter Anvin,
Rob Herring, Robin Murphy, Saravana Kannan, Suren Baghdasaryan,
Thomas Gleixner, Vlastimil Babka, Will Deacon, Zi Yan, devicetree,
iommu, kasan-dev, linux-arm-kernel, linux-efi, linux-fsdevel,
linux-kernel, linux-mm, linux-trace-kernel, linuxppc-dev,
sparclinux, x86
In-Reply-To: <20260323074836.3653702-1-rppt@kernel.org>
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
free_reserved_area() is related to memblock as it frees reserved memory
back to the buddy allocator, similar to what memblock_free_late() does.
Move free_reserved_area() to mm/memblock.c to prepare for further
consolidation of the functions that free reserved memory.
No functional changes.
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
mm/memblock.c | 37 ++++++++++++++++++++++++++++++-
mm/page_alloc.c | 36 ------------------------------
tools/include/linux/mm.h | 1 +
tools/testing/memblock/internal.h | 34 +++++++++++++++++++++++++---
4 files changed, 68 insertions(+), 40 deletions(-)
diff --git a/mm/memblock.c b/mm/memblock.c
index d4a02f1750e9..c0896efbee97 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -893,6 +893,42 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
return memblock_remove_range(&memblock.memory, base, size);
}
+unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
+{
+ void *pos;
+ unsigned long pages = 0;
+
+ start = (void *)PAGE_ALIGN((unsigned long)start);
+ end = (void *)((unsigned long)end & PAGE_MASK);
+ for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
+ struct page *page = virt_to_page(pos);
+ void *direct_map_addr;
+
+ /*
+ * 'direct_map_addr' might be different from 'pos'
+ * because some architectures' virt_to_page()
+ * work with aliases. Getting the direct map
+ * address ensures that we get a _writeable_
+ * alias for the memset().
+ */
+ direct_map_addr = page_address(page);
+ /*
+ * Perform a kasan-unchecked memset() since this memory
+ * has not been initialized.
+ */
+ direct_map_addr = kasan_reset_tag(direct_map_addr);
+ if ((unsigned int)poison <= 0xFF)
+ memset(direct_map_addr, poison, PAGE_SIZE);
+
+ free_reserved_page(page);
+ }
+
+ if (pages && s)
+ pr_info("Freeing %s memory: %ldK\n", s, K(pages));
+
+ return pages;
+}
+
/**
* memblock_free - free boot memory allocation
* @ptr: starting address of the boot memory allocation
@@ -1776,7 +1812,6 @@ void __init memblock_free_late(phys_addr_t base, phys_addr_t size)
totalram_pages_inc();
}
}
-
/*
* Remaining API functions
*/
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2d4b6f1a554e..df3d61253001 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6234,42 +6234,6 @@ void adjust_managed_page_count(struct page *page, long count)
}
EXPORT_SYMBOL(adjust_managed_page_count);
-unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
-{
- void *pos;
- unsigned long pages = 0;
-
- start = (void *)PAGE_ALIGN((unsigned long)start);
- end = (void *)((unsigned long)end & PAGE_MASK);
- for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
- struct page *page = virt_to_page(pos);
- void *direct_map_addr;
-
- /*
- * 'direct_map_addr' might be different from 'pos'
- * because some architectures' virt_to_page()
- * work with aliases. Getting the direct map
- * address ensures that we get a _writeable_
- * alias for the memset().
- */
- direct_map_addr = page_address(page);
- /*
- * Perform a kasan-unchecked memset() since this memory
- * has not been initialized.
- */
- direct_map_addr = kasan_reset_tag(direct_map_addr);
- if ((unsigned int)poison <= 0xFF)
- memset(direct_map_addr, poison, PAGE_SIZE);
-
- free_reserved_page(page);
- }
-
- if (pages && s)
- pr_info("Freeing %s memory: %ldK\n", s, K(pages));
-
- return pages;
-}
-
void free_reserved_page(struct page *page)
{
clear_page_tag_ref(page);
diff --git a/tools/include/linux/mm.h b/tools/include/linux/mm.h
index 028f3faf46e7..4407d8396108 100644
--- a/tools/include/linux/mm.h
+++ b/tools/include/linux/mm.h
@@ -17,6 +17,7 @@
#define __va(x) ((void *)((unsigned long)(x)))
#define __pa(x) ((unsigned long)(x))
+#define __pa_symbol(x) ((unsigned long)(x))
#define pfn_to_page(pfn) ((void *)((pfn) * PAGE_SIZE))
diff --git a/tools/testing/memblock/internal.h b/tools/testing/memblock/internal.h
index 009b97bbdd22..b72be2968104 100644
--- a/tools/testing/memblock/internal.h
+++ b/tools/testing/memblock/internal.h
@@ -11,9 +11,22 @@ static int memblock_debug = 1;
#define pr_warn_ratelimited(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#define K(x) ((x) << (PAGE_SHIFT-10))
+
bool mirrored_kernelcore = false;
struct page {};
+static inline void *page_address(struct page *page)
+{
+ BUG();
+ return page;
+}
+
+static inline struct page *virt_to_page(void *virt)
+{
+ BUG();
+ return virt;
+}
void memblock_free_pages(unsigned long pfn, unsigned int order)
{
@@ -23,10 +36,25 @@ static inline void accept_memory(phys_addr_t start, unsigned long size)
{
}
-static inline unsigned long free_reserved_area(void *start, void *end,
- int poison, const char *s)
+unsigned long free_reserved_area(void *start, void *end, int poison, const char *s);
+void free_reserved_page(struct page *page);
+
+static inline bool deferred_pages_enabled(void)
+{
+ return false;
+}
+
+#define for_each_valid_pfn(pfn, start_pfn, end_pfn) \
+ for ((pfn) = (start_pfn); (pfn) < (end_pfn); (pfn)++)
+
+static inline void *kasan_reset_tag(const void *addr)
+{
+ return (void *)addr;
+}
+
+static inline bool __is_kernel(unsigned long addr)
{
- return 0;
+ return false;
}
#endif
--
2.53.0
^ permalink raw reply related
* [PATCH v2 3/9] powerpc: opal-core: pair alloc_pages_exact() with free_pages_exact()
From: Mike Rapoport @ 2026-03-23 7:48 UTC (permalink / raw)
To: Andrew Morton
Cc: Alexander Potapenko, Alexander Viro, Andreas Larsson,
Ard Biesheuvel, Borislav Petkov, Brendan Jackman,
Christophe Leroy (CS GROUP), Catalin Marinas, Christian Brauner,
David S. Miller, Dave Hansen, David Hildenbrand, Dmitry Vyukov,
Ilias Apalodimas, Ingo Molnar, Jan Kara, Johannes Weiner,
Liam R. Howlett, Lorenzo Stoakes, Madhavan Srinivasan,
Marco Elver, Marek Szyprowski, Masami Hiramatsu, Michael Ellerman,
Michal Hocko, Mike Rapoport, Nicholas Piggin, H. Peter Anvin,
Rob Herring, Robin Murphy, Saravana Kannan, Suren Baghdasaryan,
Thomas Gleixner, Vlastimil Babka, Will Deacon, Zi Yan, devicetree,
iommu, kasan-dev, linux-arm-kernel, linux-efi, linux-fsdevel,
linux-kernel, linux-mm, linux-trace-kernel, linuxppc-dev,
sparclinux, x86
In-Reply-To: <20260323074836.3653702-1-rppt@kernel.org>
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
opal-core allocates buffers with alloc_pages_exact(), but then
marks them as reserved and frees using free_reserved_area().
This is completely unnecessary and the pages allocated with
alloc_pages_exact() can be naturally freed with free_pages_exact().
Replace freeing of memory in opalcore_cleanup() with
free_pages_exact() and simplify allocation code so that it won't mark
allocated pages as reserved.
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
arch/powerpc/platforms/powernv/opal-core.c | 11 +----------
1 file changed, 1 insertion(+), 10 deletions(-)
diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c
index e76e462f55f6..32662d30d70f 100644
--- a/arch/powerpc/platforms/powernv/opal-core.c
+++ b/arch/powerpc/platforms/powernv/opal-core.c
@@ -303,7 +303,6 @@ static int __init create_opalcore(void)
struct device_node *dn;
struct opalcore *new;
loff_t opalcore_off;
- struct page *page;
Elf64_Phdr *phdr;
Elf64_Ehdr *elf;
int i, ret;
@@ -328,11 +327,6 @@ static int __init create_opalcore(void)
oc_conf->opalcorebuf_sz = 0;
return -ENOMEM;
}
- count = oc_conf->opalcorebuf_sz / PAGE_SIZE;
- page = virt_to_page(oc_conf->opalcorebuf);
- for (i = 0; i < count; i++)
- mark_page_reserved(page + i);
-
pr_debug("opalcorebuf = 0x%llx\n", (u64)oc_conf->opalcorebuf);
/* Read OPAL related device-tree entries */
@@ -437,10 +431,7 @@ static void opalcore_cleanup(void)
/* free the buffer used for setting up OPAL core */
if (oc_conf->opalcorebuf) {
- void *end = (void *)((u64)oc_conf->opalcorebuf +
- oc_conf->opalcorebuf_sz);
-
- free_reserved_area(oc_conf->opalcorebuf, end, -1, NULL);
+ free_pages_exact(oc_conf->opalcorebuf, oc_conf->opalcorebuf_sz);
oc_conf->opalcorebuf = NULL;
oc_conf->opalcorebuf_sz = 0;
}
--
2.53.0
^ permalink raw reply related
* [PATCH v2 2/9] powerpc: fadump: pair alloc_pages_exact() with free_pages_exact()
From: Mike Rapoport @ 2026-03-23 7:48 UTC (permalink / raw)
To: Andrew Morton
Cc: Alexander Potapenko, Alexander Viro, Andreas Larsson,
Ard Biesheuvel, Borislav Petkov, Brendan Jackman,
Christophe Leroy (CS GROUP), Catalin Marinas, Christian Brauner,
David S. Miller, Dave Hansen, David Hildenbrand, Dmitry Vyukov,
Ilias Apalodimas, Ingo Molnar, Jan Kara, Johannes Weiner,
Liam R. Howlett, Lorenzo Stoakes, Madhavan Srinivasan,
Marco Elver, Marek Szyprowski, Masami Hiramatsu, Michael Ellerman,
Michal Hocko, Mike Rapoport, Nicholas Piggin, H. Peter Anvin,
Rob Herring, Robin Murphy, Saravana Kannan, Suren Baghdasaryan,
Thomas Gleixner, Vlastimil Babka, Will Deacon, Zi Yan, devicetree,
iommu, kasan-dev, linux-arm-kernel, linux-efi, linux-fsdevel,
linux-kernel, linux-mm, linux-trace-kernel, linuxppc-dev,
sparclinux, x86
In-Reply-To: <20260323074836.3653702-1-rppt@kernel.org>
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
fadump allocates buffers with alloc_pages_exact(), but then marks them
as reserved and frees using free_reserved_area().
This is completely unnecessary and the pages allocated with
alloc_pages_exact() can be naturally freed with free_pages_exact().
Replace freeing of memory in fadump_free_buffer() with
free_pages_exact() and simplify allocation code so that it won't mark
allocated pages as reserved.
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
arch/powerpc/kernel/fadump.c | 16 ++--------------
1 file changed, 2 insertions(+), 14 deletions(-)
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 4ebc333dd786..501d43bf18f3 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -775,24 +775,12 @@ void __init fadump_update_elfcore_header(char *bufp)
static void *__init fadump_alloc_buffer(unsigned long size)
{
- unsigned long count, i;
- struct page *page;
- void *vaddr;
-
- vaddr = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
- if (!vaddr)
- return NULL;
-
- count = PAGE_ALIGN(size) / PAGE_SIZE;
- page = virt_to_page(vaddr);
- for (i = 0; i < count; i++)
- mark_page_reserved(page + i);
- return vaddr;
+ return alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
}
static void fadump_free_buffer(unsigned long vaddr, unsigned long size)
{
- free_reserved_area((void *)vaddr, (void *)(vaddr + size), -1, NULL);
+ free_pages_exact((void *)vaddr, size);
}
s32 __init fadump_setup_cpu_notes_buf(u32 num_cpus)
--
2.53.0
^ permalink raw reply related
* [PATCH v2 1/9] memblock: reserve_mem: fix end caclulation in reserve_mem_release_by_name()
From: Mike Rapoport @ 2026-03-23 7:48 UTC (permalink / raw)
To: Andrew Morton
Cc: Alexander Potapenko, Alexander Viro, Andreas Larsson,
Ard Biesheuvel, Borislav Petkov, Brendan Jackman,
Christophe Leroy (CS GROUP), Catalin Marinas, Christian Brauner,
David S. Miller, Dave Hansen, David Hildenbrand, Dmitry Vyukov,
Ilias Apalodimas, Ingo Molnar, Jan Kara, Johannes Weiner,
Liam R. Howlett, Lorenzo Stoakes, Madhavan Srinivasan,
Marco Elver, Marek Szyprowski, Masami Hiramatsu, Michael Ellerman,
Michal Hocko, Mike Rapoport, Nicholas Piggin, H. Peter Anvin,
Rob Herring, Robin Murphy, Saravana Kannan, Suren Baghdasaryan,
Thomas Gleixner, Vlastimil Babka, Will Deacon, Zi Yan, devicetree,
iommu, kasan-dev, linux-arm-kernel, linux-efi, linux-fsdevel,
linux-kernel, linux-mm, linux-trace-kernel, linuxppc-dev,
sparclinux, x86
In-Reply-To: <20260323074836.3653702-1-rppt@kernel.org>
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
free_reserved_area() expects end parameter to point to the first address
after the area, but reserve_mem_release_by_name() passes it the last
address inside the area.
Remove subtraction of one in calculation of the area end.
Fixes: 74e2498ccf7b ("mm/memblock: Add reserved memory release function")
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
mm/memblock.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/memblock.c b/mm/memblock.c
index b3ddfdec7a80..d4a02f1750e9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -2434,7 +2434,7 @@ int reserve_mem_release_by_name(const char *name)
return 0;
start = phys_to_virt(map->start);
- end = start + map->size - 1;
+ end = start + map->size;
snprintf(buf, sizeof(buf), "reserve_mem:%s", name);
free_reserved_area(start, end, 0, buf);
map->size = 0;
--
2.53.0
^ permalink raw reply related
* [PATCH v2 0/9] memblock: improve late freeing of reserved memory
From: Mike Rapoport @ 2026-03-23 7:48 UTC (permalink / raw)
To: Andrew Morton
Cc: Alexander Potapenko, Alexander Viro, Andreas Larsson,
Ard Biesheuvel, Borislav Petkov, Brendan Jackman,
Christophe Leroy (CS GROUP), Catalin Marinas, Christian Brauner,
David S. Miller, Dave Hansen, David Hildenbrand, Dmitry Vyukov,
Ilias Apalodimas, Ingo Molnar, Jan Kara, Johannes Weiner,
Liam R. Howlett, Lorenzo Stoakes, Madhavan Srinivasan,
Marco Elver, Marek Szyprowski, Masami Hiramatsu, Michael Ellerman,
Michal Hocko, Mike Rapoport, Nicholas Piggin, H. Peter Anvin,
Rob Herring, Robin Murphy, Saravana Kannan, Suren Baghdasaryan,
Thomas Gleixner, Vlastimil Babka, Will Deacon, Zi Yan, devicetree,
iommu, kasan-dev, linux-arm-kernel, linux-efi, linux-fsdevel,
linux-kernel, linux-mm, linux-trace-kernel, linuxppc-dev,
sparclinux, x86
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Hi,
Following a recent discussion about leaks in x86 EFI [1], I audited usage of
memblock_free_late() and free_reserved_area() and made some imporovements how
we handle late freeing of the memory allocated with memblock.
[1] https://lore.kernel.org/all/ec2aaef14783869b3be6e3c253b2dcbf67dbc12a.camel@kernel.crashing.org/
v2 changes:
* fix UAF issue in memblock_discard() reported by 0day and sashiko
* address relevant sashiko comments
* squash memblock test stubs changes into the commit with core updates
v1: https://lore.kernel.org/all/20260318105827.1358927-1-rppt@kernel.org
Mike Rapoport (Microsoft) (9):
memblock: reserve_mem: fix end caclulation in reserve_mem_release_by_name()
powerpc: fadump: pair alloc_pages_exact() with free_pages_exact()
powerpc: opal-core: pair alloc_pages_exact() with free_pages_exact()
mm: move free_reserved_area() to mm/memblock.c
memblock: make free_reserved_area() more robust
memblock: extract page freeing from free_reserved_area() into a helper
memblock: make free_reserved_area() update memblock if ARCH_KEEP_MEMBLOCK=y
memblock, treewide: make memblock_free() handle late freeing
memblock: warn when freeing reserved memory before memory map is initialized
arch/arm64/mm/init.c | 3 -
arch/powerpc/kernel/fadump.c | 16 +--
arch/powerpc/platforms/powernv/opal-core.c | 11 +-
arch/sparc/kernel/mdesc.c | 4 +-
arch/x86/kernel/setup.c | 2 +-
arch/x86/platform/efi/memmap.c | 5 +-
arch/x86/platform/efi/quirks.c | 2 +-
drivers/firmware/efi/apple-properties.c | 2 +-
drivers/of/kexec.c | 2 +-
include/linux/memblock.h | 2 -
init/initramfs.c | 7 --
kernel/dma/swiotlb.c | 6 +-
lib/bootconfig.c | 2 +-
mm/internal.h | 10 ++
mm/kfence/core.c | 4 +-
mm/memblock.c | 124 +++++++++++++++------
mm/page_alloc.c | 46 --------
tools/include/linux/mm.h | 1 +
tools/testing/memblock/internal.h | 34 +++++-
19 files changed, 144 insertions(+), 139 deletions(-)
base-commit: 1f318b96cc84d7c2ab792fcc0bfd42a7ca890681
--
2.53.0
^ permalink raw reply
* Re: NULL pointer dereference when booting ppc64_guest_defconfig in QEMU on -next
From: Harry Yoo (Oracle) @ 2026-03-23 1:53 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: Harry Yoo, Nathan Chancellor, Thomas Weißschuh,
Michal Clapinski, Andrew Morton, Thomas Gleixner, Steven Rostedt,
Masami Hiramatsu, linux-mm, linux-trace-kernel, linux-kernel
In-Reply-To: <7a8faee8-0eb5-4e58-a6d5-ef711791e3f4@efficios.com>
On Fri, Mar 20, 2026 at 10:20:37AM -0400, Mathieu Desnoyers wrote:
> On 2026-03-20 09:31, Mathieu Desnoyers wrote:
> > On 2026-03-20 09:21, Harry Yoo (Oracle) wrote:
> > > On Fri, Mar 20, 2026 at 08:35:46AM -0400, Mathieu Desnoyers wrote:
> > > > On 2026-03-20 00:17, Harry Yoo wrote:
> > > > [...]
> > > > > > [1]: https://lore.kernel.org/20260227153730.1556542-4-
> > > > > > mathieu.desnoyers@efficios.com/
> > > > >
> > > > > @Mathieu: In patch 1/3 description,
> > > > > > Changes since v7:
> > > > > > - Explicitly initialize the subsystem from start_kernel() right
> > > > > > after mm_core_init() so it is up and running before
> > > > > > the creation of
> > > > > > the first mm at boot.
> > > > >
> > > > > But how does this work when someone calls mm_cpumask() on
> > > > > init_mm early?
> > > > > Looks like it will behave incorrectly because get_rss_stat_items_size()
> > > > > returns zero?
> > > >
> > > > It doesn't work as expected at all. I missed that all users of
> > > > mm_cpumask()
> > > > end up relying on get_rss_stat_items_size(), which now calls
> > > > percpu_counter_tree_items_size(), which depends on initialization from
> > > > percpu_counter_tree_subsystem_init().
> > > >
> > > > If you add a call to percpu_counter_tree_subsystem_init in
> > > > arch/powerpc/kernel/setup_arch() just before:
>
> [...]
>
> One thing we could do to catch this kind of init sequence issue
> is to add a WARN_ON_ONCE in percpu_counter_tree_items_size:
>
> size_t percpu_counter_tree_items_size(void)
> {
> if (WARN_ON_ONCE(!nr_cpus_order))
> return 0;
> return counter_config->nr_items * sizeof(struct percpu_counter_tree_level_item);
Looks good!
--
Cheers,
Harry / Hyeonggon
^ permalink raw reply
* Re: NULL pointer dereference when booting ppc64_guest_defconfig in QEMU on -next
From: Harry Yoo (Oracle) @ 2026-03-23 1:53 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: Harry Yoo, Nathan Chancellor, Thomas Weißschuh,
Michal Clapinski, Andrew Morton, Thomas Gleixner, Steven Rostedt,
Masami Hiramatsu, linux-mm, linux-trace-kernel, linux-kernel
In-Reply-To: <7458d8fd-5922-4e0b-9cd5-91880282aaa3@efficios.com>
On Fri, Mar 20, 2026 at 09:31:57AM -0400, Mathieu Desnoyers wrote:
> On 2026-03-20 09:21, Harry Yoo (Oracle) wrote:
> > On Fri, Mar 20, 2026 at 08:35:46AM -0400, Mathieu Desnoyers wrote:
> > > On 2026-03-20 00:17, Harry Yoo wrote:
> > > [...]
> > > > > [1]: https://lore.kernel.org/20260227153730.1556542-4-mathieu.desnoyers@efficios.com/
> > > >
> > > > @Mathieu: In patch 1/3 description,
> > > > > Changes since v7:
> > > > > - Explicitly initialize the subsystem from start_kernel() right
> > > > > after mm_core_init() so it is up and running before the creation of
> > > > > the first mm at boot.
> > > >
> > > > But how does this work when someone calls mm_cpumask() on init_mm early?
> > > > Looks like it will behave incorrectly because get_rss_stat_items_size()
> > > > returns zero?
> > >
> > > It doesn't work as expected at all. I missed that all users of mm_cpumask()
> > > end up relying on get_rss_stat_items_size(), which now calls
> > > percpu_counter_tree_items_size(), which depends on initialization from
> > > percpu_counter_tree_subsystem_init().
> > >
> > > If you add a call to percpu_counter_tree_subsystem_init in
> > > arch/powerpc/kernel/setup_arch() just before:
> > >
> > > VM_WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(&init_mm)));
> > > cpumask_set_cpu(smp_processor_id(), mm_cpumask(&init_mm));
> > >
> > > Does the warning go away ?
> >
> > Hmm it goes away, but I'm not sure if it is it okay to use nr_cpu_ids
> > before setup_nr_cpu_ids() is called?
>
> AFAIU on powerpc setup_nr_cpu_ids() is called near the end of
> smp_setup_cpu_maps(), which is called early in setup_arch,
> at least before the two lines which use mm_cpumask.
Right.
> > > Alternatively, would could use a lazy initialization invoking
> > > percpu_counter_tree_subsystem_init from percpu_counter_tree_items_size
> > > when the initialization is not already done.
> >
> > So this probably isn't a way to go?
>
> I'd favor explicit initialization, so the inter-dependencies are clear.
Ack.
> > Hmm perhaps we should treat init_mm as a special case in
> > mm_cpus_allowed() and mm_cpumask().
>
> I'd prefer not to go there if boot sequence permits and keep things
> simple.
>
> I think we're in a situation very similar to tree RCU, here is what
> is done in rcu_init_geometry:
>
> static bool initialized;
>
> if (initialized) {
> /*
> * Warn if setup_nr_cpu_ids() had not yet been invoked,
> * unless nr_cpus_ids == NR_CPUS, in which case who cares?
> */
> WARN_ON_ONCE(old_nr_cpu_ids != nr_cpu_ids);
> return;
> }
>
> old_nr_cpu_ids = nr_cpu_ids;
> initialized = true;
Yeah, as long as nr_cpus_order doesn't change after init,
that will work for HPCC. powerpc seems to be a special case that calls
mm_cpumask() very early in the boot process, so explicitly calling the
init function seems to be fair.
By the way, thinking about it differently - it would probably be simpler
to just eliminate mm_cpumask's dependency on HPCC init dependency by
placing those cpumasks before percpu counter tree items... (but yeah,
that would make mm_struct a bit larger due to alignment requirements)
--
Cheers,
Harry / Hyeonggon
^ permalink raw reply
* Re: [PATCH v11 4/5] ring-buffer: Skip invalid sub-buffers when rewinding persistent ring buffer
From: kernel test robot @ 2026-03-22 23:18 UTC (permalink / raw)
To: Masami Hiramatsu (Google), Steven Rostedt
Cc: llvm, oe-kbuild-all, Masami Hiramatsu, Mathieu Desnoyers,
linux-kernel, linux-trace-kernel, Ian Rogers
In-Reply-To: <177391156211.193994.7531495945584650297.stgit@mhiramat.tok.corp.google.com>
Hi Masami,
kernel test robot noticed the following build errors:
[auto build test ERROR on trace/for-next]
[also build test ERROR on geert-m68k/for-next geert-m68k/for-linus openrisc/for-next deller-parisc/for-next powerpc/next powerpc/fixes s390/features uml/next tip/x86/core uml/fixes v7.0-rc4 next-20260320]
[cannot apply to linus/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Masami-Hiramatsu-Google/ring-buffer-Fix-to-update-per-subbuf-entries-of-persistent-ring-buffer/20260322-122412
base: https://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace for-next
patch link: https://lore.kernel.org/r/177391156211.193994.7531495945584650297.stgit%40mhiramat.tok.corp.google.com
patch subject: [PATCH v11 4/5] ring-buffer: Skip invalid sub-buffers when rewinding persistent ring buffer
config: x86_64-kexec (https://download.01.org/0day-ci/archive/20260323/202603230725.uMAZiKJx-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260323/202603230725.uMAZiKJx-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202603230725.uMAZiKJx-lkp@intel.com/
All errors (new ones prefixed by >>):
>> kernel/trace/ring_buffer.c:1965:15: error: use of undeclared identifier 'bpage'
1965 | local_set(&bpage->entries, 0);
| ^
kernel/trace/ring_buffer.c:1966:15: error: use of undeclared identifier 'bpage'
1966 | local_set(&bpage->page->commit, 0);
| ^
2 errors generated.
vim +/bpage +1965 kernel/trace/ring_buffer.c
1910
1911 /* If the meta data has been validated, now validate the events */
1912 static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
1913 {
1914 struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
1915 struct buffer_page *head_page, *orig_head;
1916 unsigned long entry_bytes = 0;
1917 unsigned long entries = 0;
1918 int discarded = 0;
1919 int ret;
1920 u64 ts;
1921 int i;
1922
1923 if (!meta || !meta->head_buffer)
1924 return;
1925
1926 orig_head = head_page = cpu_buffer->head_page;
1927
1928 /* Do the reader page first */
1929 ret = rb_validate_buffer(cpu_buffer->reader_page, cpu_buffer->cpu, meta);
1930 if (ret < 0) {
1931 pr_info("Ring buffer meta [%d] invalid reader page detected\n",
1932 cpu_buffer->cpu);
1933 discarded++;
1934 } else {
1935 entries += ret;
1936 entry_bytes += rb_page_size(cpu_buffer->reader_page);
1937 }
1938
1939 ts = head_page->page->time_stamp;
1940
1941 /*
1942 * Try to rewind the head so that we can read the pages which already
1943 * read in the previous boot.
1944 */
1945 if (head_page == cpu_buffer->tail_page)
1946 goto skip_rewind;
1947
1948 rb_dec_page(&head_page);
1949 for (i = 0; i < meta->nr_subbufs + 1; i++, rb_dec_page(&head_page)) {
1950
1951 /* Rewind until tail (writer) page. */
1952 if (head_page == cpu_buffer->tail_page)
1953 break;
1954
1955 /* Rewind until unused page (no timestamp, no commit). */
1956 if (!head_page->page->time_stamp && rb_page_commit(head_page) == 0)
1957 break;
1958
1959 /*
1960 * Skip if the page is invalid, or its timestamp is newer than the
1961 * previous valid page.
1962 */
1963 ret = rb_validate_buffer(head_page, cpu_buffer->cpu, meta);
1964 if (ret >= 0 && ts < head_page->page->time_stamp) {
> 1965 local_set(&bpage->entries, 0);
1966 local_set(&bpage->page->commit, 0);
1967 head_page->page->time_stamp = ts;
1968 ret = -1;
1969 }
1970 if (ret < 0) {
1971 if (!discarded)
1972 pr_info("Ring buffer meta [%d] invalid buffer page detected\n",
1973 cpu_buffer->cpu);
1974 discarded++;
1975 } else {
1976 entries += ret;
1977 entry_bytes += rb_page_size(head_page);
1978 if (ret > 0)
1979 local_inc(&cpu_buffer->pages_touched);
1980 ts = head_page->page->time_stamp;
1981 }
1982 }
1983 if (i)
1984 pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i);
1985
1986 /* The last rewound page must be skipped. */
1987 if (head_page != orig_head)
1988 rb_inc_page(&head_page);
1989
1990 /*
1991 * If the ring buffer was rewound, then inject the reader page
1992 * into the location just before the original head page.
1993 */
1994 if (head_page != orig_head) {
1995 struct buffer_page *bpage = orig_head;
1996
1997 rb_dec_page(&bpage);
1998 /*
1999 * Insert the reader_page before the original head page.
2000 * Since the list encode RB_PAGE flags, general list
2001 * operations should be avoided.
2002 */
2003 cpu_buffer->reader_page->list.next = &orig_head->list;
2004 cpu_buffer->reader_page->list.prev = orig_head->list.prev;
2005 orig_head->list.prev = &cpu_buffer->reader_page->list;
2006 bpage->list.next = &cpu_buffer->reader_page->list;
2007
2008 /* Make the head_page the reader page */
2009 cpu_buffer->reader_page = head_page;
2010 bpage = head_page;
2011 rb_inc_page(&head_page);
2012 head_page->list.prev = bpage->list.prev;
2013 rb_dec_page(&bpage);
2014 bpage->list.next = &head_page->list;
2015 rb_set_list_to_head(&bpage->list);
2016 cpu_buffer->pages = &head_page->list;
2017
2018 cpu_buffer->head_page = head_page;
2019 meta->head_buffer = (unsigned long)head_page->page;
2020
2021 /* Reset all the indexes */
2022 bpage = cpu_buffer->reader_page;
2023 meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page);
2024 bpage->id = 0;
2025
2026 for (i = 1, bpage = head_page; i < meta->nr_subbufs;
2027 i++, rb_inc_page(&bpage)) {
2028 meta->buffers[i] = rb_meta_subbuf_idx(meta, bpage->page);
2029 bpage->id = i;
2030 }
2031
2032 /* We'll restart verifying from orig_head */
2033 head_page = orig_head;
2034 }
2035
2036 skip_rewind:
2037 /* If the commit_buffer is the reader page, update the commit page */
2038 if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) {
2039 cpu_buffer->commit_page = cpu_buffer->reader_page;
2040 /* Nothing more to do, the only page is the reader page */
2041 goto done;
2042 }
2043
2044 /* Iterate until finding the commit page */
2045 for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) {
2046
2047 /* Reader page has already been done */
2048 if (head_page == cpu_buffer->reader_page)
2049 continue;
2050
2051 ret = rb_validate_buffer(head_page, cpu_buffer->cpu, meta);
2052 if (ret < 0) {
2053 if (!discarded)
2054 pr_info("Ring buffer meta [%d] invalid buffer page detected\n",
2055 cpu_buffer->cpu);
2056 discarded++;
2057 } else {
2058 /* If the buffer has content, update pages_touched */
2059 if (ret)
2060 local_inc(&cpu_buffer->pages_touched);
2061
2062 entries += ret;
2063 entry_bytes += rb_page_size(head_page);
2064 }
2065 if (head_page == cpu_buffer->commit_page)
2066 break;
2067 }
2068
2069 if (head_page != cpu_buffer->commit_page) {
2070 pr_info("Ring buffer meta [%d] commit page not found\n",
2071 cpu_buffer->cpu);
2072 goto invalid;
2073 }
2074 done:
2075 local_set(&cpu_buffer->entries, entries);
2076 local_set(&cpu_buffer->entries_bytes, entry_bytes);
2077
2078 pr_info("Ring buffer meta [%d] is from previous boot! (%d pages discarded)\n",
2079 cpu_buffer->cpu, discarded);
2080 return;
2081
2082 invalid:
2083 /* The content of the buffers are invalid, reset the meta data */
2084 meta->head_buffer = 0;
2085 meta->commit_buffer = 0;
2086
2087 /* Reset the reader page */
2088 local_set(&cpu_buffer->reader_page->entries, 0);
2089 local_set(&cpu_buffer->reader_page->page->commit, 0);
2090
2091 /* Reset all the subbuffers */
2092 for (i = 0; i < meta->nr_subbufs - 1; i++, rb_inc_page(&head_page)) {
2093 local_set(&head_page->entries, 0);
2094 rb_init_page(head_page->page);
2095 }
2096 }
2097
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply
* Re: [PATCH v3 0/4] locking: contended_release tracepoint instrumentation
From: Dmitry Ilvokhin @ 2026-03-22 12:10 UTC (permalink / raw)
To: Andrew Morton
Cc: Arnd Bergmann, Dennis Zhou, Tejun Heo, Christoph Lameter,
Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng, Waiman Long,
linux-arch, linux-kernel, linux-mm, linux-trace-kernel,
kernel-team
In-Reply-To: <20260321171002.013ee5a9d8c789e2a4a53afc@linux-foundation.org>
On Sat, Mar 21, 2026 at 05:10:02PM -0700, Andrew Morton wrote:
> On Wed, 18 Mar 2026 18:45:17 +0000 Dmitry Ilvokhin <d@ilvokhin.com> wrote:
>
> > The existing contention_begin/contention_end tracepoints fire on the
> > waiter side. The lock holder's identity and stack can be captured at
> > contention_begin time (e.g. perf lock contention --lock-owner), but
> > this reflects the holder's state when a waiter arrives, not when the
> > lock is actually released.
> >
> > This series adds a contended_release tracepoint that fires on the
> > holder side when a lock with waiters is released. This provides:
> >
> > - Hold time estimation: when the holder's own acquisition was
> > contended, its contention_end (acquisition) and contended_release
> > can be correlated to measure how long the lock was held under
> > contention.
> >
> > - The holder's stack at release time, which may differ from what perf lock
> > contention --lock-owner captures if the holder does significant work between
> > the waiter's arrival and the unlock.
> >
> > The series is structured as follows:
> >
> > 1. Remove unnecessary linux/sched.h include from trace/events/lock.h.
> > 2. Extract __percpu_up_read() out of the inline percpu_up_read() to
> > avoid binary size increase from adding a tracepoint.
> > 3. Add contended_release tracepoint and instrument sleepable locks:
> > mutex, rtmutex, semaphore, rwsem, percpu-rwsem, and rwbase_rt.
>
> AI review:
> https://sashiko.dev/#/patchset/cover.1773858853.git.d@ilvokhin.com
Thanks, Andrew, appreciate you sharing the link.
The AI review looks reasonable. I'll go through it and address the
feedback in the next revision. The kernel test robot is also reporting
failures on some configs, which seem related to the Sashiko comments.
^ permalink raw reply
* Re: [PATCH v11 4/5] ring-buffer: Skip invalid sub-buffers when rewinding persistent ring buffer
From: kernel test robot @ 2026-03-22 10:13 UTC (permalink / raw)
To: Masami Hiramatsu (Google), Steven Rostedt
Cc: oe-kbuild-all, Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
linux-trace-kernel, Ian Rogers
In-Reply-To: <177391156211.193994.7531495945584650297.stgit@mhiramat.tok.corp.google.com>
Hi Masami,
kernel test robot noticed the following build errors:
[auto build test ERROR on trace/for-next]
[also build test ERROR on geert-m68k/for-next geert-m68k/for-linus openrisc/for-next deller-parisc/for-next powerpc/next powerpc/fixes s390/features uml/next tip/x86/core linus/master uml/fixes v7.0-rc4 next-20260320]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Masami-Hiramatsu-Google/ring-buffer-Fix-to-update-per-subbuf-entries-of-persistent-ring-buffer/20260322-122412
base: https://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace for-next
patch link: https://lore.kernel.org/r/177391156211.193994.7531495945584650297.stgit%40mhiramat.tok.corp.google.com
patch subject: [PATCH v11 4/5] ring-buffer: Skip invalid sub-buffers when rewinding persistent ring buffer
config: arc-defconfig (https://download.01.org/0day-ci/archive/20260322/202603221806.j3AZggGX-lkp@intel.com/config)
compiler: arc-linux-gcc (GCC) 15.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260322/202603221806.j3AZggGX-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202603221806.j3AZggGX-lkp@intel.com/
All errors (new ones prefixed by >>):
In file included from ./arch/arc/include/generated/asm/local.h:1,
from include/linux/ring_buffer_types.h:5,
from kernel/trace/ring_buffer.c:7:
kernel/trace/ring_buffer.c: In function 'rb_meta_validate_events':
>> kernel/trace/ring_buffer.c:1965:36: error: 'bpage' undeclared (first use in this function); did you mean 'page'?
1965 | local_set(&bpage->entries, 0);
| ^~~~~
include/asm-generic/local.h:30:44: note: in definition of macro 'local_set'
30 | #define local_set(l,i) atomic_long_set((&(l)->a),(i))
| ^
kernel/trace/ring_buffer.c:1965:36: note: each undeclared identifier is reported only once for each function it appears in
1965 | local_set(&bpage->entries, 0);
| ^~~~~
include/asm-generic/local.h:30:44: note: in definition of macro 'local_set'
30 | #define local_set(l,i) atomic_long_set((&(l)->a),(i))
| ^
vim +1965 kernel/trace/ring_buffer.c
1910
1911 /* If the meta data has been validated, now validate the events */
1912 static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
1913 {
1914 struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
1915 struct buffer_page *head_page, *orig_head;
1916 unsigned long entry_bytes = 0;
1917 unsigned long entries = 0;
1918 int discarded = 0;
1919 int ret;
1920 u64 ts;
1921 int i;
1922
1923 if (!meta || !meta->head_buffer)
1924 return;
1925
1926 orig_head = head_page = cpu_buffer->head_page;
1927
1928 /* Do the reader page first */
1929 ret = rb_validate_buffer(cpu_buffer->reader_page, cpu_buffer->cpu, meta);
1930 if (ret < 0) {
1931 pr_info("Ring buffer meta [%d] invalid reader page detected\n",
1932 cpu_buffer->cpu);
1933 discarded++;
1934 } else {
1935 entries += ret;
1936 entry_bytes += rb_page_size(cpu_buffer->reader_page);
1937 }
1938
1939 ts = head_page->page->time_stamp;
1940
1941 /*
1942 * Try to rewind the head so that we can read the pages which already
1943 * read in the previous boot.
1944 */
1945 if (head_page == cpu_buffer->tail_page)
1946 goto skip_rewind;
1947
1948 rb_dec_page(&head_page);
1949 for (i = 0; i < meta->nr_subbufs + 1; i++, rb_dec_page(&head_page)) {
1950
1951 /* Rewind until tail (writer) page. */
1952 if (head_page == cpu_buffer->tail_page)
1953 break;
1954
1955 /* Rewind until unused page (no timestamp, no commit). */
1956 if (!head_page->page->time_stamp && rb_page_commit(head_page) == 0)
1957 break;
1958
1959 /*
1960 * Skip if the page is invalid, or its timestamp is newer than the
1961 * previous valid page.
1962 */
1963 ret = rb_validate_buffer(head_page, cpu_buffer->cpu, meta);
1964 if (ret >= 0 && ts < head_page->page->time_stamp) {
> 1965 local_set(&bpage->entries, 0);
1966 local_set(&bpage->page->commit, 0);
1967 head_page->page->time_stamp = ts;
1968 ret = -1;
1969 }
1970 if (ret < 0) {
1971 if (!discarded)
1972 pr_info("Ring buffer meta [%d] invalid buffer page detected\n",
1973 cpu_buffer->cpu);
1974 discarded++;
1975 } else {
1976 entries += ret;
1977 entry_bytes += rb_page_size(head_page);
1978 if (ret > 0)
1979 local_inc(&cpu_buffer->pages_touched);
1980 ts = head_page->page->time_stamp;
1981 }
1982 }
1983 if (i)
1984 pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i);
1985
1986 /* The last rewound page must be skipped. */
1987 if (head_page != orig_head)
1988 rb_inc_page(&head_page);
1989
1990 /*
1991 * If the ring buffer was rewound, then inject the reader page
1992 * into the location just before the original head page.
1993 */
1994 if (head_page != orig_head) {
1995 struct buffer_page *bpage = orig_head;
1996
1997 rb_dec_page(&bpage);
1998 /*
1999 * Insert the reader_page before the original head page.
2000 * Since the list encode RB_PAGE flags, general list
2001 * operations should be avoided.
2002 */
2003 cpu_buffer->reader_page->list.next = &orig_head->list;
2004 cpu_buffer->reader_page->list.prev = orig_head->list.prev;
2005 orig_head->list.prev = &cpu_buffer->reader_page->list;
2006 bpage->list.next = &cpu_buffer->reader_page->list;
2007
2008 /* Make the head_page the reader page */
2009 cpu_buffer->reader_page = head_page;
2010 bpage = head_page;
2011 rb_inc_page(&head_page);
2012 head_page->list.prev = bpage->list.prev;
2013 rb_dec_page(&bpage);
2014 bpage->list.next = &head_page->list;
2015 rb_set_list_to_head(&bpage->list);
2016 cpu_buffer->pages = &head_page->list;
2017
2018 cpu_buffer->head_page = head_page;
2019 meta->head_buffer = (unsigned long)head_page->page;
2020
2021 /* Reset all the indexes */
2022 bpage = cpu_buffer->reader_page;
2023 meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page);
2024 bpage->id = 0;
2025
2026 for (i = 1, bpage = head_page; i < meta->nr_subbufs;
2027 i++, rb_inc_page(&bpage)) {
2028 meta->buffers[i] = rb_meta_subbuf_idx(meta, bpage->page);
2029 bpage->id = i;
2030 }
2031
2032 /* We'll restart verifying from orig_head */
2033 head_page = orig_head;
2034 }
2035
2036 skip_rewind:
2037 /* If the commit_buffer is the reader page, update the commit page */
2038 if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) {
2039 cpu_buffer->commit_page = cpu_buffer->reader_page;
2040 /* Nothing more to do, the only page is the reader page */
2041 goto done;
2042 }
2043
2044 /* Iterate until finding the commit page */
2045 for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) {
2046
2047 /* Reader page has already been done */
2048 if (head_page == cpu_buffer->reader_page)
2049 continue;
2050
2051 ret = rb_validate_buffer(head_page, cpu_buffer->cpu, meta);
2052 if (ret < 0) {
2053 if (!discarded)
2054 pr_info("Ring buffer meta [%d] invalid buffer page detected\n",
2055 cpu_buffer->cpu);
2056 discarded++;
2057 } else {
2058 /* If the buffer has content, update pages_touched */
2059 if (ret)
2060 local_inc(&cpu_buffer->pages_touched);
2061
2062 entries += ret;
2063 entry_bytes += rb_page_size(head_page);
2064 }
2065 if (head_page == cpu_buffer->commit_page)
2066 break;
2067 }
2068
2069 if (head_page != cpu_buffer->commit_page) {
2070 pr_info("Ring buffer meta [%d] commit page not found\n",
2071 cpu_buffer->cpu);
2072 goto invalid;
2073 }
2074 done:
2075 local_set(&cpu_buffer->entries, entries);
2076 local_set(&cpu_buffer->entries_bytes, entry_bytes);
2077
2078 pr_info("Ring buffer meta [%d] is from previous boot! (%d pages discarded)\n",
2079 cpu_buffer->cpu, discarded);
2080 return;
2081
2082 invalid:
2083 /* The content of the buffers are invalid, reset the meta data */
2084 meta->head_buffer = 0;
2085 meta->commit_buffer = 0;
2086
2087 /* Reset the reader page */
2088 local_set(&cpu_buffer->reader_page->entries, 0);
2089 local_set(&cpu_buffer->reader_page->page->commit, 0);
2090
2091 /* Reset all the subbuffers */
2092 for (i = 0; i < meta->nr_subbufs - 1; i++, rb_inc_page(&head_page)) {
2093 local_set(&head_page->entries, 0);
2094 rb_init_page(head_page->page);
2095 }
2096 }
2097
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply
* [PATCH] blktrace: reject buf_size smaller than blk_io_trace
From: Deepanshu Kartikey @ 2026-03-22 5:18 UTC (permalink / raw)
To: axboe, rostedt, mhiramat, mathieu.desnoyers
Cc: linux-block, linux-kernel, linux-trace-kernel, Deepanshu Kartikey,
syzbot+ed8bc247f231c1a48e21, Deepanshu Kartikey
blk_trace_setup() accepts any non-zero buf_size.
If buf_size < sizeof(struct blk_io_trace), relay_reserve()
always returns NULL and all trace events are silently dropped.
Reject such values early with -EINVAL.
Reported-by: syzbot+ed8bc247f231c1a48e21@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=ed8bc247f231c1a48e21
Signed-off-by: Deepanshu Kartikey <Kartikey406@gmail.com>
---
kernel/trace/blktrace.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 8cd2520b4c99..6cc7d83ed1c2 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -773,7 +773,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (ret)
return -EFAULT;
- if (!buts.buf_size || !buts.buf_nr)
+ if (buts.buf_size < sizeof(struct blk_io_trace) || !buts.buf_nr)
return -EINVAL;
buts2 = (struct blk_user_trace_setup2) {
--
2.43.0
^ permalink raw reply related
* Re: [PATCH v3 0/4] locking: contended_release tracepoint instrumentation
From: Andrew Morton @ 2026-03-22 0:10 UTC (permalink / raw)
To: Dmitry Ilvokhin
Cc: Arnd Bergmann, Dennis Zhou, Tejun Heo, Christoph Lameter,
Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng, Waiman Long,
linux-arch, linux-kernel, linux-mm, linux-trace-kernel,
kernel-team
In-Reply-To: <cover.1773858853.git.d@ilvokhin.com>
On Wed, 18 Mar 2026 18:45:17 +0000 Dmitry Ilvokhin <d@ilvokhin.com> wrote:
> The existing contention_begin/contention_end tracepoints fire on the
> waiter side. The lock holder's identity and stack can be captured at
> contention_begin time (e.g. perf lock contention --lock-owner), but
> this reflects the holder's state when a waiter arrives, not when the
> lock is actually released.
>
> This series adds a contended_release tracepoint that fires on the
> holder side when a lock with waiters is released. This provides:
>
> - Hold time estimation: when the holder's own acquisition was
> contended, its contention_end (acquisition) and contended_release
> can be correlated to measure how long the lock was held under
> contention.
>
> - The holder's stack at release time, which may differ from what perf lock
> contention --lock-owner captures if the holder does significant work between
> the waiter's arrival and the unlock.
>
> The series is structured as follows:
>
> 1. Remove unnecessary linux/sched.h include from trace/events/lock.h.
> 2. Extract __percpu_up_read() out of the inline percpu_up_read() to
> avoid binary size increase from adding a tracepoint.
> 3. Add contended_release tracepoint and instrument sleepable locks:
> mutex, rtmutex, semaphore, rwsem, percpu-rwsem, and rwbase_rt.
AI review:
https://sashiko.dev/#/patchset/cover.1773858853.git.d@ilvokhin.com
^ permalink raw reply
* Re: [PATCH 3/3] rtla: Parse cmdline using libsubcmd
From: Costa Shulyupin @ 2026-03-21 16:08 UTC (permalink / raw)
To: Tomas Glozar
Cc: Steven Rostedt, John Kacur, Luis Goncalves, Crystal Wood,
Wander Lairson Costa, Ivan Pravdin, Namhyung Kim, Ian Rogers,
Arnaldo Carvalho de Melo, LKML, linux-trace-kernel,
linux-perf-users
In-Reply-To: <20260320150651.51057-4-tglozar@redhat.com>
On Fri, 20 Mar 2026 at 17:07, Tomas Glozar <tglozar@redhat.com> wrote:
>> +#define TIMERLAT_OPT_NANO OPT_CALLBACK('n', "nano", params, NULL, \
> + "display data in nanoseconds", \
> + opt_nano_cb)
-n/--nano requires value incorrectly
File: src/cli.c:463
Cause: TIMERLAT_OPT_NANO used OPT_CALLBACK which expects an argument,
but -n is a flag.
Fix: Changed to OPT_CALLBACK_NOOPT:
> + HIST_OPT_NO_IRQ,
--no-irq clashes with auto-negation of --irq
File: src/cli.c:1042
Cause: libsubcmd auto-generates --no-X negations for every option.
--no-irq (histogram boolean) collides with the auto-negation of --irq
(stop threshold). The first match wins, so --irq was matched first and
its negation intercepted the call.
Fix: Moved HIST_OPT_NO_IRQ before RTLA_OPT_STOP('i', "irq", ...) in
the options array so the explicit --no-irq boolean is found first.
> + HIST_OPT_NO_THREAD,
--no-thread - same issue
Cause: Same collision between --no-thread boolean and auto-negation
of --thread.
Fix: Same reordering approach.
Costa and Claude
diff --git a/tools/tracing/rtla/src/cli.c b/tools/tracing/rtla/src/cli.c
index d029a698e8a7..c6b9ed920539 100644
--- a/tools/tracing/rtla/src/cli.c
+++ b/tools/tracing/rtla/src/cli.c
@@ -460,7 +460,7 @@ static int opt_osnoise_on_end_cb(const struct
option *opt, const char *arg, int
"save the stack trace at the IRQ if a thread latency is higher than
the argument in us", \
opt_llong_callback)
-#define TIMERLAT_OPT_NANO OPT_CALLBACK('n', "nano", params, NULL, \
+#define TIMERLAT_OPT_NANO OPT_CALLBACK_NOOPT('n', "nano", params, NULL, \
"display data in nanoseconds", \
opt_nano_cb)
@@ -1011,6 +1011,12 @@ struct common_params
*timerlat_hist_parse_args(int argc, char **argv)
cb_data.trace_output = NULL;
const struct option timerlat_hist_options[] = {
+ OPT_GROUP("Histogram Options:"),
+ HIST_OPT_NO_IRQ,
+ HIST_OPT_NO_THREAD,
+ HIST_OPT_BUCKET_SIZE,
+ HIST_OPT_ENTRIES,
+
OPT_GROUP("Tracing Options:"),
TIMERLAT_OPT_PERIOD,
RTLA_OPT_STOP('i', "irq", "irq latency"),
@@ -1034,11 +1040,7 @@ struct common_params
*timerlat_hist_parse_args(int argc, char **argv)
RTLA_OPT_KERNEL_THREADS,
RTLA_OPT_USER_LOAD,
- OPT_GROUP("Histogram Options:"),
- HIST_OPT_BUCKET_SIZE,
- HIST_OPT_ENTRIES,
- HIST_OPT_NO_IRQ,
- HIST_OPT_NO_THREAD,
+ OPT_GROUP(""),
HIST_OPT_NO_HEADER,
HIST_OPT_NO_SUMMARY,
HIST_OPT_NO_INDEX,
^ permalink raw reply related
* Re: NULL pointer dereference when booting ppc64_guest_defconfig in QEMU on -next
From: Andrew Morton @ 2026-03-21 2:21 UTC (permalink / raw)
To: Ritesh Harjani
Cc: Mathieu Desnoyers, Harry Yoo (Oracle), linuxppc-dev, Harry Yoo,
Nathan Chancellor, Thomas Weißschuh, Michal Clapinski,
Thomas Gleixner, Steven Rostedt, Masami Hiramatsu, linux-mm,
linux-trace-kernel, linux-kernel, Srikar Dronamraju,
Madhavan Srinivasan
In-Reply-To: <h5qax9x2.ritesh.list@gmail.com>
On Sat, 21 Mar 2026 06:42:41 +0530 Ritesh Harjani (IBM) <ritesh.list@gmail.com> wrote:
> Looks like this is causing regressions in linux-next with warnings
> similar to what Harry also pointed out. Do we have any solution for
> this, or are we planning to hold on to this patch[1] and maybe even
> remove it temporarily from linux-next, until this is fixed?
Yes, I'll disable this patchset.
^ permalink raw reply
* Re: NULL pointer dereference when booting ppc64_guest_defconfig in QEMU on -next
From: Ritesh Harjani @ 2026-03-21 1:12 UTC (permalink / raw)
To: Mathieu Desnoyers, Harry Yoo (Oracle), linuxppc-dev
Cc: Harry Yoo, Nathan Chancellor, Thomas Weißschuh,
Michal Clapinski, Andrew Morton, Thomas Gleixner, Steven Rostedt,
Masami Hiramatsu, linux-mm, linux-trace-kernel, linux-kernel,
Srikar Dronamraju, Madhavan Srinivasan
In-Reply-To: <7a8faee8-0eb5-4e58-a6d5-ef711791e3f4@efficios.com>
++ linuxppc-dev
Mathieu Desnoyers <mathieu.desnoyers@efficios.com> writes:
> On 2026-03-20 09:31, Mathieu Desnoyers wrote:
>> On 2026-03-20 09:21, Harry Yoo (Oracle) wrote:
>>> On Fri, Mar 20, 2026 at 08:35:46AM -0400, Mathieu Desnoyers wrote:
>>>> On 2026-03-20 00:17, Harry Yoo wrote:
>>>> [...]
>>>>>> [1]: https://lore.kernel.org/20260227153730.1556542-4-
>>>>>> mathieu.desnoyers@efficios.com/
>>>>>
>>>>> @Mathieu: In patch 1/3 description,
>>>>>> Changes since v7:
>>>>>> - Explicitly initialize the subsystem from start_kernel() right
>>>>>> after mm_core_init() so it is up and running before the
>>>>>> creation of
>>>>>> the first mm at boot.
>>>>>
>>>>> But how does this work when someone calls mm_cpumask() on init_mm
>>>>> early?
>>>>> Looks like it will behave incorrectly because get_rss_stat_items_size()
>>>>> returns zero?
>>>>
>>>> It doesn't work as expected at all. I missed that all users of
>>>> mm_cpumask()
>>>> end up relying on get_rss_stat_items_size(), which now calls
>>>> percpu_counter_tree_items_size(), which depends on initialization from
>>>> percpu_counter_tree_subsystem_init().
>>>>
>>>> If you add a call to percpu_counter_tree_subsystem_init in
>>>> arch/powerpc/kernel/setup_arch() just before:
Even though powerpc is showing the warning because of VM_WARN_ON_ONCE(),
but this looks more of a generic problem, where use of mm_cpumask()
before and after percpu_counter_tree_items_size() could lead to
different results (as you also pointed above).
Looks like this is causing regressions in linux-next with warnings
similar to what Harry also pointed out. Do we have any solution for
this, or are we planning to hold on to this patch[1] and maybe even
remove it temporarily from linux-next, until this is fixed?
[1]: https://lore.kernel.org/all/20260227153730.1556542-1-mathieu.desnoyers@efficios.com/
[ 0.000000] WARNING: arch/powerpc/mm/mmu_context.c:106 at switch_mm_irqs_off+0x1a0/0x1d0, CPU#2: swapper/0
[ 0.000000] Modules linked in:
[ 0.000000] CPU: 2 UID: 0 PID: 0 Comm: swapper Not tainted 7.0.0-rc4-next-20260317-00008-g5585e414f073 #4 PREEMPTLAZY
[ 0.000000] Hardware name: IBM PowerNV (emulated by qemu) POWER10 0x801200 opal:v7.1 PowerNV
[ 0.000000] NIP: c00000000008f3b0 LR: c00000000008f330 CTR: c000000000090e20
[ 0.000000] REGS: c000000003cb79b0 TRAP: 0700 Not tainted (7.0.0-rc4-next-20260317-00008-g5585e414f073)
[ 0.000000] MSR: 9000000002021033 <SF,HV,VEC,ME,IR,DR,RI,LE> CR:24022224 XER: 00000000
<...>
[ 0.000000] NIP [c00000000008f3b0] switch_mm_irqs_off+0x1a0/0x1d0
[ 0.000000] LR [c00000000008f330] switch_mm_irqs_off+0x120/0x1d0
[ 0.000000] Call Trace:
[ 0.000000] [c000000003cb7c50] [0500210400000080] 0x500210400000080 (unreliable)
[ 0.000000] [c000000003cb7cb0] [c0000000000ad850] start_using_temp_mm+0x34/0xb0
[ 0.000000] [c000000003cb7cf0] [c0000000000ae8b8] patch_mem+0x110/0x530
[ 0.000000] [c000000003cb7d70] [c000000000077f30] ftrace_modify_code+0x114/0x154
[ 0.000000] [c000000003cb7dd0] [c00000000036a690] ftrace_process_locs+0x408/0x810
[ 0.000000] [c000000003cb7ec0] [c0000000030584ec] ftrace_init+0x68/0x1c4
[ 0.000000] [c000000003cb7f30] [c00000000300d3b8] start_kernel+0x680/0xc44
[ 0.000000] [c000000003cb7fe0] [c00000000000e99c] start_here_common+0x1c/0x20
-ritesh
^ permalink raw reply
* Re: [PATCH net-next v4 13/13] net/mlx5: Add a shared devlink instance for PFs on same chip
From: Adam Young @ 2026-03-20 23:16 UTC (permalink / raw)
To: Jiri Pirko, netdev
Cc: davem, edumazet, kuba, pabeni, horms, donald.hunter, corbet,
skhan, saeedm, leon, tariqt, mbloch, przemyslaw.kitszel, mschmidt,
andrew+netdev, rostedt, mhiramat, mathieu.desnoyers, chuck.lever,
matttbe, cjubran, daniel.zahka, linux-doc, linux-rdma,
linux-trace-kernel
In-Reply-To: <20260312100407.551173-14-jiri@resnulli.us>
This breaks on my system:
On 7.0.0 It boots fine. With net-next/main currently at this commit
commit 8737d7194d6d5947c3d7d8813895b44a25b84477 (net-next/main,
net-next/HEAD)
Author: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Fri Mar 13 17:28:36 2026 +0100
I get:
[ 21.859081] mlx5_core 0005:01:00.0: probe_one:2017:(pid 10):
mlx5_shd_init failed with error code -2
[ 21.863266] mlx5_core 0005:01:00.0: probe with driver mlx5_core
failed with error -2
[ 21.866360] mlx5_core 0005:01:00.1: probe_one:2017:(pid 10):
mlx5_shd_init failed with error code -2
[ 21.869937] mlx5_core 0005:01:00.1: probe with driver mlx5_core
failed with error -2
I am happy to help debug: what do you need from me?
On 3/12/26 06:04, Jiri Pirko wrote:
> From: Jiri Pirko <jiri@nvidia.com>
>
> Use the previously introduced shared devlink infrastructure to create
> a shared devlink instance for mlx5 PFs that reside on the same physical
> chip. The shared instance is identified by the chip's serial number
> extracted from PCI VPD (V3 keyword, with fallback to serial number
> for older devices).
>
> Each PF that probes calls mlx5_shd_init() which extracts the chip serial
> number and uses devlink_shd_get() to get or create the shared instance.
> When a PF is removed, mlx5_shd_uninit() calls devlink_shd_put()
> to release the reference. The shared instance is automatically destroyed
> when the last PF is removed.
>
> Make the PF devlink instances nested in this shared devlink instance,
> allowing userspace to identify which PFs belong to the same physical
> chip.
>
> Example:
>
> pci/0000:08:00.0: index 0
> nested_devlink:
> auxiliary/mlx5_core.eth.0
> devlink_index/1: index 1
> nested_devlink:
> pci/0000:08:00.0
> pci/0000:08:00.1
> auxiliary/mlx5_core.eth.0: index 2
> pci/0000:08:00.1: index 3
> nested_devlink:
> auxiliary/mlx5_core.eth.1
> auxiliary/mlx5_core.eth.1: index 4
>
> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
> ---
> v2->v3:
> - removed "const" from "sn"
> - passing driver pointer to devlink_shd_get()
> ---
> .../net/ethernet/mellanox/mlx5/core/Makefile | 5 +-
> .../net/ethernet/mellanox/mlx5/core/main.c | 17 ++++++
> .../ethernet/mellanox/mlx5/core/sh_devlink.c | 61 +++++++++++++++++++
> .../ethernet/mellanox/mlx5/core/sh_devlink.h | 12 ++++
> include/linux/mlx5/driver.h | 1 +
> 5 files changed, 94 insertions(+), 2 deletions(-)
> create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.c
> create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.h
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
> index 8ffa286a18f5..d39fe9c4a87c 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
> @@ -16,8 +16,9 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
> transobj.o vport.o sriov.o fs_cmd.o fs_core.o pci_irq.o \
> fs_counters.o fs_ft_pool.o rl.o lag/debugfs.o lag/lag.o dev.o events.o wq.o lib/gid.o \
> lib/devcom.o lib/pci_vsc.o lib/dm.o lib/fs_ttc.o diag/fs_tracepoint.o \
> - diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o diag/reporter_vnic.o \
> - fw_reset.o qos.o lib/tout.o lib/aso.o wc.o fs_pool.o lib/nv_param.o
> + diag/fw_tracer.o diag/crdump.o devlink.o sh_devlink.o diag/rsc_dump.o \
> + diag/reporter_vnic.o fw_reset.o qos.o lib/tout.o lib/aso.o wc.o fs_pool.o \
> + lib/nv_param.o
>
> #
> # Netdev basic
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
> index fdc3ba20912e..1c35c3fc3bb3 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
> @@ -74,6 +74,7 @@
> #include "mlx5_irq.h"
> #include "hwmon.h"
> #include "lag/lag.h"
> +#include "sh_devlink.h"
>
> MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
> MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) core driver");
> @@ -1520,10 +1521,16 @@ int mlx5_init_one(struct mlx5_core_dev *dev)
> int err;
>
> devl_lock(devlink);
> + if (dev->shd) {
> + err = devl_nested_devlink_set(dev->shd, devlink);
> + if (err)
> + goto unlock;
> + }
> devl_register(devlink);
> err = mlx5_init_one_devl_locked(dev);
> if (err)
> devl_unregister(devlink);
> +unlock:
> devl_unlock(devlink);
> return err;
> }
> @@ -2005,6 +2012,13 @@ static int probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
> goto pci_init_err;
> }
>
> + err = mlx5_shd_init(dev);
> + if (err) {
> + mlx5_core_err(dev, "mlx5_shd_init failed with error code %d\n",
> + err);
> + goto shd_init_err;
> + }
> +
> err = mlx5_init_one(dev);
> if (err) {
> mlx5_core_err(dev, "mlx5_init_one failed with error code %d\n",
> @@ -2018,6 +2032,8 @@ static int probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
> return 0;
>
> err_init_one:
> + mlx5_shd_uninit(dev);
> +shd_init_err:
> mlx5_pci_close(dev);
> pci_init_err:
> mlx5_mdev_uninit(dev);
> @@ -2039,6 +2055,7 @@ static void remove_one(struct pci_dev *pdev)
> mlx5_drain_health_wq(dev);
> mlx5_sriov_disable(pdev, false);
> mlx5_uninit_one(dev);
> + mlx5_shd_uninit(dev);
> mlx5_pci_close(dev);
> mlx5_mdev_uninit(dev);
> mlx5_adev_idx_free(dev->priv.adev_idx);
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.c
> new file mode 100644
> index 000000000000..bc33f95302df
> --- /dev/null
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.c
> @@ -0,0 +1,61 @@
> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
> +/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
> +
> +#include <linux/mlx5/driver.h>
> +#include <net/devlink.h>
> +
> +#include "sh_devlink.h"
> +
> +static const struct devlink_ops mlx5_shd_ops = {
> +};
> +
> +int mlx5_shd_init(struct mlx5_core_dev *dev)
> +{
> + u8 *vpd_data __free(kfree) = NULL;
> + struct pci_dev *pdev = dev->pdev;
> + unsigned int vpd_size, kw_len;
> + struct devlink *devlink;
> + char *sn, *end;
> + int start;
> + int err;
> +
> + if (!mlx5_core_is_pf(dev))
> + return 0;
> +
> + vpd_data = pci_vpd_alloc(pdev, &vpd_size);
> + if (IS_ERR(vpd_data)) {
> + err = PTR_ERR(vpd_data);
> + return err == -ENODEV ? 0 : err;
> + }
> + start = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size, "V3", &kw_len);
> + if (start < 0) {
> + /* Fall-back to SN for older devices. */
> + start = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size,
> + PCI_VPD_RO_KEYWORD_SERIALNO, &kw_len);
> + if (start < 0)
> + return -ENOENT;
> + }
> + sn = kstrndup(vpd_data + start, kw_len, GFP_KERNEL);
> + if (!sn)
> + return -ENOMEM;
> + /* Firmware may return spaces at the end of the string, strip it. */
> + end = strchrnul(sn, ' ');
> + *end = '\0';
> +
> + /* Get or create shared devlink instance */
> + devlink = devlink_shd_get(sn, &mlx5_shd_ops, 0, pdev->dev.driver);
> + kfree(sn);
> + if (!devlink)
> + return -ENOMEM;
> +
> + dev->shd = devlink;
> + return 0;
> +}
> +
> +void mlx5_shd_uninit(struct mlx5_core_dev *dev)
> +{
> + if (!dev->shd)
> + return;
> +
> + devlink_shd_put(dev->shd);
> +}
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.h
> new file mode 100644
> index 000000000000..8ab8d6940227
> --- /dev/null
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.h
> @@ -0,0 +1,12 @@
> +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
> +/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
> +
> +#ifndef __MLX5_SH_DEVLINK_H__
> +#define __MLX5_SH_DEVLINK_H__
> +
> +#include <linux/mlx5/driver.h>
> +
> +int mlx5_shd_init(struct mlx5_core_dev *dev);
> +void mlx5_shd_uninit(struct mlx5_core_dev *dev);
> +
> +#endif /* __MLX5_SH_DEVLINK_H__ */
> diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
> index 04dcd09f7517..1268fcf35ec7 100644
> --- a/include/linux/mlx5/driver.h
> +++ b/include/linux/mlx5/driver.h
> @@ -798,6 +798,7 @@ struct mlx5_core_dev {
> enum mlx5_wc_state wc_state;
> /* sync write combining state */
> struct mutex wc_state_lock;
> + struct devlink *shd;
> };
>
> struct mlx5_db {
^ permalink raw reply
* Re: [PATCH] tracing: Adjust cmd_check_undefined to show unexpected undefined symbols
From: Arnd Bergmann @ 2026-03-20 21:34 UTC (permalink / raw)
To: Nathan Chancellor, Marc Zyngier, Vincent Donnefort
Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
linux-trace-kernel, kvmarm
In-Reply-To: <20260320-cmd_check_undefined-verbose-v1-1-54fc5b061f94@kernel.org>
On Fri, Mar 20, 2026, at 22:29, Nathan Chancellor wrote:
> When the check_undefined command in kernel/trace/Makefile fails, there
> is no output, making it hard to understand why the build failed. Capture
> the output of the $(NM) + grep command and print it when failing to make
> it clearer what the problem is.
>
> Fixes: a717943d8ecc ("tracing: Check for undefined symbols in
> simple_ring_buffer")
> Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
This does seem very helpful, as I still expect this to come up regularly.
Arnd
^ permalink raw reply
* [PATCH] tracing: Adjust cmd_check_undefined to show unexpected undefined symbols
From: Nathan Chancellor @ 2026-03-20 21:29 UTC (permalink / raw)
To: Marc Zyngier, Vincent Donnefort
Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
Arnd Bergmann, linux-kernel, linux-trace-kernel, kvmarm,
Nathan Chancellor
When the check_undefined command in kernel/trace/Makefile fails, there
is no output, making it hard to understand why the build failed. Capture
the output of the $(NM) + grep command and print it when failing to make
it clearer what the problem is.
Fixes: a717943d8ecc ("tracing: Check for undefined symbols in simple_ring_buffer")
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
---
Commit a717943d8ecc ("tracing: Check for undefined symbols in
simple_ring_buffer") and its follow up fixes are in the kvmarm tree so
this should go there as well. This is the rebased version of my
suggestion in the original thread:
https://lore.kernel.org/20260311221816.GA316631@ax162/
---
kernel/trace/Makefile | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c5e14ffd36ee..d662c1a64cd5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -174,7 +174,13 @@ UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitize
$(shell $(NM) -u $(obj)/undefsyms_base.o 2>/dev/null | awk '{print $$2}')
quiet_cmd_check_undefined = NM $<
- cmd_check_undefined = test -z "`$(NM) -u $< | grep -v $(addprefix -e , $(UNDEFINED_ALLOWLIST))`"
+ cmd_check_undefined = \
+ undefsyms=$$($(NM) -u $< | grep -v $(addprefix -e , $(UNDEFINED_ALLOWLIST)) || true); \
+ if [ -n "$$undefsyms" ]; then \
+ echo "Unexpected symbols in $<:" >&2; \
+ echo "$$undefsyms" >&2; \
+ false; \
+ fi
$(obj)/%.o.checked: $(obj)/%.o $(obj)/undefsyms_base.o FORCE
$(call if_changed,check_undefined)
---
base-commit: e3d585ed3ff891a00c2284fef4be9cf8581735ab
change-id: 20260320-cmd_check_undefined-verbose-7d15f13f615d
Best regards,
--
Nathan Chancellor <nathan@kernel.org>
^ permalink raw reply related
* Re: [PATCH] coredump: add tracepoint for coredump events
From: Steven Rostedt @ 2026-03-20 18:48 UTC (permalink / raw)
To: Christian Brauner
Cc: Breno Leitao, Alexander Viro, Jan Kara, Masami Hiramatsu,
Mathieu Desnoyers, linux-kernel, linux-fsdevel,
linux-trace-kernel, bpf, kernel-team, Andrii Nakryiko
In-Reply-To: <20260320-erlitt-ergibt-255e86a66414@brauner>
On Fri, 20 Mar 2026 14:21:23 +0100
Christian Brauner <brauner@kernel.org> wrote:
> > +TRACE_EVENT(coredump,
> > +
> > + TP_PROTO(int sig),
> > +
> > + TP_ARGS(sig),
> > +
> > + TP_STRUCT__entry(
> > + __field(int, sig)
> > + __array(char, comm, TASK_COMM_LEN)
> > + __field(pid_t, pid)
> > + ),
> > +
> > + TP_fast_assign(
> > + __entry->sig = sig;
> > + memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
> > + __entry->pid = current->pid;
>
> That's the TID as seen in the global pid namespace.
> I assume this is what you want but worth noting.
Not to mention the pid is saved in all trace events and is available for
perf and bpf too. Even the change log showed it:
sleep-634 [036] ..... 145.222206: coredump: sig=11 comm=sleep pid=634
^^^ ^^^
So it should not be included. It's duplicate and only wastes space. Now if
you wanted to save the name space pid, that may be useful.
-- Steve
^ permalink raw reply
* Re: [PATCH 3/3] rtla: Parse cmdline using libsubcmd
From: Wander Lairson Costa @ 2026-03-20 17:31 UTC (permalink / raw)
To: Tomas Glozar
Cc: Steven Rostedt, John Kacur, Luis Goncalves, Crystal Wood,
Costa Shulyupin, Ivan Pravdin, Namhyung Kim, Ian Rogers,
Arnaldo Carvalho de Melo, LKML, linux-trace-kernel,
linux-perf-users
In-Reply-To: <20260320150651.51057-4-tglozar@redhat.com>
On Fri, Mar 20, 2026 at 04:06:51PM +0100, Tomas Glozar wrote:
> Instead of using getopt_long() directly to parse the command line
> arguments given to an RTLA tool, use libsubcmd's parse_options().
>
> Utilizing libsubcmd for parsing command line arguments has several
> benefits:
>
> - A help message is automatically generated by libsubcmd from the
> specification, removing the need of writing it by hand.
> - Options are sorted into groups based on which part of tracing (CPU,
> thread, auto-analysis, tuning, histogram) they relate to.
> - Common parsing patterns for numerical and boolean values now share
> code, with the target variable being stored in the option array.
>
> To avoid duplication of the option parsing logic, RTLA-specific
> macros defining struct option values are created:
>
> - RTLA_OPT_* for options common to all tools
> - OSNOISE_OPT_* and TIMERLAT_OPT_* for options specific to
[snip]
> -int getopt_auto(int argc, char **argv, const struct option *long_opts);
> int common_parse_options(int argc, char **argv, struct common_params *common);
The function common_parse_options() body was removed, but the declaration remains.
> int common_apply_config(struct osnoise_tool *tool, struct common_params *params);
[snip]
^ permalink raw reply
* Re: [PATCH v3 2/2] blk-mq: expose tag starvation counts via debugfs
From: Laurence Oberman @ 2026-03-20 15:08 UTC (permalink / raw)
To: Aaron Tomlin, axboe, rostedt, mhiramat, mathieu.desnoyers
Cc: johannes.thumshirn, kch, bvanassche, dlemoal, ritesh.list, neelx,
sean, mproche, chjohnst, linux-block, linux-kernel,
linux-trace-kernel
In-Reply-To: <20260319221956.332770-3-atomlin@atomlin.com>
On Thu, 2026-03-19 at 18:19 -0400, Aaron Tomlin wrote:
> In high-performance storage environments, particularly when utilising
> RAID controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED),
> severe
> latency spikes can occur when fast devices are starved of available
> tags.
>
> This patch introduces two new debugfs attributes for each block
> hardware queue:
> - /sys/kernel/debug/block/[device]/hctxN/wait_on_hw_tag
> - /sys/kernel/debug/block/[device]/hctxN/wait_on_sched_tag
>
> These files expose atomic counters that increment each time a
> submitting
> context is forced into an uninterruptible sleep via io_schedule() due
> to
> the complete exhaustion of physical driver tags or software scheduler
> tags, respectively.
>
> To guarantee zero performance overhead for production kernels
> compiled
> without debugfs, the underlying atomic_t variables and their
> associated
> increment routines are strictly guarded behind CONFIG_BLK_DEBUG_FS.
> When this configuration is disabled, the tracking logic compiles down
> to a safe no-op.
>
> Signed-off-by: Aaron Tomlin <atomlin@atomlin.com>
> ---
> block/blk-mq-debugfs.c | 56
> ++++++++++++++++++++++++++++++++++++++++++
> block/blk-mq-debugfs.h | 7 ++++++
> block/blk-mq-tag.c | 4 +++
> include/linux/blk-mq.h | 10 ++++++++
> 4 files changed, 77 insertions(+)
>
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index 28167c9baa55..078561d7da38 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -483,6 +483,42 @@ static int hctx_dispatch_busy_show(void *data,
> struct seq_file *m)
> return 0;
> }
>
> +/**
> + * hctx_wait_on_hw_tag_show - display hardware tag starvation count
> + * @data: generic pointer to the associated hardware context (hctx)
> + * @m: seq_file pointer for debugfs output formatting
> + *
> + * Prints the cumulative number of times a submitting context was
> forced
> + * to block due to the exhaustion of physical hardware driver tags.
> + *
> + * Return: 0 on success.
> + */
> +static int hctx_wait_on_hw_tag_show(void *data, struct seq_file *m)
> +{
> + struct blk_mq_hw_ctx *hctx = data;
> +
> + seq_printf(m, "%d\n", atomic_read(&hctx->wait_on_hw_tag));
> + return 0;
> +}
> +
> +/**
> + * hctx_wait_on_sched_tag_show - display scheduler tag starvation
> count
> + * @data: generic pointer to the associated hardware context (hctx)
> + * @m: seq_file pointer for debugfs output formatting
> + *
> + * Prints the cumulative number of times a submitting context was
> forced
> + * to block due to the exhaustion of software scheduler tags.
> + *
> + * Return: 0 on success.
> + */
> +static int hctx_wait_on_sched_tag_show(void *data, struct seq_file
> *m)
> +{
> + struct blk_mq_hw_ctx *hctx = data;
> +
> + seq_printf(m, "%d\n", atomic_read(&hctx-
> >wait_on_sched_tag));
> + return 0;
> +}
> +
> #define CTX_RQ_SEQ_OPS(name,
> type) \
> static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t
> *pos) \
> __acquires(&ctx-
> >lock) \
> @@ -598,6 +634,8 @@ static const struct blk_mq_debugfs_attr
> blk_mq_debugfs_hctx_attrs[] = {
> {"active", 0400, hctx_active_show},
> {"dispatch_busy", 0400, hctx_dispatch_busy_show},
> {"type", 0400, hctx_type_show},
> + {"wait_on_hw_tag", 0400, hctx_wait_on_hw_tag_show},
> + {"wait_on_sched_tag", 0400, hctx_wait_on_sched_tag_show},
> {},
> };
>
> @@ -814,3 +852,21 @@ void blk_mq_debugfs_unregister_sched_hctx(struct
> blk_mq_hw_ctx *hctx)
> debugfs_remove_recursive(hctx->sched_debugfs_dir);
> hctx->sched_debugfs_dir = NULL;
> }
> +
> +/**
> + * blk_mq_debugfs_inc_wait_tags - increment the tag starvation
> counters
> + * @hctx: hardware context associated with the tag allocation
> + * @is_sched: boolean indicating whether the starved pool is the
> software scheduler
> + *
> + * Evaluates the exhausted tag pool and increments the appropriate
> debugfs
> + * starvation counter. This is invoked immediately before the
> submitting
> + * context is forced into an uninterruptible sleep via
> io_schedule().
> + */
> +void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
> + bool is_sched)
> +{
> + if (is_sched)
> + atomic_inc(&hctx->wait_on_sched_tag);
> + else
> + atomic_inc(&hctx->wait_on_hw_tag);
> +}
> diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
> index 49bb1aaa83dc..2cda555d5730 100644
> --- a/block/blk-mq-debugfs.h
> +++ b/block/blk-mq-debugfs.h
> @@ -34,6 +34,8 @@ void blk_mq_debugfs_register_sched_hctx(struct
> request_queue *q,
> void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx
> *hctx);
>
> void blk_mq_debugfs_register_rq_qos(struct request_queue *q);
> +void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
> + bool is_sched);
> #else
> static inline void blk_mq_debugfs_register(struct request_queue *q)
> {
> @@ -77,6 +79,11 @@ static inline void
> blk_mq_debugfs_register_rq_qos(struct request_queue *q)
> {
> }
>
> +static inline void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx
> *hctx,
> + bool is_sched)
> +{
> +}
> +
> #endif
>
> #if defined(CONFIG_BLK_DEV_ZONED) && defined(CONFIG_BLK_DEBUG_FS)
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 66138dd043d4..3cc6a97a87a0 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -17,6 +17,7 @@
> #include "blk.h"
> #include "blk-mq.h"
> #include "blk-mq-sched.h"
> +#include "blk-mq-debugfs.h"
>
> /*
> * Recalculate wakeup batch when tag is shared by hctx.
> @@ -191,6 +192,9 @@ unsigned int blk_mq_get_tag(struct
> blk_mq_alloc_data *data)
> trace_block_rq_tag_wait(data->q, data->hctx,
> data->rq_flags &
> RQF_SCHED_TAGS);
>
> + blk_mq_debugfs_inc_wait_tags(data->hctx,
> + data->rq_flags &
> RQF_SCHED_TAGS);
> +
> bt_prev = bt;
> io_schedule();
>
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 18a2388ba581..f3d8ea93b23f 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -453,6 +453,16 @@ struct blk_mq_hw_ctx {
> struct dentry *debugfs_dir;
> /** @sched_debugfs_dir: debugfs directory for the
> scheduler. */
> struct dentry *sched_debugfs_dir;
> + /**
> + * @wait_on_hw_tag: Cumulative counter incremented each time
> a submitting
> + * context is forced to block due to physical hardware
> driver tag exhaustion.
> + */
> + atomic_t wait_on_hw_tag;
> + /**
> + * @wait_on_sched_tag: Cumulative counter incremented each
> time a submitting
> + * context is forced to block due to software scheduler tag
> exhaustion.
> + */
> + atomic_t wait_on_sched_tag;
> #endif
>
> /**
For [PATCH v3 2/2] blk-mq: expose tag starvation counts via debugfs
Tested-by: Laurence Oberman <loberman@redhat.com>
Every 10.0s: grep . /sys/kernel/debug/block/nvme0n1/hctx0/wait_on_*
rhel95: Fri Mar 20 11:04:15 2026
/sys/kernel/debug/block/nvme0n1/hctx0/wait_on_hw_tag:103260 <---
cumulative
/sys/kernel/debug/block/nvme0n1/hctx0/wait_on_sched_tag:0
The patch to me looks good, but will need others to confirm
Reviewed-by: Laurence Oberman <loberman@redhat.com>
^ permalink raw reply
* [PATCH 3/3] rtla: Parse cmdline using libsubcmd
From: Tomas Glozar @ 2026-03-20 15:06 UTC (permalink / raw)
To: Steven Rostedt, Tomas Glozar
Cc: John Kacur, Luis Goncalves, Crystal Wood, Costa Shulyupin,
Wander Lairson Costa, Ivan Pravdin, Namhyung Kim, Ian Rogers,
Arnaldo Carvalho de Melo, LKML, linux-trace-kernel,
linux-perf-users
In-Reply-To: <20260320150651.51057-1-tglozar@redhat.com>
Instead of using getopt_long() directly to parse the command line
arguments given to an RTLA tool, use libsubcmd's parse_options().
Utilizing libsubcmd for parsing command line arguments has several
benefits:
- A help message is automatically generated by libsubcmd from the
specification, removing the need of writing it by hand.
- Options are sorted into groups based on which part of tracing (CPU,
thread, auto-analysis, tuning, histogram) they relate to.
- Common parsing patterns for numerical and boolean values now share
code, with the target variable being stored in the option array.
To avoid duplication of the option parsing logic, RTLA-specific
macros defining struct option values are created:
- RTLA_OPT_* for options common to all tools
- OSNOISE_OPT_* and TIMERLAT_OPT_* for options specific to
osnoise/timerlat tools
, HIST_OPT_* macros for options specific to histogram-based tools.
Individual *_parse_args() functions then construct an array out of
these macros that is then passed to libsubcmd's parse_options().
All code specific to command line options parsing is moved out of the
individual tool files into a new file, cli.c, which also contains the
contents of the rtla.c file.
The return value of tool-level help option changes to 129, as this is
the value set by libsubcmd; this is reflected in affected test cases.
The implementation of help for command-level and tracer-level help
remains the same.
Assisted-by: Composer:composer-1.5
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
tools/tracing/rtla/src/Build | 2 +-
tools/tracing/rtla/src/cli.c | 1207 ++++++++++++++++++++++++
tools/tracing/rtla/src/cli.h | 7 +
tools/tracing/rtla/src/common.c | 109 ---
tools/tracing/rtla/src/common.h | 26 +-
tools/tracing/rtla/src/osnoise_hist.c | 221 +----
tools/tracing/rtla/src/osnoise_top.c | 200 +---
tools/tracing/rtla/src/rtla.c | 89 --
tools/tracing/rtla/src/timerlat.h | 4 +-
tools/tracing/rtla/src/timerlat_hist.c | 317 +------
tools/tracing/rtla/src/timerlat_top.c | 285 +-----
tools/tracing/rtla/src/utils.c | 28 +-
tools/tracing/rtla/src/utils.h | 3 +-
tools/tracing/rtla/tests/hwnoise.t | 2 +-
14 files changed, 1236 insertions(+), 1264 deletions(-)
create mode 100644 tools/tracing/rtla/src/cli.c
create mode 100644 tools/tracing/rtla/src/cli.h
delete mode 100644 tools/tracing/rtla/src/rtla.c
diff --git a/tools/tracing/rtla/src/Build b/tools/tracing/rtla/src/Build
index 329e24a40cf7..a1f3ab927207 100644
--- a/tools/tracing/rtla/src/Build
+++ b/tools/tracing/rtla/src/Build
@@ -11,4 +11,4 @@ rtla-y += timerlat_hist.o
rtla-y += timerlat_u.o
rtla-y += timerlat_aa.o
rtla-y += timerlat_bpf.o
-rtla-y += rtla.o
+rtla-y += cli.o
diff --git a/tools/tracing/rtla/src/cli.c b/tools/tracing/rtla/src/cli.c
new file mode 100644
index 000000000000..d029a698e8a7
--- /dev/null
+++ b/tools/tracing/rtla/src/cli.c
@@ -0,0 +1,1207 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Red Hat Inc, Daniel Bristot de Oliveira <bristot@kernel.org>
+ */
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+
+#include <linux/kernel.h>
+#include <subcmd/parse-options.h>
+
+#include "cli.h"
+#include "osnoise.h"
+#include "timerlat.h"
+
+struct osnoise_cb_data {
+ struct osnoise_params *params;
+ char *trace_output;
+};
+
+struct timerlat_cb_data {
+ struct timerlat_params *params;
+ char *trace_output;
+};
+
+static const char * const osnoise_top_usage[] = {
+ "rtla osnoise [top] [<options>]",
+ NULL,
+};
+
+static const char * const osnoise_hist_usage[] = {
+ "rtla osnoise hist [<options>]",
+ NULL,
+};
+
+static const char * const timerlat_top_usage[] = {
+ "rtla timerlat [top] [<options>]",
+ NULL,
+};
+
+static const char * const timerlat_hist_usage[] = {
+ "rtla timerlat hist [<options>]",
+ NULL,
+};
+
+static const char * const hwnoise_usage[] = {
+ "rtla hwnoise [<options>]",
+ NULL,
+};
+
+static const int common_parse_options_flags = PARSE_OPT_OPTARG_ALLOW_NEXT;
+
+/*
+ * Macros for command line options common to all tools
+ *
+ * Note: Some of the options are common to both timerlat and osnoise, but
+ * have a slightly different meaning. Such options take additional arguments
+ * that have to be provided by the *_parse_args() function of the corresponding
+ * tool.
+ *
+ * All macros defined here assume the presence of a params variable of
+ * the corresponding tool type (i.e struct timerlat_params or struct osnoise_params)
+ * and a cb_data variable of the matching type.
+ */
+
+#define RTLA_OPT_STOP(short, long, name) OPT_CALLBACK(short, long, \
+ ¶ms->common.stop_us, \
+ "us", \
+ "stop trace if " name " is higher than the argument in us", \
+ opt_llong_callback)
+
+#define RTLA_OPT_STOP_TOTAL(short, long, name) OPT_CALLBACK(short, long, \
+ ¶ms->common.stop_total_us, \
+ "us", \
+ "stop trace if " name " is higher than the argument in us", \
+ opt_llong_callback)
+
+#define RTLA_OPT_TRACE_OUTPUT(tracer, cb) OPT_CALLBACK_OPTARG('t', "trace", \
+ (const char **)&cb_data.trace_output, \
+ tracer "_trace.txt", \
+ "[file]", \
+ "save the stopped trace to [file|" tracer "_trace.txt]", \
+ cb)
+
+#define RTLA_OPT_CPUS OPT_CALLBACK('c', "cpus", ¶ms->common, \
+ "cpu-list", \
+ "run the tracer only on the given cpus", \
+ opt_cpus_cb)
+
+#define RTLA_OPT_CGROUP OPT_CALLBACK_OPTARG('C', "cgroup", ¶ms->common, \
+ "[cgroup_name]", NULL, \
+ "set cgroup, no argument means rtla's cgroup will be inherited", \
+ opt_cgroup_cb)
+
+#define RTLA_OPT_USER_THREADS OPT_CALLBACK('u', "user-threads", params, NULL, \
+ "use rtla user-space threads instead of kernel-space timerlat threads", \
+ opt_user_threads_cb)
+
+#define RTLA_OPT_KERNEL_THREADS OPT_BOOLEAN('k', "kernel-threads", \
+ ¶ms->common.kernel_workload, \
+ "use timerlat kernel-space threads instead of rtla user-space threads")
+
+#define RTLA_OPT_USER_LOAD OPT_BOOLEAN('U', "user-load", ¶ms->common.user_data, \
+ "enable timerlat for user-defined user-space workload")
+
+#define RTLA_OPT_DURATION OPT_CALLBACK('d', "duration", ¶ms->common, \
+ "time[s|m|h|d]", \
+ "set the duration of the session", \
+ opt_duration_cb)
+
+#define RTLA_OPT_EVENT OPT_CALLBACK('e', "event", ¶ms->common.events, \
+ "sys:event", \
+ "enable the <sys:event> in the trace instance, multiple -e are allowed", \
+ opt_event_cb)
+
+#define RTLA_OPT_HOUSEKEEPING OPT_CALLBACK('H', "house-keeping", ¶ms->common, \
+ "cpu-list", \
+ "run rtla control threads only on the given cpus", \
+ opt_housekeeping_cb)
+
+#define RTLA_OPT_PRIORITY OPT_CALLBACK('P', "priority", ¶ms->common, \
+ "o:prio|r:prio|f:prio|d:runtime:period", \
+ "set scheduling parameters", \
+ opt_priority_cb)
+
+#define RTLA_OPT_TRIGGER OPT_CALLBACK(0, "trigger", ¶ms->common, \
+ "trigger", \
+ "enable a trace event trigger to the previous -e event", \
+ opt_trigger_cb)
+
+#define RTLA_OPT_FILTER OPT_CALLBACK(0, "filter", ¶ms->common, \
+ "filter", \
+ "enable a trace event filter to the previous -e event", \
+ opt_filter_cb)
+
+#define RTLA_OPT_QUIET OPT_BOOLEAN('q', "quiet", ¶ms->common.quiet, \
+ "print only a summary at the end")
+
+#define RTLA_OPT_TRACE_BUFFER_SIZE OPT_CALLBACK(0, "trace-buffer-size", \
+ ¶ms->common.buffer_size, "kB", \
+ "set the per-cpu trace buffer size in kB", \
+ opt_int_callback)
+
+#define RTLA_OPT_WARM_UP OPT_CALLBACK(0, "warm-up", ¶ms->common.warmup, "s", \
+ "let the workload run for s seconds before collecting data", \
+ opt_int_callback)
+
+#define RTLA_OPT_AUTO(cb) OPT_CALLBACK('a', "auto", &cb_data, "us", \
+ "set automatic trace mode, stopping the session if argument in us sample is hit", \
+ cb)
+
+#define RTLA_OPT_ON_THRESHOLD(threshold, cb) OPT_CALLBACK(0, "on-threshold", \
+ ¶ms->common.threshold_actions, \
+ "action", \
+ "define action to be executed at " threshold " threshold, multiple are allowed", \
+ cb)
+
+#define RTLA_OPT_ON_END(cb) OPT_CALLBACK(0, "on-end", ¶ms->common.end_actions, \
+ "action", \
+ "define action to be executed at measurement end, multiple are allowed", \
+ cb)
+
+#define RTLA_OPT_DEBUG OPT_BOOLEAN('D', "debug", &config_debug, \
+ "print debug info")
+
+#define RTLA_OPT_HELP OPT_BOOLEAN('h', "help", (bool *)NULL, \
+ "show help")
+
+/*
+ * Common callback functions for command line options
+ */
+
+static int opt_llong_callback(const struct option *opt, const char *arg, int unset)
+{
+ long long *value = opt->value;
+
+ if (unset || !arg)
+ return -1;
+
+ *value = get_llong_from_str((char *)arg);
+ return 0;
+}
+
+static int opt_int_callback(const struct option *opt, const char *arg, int unset)
+{
+ int *value = opt->value;
+
+ if (unset || !arg)
+ return -1;
+
+ if (strtoi(arg, value))
+ return -1;
+
+ return 0;
+}
+
+static int opt_cpus_cb(const struct option *opt, const char *arg, int unset)
+{
+ struct common_params *params = opt->value;
+ int retval;
+
+ if (unset || !arg)
+ return -1;
+
+ retval = parse_cpu_set((char *)arg, ¶ms->monitored_cpus);
+ if (retval)
+ fatal("Invalid -c cpu list");
+ params->cpus = (char *)arg;
+
+ return 0;
+}
+
+static int opt_cgroup_cb(const struct option *opt, const char *arg, int unset)
+{
+ struct common_params *params = opt->value;
+
+ params->cgroup = 1;
+ params->cgroup_name = (char *)arg;
+ if (params->cgroup_name && params->cgroup_name[0] == '=')
+ /* Allow -C=<cgroup_name> next to -C[ ]<cgroup_name> */
+ ++params->cgroup_name;
+
+ return 0;
+}
+
+static int opt_duration_cb(const struct option *opt, const char *arg, int unset)
+{
+ struct common_params *params = opt->value;
+
+ if (unset || !arg)
+ return -1;
+
+ params->duration = parse_seconds_duration((char *)arg);
+ if (!params->duration)
+ fatal("Invalid -d duration");
+
+ return 0;
+}
+
+static int opt_event_cb(const struct option *opt, const char *arg, int unset)
+{
+ struct trace_events **events = opt->value;
+ struct trace_events *tevent;
+
+ if (unset || !arg)
+ return -1;
+
+ tevent = trace_event_alloc((char *)arg);
+ if (!tevent)
+ fatal("Error alloc trace event");
+
+ if (*events)
+ tevent->next = *events;
+ *events = tevent;
+
+ return 0;
+}
+
+static int opt_housekeeping_cb(const struct option *opt, const char *arg, int unset)
+{
+ struct common_params *params = opt->value;
+ int retval;
+
+ if (unset || !arg)
+ return -1;
+
+ params->hk_cpus = 1;
+ retval = parse_cpu_set((char *)arg, ¶ms->hk_cpu_set);
+ if (retval)
+ fatal("Error parsing house keeping CPUs");
+
+ return 0;
+}
+
+static int opt_priority_cb(const struct option *opt, const char *arg, int unset)
+{
+ struct common_params *params = opt->value;
+ int retval;
+
+ if (unset || !arg)
+ return -1;
+
+ retval = parse_prio((char *)arg, ¶ms->sched_param);
+ if (retval == -1)
+ fatal("Invalid -P priority");
+ params->set_sched = 1;
+
+ return 0;
+}
+
+static int opt_trigger_cb(const struct option *opt, const char *arg, int unset)
+{
+ struct common_params *params = opt->value;
+
+ if (unset || !arg)
+ return -1;
+
+ if (!params->events)
+ fatal("--trigger requires a previous -e");
+
+ trace_event_add_trigger(params->events, (char *)arg);
+
+ return 0;
+}
+
+static int opt_filter_cb(const struct option *opt, const char *arg, int unset)
+{
+ struct common_params *params = opt->value;
+
+ if (unset || !arg)
+ return -1;
+
+ if (!params->events)
+ fatal("--filter requires a previous -e");
+
+ trace_event_add_filter(params->events, (char *)arg);
+
+ return 0;
+}
+
+/*
+ * Macros for command line options specific to osnoise
+ */
+#define OSNOISE_OPT_PERIOD OPT_CALLBACK('p', "period", ¶ms->period, "us", \
+ "osnoise period in us", \
+ opt_osnoise_period_cb)
+
+#define OSNOISE_OPT_RUNTIME OPT_CALLBACK('r', "runtime", ¶ms->runtime, "us", \
+ "osnoise runtime in us", \
+ opt_osnoise_runtime_cb)
+
+#define OSNOISE_OPT_THRESHOLD OPT_CALLBACK('T', "threshold", ¶ms->threshold, "us", \
+ "the minimum delta to be considered a noise", \
+ opt_osnoise_threshold_cb)
+
+/*
+ * Callback functions for command line options for osnoise tools
+ */
+
+static int opt_osnoise_auto_cb(const struct option *opt, const char *arg, int unset)
+{
+ struct osnoise_cb_data *cb_data = opt->value;
+ struct osnoise_params *params = cb_data->params;
+ long long auto_thresh;
+
+ if (unset || !arg)
+ return -1;
+
+ auto_thresh = get_llong_from_str((char *)arg);
+ params->common.stop_us = auto_thresh;
+ params->threshold = 1;
+
+ if (!cb_data->trace_output)
+ cb_data->trace_output = "osnoise_trace.txt";
+
+ return 0;
+}
+
+static int opt_osnoise_period_cb(const struct option *opt, const char *arg, int unset)
+{
+ unsigned long long *period = opt->value;
+
+ if (unset || !arg)
+ return -1;
+
+ *period = get_llong_from_str((char *)arg);
+ if (*period > 10000000)
+ fatal("Period longer than 10 s");
+
+ return 0;
+}
+
+static int opt_osnoise_runtime_cb(const struct option *opt, const char *arg, int unset)
+{
+ unsigned long long *runtime = opt->value;
+
+ if (unset || !arg)
+ return -1;
+
+ *runtime = get_llong_from_str((char *)arg);
+ if (*runtime < 100)
+ fatal("Runtime shorter than 100 us");
+
+ return 0;
+}
+
+static int opt_osnoise_trace_output_cb(const struct option *opt, const char *arg, int unset)
+{
+ const char **trace_output = opt->value;
+
+ if (unset)
+ return -1;
+
+ if (!arg) {
+ *trace_output = "osnoise_trace.txt";
+ } else {
+ *trace_output = (char *)arg;
+ if (*trace_output && (*trace_output)[0] == '=')
+ /* Allow -t=<trace_output> next to -t[ ]<trace_output> */
+ ++*trace_output;
+ }
+
+ return 0;
+}
+
+static int opt_osnoise_threshold_cb(const struct option *opt, const char *arg, int unset)
+{
+ long long *threshold = opt->value;
+
+ if (unset || !arg)
+ return -1;
+
+ *threshold = get_llong_from_str((char *)arg);
+
+ return 0;
+}
+
+static int opt_osnoise_on_threshold_cb(const struct option *opt, const char *arg, int unset)
+{
+ struct actions *actions = opt->value;
+ int retval;
+
+ if (unset || !arg)
+ return -1;
+
+ retval = actions_parse(actions, (char *)arg, "osnoise_trace.txt");
+ if (retval)
+ fatal("Invalid action %s", arg);
+
+ return 0;
+}
+
+static int opt_osnoise_on_end_cb(const struct option *opt, const char *arg, int unset)
+{
+ struct actions *actions = opt->value;
+ int retval;
+
+ if (unset || !arg)
+ return -1;
+
+ retval = actions_parse(actions, (char *)arg, "osnoise_trace.txt");
+ if (retval)
+ fatal("Invalid action %s", arg);
+
+ return 0;
+}
+
+/*
+ * Macros for command line options specific to timerlat
+ */
+#define TIMERLAT_OPT_PERIOD OPT_CALLBACK('p', "period", ¶ms->timerlat_period_us, "us", \
+ "timerlat period in us", \
+ opt_timerlat_period_cb)
+
+#define TIMERLAT_OPT_STACK OPT_CALLBACK('s', "stack", ¶ms->print_stack, "us", \
+ "save the stack trace at the IRQ if a thread latency is higher than the argument in us", \
+ opt_llong_callback)
+
+#define TIMERLAT_OPT_NANO OPT_CALLBACK('n', "nano", params, NULL, \
+ "display data in nanoseconds", \
+ opt_nano_cb)
+
+#define TIMERLAT_OPT_DMA_LATENCY OPT_CALLBACK(0, "dma-latency", ¶ms->dma_latency, "us", \
+ "set /dev/cpu_dma_latency latency <us> to reduce exit from idle latency", \
+ opt_dma_latency_cb)
+
+#define TIMERLAT_OPT_DEEPEST_IDLE_STATE OPT_CALLBACK(0, "deepest-idle-state", \
+ ¶ms->deepest_idle_state, "n", \
+ "only go down to idle state n on cpus used by timerlat to reduce exit from idle latency", \
+ opt_int_callback)
+
+#define TIMERLAT_OPT_AA_ONLY OPT_CALLBACK(0, "aa-only", params, "us", \
+ "stop if <us> latency is hit, only printing the auto analysis (reduces CPU usage)", \
+ opt_aa_only_cb)
+
+#define TIMERLAT_OPT_NO_AA OPT_BOOLEAN(0, "no-aa", ¶ms->no_aa, \
+ "disable auto-analysis, reducing rtla timerlat cpu usage")
+
+#define TIMERLAT_OPT_DUMPS_TASKS OPT_BOOLEAN(0, "dump-tasks", ¶ms->dump_tasks, \
+ "prints the task running on all CPUs if stop conditions are met (depends on !--no-aa)")
+
+#define TIMERLAT_OPT_BPF_ACTION OPT_STRING(0, "bpf-action", ¶ms->bpf_action_program, \
+ "program", \
+ "load and execute BPF program when latency threshold is exceeded")
+
+#define TIMERLAT_OPT_STACK_FORMAT OPT_CALLBACK(0, "stack-format", ¶ms->stack_format, "format", \
+ "set the stack format (truncate, skip, full)", \
+ opt_stack_format_cb)
+
+/*
+ * Callback functions for command line options for timerlat tools
+ */
+
+static int opt_timerlat_period_cb(const struct option *opt, const char *arg, int unset)
+{
+ long long *period = opt->value;
+
+ if (unset || !arg)
+ return -1;
+
+ *period = get_llong_from_str((char *)arg);
+ if (*period > 1000000)
+ fatal("Period longer than 1 s");
+
+ return 0;
+}
+
+static int opt_timerlat_auto_cb(const struct option *opt, const char *arg, int unset)
+{
+ struct timerlat_cb_data *cb_data = opt->value;
+ struct timerlat_params *params = cb_data->params;
+ long long auto_thresh;
+
+ if (unset || !arg)
+ return -1;
+
+ auto_thresh = get_llong_from_str((char *)arg);
+ params->common.stop_total_us = auto_thresh;
+ params->common.stop_us = auto_thresh;
+ params->print_stack = auto_thresh;
+
+ if (!cb_data->trace_output)
+ cb_data->trace_output = "timerlat_trace.txt";
+
+ return 0;
+}
+
+static int opt_dma_latency_cb(const struct option *opt, const char *arg, int unset)
+{
+ int *dma_latency = opt->value;
+ int retval;
+
+ if (unset || !arg)
+ return -1;
+
+ retval = strtoi((char *)arg, dma_latency);
+ if (retval)
+ fatal("Invalid -dma-latency %s", arg);
+ if (*dma_latency < 0 || *dma_latency > 10000)
+ fatal("--dma-latency needs to be >= 0 and < 10000");
+
+ return 0;
+}
+
+static int opt_aa_only_cb(const struct option *opt, const char *arg, int unset)
+{
+ struct timerlat_params *params = opt->value;
+ long long auto_thresh;
+
+ if (unset || !arg)
+ return -1;
+
+ auto_thresh = get_llong_from_str((char *)arg);
+ params->common.stop_total_us = auto_thresh;
+ params->common.stop_us = auto_thresh;
+ params->print_stack = auto_thresh;
+ params->common.aa_only = 1;
+
+ return 0;
+}
+
+static int opt_timerlat_trace_output_cb(const struct option *opt, const char *arg, int unset)
+{
+ const char **trace_output = opt->value;
+
+ if (unset)
+ return -1;
+
+ if (!arg) {
+ *trace_output = "timerlat_trace.txt";
+ } else {
+ *trace_output = (char *)arg;
+ if (*trace_output && (*trace_output)[0] == '=')
+ /* Allow -t=<trace_output> next to -t[ ]<trace_output> */
+ ++*trace_output;
+ }
+
+ return 0;
+}
+
+static int opt_timerlat_on_threshold_cb(const struct option *opt, const char *arg, int unset)
+{
+ struct actions *actions = opt->value;
+ int retval;
+
+ if (unset || !arg)
+ return -1;
+
+ retval = actions_parse(actions, (char *)arg, "timerlat_trace.txt");
+ if (retval)
+ fatal("Invalid action %s", arg);
+
+ return 0;
+}
+
+static int opt_timerlat_on_end_cb(const struct option *opt, const char *arg, int unset)
+{
+ struct actions *actions = opt->value;
+ int retval;
+
+ if (unset || !arg)
+ return -1;
+
+ retval = actions_parse(actions, (char *)arg, "timerlat_trace.txt");
+ if (retval)
+ fatal("Invalid action %s", arg);
+
+ return 0;
+}
+
+static int opt_user_threads_cb(const struct option *opt, const char *arg, int unset)
+{
+ struct timerlat_params *params = opt->value;
+
+ if (unset)
+ return 0;
+
+ params->common.user_workload = true;
+ params->common.user_data = true;
+
+ return 0;
+}
+
+static int opt_nano_cb(const struct option *opt, const char *arg, int unset)
+{
+ struct timerlat_params *params = opt->value;
+
+ if (unset)
+ return 0;
+
+ params->common.output_divisor = 1;
+
+ return 0;
+}
+
+static int opt_stack_format_cb(const struct option *opt, const char *arg, int unset)
+{
+ int *format = opt->value;
+
+ if (unset || !arg)
+ return -1;
+
+ *format = parse_stack_format((char *)arg);
+
+ if (*format == -1)
+ fatal("Invalid --stack-format option");
+
+ return 0;
+}
+
+/*
+ * Macros for command line options specific to histogram-based tools
+ */
+#define HIST_OPT_BUCKET_SIZE OPT_CALLBACK('b', "bucket-size", \
+ ¶ms->common.hist.bucket_size, "N", \
+ "set the histogram bucket size (default 1)", \
+ opt_bucket_size_cb)
+
+#define HIST_OPT_ENTRIES OPT_CALLBACK('E', "entries", ¶ms->common.hist.entries, "N", \
+ "set the number of entries of the histogram (default 256)", \
+ opt_entries_cb)
+
+#define HIST_OPT_NO_IRQ OPT_BOOLEAN(0, "no-irq", ¶ms->common.hist.no_irq, \
+ "ignore IRQ latencies")
+
+#define HIST_OPT_NO_THREAD OPT_BOOLEAN(0, "no-thread", ¶ms->common.hist.no_thread, \
+ "ignore thread latencies")
+
+#define HIST_OPT_NO_HEADER OPT_BOOLEAN(0, "no-header", ¶ms->common.hist.no_header, \
+ "do not print header")
+
+#define HIST_OPT_NO_SUMMARY OPT_BOOLEAN(0, "no-summary", ¶ms->common.hist.no_summary, \
+ "do not print summary")
+
+#define HIST_OPT_NO_INDEX OPT_BOOLEAN(0, "no-index", ¶ms->common.hist.no_index, \
+ "do not print index")
+
+#define HIST_OPT_WITH_ZEROS OPT_BOOLEAN(0, "with-zeros", ¶ms->common.hist.with_zeros, \
+ "print zero only entries")
+
+/* Histogram-specific callbacks */
+
+static int opt_bucket_size_cb(const struct option *opt, const char *arg, int unset)
+{
+ int *bucket_size = opt->value;
+
+ if (unset || !arg)
+ return -1;
+
+ *bucket_size = get_llong_from_str((char *)arg);
+ if (*bucket_size == 0 || *bucket_size >= 1000000)
+ fatal("Bucket size needs to be > 0 and <= 1000000");
+
+ return 0;
+}
+
+static int opt_entries_cb(const struct option *opt, const char *arg, int unset)
+{
+ int *entries = opt->value;
+
+ if (unset || !arg)
+ return -1;
+
+ *entries = get_llong_from_str((char *)arg);
+ if (*entries < 10 || *entries > 9999999)
+ fatal("Entries must be > 10 and < 9999999");
+
+ return 0;
+}
+
+/*
+ * osnoise_top_parse_args - allocs, parse and fill the cmd line parameters
+ */
+struct common_params *osnoise_top_parse_args(int argc, char **argv)
+{
+ struct osnoise_params *params;
+ struct osnoise_cb_data cb_data;
+ const char * const *usage;
+
+ params = calloc_fatal(1, sizeof(*params));
+
+ cb_data.params = params;
+ cb_data.trace_output = NULL;
+
+ if (strcmp(argv[0], "hwnoise") == 0) {
+ params->mode = MODE_HWNOISE;
+ /*
+ * Reduce CPU usage for 75% to avoid killing the system.
+ */
+ params->runtime = 750000;
+ params->period = 1000000;
+ usage = hwnoise_usage;
+ } else {
+ usage = osnoise_top_usage;
+ }
+
+ const struct option osnoise_top_options[] = {
+ OPT_GROUP("Tracing Options:"),
+ OSNOISE_OPT_PERIOD,
+ OSNOISE_OPT_RUNTIME,
+ RTLA_OPT_STOP('s', "stop", "single sample"),
+ RTLA_OPT_STOP_TOTAL('S', "stop-total", "total sample"),
+ OSNOISE_OPT_THRESHOLD,
+ RTLA_OPT_TRACE_OUTPUT("osnoise", opt_osnoise_trace_output_cb),
+
+ OPT_GROUP("Event Configuration:"),
+ RTLA_OPT_EVENT,
+ RTLA_OPT_FILTER,
+ RTLA_OPT_TRIGGER,
+
+ OPT_GROUP("CPU Configuration:"),
+ RTLA_OPT_CPUS,
+ RTLA_OPT_HOUSEKEEPING,
+
+ OPT_GROUP("Thread Configuration:"),
+ RTLA_OPT_PRIORITY,
+ RTLA_OPT_CGROUP,
+
+ OPT_GROUP("Output:"),
+ RTLA_OPT_QUIET,
+
+ OPT_GROUP("System Tuning:"),
+ RTLA_OPT_TRACE_BUFFER_SIZE,
+ RTLA_OPT_WARM_UP,
+
+ OPT_GROUP("Auto Analysis and Actions:"),
+ RTLA_OPT_AUTO(opt_osnoise_auto_cb),
+ RTLA_OPT_ON_THRESHOLD("stop-total", opt_osnoise_on_threshold_cb),
+ RTLA_OPT_ON_END(opt_osnoise_on_end_cb),
+
+ OPT_GROUP("General:"),
+ RTLA_OPT_DURATION,
+ RTLA_OPT_DEBUG,
+ RTLA_OPT_HELP,
+
+ OPT_END(),
+ };
+
+ actions_init(¶ms->common.threshold_actions);
+ actions_init(¶ms->common.end_actions);
+
+ argc = parse_options(argc, (const char **)argv,
+ osnoise_top_options,
+ usage,
+ common_parse_options_flags);
+ if (argc < 0)
+ return NULL;
+
+ if (cb_data.trace_output)
+ actions_add_trace_output(¶ms->common.threshold_actions, cb_data.trace_output);
+
+ if (geteuid())
+ fatal("osnoise needs root permission");
+
+ return ¶ms->common;
+}
+
+/*
+ * osnoise_hist_parse_args - allocs, parse and fill the cmd line parameters
+ */
+struct common_params *osnoise_hist_parse_args(int argc, char *argv[])
+{
+ struct osnoise_params *params;
+ struct osnoise_cb_data cb_data;
+
+ params = calloc_fatal(1, sizeof(*params));
+
+ cb_data.params = params;
+ cb_data.trace_output = NULL;
+
+ const struct option osnoise_hist_options[] = {
+ OPT_GROUP("Tracing Options:"),
+ OSNOISE_OPT_PERIOD,
+ OSNOISE_OPT_RUNTIME,
+ RTLA_OPT_STOP('s', "stop", "single sample"),
+ RTLA_OPT_STOP_TOTAL('S', "stop-total", "total sample"),
+ OSNOISE_OPT_THRESHOLD,
+ RTLA_OPT_TRACE_OUTPUT("osnoise", opt_osnoise_trace_output_cb),
+
+ OPT_GROUP("Event Configuration:"),
+ RTLA_OPT_EVENT,
+ RTLA_OPT_FILTER,
+ RTLA_OPT_TRIGGER,
+
+ OPT_GROUP("CPU Configuration:"),
+ RTLA_OPT_CPUS,
+ RTLA_OPT_HOUSEKEEPING,
+
+ OPT_GROUP("Thread Configuration:"),
+ RTLA_OPT_PRIORITY,
+ RTLA_OPT_CGROUP,
+
+ OPT_GROUP("Histogram Options:"),
+ HIST_OPT_BUCKET_SIZE,
+ HIST_OPT_ENTRIES,
+ HIST_OPT_NO_HEADER,
+ HIST_OPT_NO_SUMMARY,
+ HIST_OPT_NO_INDEX,
+ HIST_OPT_WITH_ZEROS,
+
+ OPT_GROUP("System Tuning:"),
+ RTLA_OPT_TRACE_BUFFER_SIZE,
+ RTLA_OPT_WARM_UP,
+
+ OPT_GROUP("Auto Analysis and Actions:"),
+ RTLA_OPT_AUTO(opt_osnoise_auto_cb),
+ RTLA_OPT_ON_THRESHOLD("stop-total", opt_osnoise_on_threshold_cb),
+ RTLA_OPT_ON_END(opt_osnoise_on_end_cb),
+
+ OPT_GROUP("General:"),
+ RTLA_OPT_DURATION,
+ RTLA_OPT_DEBUG,
+ RTLA_OPT_HELP,
+
+ OPT_END(),
+ };
+
+ actions_init(¶ms->common.threshold_actions);
+ actions_init(¶ms->common.end_actions);
+
+ /* display data in microseconds */
+ params->common.output_divisor = 1000;
+ params->common.hist.bucket_size = 1;
+ params->common.hist.entries = 256;
+
+ argc = parse_options(argc, (const char **)argv,
+ osnoise_hist_options, osnoise_hist_usage,
+ common_parse_options_flags);
+ if (argc < 0)
+ return NULL;
+
+ if (cb_data.trace_output)
+ actions_add_trace_output(¶ms->common.threshold_actions, cb_data.trace_output);
+
+ if (geteuid())
+ fatal("rtla needs root permission");
+
+ if (params->common.hist.no_index && !params->common.hist.with_zeros)
+ fatal("no-index set and with-zeros not set - it does not make sense");
+
+ return ¶ms->common;
+}
+
+struct common_params *timerlat_top_parse_args(int argc, char **argv)
+{
+ struct timerlat_params *params;
+ struct timerlat_cb_data cb_data;
+
+ params = calloc_fatal(1, sizeof(*params));
+
+ cb_data.params = params;
+ cb_data.trace_output = NULL;
+
+ const struct option timerlat_top_options[] = {
+ OPT_GROUP("Tracing Options:"),
+ TIMERLAT_OPT_PERIOD,
+ RTLA_OPT_STOP('i', "irq", "irq latency"),
+ RTLA_OPT_STOP_TOTAL('T', "thread", "thread latency"),
+ TIMERLAT_OPT_STACK,
+ RTLA_OPT_TRACE_OUTPUT("timerlat", opt_timerlat_trace_output_cb),
+
+ OPT_GROUP("Event Configuration:"),
+ RTLA_OPT_EVENT,
+ RTLA_OPT_FILTER,
+ RTLA_OPT_TRIGGER,
+
+ OPT_GROUP("CPU Configuration:"),
+ RTLA_OPT_CPUS,
+ RTLA_OPT_HOUSEKEEPING,
+
+ OPT_GROUP("Thread Configuration:"),
+ RTLA_OPT_PRIORITY,
+ RTLA_OPT_CGROUP,
+ RTLA_OPT_USER_THREADS,
+ RTLA_OPT_KERNEL_THREADS,
+ RTLA_OPT_USER_LOAD,
+
+ OPT_GROUP("Output:"),
+ TIMERLAT_OPT_NANO,
+ RTLA_OPT_QUIET,
+
+ OPT_GROUP("System Tuning:"),
+ TIMERLAT_OPT_DMA_LATENCY,
+ TIMERLAT_OPT_DEEPEST_IDLE_STATE,
+ RTLA_OPT_TRACE_BUFFER_SIZE,
+ RTLA_OPT_WARM_UP,
+
+ OPT_GROUP("Auto Analysis and Actions:"),
+ RTLA_OPT_AUTO(opt_timerlat_auto_cb),
+ TIMERLAT_OPT_AA_ONLY,
+ TIMERLAT_OPT_NO_AA,
+ TIMERLAT_OPT_DUMPS_TASKS,
+ RTLA_OPT_ON_THRESHOLD("latency", opt_timerlat_on_threshold_cb),
+ RTLA_OPT_ON_END(opt_timerlat_on_end_cb),
+ TIMERLAT_OPT_BPF_ACTION,
+ TIMERLAT_OPT_STACK_FORMAT,
+
+ OPT_GROUP("General:"),
+ RTLA_OPT_DURATION,
+ RTLA_OPT_DEBUG,
+ RTLA_OPT_HELP,
+
+ OPT_END(),
+ };
+
+ actions_init(¶ms->common.threshold_actions);
+ actions_init(¶ms->common.end_actions);
+
+ /* disabled by default */
+ params->dma_latency = -1;
+ params->deepest_idle_state = -2;
+
+ /* display data in microseconds */
+ params->common.output_divisor = 1000;
+
+ /* default to BPF mode */
+ params->mode = TRACING_MODE_BPF;
+
+ /* default to truncate stack format */
+ params->stack_format = STACK_FORMAT_TRUNCATE;
+
+ argc = parse_options(argc, (const char **)argv,
+ timerlat_top_options, timerlat_top_usage,
+ common_parse_options_flags);
+ if (argc < 0)
+ return NULL;
+
+ if (cb_data.trace_output)
+ actions_add_trace_output(¶ms->common.threshold_actions, cb_data.trace_output);
+
+ if (geteuid())
+ fatal("rtla needs root permission");
+
+ /*
+ * Auto analysis only happens if stop tracing, thus:
+ */
+ if (!params->common.stop_us && !params->common.stop_total_us)
+ params->no_aa = 1;
+
+ if (params->no_aa && params->common.aa_only)
+ fatal("--no-aa and --aa-only are mutually exclusive!");
+
+ if (params->common.kernel_workload && params->common.user_workload)
+ fatal("--kernel-threads and --user-threads are mutually exclusive!");
+
+ /*
+ * If auto-analysis or trace output is enabled, switch from BPF mode to
+ * mixed mode
+ */
+ if (params->mode == TRACING_MODE_BPF &&
+ (params->common.threshold_actions.present[ACTION_TRACE_OUTPUT] ||
+ params->common.end_actions.present[ACTION_TRACE_OUTPUT] ||
+ !params->no_aa))
+ params->mode = TRACING_MODE_MIXED;
+
+ return ¶ms->common;
+}
+
+struct common_params *timerlat_hist_parse_args(int argc, char **argv)
+{
+ struct timerlat_params *params;
+ struct timerlat_cb_data cb_data;
+
+ params = calloc_fatal(1, sizeof(*params));
+
+ cb_data.params = params;
+ cb_data.trace_output = NULL;
+
+ const struct option timerlat_hist_options[] = {
+ OPT_GROUP("Tracing Options:"),
+ TIMERLAT_OPT_PERIOD,
+ RTLA_OPT_STOP('i', "irq", "irq latency"),
+ RTLA_OPT_STOP_TOTAL('T', "thread", "thread latency"),
+ TIMERLAT_OPT_STACK,
+ RTLA_OPT_TRACE_OUTPUT("timerlat", opt_timerlat_trace_output_cb),
+
+ OPT_GROUP("Event Configuration:"),
+ RTLA_OPT_EVENT,
+ RTLA_OPT_FILTER,
+ RTLA_OPT_TRIGGER,
+
+ OPT_GROUP("CPU Configuration:"),
+ RTLA_OPT_CPUS,
+ RTLA_OPT_HOUSEKEEPING,
+
+ OPT_GROUP("Thread Configuration:"),
+ RTLA_OPT_PRIORITY,
+ RTLA_OPT_CGROUP,
+ RTLA_OPT_USER_THREADS,
+ RTLA_OPT_KERNEL_THREADS,
+ RTLA_OPT_USER_LOAD,
+
+ OPT_GROUP("Histogram Options:"),
+ HIST_OPT_BUCKET_SIZE,
+ HIST_OPT_ENTRIES,
+ HIST_OPT_NO_IRQ,
+ HIST_OPT_NO_THREAD,
+ HIST_OPT_NO_HEADER,
+ HIST_OPT_NO_SUMMARY,
+ HIST_OPT_NO_INDEX,
+ HIST_OPT_WITH_ZEROS,
+
+ OPT_GROUP("Output:"),
+ TIMERLAT_OPT_NANO,
+
+ OPT_GROUP("System Tuning:"),
+ TIMERLAT_OPT_DMA_LATENCY,
+ TIMERLAT_OPT_DEEPEST_IDLE_STATE,
+ RTLA_OPT_TRACE_BUFFER_SIZE,
+ RTLA_OPT_WARM_UP,
+
+ OPT_GROUP("Auto Analysis and Actions:"),
+ RTLA_OPT_AUTO(opt_timerlat_auto_cb),
+ TIMERLAT_OPT_NO_AA,
+ TIMERLAT_OPT_DUMPS_TASKS,
+ RTLA_OPT_ON_THRESHOLD("latency", opt_timerlat_on_threshold_cb),
+ RTLA_OPT_ON_END(opt_timerlat_on_end_cb),
+ TIMERLAT_OPT_BPF_ACTION,
+ TIMERLAT_OPT_STACK_FORMAT,
+
+ OPT_GROUP("General:"),
+ RTLA_OPT_DURATION,
+ RTLA_OPT_DEBUG,
+ RTLA_OPT_HELP,
+
+ OPT_END(),
+ };
+
+ actions_init(¶ms->common.threshold_actions);
+ actions_init(¶ms->common.end_actions);
+
+ /* disabled by default */
+ params->dma_latency = -1;
+
+ /* disabled by default */
+ params->deepest_idle_state = -2;
+
+ /* display data in microseconds */
+ params->common.output_divisor = 1000;
+ params->common.hist.bucket_size = 1;
+ params->common.hist.entries = 256;
+
+ /* default to BPF mode */
+ params->mode = TRACING_MODE_BPF;
+
+ /* default to truncate stack format */
+ params->stack_format = STACK_FORMAT_TRUNCATE;
+
+ argc = parse_options(argc, (const char **)argv,
+ timerlat_hist_options, timerlat_hist_usage,
+ common_parse_options_flags);
+ if (argc < 0)
+ return NULL;
+
+ if (cb_data.trace_output)
+ actions_add_trace_output(¶ms->common.threshold_actions, cb_data.trace_output);
+
+ if (geteuid())
+ fatal("rtla needs root permission");
+
+ if (params->common.hist.no_irq && params->common.hist.no_thread)
+ fatal("no-irq and no-thread set, there is nothing to do here");
+
+ if (params->common.hist.no_index && !params->common.hist.with_zeros)
+ fatal("no-index set with with-zeros is not set - it does not make sense");
+
+ /*
+ * Auto analysis only happens if stop tracing, thus:
+ */
+ if (!params->common.stop_us && !params->common.stop_total_us)
+ params->no_aa = 1;
+
+ if (params->common.kernel_workload && params->common.user_workload)
+ fatal("--kernel-threads and --user-threads are mutually exclusive!");
+
+ /*
+ * If auto-analysis or trace output is enabled, switch from BPF mode to
+ * mixed mode
+ */
+ if (params->mode == TRACING_MODE_BPF &&
+ (params->common.threshold_actions.present[ACTION_TRACE_OUTPUT] ||
+ params->common.end_actions.present[ACTION_TRACE_OUTPUT] ||
+ !params->no_aa))
+ params->mode = TRACING_MODE_MIXED;
+
+ return ¶ms->common;
+}
+
+/*
+ * rtla_usage - print rtla usage
+ */
+static void rtla_usage(int err)
+{
+ int i;
+
+ static const char *msg[] = {
+ "",
+ "rtla version " VERSION,
+ "",
+ " usage: rtla COMMAND ...",
+ "",
+ " commands:",
+ " osnoise - gives information about the operating system noise (osnoise)",
+ " hwnoise - gives information about hardware-related noise",
+ " timerlat - measures the timer irq and thread latency",
+ "",
+ NULL,
+ };
+
+ for (i = 0; msg[i]; i++)
+ fprintf(stderr, "%s\n", msg[i]);
+ exit(err);
+}
+
+/*
+ * run_tool_command - try to run a rtla tool command
+ *
+ * It returns 0 if it fails. The tool's main will generally not
+ * return as they should call exit().
+ */
+int run_tool_command(int argc, char **argv, int start_position)
+{
+ if (strcmp(argv[start_position], "osnoise") == 0) {
+ osnoise_main(argc-start_position, &argv[start_position]);
+ goto ran;
+ } else if (strcmp(argv[start_position], "hwnoise") == 0) {
+ hwnoise_main(argc-start_position, &argv[start_position]);
+ goto ran;
+ } else if (strcmp(argv[start_position], "timerlat") == 0) {
+ timerlat_main(argc-start_position, &argv[start_position]);
+ goto ran;
+ }
+
+ return 0;
+ran:
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ int retval;
+
+ /* is it an alias? */
+ retval = run_tool_command(argc, argv, 0);
+ if (retval)
+ exit(0);
+
+ if (argc < 2)
+ goto usage;
+
+ if (strcmp(argv[1], "-h") == 0)
+ rtla_usage(0);
+ else if (strcmp(argv[1], "--help") == 0)
+ rtla_usage(0);
+
+ retval = run_tool_command(argc, argv, 1);
+ if (retval)
+ exit(0);
+
+usage:
+ rtla_usage(1);
+ exit(1);
+}
diff --git a/tools/tracing/rtla/src/cli.h b/tools/tracing/rtla/src/cli.h
new file mode 100644
index 000000000000..c49ccb3e92f5
--- /dev/null
+++ b/tools/tracing/rtla/src/cli.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#pragma once
+
+struct common_params *osnoise_top_parse_args(int argc, char **argv);
+struct common_params *osnoise_hist_parse_args(int argc, char **argv);
+struct common_params *timerlat_top_parse_args(int argc, char **argv);
+struct common_params *timerlat_hist_parse_args(int argc, char **argv);
diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c
index 35e3d3aa922e..7403dcc8f6c1 100644
--- a/tools/tracing/rtla/src/common.c
+++ b/tools/tracing/rtla/src/common.c
@@ -5,7 +5,6 @@
#include <signal.h>
#include <stdlib.h>
#include <string.h>
-#include <getopt.h>
#include <sys/sysinfo.h>
#include "common.h"
@@ -53,114 +52,6 @@ static void unset_signals(struct common_params *params)
}
}
-/*
- * getopt_auto - auto-generates optstring from long_options
- */
-int getopt_auto(int argc, char **argv, const struct option *long_opts)
-{
- char opts[256];
- int n = 0;
-
- for (int i = 0; long_opts[i].name; i++) {
- if (long_opts[i].val < 32 || long_opts[i].val > 127)
- continue;
-
- if (n + 4 >= sizeof(opts))
- fatal("optstring buffer overflow");
-
- opts[n++] = long_opts[i].val;
-
- if (long_opts[i].has_arg == required_argument)
- opts[n++] = ':';
- else if (long_opts[i].has_arg == optional_argument) {
- opts[n++] = ':';
- opts[n++] = ':';
- }
- }
-
- opts[n] = '\0';
-
- return getopt_long(argc, argv, opts, long_opts, NULL);
-}
-
-/*
- * common_parse_options - parse common command line options
- *
- * @argc: argument count
- * @argv: argument vector
- * @common: common parameters structure
- *
- * Parse command line options that are common to all rtla tools.
- *
- * Returns: non zero if a common option was parsed, or 0
- * if the option should be handled by tool-specific parsing.
- */
-int common_parse_options(int argc, char **argv, struct common_params *common)
-{
- struct trace_events *tevent;
- int saved_state = optind;
- int c;
-
- static struct option long_options[] = {
- {"cpus", required_argument, 0, 'c'},
- {"cgroup", optional_argument, 0, 'C'},
- {"debug", no_argument, 0, 'D'},
- {"duration", required_argument, 0, 'd'},
- {"event", required_argument, 0, 'e'},
- {"house-keeping", required_argument, 0, 'H'},
- {"priority", required_argument, 0, 'P'},
- {0, 0, 0, 0}
- };
-
- opterr = 0;
- c = getopt_auto(argc, argv, long_options);
- opterr = 1;
-
- switch (c) {
- case 'c':
- if (parse_cpu_set(optarg, &common->monitored_cpus))
- fatal("Invalid -c cpu list");
- common->cpus = optarg;
- break;
- case 'C':
- common->cgroup = 1;
- common->cgroup_name = parse_optional_arg(argc, argv);
- break;
- case 'D':
- config_debug = 1;
- break;
- case 'd':
- common->duration = parse_seconds_duration(optarg);
- if (!common->duration)
- fatal("Invalid -d duration");
- break;
- case 'e':
- tevent = trace_event_alloc(optarg);
- if (!tevent)
- fatal("Error alloc trace event");
-
- if (common->events)
- tevent->next = common->events;
- common->events = tevent;
- break;
- case 'H':
- common->hk_cpus = 1;
- if (parse_cpu_set(optarg, &common->hk_cpu_set))
- fatal("Error parsing house keeping CPUs");
- break;
- case 'P':
- if (parse_prio(optarg, &common->sched_param) == -1)
- fatal("Invalid -P priority");
- common->set_sched = 1;
- break;
- default:
- optind = saved_state;
- return 0;
- }
-
- return c;
-}
-
/*
* common_apply_config - apply common configs to the initialized tool
*/
diff --git a/tools/tracing/rtla/src/common.h b/tools/tracing/rtla/src/common.h
index 51665db4ffce..27439b10ffd5 100644
--- a/tools/tracing/rtla/src/common.h
+++ b/tools/tracing/rtla/src/common.h
@@ -1,7 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
#pragma once
-#include <getopt.h>
#include "actions.h"
#include "timerlat_u.h"
#include "trace.h"
@@ -58,12 +57,12 @@ extern struct trace_instance *trace_inst;
extern volatile int stop_tracing;
struct hist_params {
- char no_irq;
- char no_thread;
- char no_header;
- char no_summary;
- char no_index;
- char with_zeros;
+ bool no_irq;
+ bool no_thread;
+ bool no_header;
+ bool no_summary;
+ bool no_index;
+ bool with_zeros;
int bucket_size;
int entries;
};
@@ -96,12 +95,12 @@ struct common_params {
/* Other parameters */
struct hist_params hist;
int output_divisor;
- int pretty_output;
- int quiet;
- int user_workload;
- int kernel_workload;
- int user_data;
- int aa_only;
+ bool pretty_output;
+ bool quiet;
+ bool user_workload;
+ bool kernel_workload;
+ bool user_data;
+ bool aa_only;
struct actions threshold_actions;
struct actions end_actions;
@@ -177,7 +176,6 @@ int osnoise_set_stop_us(struct osnoise_context *context, long long stop_us);
int osnoise_set_stop_total_us(struct osnoise_context *context,
long long stop_total_us);
-int getopt_auto(int argc, char **argv, const struct option *long_opts);
int common_parse_options(int argc, char **argv, struct common_params *common);
int common_apply_config(struct osnoise_tool *tool, struct common_params *params);
int top_main_loop(struct osnoise_tool *tool);
diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c
index 8ad816b80265..dfa91d0681f8 100644
--- a/tools/tracing/rtla/src/osnoise_hist.c
+++ b/tools/tracing/rtla/src/osnoise_hist.c
@@ -4,7 +4,6 @@
*/
#define _GNU_SOURCE
-#include <getopt.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
@@ -13,6 +12,7 @@
#include <time.h>
#include "osnoise.h"
+#include "cli.h"
struct osnoise_hist_cpu {
int *samples;
@@ -400,225 +400,6 @@ osnoise_print_stats(struct osnoise_tool *tool)
osnoise_report_missed_events(tool);
}
-/*
- * osnoise_hist_usage - prints osnoise hist usage message
- */
-static void osnoise_hist_usage(void)
-{
- static const char * const msg_start[] = {
- "[-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\",
- " [-T us] [-t [file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] \\",
- " [-c cpu-list] [-H cpu-list] [-P priority] [-b N] [-E N] [--no-header] [--no-summary] \\",
- " [--no-index] [--with-zeros] [-C [cgroup_name]] [--warm-up]",
- NULL,
- };
-
- static const char * const msg_opts[] = {
- " -a/--auto: set automatic trace mode, stopping the session if argument in us sample is hit",
- " -p/--period us: osnoise period in us",
- " -r/--runtime us: osnoise runtime in us",
- " -s/--stop us: stop trace if a single sample is higher than the argument in us",
- " -S/--stop-total us: stop trace if the total sample is higher than the argument in us",
- " -T/--threshold us: the minimum delta to be considered a noise",
- " -c/--cpus cpu-list: list of cpus to run osnoise threads",
- " -H/--house-keeping cpus: run rtla control threads only on the given cpus",
- " -C/--cgroup [cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited",
- " -d/--duration time[s|m|h|d]: duration of the session",
- " -D/--debug: print debug info",
- " -t/--trace [file]: save the stopped trace to [file|osnoise_trace.txt]",
- " -e/--event <sys:event>: enable the <sys:event> in the trace instance, multiple -e are allowed",
- " --filter <filter>: enable a trace event filter to the previous -e event",
- " --trigger <trigger>: enable a trace event trigger to the previous -e event",
- " -b/--bucket-size N: set the histogram bucket size (default 1)",
- " -E/--entries N: set the number of entries of the histogram (default 256)",
- " --no-header: do not print header",
- " --no-summary: do not print summary",
- " --no-index: do not print index",
- " --with-zeros: print zero only entries",
- " -P/--priority o:prio|r:prio|f:prio|d:runtime:period: set scheduling parameters",
- " o:prio - use SCHED_OTHER with prio",
- " r:prio - use SCHED_RR with prio",
- " f:prio - use SCHED_FIFO with prio",
- " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period",
- " in nanoseconds",
- " --warm-up: let the workload run for s seconds before collecting data",
- " --trace-buffer-size kB: set the per-cpu trace buffer size in kB",
- " --on-threshold <action>: define action to be executed at stop-total threshold, multiple are allowed",
- " --on-end <action>: define action to be executed at measurement end, multiple are allowed",
- NULL,
- };
-
- common_usage("osnoise", "hist", "a per-cpu histogram of the OS noise",
- msg_start, msg_opts);
-}
-
-/*
- * osnoise_hist_parse_args - allocs, parse and fill the cmd line parameters
- */
-static struct common_params
-*osnoise_hist_parse_args(int argc, char *argv[])
-{
- struct osnoise_params *params;
- int retval;
- int c;
- char *trace_output = NULL;
-
- params = calloc_fatal(1, sizeof(*params));
-
- actions_init(¶ms->common.threshold_actions);
- actions_init(¶ms->common.end_actions);
-
- /* display data in microseconds */
- params->common.output_divisor = 1000;
- params->common.hist.bucket_size = 1;
- params->common.hist.entries = 256;
-
- while (1) {
- static struct option long_options[] = {
- {"auto", required_argument, 0, 'a'},
- {"bucket-size", required_argument, 0, 'b'},
- {"entries", required_argument, 0, 'E'},
- {"help", no_argument, 0, 'h'},
- {"period", required_argument, 0, 'p'},
- {"runtime", required_argument, 0, 'r'},
- {"stop", required_argument, 0, 's'},
- {"stop-total", required_argument, 0, 'S'},
- {"trace", optional_argument, 0, 't'},
- {"threshold", required_argument, 0, 'T'},
- {"no-header", no_argument, 0, '0'},
- {"no-summary", no_argument, 0, '1'},
- {"no-index", no_argument, 0, '2'},
- {"with-zeros", no_argument, 0, '3'},
- {"trigger", required_argument, 0, '4'},
- {"filter", required_argument, 0, '5'},
- {"warm-up", required_argument, 0, '6'},
- {"trace-buffer-size", required_argument, 0, '7'},
- {"on-threshold", required_argument, 0, '8'},
- {"on-end", required_argument, 0, '9'},
- {0, 0, 0, 0}
- };
-
- if (common_parse_options(argc, argv, ¶ms->common))
- continue;
-
- c = getopt_auto(argc, argv, long_options);
-
- /* detect the end of the options. */
- if (c == -1)
- break;
-
- switch (c) {
- case 'a':
- /* set sample stop to auto_thresh */
- params->common.stop_us = get_llong_from_str(optarg);
-
- /* set sample threshold to 1 */
- params->threshold = 1;
-
- /* set trace */
- if (!trace_output)
- trace_output = "osnoise_trace.txt";
-
- break;
- case 'b':
- params->common.hist.bucket_size = get_llong_from_str(optarg);
- if (params->common.hist.bucket_size == 0 ||
- params->common.hist.bucket_size >= 1000000)
- fatal("Bucket size needs to be > 0 and <= 1000000");
- break;
- case 'E':
- params->common.hist.entries = get_llong_from_str(optarg);
- if (params->common.hist.entries < 10 ||
- params->common.hist.entries > 9999999)
- fatal("Entries must be > 10 and < 9999999");
- break;
- case 'h':
- case '?':
- osnoise_hist_usage();
- break;
- case 'p':
- params->period = get_llong_from_str(optarg);
- if (params->period > 10000000)
- fatal("Period longer than 10 s");
- break;
- case 'r':
- params->runtime = get_llong_from_str(optarg);
- if (params->runtime < 100)
- fatal("Runtime shorter than 100 us");
- break;
- case 's':
- params->common.stop_us = get_llong_from_str(optarg);
- break;
- case 'S':
- params->common.stop_total_us = get_llong_from_str(optarg);
- break;
- case 'T':
- params->threshold = get_llong_from_str(optarg);
- break;
- case 't':
- trace_output = parse_optional_arg(argc, argv);
- if (!trace_output)
- trace_output = "osnoise_trace.txt";
- break;
- case '0': /* no header */
- params->common.hist.no_header = 1;
- break;
- case '1': /* no summary */
- params->common.hist.no_summary = 1;
- break;
- case '2': /* no index */
- params->common.hist.no_index = 1;
- break;
- case '3': /* with zeros */
- params->common.hist.with_zeros = 1;
- break;
- case '4': /* trigger */
- if (params->common.events)
- trace_event_add_trigger(params->common.events, optarg);
- else
- fatal("--trigger requires a previous -e");
- break;
- case '5': /* filter */
- if (params->common.events)
- trace_event_add_filter(params->common.events, optarg);
- else
- fatal("--filter requires a previous -e");
- break;
- case '6':
- params->common.warmup = get_llong_from_str(optarg);
- break;
- case '7':
- params->common.buffer_size = get_llong_from_str(optarg);
- break;
- case '8':
- retval = actions_parse(¶ms->common.threshold_actions, optarg,
- "osnoise_trace.txt");
- if (retval)
- fatal("Invalid action %s", optarg);
- break;
- case '9':
- retval = actions_parse(¶ms->common.end_actions, optarg,
- "osnoise_trace.txt");
- if (retval)
- fatal("Invalid action %s", optarg);
- break;
- default:
- fatal("Invalid option");
- }
- }
-
- if (trace_output)
- actions_add_trace_output(¶ms->common.threshold_actions, trace_output);
-
- if (geteuid())
- fatal("rtla needs root permission");
-
- if (params->common.hist.no_index && !params->common.hist.with_zeros)
- fatal("no-index set and with-zeros not set - it does not make sense");
-
- return ¶ms->common;
-}
-
/*
* osnoise_hist_apply_config - apply the hist configs to the initialized tool
*/
diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c
index 244bdce022ad..512a6299cb01 100644
--- a/tools/tracing/rtla/src/osnoise_top.c
+++ b/tools/tracing/rtla/src/osnoise_top.c
@@ -4,7 +4,6 @@
*/
#define _GNU_SOURCE
-#include <getopt.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
@@ -13,6 +12,7 @@
#include <time.h>
#include "osnoise.h"
+#include "cli.h"
struct osnoise_top_cpu {
unsigned long long sum_runtime;
@@ -245,204 +245,6 @@ osnoise_print_stats(struct osnoise_tool *top)
osnoise_report_missed_events(top);
}
-/*
- * osnoise_top_usage - prints osnoise top usage message
- */
-static void osnoise_top_usage(struct osnoise_params *params)
-{
- const char *tool, *mode, *desc;
-
- static const char * const msg_start[] = {
- "[-q] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\",
- " [-T us] [-t [file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] \\",
- " [-c cpu-list] [-H cpu-list] [-P priority] [-C [cgroup_name]] [--warm-up s]",
- NULL,
- };
-
- static const char * const msg_opts[] = {
- " -a/--auto: set automatic trace mode, stopping the session if argument in us sample is hit",
- " -p/--period us: osnoise period in us",
- " -r/--runtime us: osnoise runtime in us",
- " -s/--stop us: stop trace if a single sample is higher than the argument in us",
- " -S/--stop-total us: stop trace if the total sample is higher than the argument in us",
- " -T/--threshold us: the minimum delta to be considered a noise",
- " -c/--cpus cpu-list: list of cpus to run osnoise threads",
- " -H/--house-keeping cpus: run rtla control threads only on the given cpus",
- " -C/--cgroup [cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited",
- " -d/--duration time[s|m|h|d]: duration of the session",
- " -D/--debug: print debug info",
- " -t/--trace [file]: save the stopped trace to [file|osnoise_trace.txt]",
- " -e/--event <sys:event>: enable the <sys:event> in the trace instance, multiple -e are allowed",
- " --filter <filter>: enable a trace event filter to the previous -e event",
- " --trigger <trigger>: enable a trace event trigger to the previous -e event",
- " -q/--quiet print only a summary at the end",
- " -P/--priority o:prio|r:prio|f:prio|d:runtime:period : set scheduling parameters",
- " o:prio - use SCHED_OTHER with prio",
- " r:prio - use SCHED_RR with prio",
- " f:prio - use SCHED_FIFO with prio",
- " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period",
- " in nanoseconds",
- " --warm-up s: let the workload run for s seconds before collecting data",
- " --trace-buffer-size kB: set the per-cpu trace buffer size in kB",
- " --on-threshold <action>: define action to be executed at stop-total threshold, multiple are allowed",
- " --on-end: define action to be executed at measurement end, multiple are allowed",
- NULL,
- };
-
- if (params->mode == MODE_OSNOISE) {
- tool = "osnoise";
- mode = "top";
- desc = "a per-cpu summary of the OS noise";
- } else {
- tool = "hwnoise";
- mode = "";
- desc = "a summary of hardware-related noise";
- }
-
- common_usage(tool, mode, desc, msg_start, msg_opts);
-}
-
-/*
- * osnoise_top_parse_args - allocs, parse and fill the cmd line parameters
- */
-struct common_params *osnoise_top_parse_args(int argc, char **argv)
-{
- struct osnoise_params *params;
- int retval;
- int c;
- char *trace_output = NULL;
-
- params = calloc_fatal(1, sizeof(*params));
-
- actions_init(¶ms->common.threshold_actions);
- actions_init(¶ms->common.end_actions);
-
- if (strcmp(argv[0], "hwnoise") == 0) {
- params->mode = MODE_HWNOISE;
- /*
- * Reduce CPU usage for 75% to avoid killing the system.
- */
- params->runtime = 750000;
- params->period = 1000000;
- }
-
- while (1) {
- static struct option long_options[] = {
- {"auto", required_argument, 0, 'a'},
- {"help", no_argument, 0, 'h'},
- {"period", required_argument, 0, 'p'},
- {"quiet", no_argument, 0, 'q'},
- {"runtime", required_argument, 0, 'r'},
- {"stop", required_argument, 0, 's'},
- {"stop-total", required_argument, 0, 'S'},
- {"threshold", required_argument, 0, 'T'},
- {"trace", optional_argument, 0, 't'},
- {"trigger", required_argument, 0, '0'},
- {"filter", required_argument, 0, '1'},
- {"warm-up", required_argument, 0, '2'},
- {"trace-buffer-size", required_argument, 0, '3'},
- {"on-threshold", required_argument, 0, '4'},
- {"on-end", required_argument, 0, '5'},
- {0, 0, 0, 0}
- };
-
- if (common_parse_options(argc, argv, ¶ms->common))
- continue;
-
- c = getopt_auto(argc, argv, long_options);
-
- /* Detect the end of the options. */
- if (c == -1)
- break;
-
- switch (c) {
- case 'a':
- /* set sample stop to auto_thresh */
- params->common.stop_us = get_llong_from_str(optarg);
-
- /* set sample threshold to 1 */
- params->threshold = 1;
-
- /* set trace */
- if (!trace_output)
- trace_output = "osnoise_trace.txt";
-
- break;
- case 'h':
- case '?':
- osnoise_top_usage(params);
- break;
- case 'p':
- params->period = get_llong_from_str(optarg);
- if (params->period > 10000000)
- fatal("Period longer than 10 s");
- break;
- case 'q':
- params->common.quiet = 1;
- break;
- case 'r':
- params->runtime = get_llong_from_str(optarg);
- if (params->runtime < 100)
- fatal("Runtime shorter than 100 us");
- break;
- case 's':
- params->common.stop_us = get_llong_from_str(optarg);
- break;
- case 'S':
- params->common.stop_total_us = get_llong_from_str(optarg);
- break;
- case 't':
- trace_output = parse_optional_arg(argc, argv);
- if (!trace_output)
- trace_output = "osnoise_trace.txt";
- break;
- case 'T':
- params->threshold = get_llong_from_str(optarg);
- break;
- case '0': /* trigger */
- if (params->common.events)
- trace_event_add_trigger(params->common.events, optarg);
- else
- fatal("--trigger requires a previous -e");
- break;
- case '1': /* filter */
- if (params->common.events)
- trace_event_add_filter(params->common.events, optarg);
- else
- fatal("--filter requires a previous -e");
- break;
- case '2':
- params->common.warmup = get_llong_from_str(optarg);
- break;
- case '3':
- params->common.buffer_size = get_llong_from_str(optarg);
- break;
- case '4':
- retval = actions_parse(¶ms->common.threshold_actions, optarg,
- "osnoise_trace.txt");
- if (retval)
- fatal("Invalid action %s", optarg);
- break;
- case '5':
- retval = actions_parse(¶ms->common.end_actions, optarg,
- "osnoise_trace.txt");
- if (retval)
- fatal("Invalid action %s", optarg);
- break;
- default:
- fatal("Invalid option");
- }
- }
-
- if (trace_output)
- actions_add_trace_output(¶ms->common.threshold_actions, trace_output);
-
- if (geteuid())
- fatal("osnoise needs root permission");
-
- return ¶ms->common;
-}
-
/*
* osnoise_top_apply_config - apply the top configs to the initialized tool
*/
diff --git a/tools/tracing/rtla/src/rtla.c b/tools/tracing/rtla/src/rtla.c
deleted file mode 100644
index 845932f902ef..000000000000
--- a/tools/tracing/rtla/src/rtla.c
+++ /dev/null
@@ -1,89 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2021 Red Hat Inc, Daniel Bristot de Oliveira <bristot@kernel.org>
- */
-
-#include <getopt.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-
-#include "osnoise.h"
-#include "timerlat.h"
-
-/*
- * rtla_usage - print rtla usage
- */
-static void rtla_usage(int err)
-{
- int i;
-
- static const char *msg[] = {
- "",
- "rtla version " VERSION,
- "",
- " usage: rtla COMMAND ...",
- "",
- " commands:",
- " osnoise - gives information about the operating system noise (osnoise)",
- " hwnoise - gives information about hardware-related noise",
- " timerlat - measures the timer irq and thread latency",
- "",
- NULL,
- };
-
- for (i = 0; msg[i]; i++)
- fprintf(stderr, "%s\n", msg[i]);
- exit(err);
-}
-
-/*
- * run_tool_command - try to run a rtla tool command
- *
- * It returns 0 if it fails. The tool's main will generally not
- * return as they should call exit().
- */
-int run_tool_command(int argc, char **argv, int start_position)
-{
- if (strcmp(argv[start_position], "osnoise") == 0) {
- osnoise_main(argc-start_position, &argv[start_position]);
- goto ran;
- } else if (strcmp(argv[start_position], "hwnoise") == 0) {
- hwnoise_main(argc-start_position, &argv[start_position]);
- goto ran;
- } else if (strcmp(argv[start_position], "timerlat") == 0) {
- timerlat_main(argc-start_position, &argv[start_position]);
- goto ran;
- }
-
- return 0;
-ran:
- return 1;
-}
-
-int main(int argc, char *argv[])
-{
- int retval;
-
- /* is it an alias? */
- retval = run_tool_command(argc, argv, 0);
- if (retval)
- exit(0);
-
- if (argc < 2)
- goto usage;
-
- if (strcmp(argv[1], "-h") == 0) {
- rtla_usage(0);
- } else if (strcmp(argv[1], "--help") == 0) {
- rtla_usage(0);
- }
-
- retval = run_tool_command(argc, argv, 1);
- if (retval)
- exit(0);
-
-usage:
- rtla_usage(1);
- exit(1);
-}
diff --git a/tools/tracing/rtla/src/timerlat.h b/tools/tracing/rtla/src/timerlat.h
index 364203a29abd..37a808f1611e 100644
--- a/tools/tracing/rtla/src/timerlat.h
+++ b/tools/tracing/rtla/src/timerlat.h
@@ -23,8 +23,8 @@ struct timerlat_params {
long long timerlat_period_us;
long long print_stack;
int dma_latency;
- int no_aa;
- int dump_tasks;
+ bool no_aa;
+ bool dump_tasks;
int deepest_idle_state;
enum timerlat_tracing_mode mode;
const char *bpf_action_program;
diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index 79142af4f566..df7b1398a966 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -4,7 +4,6 @@
*/
#define _GNU_SOURCE
-#include <getopt.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
@@ -17,6 +16,7 @@
#include "timerlat.h"
#include "timerlat_aa.h"
#include "timerlat_bpf.h"
+#include "cli.h"
#include "common.h"
struct timerlat_hist_cpu {
@@ -685,321 +685,6 @@ timerlat_print_stats(struct osnoise_tool *tool)
osnoise_report_missed_events(tool);
}
-/*
- * timerlat_hist_usage - prints timerlat top usage message
- */
-static void timerlat_hist_usage(void)
-{
- static const char * const msg_start[] = {
- "[-d s] [-D] [-n] [-a us] [-p us] [-i us] [-T us] [-s us] \\",
- " [-t [file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] [-c cpu-list] [-H cpu-list]\\",
- " [-P priority] [-E N] [-b N] [--no-irq] [--no-thread] [--no-header] [--no-summary] \\",
- " [--no-index] [--with-zeros] [--dma-latency us] [-C [cgroup_name]] [--no-aa] [--dump-task] [-u|-k]",
- " [--warm-up s] [--deepest-idle-state n]",
- NULL,
- };
-
- static const char * const msg_opts[] = {
- " -a/--auto: set automatic trace mode, stopping the session if argument in us latency is hit",
- " -p/--period us: timerlat period in us",
- " -i/--irq us: stop trace if the irq latency is higher than the argument in us",
- " -T/--thread us: stop trace if the thread latency is higher than the argument in us",
- " -s/--stack us: save the stack trace at the IRQ if a thread latency is higher than the argument in us",
- " -c/--cpus cpus: run the tracer only on the given cpus",
- " -H/--house-keeping cpus: run rtla control threads only on the given cpus",
- " -C/--cgroup [cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited",
- " -d/--duration time[m|h|d]: duration of the session in seconds",
- " --dump-tasks: prints the task running on all CPUs if stop conditions are met (depends on !--no-aa)",
- " -D/--debug: print debug info",
- " -t/--trace [file]: save the stopped trace to [file|timerlat_trace.txt]",
- " -e/--event <sys:event>: enable the <sys:event> in the trace instance, multiple -e are allowed",
- " --filter <filter>: enable a trace event filter to the previous -e event",
- " --trigger <trigger>: enable a trace event trigger to the previous -e event",
- " -n/--nano: display data in nanoseconds",
- " --no-aa: disable auto-analysis, reducing rtla timerlat cpu usage",
- " -b/--bucket-size N: set the histogram bucket size (default 1)",
- " -E/--entries N: set the number of entries of the histogram (default 256)",
- " --no-irq: ignore IRQ latencies",
- " --no-thread: ignore thread latencies",
- " --no-header: do not print header",
- " --no-summary: do not print summary",
- " --no-index: do not print index",
- " --with-zeros: print zero only entries",
- " --dma-latency us: set /dev/cpu_dma_latency latency <us> to reduce exit from idle latency",
- " -P/--priority o:prio|r:prio|f:prio|d:runtime:period : set scheduling parameters",
- " o:prio - use SCHED_OTHER with prio",
- " r:prio - use SCHED_RR with prio",
- " f:prio - use SCHED_FIFO with prio",
- " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period",
- " in nanoseconds",
- " -u/--user-threads: use rtla user-space threads instead of kernel-space timerlat threads",
- " -k/--kernel-threads: use timerlat kernel-space threads instead of rtla user-space threads",
- " -U/--user-load: enable timerlat for user-defined user-space workload",
- " --warm-up s: let the workload run for s seconds before collecting data",
- " --trace-buffer-size kB: set the per-cpu trace buffer size in kB",
- " --deepest-idle-state n: only go down to idle state n on cpus used by timerlat to reduce exit from idle latency",
- " --on-threshold <action>: define action to be executed at latency threshold, multiple are allowed",
- " --on-end <action>: define action to be executed at measurement end, multiple are allowed",
- " --bpf-action <program>: load and execute BPF program when latency threshold is exceeded",
- " --stack-format <format>: set the stack format (truncate, skip, full)",
- NULL,
- };
-
- common_usage("timerlat", "hist", "a per-cpu histogram of the timer latency",
- msg_start, msg_opts);
-}
-
-/*
- * timerlat_hist_parse_args - allocs, parse and fill the cmd line parameters
- */
-static struct common_params
-*timerlat_hist_parse_args(int argc, char *argv[])
-{
- struct timerlat_params *params;
- int auto_thresh;
- int retval;
- int c;
- char *trace_output = NULL;
-
- params = calloc_fatal(1, sizeof(*params));
-
- actions_init(¶ms->common.threshold_actions);
- actions_init(¶ms->common.end_actions);
-
- /* disabled by default */
- params->dma_latency = -1;
-
- /* disabled by default */
- params->deepest_idle_state = -2;
-
- /* display data in microseconds */
- params->common.output_divisor = 1000;
- params->common.hist.bucket_size = 1;
- params->common.hist.entries = 256;
-
- /* default to BPF mode */
- params->mode = TRACING_MODE_BPF;
-
- /* default to truncate stack format */
- params->stack_format = STACK_FORMAT_TRUNCATE;
-
- while (1) {
- static struct option long_options[] = {
- {"auto", required_argument, 0, 'a'},
- {"bucket-size", required_argument, 0, 'b'},
- {"entries", required_argument, 0, 'E'},
- {"help", no_argument, 0, 'h'},
- {"irq", required_argument, 0, 'i'},
- {"nano", no_argument, 0, 'n'},
- {"period", required_argument, 0, 'p'},
- {"stack", required_argument, 0, 's'},
- {"thread", required_argument, 0, 'T'},
- {"trace", optional_argument, 0, 't'},
- {"user-threads", no_argument, 0, 'u'},
- {"kernel-threads", no_argument, 0, 'k'},
- {"user-load", no_argument, 0, 'U'},
- {"no-irq", no_argument, 0, '0'},
- {"no-thread", no_argument, 0, '1'},
- {"no-header", no_argument, 0, '2'},
- {"no-summary", no_argument, 0, '3'},
- {"no-index", no_argument, 0, '4'},
- {"with-zeros", no_argument, 0, '5'},
- {"trigger", required_argument, 0, '6'},
- {"filter", required_argument, 0, '7'},
- {"dma-latency", required_argument, 0, '8'},
- {"no-aa", no_argument, 0, '9'},
- {"dump-task", no_argument, 0, '\1'},
- {"warm-up", required_argument, 0, '\2'},
- {"trace-buffer-size", required_argument, 0, '\3'},
- {"deepest-idle-state", required_argument, 0, '\4'},
- {"on-threshold", required_argument, 0, '\5'},
- {"on-end", required_argument, 0, '\6'},
- {"bpf-action", required_argument, 0, '\7'},
- {"stack-format", required_argument, 0, '\10'},
- {0, 0, 0, 0}
- };
-
- if (common_parse_options(argc, argv, ¶ms->common))
- continue;
-
- c = getopt_auto(argc, argv, long_options);
-
- /* detect the end of the options. */
- if (c == -1)
- break;
-
- switch (c) {
- case 'a':
- auto_thresh = get_llong_from_str(optarg);
-
- /* set thread stop to auto_thresh */
- params->common.stop_total_us = auto_thresh;
- params->common.stop_us = auto_thresh;
-
- /* get stack trace */
- params->print_stack = auto_thresh;
-
- /* set trace */
- if (!trace_output)
- trace_output = "timerlat_trace.txt";
-
- break;
- case 'b':
- params->common.hist.bucket_size = get_llong_from_str(optarg);
- if (params->common.hist.bucket_size == 0 ||
- params->common.hist.bucket_size >= 1000000)
- fatal("Bucket size needs to be > 0 and <= 1000000");
- break;
- case 'E':
- params->common.hist.entries = get_llong_from_str(optarg);
- if (params->common.hist.entries < 10 ||
- params->common.hist.entries > 9999999)
- fatal("Entries must be > 10 and < 9999999");
- break;
- case 'h':
- case '?':
- timerlat_hist_usage();
- break;
- case 'i':
- params->common.stop_us = get_llong_from_str(optarg);
- break;
- case 'k':
- params->common.kernel_workload = 1;
- break;
- case 'n':
- params->common.output_divisor = 1;
- break;
- case 'p':
- params->timerlat_period_us = get_llong_from_str(optarg);
- if (params->timerlat_period_us > 1000000)
- fatal("Period longer than 1 s");
- break;
- case 's':
- params->print_stack = get_llong_from_str(optarg);
- break;
- case 'T':
- params->common.stop_total_us = get_llong_from_str(optarg);
- break;
- case 't':
- trace_output = parse_optional_arg(argc, argv);
- if (!trace_output)
- trace_output = "timerlat_trace.txt";
- break;
- case 'u':
- params->common.user_workload = 1;
- /* fallback: -u implies in -U */
- case 'U':
- params->common.user_data = 1;
- break;
- case '0': /* no irq */
- params->common.hist.no_irq = 1;
- break;
- case '1': /* no thread */
- params->common.hist.no_thread = 1;
- break;
- case '2': /* no header */
- params->common.hist.no_header = 1;
- break;
- case '3': /* no summary */
- params->common.hist.no_summary = 1;
- break;
- case '4': /* no index */
- params->common.hist.no_index = 1;
- break;
- case '5': /* with zeros */
- params->common.hist.with_zeros = 1;
- break;
- case '6': /* trigger */
- if (params->common.events)
- trace_event_add_trigger(params->common.events, optarg);
- else
- fatal("--trigger requires a previous -e");
- break;
- case '7': /* filter */
- if (params->common.events)
- trace_event_add_filter(params->common.events, optarg);
- else
- fatal("--filter requires a previous -e");
- break;
- case '8':
- params->dma_latency = get_llong_from_str(optarg);
- if (params->dma_latency < 0 || params->dma_latency > 10000)
- fatal("--dma-latency needs to be >= 0 and < 10000");
- break;
- case '9':
- params->no_aa = 1;
- break;
- case '\1':
- params->dump_tasks = 1;
- break;
- case '\2':
- params->common.warmup = get_llong_from_str(optarg);
- break;
- case '\3':
- params->common.buffer_size = get_llong_from_str(optarg);
- break;
- case '\4':
- params->deepest_idle_state = get_llong_from_str(optarg);
- break;
- case '\5':
- retval = actions_parse(¶ms->common.threshold_actions, optarg,
- "timerlat_trace.txt");
- if (retval)
- fatal("Invalid action %s", optarg);
- break;
- case '\6':
- retval = actions_parse(¶ms->common.end_actions, optarg,
- "timerlat_trace.txt");
- if (retval)
- fatal("Invalid action %s", optarg);
- break;
- case '\7':
- params->bpf_action_program = optarg;
- break;
- case '\10':
- params->stack_format = parse_stack_format(optarg);
- if (params->stack_format == -1)
- fatal("Invalid --stack-format option");
- break;
- default:
- fatal("Invalid option");
- }
- }
-
- if (trace_output)
- actions_add_trace_output(¶ms->common.threshold_actions, trace_output);
-
- if (geteuid())
- fatal("rtla needs root permission");
-
- if (params->common.hist.no_irq && params->common.hist.no_thread)
- fatal("no-irq and no-thread set, there is nothing to do here");
-
- if (params->common.hist.no_index && !params->common.hist.with_zeros)
- fatal("no-index set with with-zeros is not set - it does not make sense");
-
- /*
- * Auto analysis only happens if stop tracing, thus:
- */
- if (!params->common.stop_us && !params->common.stop_total_us)
- params->no_aa = 1;
-
- if (params->common.kernel_workload && params->common.user_workload)
- fatal("--kernel-threads and --user-threads are mutually exclusive!");
-
- /*
- * If auto-analysis or trace output is enabled, switch from BPF mode to
- * mixed mode
- */
- if (params->mode == TRACING_MODE_BPF &&
- (params->common.threshold_actions.present[ACTION_TRACE_OUTPUT] ||
- params->common.end_actions.present[ACTION_TRACE_OUTPUT] ||
- !params->no_aa))
- params->mode = TRACING_MODE_MIXED;
-
- return ¶ms->common;
-}
-
/*
* timerlat_hist_apply_config - apply the hist configs to the initialized tool
*/
diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index 64cbdcc878b0..18e1071a2e24 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -4,7 +4,6 @@
*/
#define _GNU_SOURCE
-#include <getopt.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
@@ -17,6 +16,7 @@
#include "timerlat.h"
#include "timerlat_aa.h"
#include "timerlat_bpf.h"
+#include "cli.h"
#include "common.h"
struct timerlat_top_cpu {
@@ -459,289 +459,6 @@ timerlat_print_stats(struct osnoise_tool *top)
osnoise_report_missed_events(top);
}
-/*
- * timerlat_top_usage - prints timerlat top usage message
- */
-static void timerlat_top_usage(void)
-{
- static const char *const msg_start[] = {
- "[-q] [-a us] [-d s] [-D] [-n] [-p us] [-i us] [-T us] [-s us] \\",
- " [[-t [file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] [-c cpu-list] [-H cpu-list]\\",
- " [-P priority] [--dma-latency us] [--aa-only us] [-C [cgroup_name]] [-u|-k] [--warm-up s] [--deepest-idle-state n]",
- NULL,
- };
-
- static const char *const msg_opts[] = {
- " -a/--auto: set automatic trace mode, stopping the session if argument in us latency is hit",
- " --aa-only us: stop if <us> latency is hit, only printing the auto analysis (reduces CPU usage)",
- " -p/--period us: timerlat period in us",
- " -i/--irq us: stop trace if the irq latency is higher than the argument in us",
- " -T/--thread us: stop trace if the thread latency is higher than the argument in us",
- " -s/--stack us: save the stack trace at the IRQ if a thread latency is higher than the argument in us",
- " -c/--cpus cpus: run the tracer only on the given cpus",
- " -H/--house-keeping cpus: run rtla control threads only on the given cpus",
- " -C/--cgroup [cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited",
- " -d/--duration time[s|m|h|d]: duration of the session",
- " -D/--debug: print debug info",
- " --dump-tasks: prints the task running on all CPUs if stop conditions are met (depends on !--no-aa)",
- " -t/--trace [file]: save the stopped trace to [file|timerlat_trace.txt]",
- " -e/--event <sys:event>: enable the <sys:event> in the trace instance, multiple -e are allowed",
- " --filter <command>: enable a trace event filter to the previous -e event",
- " --trigger <command>: enable a trace event trigger to the previous -e event",
- " -n/--nano: display data in nanoseconds",
- " --no-aa: disable auto-analysis, reducing rtla timerlat cpu usage",
- " -q/--quiet print only a summary at the end",
- " --dma-latency us: set /dev/cpu_dma_latency latency <us> to reduce exit from idle latency",
- " -P/--priority o:prio|r:prio|f:prio|d:runtime:period : set scheduling parameters",
- " o:prio - use SCHED_OTHER with prio",
- " r:prio - use SCHED_RR with prio",
- " f:prio - use SCHED_FIFO with prio",
- " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period",
- " in nanoseconds",
- " -u/--user-threads: use rtla user-space threads instead of kernel-space timerlat threads",
- " -k/--kernel-threads: use timerlat kernel-space threads instead of rtla user-space threads",
- " -U/--user-load: enable timerlat for user-defined user-space workload",
- " --warm-up s: let the workload run for s seconds before collecting data",
- " --trace-buffer-size kB: set the per-cpu trace buffer size in kB",
- " --deepest-idle-state n: only go down to idle state n on cpus used by timerlat to reduce exit from idle latency",
- " --on-threshold <action>: define action to be executed at latency threshold, multiple are allowed",
- " --on-end: define action to be executed at measurement end, multiple are allowed",
- " --bpf-action <program>: load and execute BPF program when latency threshold is exceeded",
- " --stack-format <format>: set the stack format (truncate, skip, full)",
- NULL,
- };
-
- common_usage("timerlat", "top", "a per-cpu summary of the timer latency",
- msg_start, msg_opts);
-}
-
-/*
- * timerlat_top_parse_args - allocs, parse and fill the cmd line parameters
- */
-static struct common_params
-*timerlat_top_parse_args(int argc, char **argv)
-{
- struct timerlat_params *params;
- long long auto_thresh;
- int retval;
- int c;
- char *trace_output = NULL;
-
- params = calloc_fatal(1, sizeof(*params));
-
- actions_init(¶ms->common.threshold_actions);
- actions_init(¶ms->common.end_actions);
-
- /* disabled by default */
- params->dma_latency = -1;
-
- /* disabled by default */
- params->deepest_idle_state = -2;
-
- /* display data in microseconds */
- params->common.output_divisor = 1000;
-
- /* default to BPF mode */
- params->mode = TRACING_MODE_BPF;
-
- /* default to truncate stack format */
- params->stack_format = STACK_FORMAT_TRUNCATE;
-
- while (1) {
- static struct option long_options[] = {
- {"auto", required_argument, 0, 'a'},
- {"help", no_argument, 0, 'h'},
- {"irq", required_argument, 0, 'i'},
- {"nano", no_argument, 0, 'n'},
- {"period", required_argument, 0, 'p'},
- {"quiet", no_argument, 0, 'q'},
- {"stack", required_argument, 0, 's'},
- {"thread", required_argument, 0, 'T'},
- {"trace", optional_argument, 0, 't'},
- {"user-threads", no_argument, 0, 'u'},
- {"kernel-threads", no_argument, 0, 'k'},
- {"user-load", no_argument, 0, 'U'},
- {"trigger", required_argument, 0, '0'},
- {"filter", required_argument, 0, '1'},
- {"dma-latency", required_argument, 0, '2'},
- {"no-aa", no_argument, 0, '3'},
- {"dump-tasks", no_argument, 0, '4'},
- {"aa-only", required_argument, 0, '5'},
- {"warm-up", required_argument, 0, '6'},
- {"trace-buffer-size", required_argument, 0, '7'},
- {"deepest-idle-state", required_argument, 0, '8'},
- {"on-threshold", required_argument, 0, '9'},
- {"on-end", required_argument, 0, '\1'},
- {"bpf-action", required_argument, 0, '\2'},
- {"stack-format", required_argument, 0, '\3'},
- {0, 0, 0, 0}
- };
-
- if (common_parse_options(argc, argv, ¶ms->common))
- continue;
-
- c = getopt_auto(argc, argv, long_options);
-
- /* detect the end of the options. */
- if (c == -1)
- break;
-
- switch (c) {
- case 'a':
- auto_thresh = get_llong_from_str(optarg);
-
- /* set thread stop to auto_thresh */
- params->common.stop_total_us = auto_thresh;
- params->common.stop_us = auto_thresh;
-
- /* get stack trace */
- params->print_stack = auto_thresh;
-
- /* set trace */
- if (!trace_output)
- trace_output = "timerlat_trace.txt";
-
- break;
- case '5':
- /* it is here because it is similar to -a */
- auto_thresh = get_llong_from_str(optarg);
-
- /* set thread stop to auto_thresh */
- params->common.stop_total_us = auto_thresh;
- params->common.stop_us = auto_thresh;
-
- /* get stack trace */
- params->print_stack = auto_thresh;
-
- /* set aa_only to avoid parsing the trace */
- params->common.aa_only = 1;
- break;
- case 'h':
- case '?':
- timerlat_top_usage();
- break;
- case 'i':
- params->common.stop_us = get_llong_from_str(optarg);
- break;
- case 'k':
- params->common.kernel_workload = true;
- break;
- case 'n':
- params->common.output_divisor = 1;
- break;
- case 'p':
- params->timerlat_period_us = get_llong_from_str(optarg);
- if (params->timerlat_period_us > 1000000)
- fatal("Period longer than 1 s");
- break;
- case 'q':
- params->common.quiet = 1;
- break;
- case 's':
- params->print_stack = get_llong_from_str(optarg);
- break;
- case 'T':
- params->common.stop_total_us = get_llong_from_str(optarg);
- break;
- case 't':
- trace_output = parse_optional_arg(argc, argv);
- if (!trace_output)
- trace_output = "timerlat_trace.txt";
- break;
- case 'u':
- params->common.user_workload = true;
- /* fallback: -u implies -U */
- case 'U':
- params->common.user_data = true;
- break;
- case '0': /* trigger */
- if (params->common.events)
- trace_event_add_trigger(params->common.events, optarg);
- else
- fatal("--trigger requires a previous -e");
- break;
- case '1': /* filter */
- if (params->common.events)
- trace_event_add_filter(params->common.events, optarg);
- else
- fatal("--filter requires a previous -e");
- break;
- case '2': /* dma-latency */
- params->dma_latency = get_llong_from_str(optarg);
- if (params->dma_latency < 0 || params->dma_latency > 10000)
- fatal("--dma-latency needs to be >= 0 and < 10000");
- break;
- case '3': /* no-aa */
- params->no_aa = 1;
- break;
- case '4':
- params->dump_tasks = 1;
- break;
- case '6':
- params->common.warmup = get_llong_from_str(optarg);
- break;
- case '7':
- params->common.buffer_size = get_llong_from_str(optarg);
- break;
- case '8':
- params->deepest_idle_state = get_llong_from_str(optarg);
- break;
- case '9':
- retval = actions_parse(¶ms->common.threshold_actions, optarg,
- "timerlat_trace.txt");
- if (retval)
- fatal("Invalid action %s", optarg);
- break;
- case '\1':
- retval = actions_parse(¶ms->common.end_actions, optarg,
- "timerlat_trace.txt");
- if (retval)
- fatal("Invalid action %s", optarg);
- break;
- case '\2':
- params->bpf_action_program = optarg;
- break;
- case '\3':
- params->stack_format = parse_stack_format(optarg);
- if (params->stack_format == -1)
- fatal("Invalid --stack-format option");
- break;
- default:
- fatal("Invalid option");
- }
- }
-
- if (trace_output)
- actions_add_trace_output(¶ms->common.threshold_actions, trace_output);
-
- if (geteuid())
- fatal("rtla needs root permission");
-
- /*
- * Auto analysis only happens if stop tracing, thus:
- */
- if (!params->common.stop_us && !params->common.stop_total_us)
- params->no_aa = 1;
-
- if (params->no_aa && params->common.aa_only)
- fatal("--no-aa and --aa-only are mutually exclusive!");
-
- if (params->common.kernel_workload && params->common.user_workload)
- fatal("--kernel-threads and --user-threads are mutually exclusive!");
-
- /*
- * If auto-analysis or trace output is enabled, switch from BPF mode to
- * mixed mode
- */
- if (params->mode == TRACING_MODE_BPF &&
- (params->common.threshold_actions.present[ACTION_TRACE_OUTPUT] ||
- params->common.end_actions.present[ACTION_TRACE_OUTPUT] ||
- !params->no_aa))
- params->mode = TRACING_MODE_MIXED;
-
- return ¶ms->common;
-}
-
/*
* timerlat_top_apply_config - apply the top configs to the initialized tool
*/
diff --git a/tools/tracing/rtla/src/utils.c b/tools/tracing/rtla/src/utils.c
index 9cec5b3e02c8..cb187e7d48d1 100644
--- a/tools/tracing/rtla/src/utils.c
+++ b/tools/tracing/rtla/src/utils.c
@@ -22,7 +22,7 @@
#include "common.h"
#define MAX_MSG_LENGTH 1024
-int config_debug;
+bool config_debug;
/*
* err_msg - print an error message to the stderr
@@ -1011,32 +1011,6 @@ int auto_house_keeping(cpu_set_t *monitored_cpus)
return 1;
}
-/**
- * parse_optional_arg - Parse optional argument value
- *
- * Parse optional argument value, which can be in the form of:
- * -sarg, -s/--long=arg, -s/--long arg
- *
- * Returns arg value if found, NULL otherwise.
- */
-char *parse_optional_arg(int argc, char **argv)
-{
- if (optarg) {
- if (optarg[0] == '=') {
- /* skip the = */
- return &optarg[1];
- } else {
- return optarg;
- }
- /* parse argument of form -s [arg] and --long [arg]*/
- } else if (optind < argc && argv[optind][0] != '-') {
- /* consume optind */
- return argv[optind++];
- } else {
- return NULL;
- }
-}
-
/*
* strtoi - convert string to integer with error checking
*
diff --git a/tools/tracing/rtla/src/utils.h b/tools/tracing/rtla/src/utils.h
index 96fd72042717..2ba3333669bb 100644
--- a/tools/tracing/rtla/src/utils.h
+++ b/tools/tracing/rtla/src/utils.h
@@ -39,7 +39,7 @@ static inline bool str_has_prefix(const char *str, const char *prefix)
return strncmp(str, prefix, strlen(prefix)) == 0;
}
-extern int config_debug;
+extern bool config_debug;
void debug_msg(const char *fmt, ...);
void err_msg(const char *fmt, ...);
void fatal(const char *fmt, ...);
@@ -47,7 +47,6 @@ void fatal(const char *fmt, ...);
long parse_seconds_duration(char *val);
void get_duration(time_t start_time, char *output, int output_size);
-char *parse_optional_arg(int argc, char **argv);
long long get_llong_from_str(char *start);
static inline void
diff --git a/tools/tracing/rtla/tests/hwnoise.t b/tools/tracing/rtla/tests/hwnoise.t
index 23ce250a6852..cfe687ff5ee1 100644
--- a/tools/tracing/rtla/tests/hwnoise.t
+++ b/tools/tracing/rtla/tests/hwnoise.t
@@ -6,7 +6,7 @@ test_begin
set_timeout 2m
check "verify help page" \
- "hwnoise --help" 0 "summary of hardware-related noise"
+ "hwnoise --help" 129 "Usage: rtla hwnoise"
check "detect noise higher than one microsecond" \
"hwnoise -c 0 -T 1 -d 5s -q" 0
check "set the automatic trace mode" \
--
2.53.0
^ permalink raw reply related
* [PATCH 2/3] tools subcmd: support optarg as separate argument
From: Tomas Glozar @ 2026-03-20 15:06 UTC (permalink / raw)
To: Steven Rostedt, Tomas Glozar
Cc: John Kacur, Luis Goncalves, Crystal Wood, Costa Shulyupin,
Wander Lairson Costa, Ivan Pravdin, Namhyung Kim, Ian Rogers,
Arnaldo Carvalho de Melo, LKML, linux-trace-kernel,
linux-perf-users
In-Reply-To: <20260320150651.51057-1-tglozar@redhat.com>
In addition to "-ovalue" and "--opt=value" syntax, allow also "-o value"
and "--opt value" for options with optional argument when the newly
added PARSE_OPT_OPTARG_ALLOW_NEXT flag is set.
This behavior is turned off by default since it does not make sense for
tools using non-option command line arguments. Consider the ambiguity
of "cmd -d x", where "-d x" can mean either "-d with argument of x" or
"-d without argument, followed by non-option argument x". This is not an
issue in the case that the tool takes no non-option arguments.
To implement this, a new local variable, force_defval, is created in
get_value(), along with a comment explaining the logic.
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
tools/lib/subcmd/parse-options.c | 53 +++++++++++++++++++++++++++-----
tools/lib/subcmd/parse-options.h | 1 +
2 files changed, 46 insertions(+), 8 deletions(-)
diff --git a/tools/lib/subcmd/parse-options.c b/tools/lib/subcmd/parse-options.c
index 555d617c1f50..664b2053bb77 100644
--- a/tools/lib/subcmd/parse-options.c
+++ b/tools/lib/subcmd/parse-options.c
@@ -72,6 +72,7 @@ static int get_value(struct parse_opt_ctx_t *p,
const char *s, *arg = NULL;
const int unset = flags & OPT_UNSET;
int err;
+ bool force_defval = false;
if (unset && p->opt)
return opterror(opt, "takes no value", flags);
@@ -123,6 +124,42 @@ static int get_value(struct parse_opt_ctx_t *p,
}
}
+ if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+ if (!(p->flags & PARSE_OPT_OPTARG_ALLOW_NEXT)) {
+ /*
+ * If the option has an optional argument, and the argument is not
+ * provided in the option itself, do not attempt to get it from
+ * the next argument, unless PARSE_OPT_OPTARG_ALLOW_NEXT is set.
+ *
+ * This prevents a non-option argument from being interpreted as an
+ * optional argument of a preceding option, for example:
+ *
+ * $ cmd --opt val
+ * -> is "val" argument of "--opt" or a separate non-option
+ * argument?
+ *
+ * With PARSE_OPT_OPTARG_ALLOW_NEXT, "val" is interpreted as
+ * the argument of "--opt", i.e. the same as "--opt=val".
+ * Without PARSE_OPT_OPTARG_ALLOW_NEXT, --opt is interpreted
+ * as having the default value, and "val" as a separate non-option
+ * argument.
+ *
+ * PARSE_OPT_OPTARG_ALLOW_NEXT is useful for commands that take no
+ * non-option arguments and want to allow more flexibility in
+ * optional argument passing.
+ */
+ force_defval = true;
+ }
+
+ if (p->argc <= 1 || p->argv[1][0] == '-') {
+ /*
+ * If next argument is an option or does not exist,
+ * use the default value.
+ */
+ force_defval = true;
+ }
+ }
+
if (opt->flags & PARSE_OPT_NOBUILD) {
char reason[128];
bool noarg = false;
@@ -148,7 +185,7 @@ static int get_value(struct parse_opt_ctx_t *p,
noarg = true;
if (opt->flags & PARSE_OPT_NOARG)
noarg = true;
- if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+ if (force_defval)
noarg = true;
switch (opt->type) {
@@ -212,7 +249,7 @@ static int get_value(struct parse_opt_ctx_t *p,
err = 0;
if (unset)
*(const char **)opt->value = NULL;
- else if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+ else if (force_defval)
*(const char **)opt->value = (const char *)opt->defval;
else
err = get_arg(p, opt, flags, (const char **)opt->value);
@@ -244,7 +281,7 @@ static int get_value(struct parse_opt_ctx_t *p,
return (*opt->callback)(opt, NULL, 1) ? (-1) : 0;
if (opt->flags & PARSE_OPT_NOARG)
return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
- if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+ if (force_defval)
return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
if (get_arg(p, opt, flags, &arg))
return -1;
@@ -255,7 +292,7 @@ static int get_value(struct parse_opt_ctx_t *p,
*(int *)opt->value = 0;
return 0;
}
- if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+ if (force_defval) {
*(int *)opt->value = opt->defval;
return 0;
}
@@ -271,7 +308,7 @@ static int get_value(struct parse_opt_ctx_t *p,
*(unsigned int *)opt->value = 0;
return 0;
}
- if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+ if (force_defval) {
*(unsigned int *)opt->value = opt->defval;
return 0;
}
@@ -289,7 +326,7 @@ static int get_value(struct parse_opt_ctx_t *p,
*(long *)opt->value = 0;
return 0;
}
- if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+ if (force_defval) {
*(long *)opt->value = opt->defval;
return 0;
}
@@ -305,7 +342,7 @@ static int get_value(struct parse_opt_ctx_t *p,
*(unsigned long *)opt->value = 0;
return 0;
}
- if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+ if (force_defval) {
*(unsigned long *)opt->value = opt->defval;
return 0;
}
@@ -321,7 +358,7 @@ static int get_value(struct parse_opt_ctx_t *p,
*(u64 *)opt->value = 0;
return 0;
}
- if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+ if (force_defval) {
*(u64 *)opt->value = opt->defval;
return 0;
}
diff --git a/tools/lib/subcmd/parse-options.h b/tools/lib/subcmd/parse-options.h
index 8e9147358a28..c573a0ca5ca6 100644
--- a/tools/lib/subcmd/parse-options.h
+++ b/tools/lib/subcmd/parse-options.h
@@ -33,6 +33,7 @@ enum parse_opt_flags {
PARSE_OPT_KEEP_ARGV0 = 4,
PARSE_OPT_KEEP_UNKNOWN = 8,
PARSE_OPT_NO_INTERNAL_HELP = 16,
+ PARSE_OPT_OPTARG_ALLOW_NEXT = 32,
};
enum parse_opt_option_flags {
--
2.53.0
^ permalink raw reply related
* [PATCH 1/3] rtla: Add libsubcmd dependency
From: Tomas Glozar @ 2026-03-20 15:06 UTC (permalink / raw)
To: Steven Rostedt, Tomas Glozar
Cc: John Kacur, Luis Goncalves, Crystal Wood, Costa Shulyupin,
Wander Lairson Costa, Ivan Pravdin, Namhyung Kim, Ian Rogers,
Arnaldo Carvalho de Melo, LKML, linux-trace-kernel,
linux-perf-users
In-Reply-To: <20260320150651.51057-1-tglozar@redhat.com>
In preparation to migrating RTLA to libsubcmd, build libsubcmd from the
appropriate directory next to the RTLA build proper, and link the
resulting object to RTLA.
libsubcmd uses str_error_r() and strlcpy() at several places. To support
these, also link the respective libraries from tools/lib.
For completeness, also add tools/include to include path. This will
allow other userspace function and macros shipped with the kernel to be
used in RTLA; perf and bpftool, two other users of libsubcmd, already do
that.
To prevent name conflict, rename RTLA's run_command() function to
run_tool_command(), and replace RTLA's own container_of implementation
with the one in tools/include/linux/container_of.h.
Assisted-by: Composer:composer-1
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
tools/tracing/rtla/.gitignore | 1 +
tools/tracing/rtla/Makefile | 53 +++++++++++++++++++++++++++++-----
tools/tracing/rtla/src/rtla.c | 8 ++---
tools/tracing/rtla/src/utils.h | 6 ++--
4 files changed, 53 insertions(+), 15 deletions(-)
diff --git a/tools/tracing/rtla/.gitignore b/tools/tracing/rtla/.gitignore
index 4d39d64ac08c..123c2d5ed7ac 100644
--- a/tools/tracing/rtla/.gitignore
+++ b/tools/tracing/rtla/.gitignore
@@ -9,3 +9,4 @@ custom_filename.txt
osnoise_irq_noise_hist.txt
osnoise_trace.txt
timerlat_trace.txt
+libsubcmd/
diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile
index 45690ee14544..289e44c9664b 100644
--- a/tools/tracing/rtla/Makefile
+++ b/tools/tracing/rtla/Makefile
@@ -27,6 +27,24 @@ endif
RTLA := $(OUTPUT)rtla
RTLA_IN := $(RTLA)-in.o
+LIBSUBCMD_DIR = $(srctree)/tools/lib/subcmd/
+ifneq ($(OUTPUT),)
+ LIBSUBCMD_OUTPUT = $(abspath $(OUTPUT))/libsubcmd
+else
+ LIBSUBCMD_OUTPUT = $(CURDIR)/libsubcmd
+endif
+LIBSUBCMD = $(LIBSUBCMD_OUTPUT)/libsubcmd.a
+LIBSUBCMD_INCLUDES = -I$(LIBSUBCMD_OUTPUT)/include
+LIBSUBCMD_MAKEFLAGS = O=$(LIBSUBCMD_OUTPUT) DESTDIR=$(LIBSUBCMD_OUTPUT) prefix= subdir=
+
+TOOLS_INCLUDES = -I$(srctree)/tools/include
+
+LIB_STRING = $(OUTPUT)string.o
+LIB_STRING_SRC = $(srctree)/tools/lib/string.c
+
+LIB_STR_ERROR_R = $(OUTPUT)str_error_r.o
+LIB_STR_ERROR_R_SRC = $(srctree)/tools/lib/str_error_r.c
+
VERSION := $(shell sh -c "make -sC ../../.. kernelversion | grep -v make")
DOCSRC := ../../../Documentation/tools/rtla/
@@ -66,7 +84,7 @@ ifeq ($(config),1)
include Makefile.config
endif
-CFLAGS += $(INCLUDES) $(LIB_INCLUDES)
+CFLAGS += $(INCLUDES) $(LIB_INCLUDES) $(TOOLS_INCLUDES) $(LIBSUBCMD_INCLUDES)
export CFLAGS OUTPUT srctree
@@ -93,20 +111,41 @@ tests/bpf/bpf_action_map.o: tests/bpf/bpf_action_map.c
$(Q)echo "BPF skeleton support is disabled, skipping tests/bpf/bpf_action_map.o"
endif
-$(RTLA): $(RTLA_IN)
- $(QUIET_LINK)$(CC) $(LDFLAGS) -o $(RTLA) $(RTLA_IN) $(EXTLIBS)
+$(RTLA): $(RTLA_IN) $(LIBSUBCMD) $(LIB_STRING) $(LIB_STR_ERROR_R)
+ $(QUIET_LINK)$(CC) $(LDFLAGS) -o $(RTLA) $(RTLA_IN) $(LIBSUBCMD) $(LIB_STRING) $(LIB_STR_ERROR_R) $(EXTLIBS)
-static: $(RTLA_IN)
+static: $(RTLA_IN) $(LIBSUBCMD) $(LIB_STRING) $(LIB_STR_ERROR_R)
$(eval LDFLAGS += -static)
- $(QUIET_LINK)$(CC) -static $(LDFLAGS) -o $(RTLA)-static $(RTLA_IN) $(EXTLIBS)
+ $(QUIET_LINK)$(CC) -static $(LDFLAGS) -o $(RTLA)-static $(RTLA_IN) $(LIBSUBCMD) $(LIB_STRING) $(LIB_STR_ERROR_R) $(EXTLIBS)
rtla.%: fixdep FORCE
make -f $(srctree)/tools/build/Makefile.build dir=. $@
-$(RTLA_IN): fixdep FORCE src/timerlat.skel.h
+$(RTLA_IN): fixdep FORCE src/timerlat.skel.h $(LIBSUBCMD_INCLUDES)
make $(build)=rtla
-clean: doc_clean fixdep-clean
+$(LIBSUBCMD_OUTPUT):
+ $(Q)$(MKDIR) -p $@
+
+$(LIBSUBCMD_INCLUDES): $(LIBSUBCMD_OUTPUT)
+ $(Q)$(MAKE) -C $(LIBSUBCMD_DIR) $(LIBSUBCMD_MAKEFLAGS) \
+ install_headers
+
+$(LIBSUBCMD): fixdep $(LIBSUBCMD_OUTPUT)
+ $(Q)$(MAKE) -C $(LIBSUBCMD_DIR) $(LIBSUBCMD_MAKEFLAGS) \
+ $@
+
+$(LIB_STR_ERROR_R): $(LIB_STR_ERROR_R_SRC)
+ $(QUIET_CC)$(CC) $(CFLAGS) -c -o $@ $<
+
+$(LIB_STRING): $(LIB_STRING_SRC)
+ $(QUIET_CC)$(CC) $(CFLAGS) -c -o $@ $<
+
+$(LIBSUBCMD)-clean:
+ $(call QUIET_CLEAN, libsubcmd)
+ $(Q)$(RM) -r -- $(LIBSUBCMD_OUTPUT)
+
+clean: doc_clean fixdep-clean $(LIBSUBCMD)-clean
$(call QUIET_CLEAN, rtla)
$(Q)find . -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
$(Q)rm -f rtla rtla-static fixdep FEATURE-DUMP rtla-*
diff --git a/tools/tracing/rtla/src/rtla.c b/tools/tracing/rtla/src/rtla.c
index 7635c70123ab..845932f902ef 100644
--- a/tools/tracing/rtla/src/rtla.c
+++ b/tools/tracing/rtla/src/rtla.c
@@ -38,12 +38,12 @@ static void rtla_usage(int err)
}
/*
- * run_command - try to run a rtla tool command
+ * run_tool_command - try to run a rtla tool command
*
* It returns 0 if it fails. The tool's main will generally not
* return as they should call exit().
*/
-int run_command(int argc, char **argv, int start_position)
+int run_tool_command(int argc, char **argv, int start_position)
{
if (strcmp(argv[start_position], "osnoise") == 0) {
osnoise_main(argc-start_position, &argv[start_position]);
@@ -66,7 +66,7 @@ int main(int argc, char *argv[])
int retval;
/* is it an alias? */
- retval = run_command(argc, argv, 0);
+ retval = run_tool_command(argc, argv, 0);
if (retval)
exit(0);
@@ -79,7 +79,7 @@ int main(int argc, char *argv[])
rtla_usage(0);
}
- retval = run_command(argc, argv, 1);
+ retval = run_tool_command(argc, argv, 1);
if (retval)
exit(0);
diff --git a/tools/tracing/rtla/src/utils.h b/tools/tracing/rtla/src/utils.h
index e794ede64b2c..96fd72042717 100644
--- a/tools/tracing/rtla/src/utils.h
+++ b/tools/tracing/rtla/src/utils.h
@@ -7,6 +7,8 @@
#include <stdbool.h>
#include <stdlib.h>
+#include <linux/container_of.h>
+
/*
* '18446744073709551615\0'
*/
@@ -37,10 +39,6 @@ static inline bool str_has_prefix(const char *str, const char *prefix)
return strncmp(str, prefix, strlen(prefix)) == 0;
}
-#define container_of(ptr, type, member)({ \
- const typeof(((type *)0)->member) *__mptr = (ptr); \
- (type *)((char *)__mptr - offsetof(type, member)) ; })
-
extern int config_debug;
void debug_msg(const char *fmt, ...);
void err_msg(const char *fmt, ...);
--
2.53.0
^ permalink raw reply related
* [PATCH 0/3] rtla: Migrate to libsubcmd for command line option parsing
From: Tomas Glozar @ 2026-03-20 15:06 UTC (permalink / raw)
To: Steven Rostedt, Tomas Glozar
Cc: John Kacur, Luis Goncalves, Crystal Wood, Costa Shulyupin,
Wander Lairson Costa, Ivan Pravdin, Namhyung Kim, Ian Rogers,
Arnaldo Carvalho de Melo, LKML, linux-trace-kernel,
linux-perf-users
[ CC to linux-perf-users for the libsubcmd code change (second commit) ]
rtla currently uses its own implementation that uses getopt_long() to
parse command-line arguments.
Migrate rtla to use libsubcmd for command line argument parsing,
similarly to what is already done by other tools like perf, bpftool,
and objtool. Among other benefits, this allows help messages to be
generated automatically rather then having to by typed out manually
for each tool.
libsubcmd is extended with an option to parse optarg from separate
argument if a new flag is turned on. Without the flag, the old behavior
is preserved. That keeps the parsing working for tools that use
positional arguments, and allows RTLA to keep its flexible syntax for -C
and -t options and their long variants, --cgroup and --trace-output.
The new implementation is moved into a separate file, cli.c, together
with a tiny header counterpart, cli.h. This helps separate the parsing
logic, which has little in common with the rest of RTLA, in a separate
module.
Macros to generate struct option array fields for libsubcmd's
parse_args() are used to preserve the consolidation of argument parsing
code across different RTLA tools. Kernel and user threads are, as
an exception, treated as common, although they are currently implemented
for timerlat only, in line with earlier consolidation changes.
I expect more improvements to the code being possible in the future,
like creating macros for option groups to further deduplicate the code,
or reduce the amount of extra code in the _parse_args() functions.
Tomas Glozar (3):
rtla: Add libsubcmd dependency
tools subcmd: support optarg as separate argument
rtla: Parse cmdline using libsubcmd
tools/lib/subcmd/parse-options.c | 53 +-
tools/lib/subcmd/parse-options.h | 1 +
tools/tracing/rtla/.gitignore | 1 +
tools/tracing/rtla/Makefile | 53 +-
tools/tracing/rtla/src/Build | 2 +-
tools/tracing/rtla/src/cli.c | 1207 ++++++++++++++++++++++++
tools/tracing/rtla/src/cli.h | 7 +
tools/tracing/rtla/src/common.c | 109 ---
tools/tracing/rtla/src/common.h | 26 +-
tools/tracing/rtla/src/osnoise_hist.c | 221 +----
tools/tracing/rtla/src/osnoise_top.c | 200 +---
tools/tracing/rtla/src/rtla.c | 89 --
tools/tracing/rtla/src/timerlat.h | 4 +-
tools/tracing/rtla/src/timerlat_hist.c | 317 +------
tools/tracing/rtla/src/timerlat_top.c | 285 +-----
tools/tracing/rtla/src/utils.c | 28 +-
tools/tracing/rtla/src/utils.h | 9 +-
tools/tracing/rtla/tests/hwnoise.t | 2 +-
18 files changed, 1331 insertions(+), 1283 deletions(-)
create mode 100644 tools/tracing/rtla/src/cli.c
create mode 100644 tools/tracing/rtla/src/cli.h
delete mode 100644 tools/tracing/rtla/src/rtla.c
--
2.53.0
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox