Linux Trace Kernel
 help / color / mirror / Atom feed
* [PATCH v3 2/7] mm/page_owner: add MR_NEVER to enum migrate_reason and use it for last_migrate_reason
From: Ye Liu @ 2026-06-30  1:53 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Steven Rostedt,
	Masami Hiramatsu, Vlastimil Babka
  Cc: Ye Liu, Zi Yan, Matthew Brost, Joshua Hahn, Rakie Kim,
	Byungchul Park, Gregory Price, Ying Huang, Alistair Popple,
	Mathieu Desnoyers, Suren Baghdasaryan, Michal Hocko,
	Brendan Jackman, Johannes Weiner, linux-mm, linux-kernel,
	linux-trace-kernel
In-Reply-To: <20260630015331.147174-1-ye.liu@linux.dev>

The last_migrate_reason field uses -1 as a sentinel value to mean "no
migration has happened".  Replace the four bare -1 occurrences by
adding a proper MR_NEVER member to enum migrate_reason, defining a
corresponding "never_migrated" string in the MIGRATE_REASON trace
macro, and removing the local MIGRATE_REASON_NONE define.

No functional change.

Signed-off-by: Ye Liu <ye.liu@linux.dev>
---
 include/linux/migrate_mode.h   | 1 +
 include/trace/events/migrate.h | 3 ++-
 mm/page_owner.c                | 8 ++++----
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h
index 265c4328b36a..05102d4d2490 100644
--- a/include/linux/migrate_mode.h
+++ b/include/linux/migrate_mode.h
@@ -25,6 +25,7 @@ enum migrate_reason {
 	MR_LONGTERM_PIN,
 	MR_DEMOTION,
 	MR_DAMON,
+	MR_NEVER,		/* page has never been migrated */
 	MR_TYPES
 };
 
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index cd01dd7b3640..11bc0aa14c7e 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -23,7 +23,8 @@
 	EM( MR_CONTIG_RANGE,	"contig_range")			\
 	EM( MR_LONGTERM_PIN,	"longterm_pin")			\
 	EM( MR_DEMOTION,	"demotion")			\
-	EMe(MR_DAMON,		"damon")
+	EM( MR_DAMON,		"damon")			\
+	EMe(MR_NEVER,		"never_migrated")
 
 /*
  * First define the enums in the above macros to be exported to userspace
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 342549891a8d..c2f43ab860eb 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -339,7 +339,7 @@ noinline void __set_page_owner(struct page *page, unsigned short order,
 	depot_stack_handle_t handle;
 
 	handle = save_stack(gfp_mask);
-	__update_page_owner_handle(page, handle, order, gfp_mask, -1,
+	__update_page_owner_handle(page, handle, order, gfp_mask, MR_NEVER,
 				   ts_nsec, current->pid, current->tgid,
 				   current->comm);
 	inc_stack_record_count(handle, gfp_mask, 1 << order);
@@ -596,7 +596,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
 	if (ret >= count)
 		goto err;
 
-	if (page_owner->last_migrate_reason != -1) {
+	if (page_owner->last_migrate_reason != MR_NEVER) {
 		ret += scnprintf(kbuf + ret, count - ret,
 			"Page has been migrated, last migrate reason: %s\n",
 			migrate_reason_names[page_owner->last_migrate_reason]);
@@ -667,7 +667,7 @@ void __dump_page_owner(const struct page *page)
 		stack_depot_print(handle);
 	}
 
-	if (page_owner->last_migrate_reason != -1)
+	if (page_owner->last_migrate_reason != MR_NEVER)
 		pr_alert("page has been migrated, last migrate reason: %s\n",
 			migrate_reason_names[page_owner->last_migrate_reason]);
 	page_ext_put(page_ext);
@@ -826,7 +826,7 @@ static void init_pages_in_zone(struct zone *zone)
 
 			/* Found early allocated page */
 			__update_page_owner_handle(page, early_handle, 0, 0,
-						   -1, local_clock(), current->pid,
+						   MR_NEVER, local_clock(), current->pid,
 						   current->tgid, current->comm);
 			count++;
 ext_put_continue:
-- 
2.43.0


^ permalink raw reply related

* [PATCH v3 3/7] mm: use enum migrate_reason instead of int for migration reason parameters
From: Ye Liu @ 2026-06-30  1:53 UTC (permalink / raw)
  To: Muchun Song, Oscar Salvador, Andrew Morton, David Hildenbrand,
	Steven Rostedt, Masami Hiramatsu, Vlastimil Babka
  Cc: Ye Liu, Zi Yan, Matthew Brost, Joshua Hahn, Rakie Kim,
	Byungchul Park, Gregory Price, Ying Huang, Alistair Popple,
	Lorenzo Stoakes, Liam R. Howlett, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Mathieu Desnoyers,
	Brendan Jackman, Johannes Weiner, linux-mm, linux-kernel,
	linux-trace-kernel
In-Reply-To: <20260630015331.147174-1-ye.liu@linux.dev>

Replace all 'int reason' function parameters that carry migrate_reason
values with the proper 'enum migrate_reason' type.  This makes the
intent explicit and leverages compiler type checking.  The affected
subsystems are:

  - page_owner: __folio_set_owner_migrate_reason(),
                folio_set_owner_migrate_reason()
  - migrate: migrate_pages(), migrate_pages_sync(),
             migrate_pages_batch(), migrate_folios_move(),
             migrate_hugetlbs(), unmap_and_move_huge_page()
  - hugetlb: move_hugetlb_state(), htlb_allow_alloc_fallback()
  - trace: mm_migrate_pages and mm_migrate_pages_start events

The 'short last_migrate_reason' struct field and internal helper
parameter in page_owner are intentionally left as 'short' since they
store per-page metadata where size matters.

No functional change.

Signed-off-by: Ye Liu <ye.liu@linux.dev>
---
 include/linux/hugetlb.h        |  9 +++++----
 include/linux/migrate.h        |  6 ++++--
 include/linux/page_owner.h     |  7 ++++---
 include/trace/events/migrate.h |  8 ++++----
 mm/hugetlb.c                   |  3 ++-
 mm/migrate.c                   | 12 ++++++------
 mm/page_owner.c                |  2 +-
 7 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 2abaf99321e9..fa828232dfcc 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -154,7 +154,8 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
 bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
 int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
 void folio_putback_hugetlb(struct folio *folio);
-void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
+void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio,
+			enum migrate_reason reason);
 void hugetlb_fix_reserve_counts(struct inode *inode);
 extern struct mutex *hugetlb_fault_mutex_table;
 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);
@@ -424,7 +425,7 @@ static inline void folio_putback_hugetlb(struct folio *folio)
 }
 
 static inline void move_hugetlb_state(struct folio *old_folio,
-					struct folio *new_folio, int reason)
+					struct folio *new_folio, enum migrate_reason reason)
 {
 }
 
@@ -956,7 +957,7 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
 	return modified_mask;
 }
 
-static inline bool htlb_allow_alloc_fallback(int reason)
+static inline bool htlb_allow_alloc_fallback(enum migrate_reason reason)
 {
 	bool allowed_fallback = false;
 
@@ -1238,7 +1239,7 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
 	return 0;
 }
 
-static inline bool htlb_allow_alloc_fallback(int reason)
+static inline bool htlb_allow_alloc_fallback(enum migrate_reason reason)
 {
 	return false;
 }
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index d5af2b7f577b..1f83924615d6 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -57,7 +57,8 @@ void putback_movable_pages(struct list_head *l);
 int migrate_folio(struct address_space *mapping, struct folio *dst,
 		struct folio *src, enum migrate_mode mode);
 int migrate_pages(struct list_head *l, new_folio_t new, free_folio_t free,
-		  unsigned long private, enum migrate_mode mode, int reason,
+		  unsigned long private, enum migrate_mode mode,
+		  enum migrate_reason reason,
 		  unsigned int *ret_succeeded);
 struct folio *alloc_migration_target(struct folio *src, unsigned long private);
 bool isolate_movable_ops_page(struct page *page, isolate_mode_t mode);
@@ -77,7 +78,8 @@ int set_movable_ops(const struct movable_operations *ops, enum pagetype type);
 static inline void putback_movable_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_folio_t new,
 		free_folio_t free, unsigned long private,
-		enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
+		enum migrate_mode mode, enum migrate_reason reason,
+		unsigned int *ret_succeeded)
 	{ return -ENOSYS; }
 static inline struct folio *alloc_migration_target(struct folio *src,
 		unsigned long private)
diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h
index 3328357f6dba..9fe51dfccf26 100644
--- a/include/linux/page_owner.h
+++ b/include/linux/page_owner.h
@@ -3,6 +3,7 @@
 #define __LINUX_PAGE_OWNER_H
 
 #include <linux/jump_label.h>
+#include <linux/migrate_mode.h>
 
 #ifdef CONFIG_PAGE_OWNER
 extern struct static_key_false page_owner_inited;
@@ -14,7 +15,7 @@ extern void __set_page_owner(struct page *page,
 extern void __split_page_owner(struct page *page, int old_order,
 			int new_order);
 extern void __folio_copy_owner(struct folio *newfolio, struct folio *old);
-extern void __folio_set_owner_migrate_reason(struct folio *folio, int reason);
+extern void __folio_set_owner_migrate_reason(struct folio *folio, enum migrate_reason reason);
 extern void __dump_page_owner(const struct page *page);
 extern void pagetypeinfo_showmixedcount_print(struct seq_file *m,
 					pg_data_t *pgdat, struct zone *zone);
@@ -43,7 +44,7 @@ static inline void folio_copy_owner(struct folio *newfolio, struct folio *old)
 	if (static_branch_unlikely(&page_owner_inited))
 		__folio_copy_owner(newfolio, old);
 }
-static inline void folio_set_owner_migrate_reason(struct folio *folio, int reason)
+static inline void folio_set_owner_migrate_reason(struct folio *folio, enum migrate_reason reason)
 {
 	if (static_branch_unlikely(&page_owner_inited))
 		__folio_set_owner_migrate_reason(folio, reason);
@@ -68,7 +69,7 @@ static inline void split_page_owner(struct page *page, int old_order,
 static inline void folio_copy_owner(struct folio *newfolio, struct folio *folio)
 {
 }
-static inline void folio_set_owner_migrate_reason(struct folio *folio, int reason)
+static inline void folio_set_owner_migrate_reason(struct folio *folio, enum migrate_reason reason)
 {
 }
 static inline void dump_page_owner(const struct page *page)
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index 11bc0aa14c7e..15ee2ef201b5 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -52,7 +52,7 @@ TRACE_EVENT(mm_migrate_pages,
 	TP_PROTO(unsigned long succeeded, unsigned long failed,
 		 unsigned long thp_succeeded, unsigned long thp_failed,
 		 unsigned long thp_split, unsigned long large_folio_split,
-		 enum migrate_mode mode, int reason),
+		 enum migrate_mode mode, enum migrate_reason reason),
 
 	TP_ARGS(succeeded, failed, thp_succeeded, thp_failed,
 		thp_split, large_folio_split, mode, reason),
@@ -65,7 +65,7 @@ TRACE_EVENT(mm_migrate_pages,
 		__field(	unsigned long,		thp_split)
 		__field(	unsigned long,		large_folio_split)
 		__field(	enum migrate_mode,	mode)
-		__field(	int,			reason)
+		__field(	enum migrate_reason,	reason)
 	),
 
 	TP_fast_assign(
@@ -92,13 +92,13 @@ TRACE_EVENT(mm_migrate_pages,
 
 TRACE_EVENT(mm_migrate_pages_start,
 
-	TP_PROTO(enum migrate_mode mode, int reason),
+	TP_PROTO(enum migrate_mode mode, enum migrate_reason reason),
 
 	TP_ARGS(mode, reason),
 
 	TP_STRUCT__entry(
 		__field(enum migrate_mode, mode)
-		__field(int, reason)
+		__field(enum migrate_reason, reason)
 	),
 
 	TP_fast_assign(
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 571212b80835..17732d1fdc5e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7182,7 +7182,8 @@ void folio_putback_hugetlb(struct folio *folio)
 	folio_put(folio);
 }
 
-void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason)
+void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio,
+			enum migrate_reason reason)
 {
 	struct hstate *h = folio_hstate(old_folio);
 
diff --git a/mm/migrate.c b/mm/migrate.c
index d9b23909d716..49e10feeb094 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1469,7 +1469,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
 static int unmap_and_move_huge_page(new_folio_t get_new_folio,
 		free_folio_t put_new_folio, unsigned long private,
 		struct folio *src, int force, enum migrate_mode mode,
-		int reason, struct list_head *ret)
+		enum migrate_reason reason, struct list_head *ret)
 {
 	struct folio *dst;
 	int rc = -EAGAIN;
@@ -1626,7 +1626,7 @@ struct migrate_pages_stats {
  */
 static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
 			    free_folio_t put_new_folio, unsigned long private,
-			    enum migrate_mode mode, int reason,
+			    enum migrate_mode mode, enum migrate_reason reason,
 			    struct migrate_pages_stats *stats,
 			    struct list_head *ret_folios)
 {
@@ -1716,7 +1716,7 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
 static void migrate_folios_move(struct list_head *src_folios,
 		struct list_head *dst_folios,
 		free_folio_t put_new_folio, unsigned long private,
-		enum migrate_mode mode, int reason,
+		enum migrate_mode mode, enum migrate_reason reason,
 		struct list_head *ret_folios,
 		struct migrate_pages_stats *stats,
 		int *retry, int *thp_retry, int *nr_failed,
@@ -1799,7 +1799,7 @@ static void migrate_folios_undo(struct list_head *src_folios,
  */
 static int migrate_pages_batch(struct list_head *from,
 		new_folio_t get_new_folio, free_folio_t put_new_folio,
-		unsigned long private, enum migrate_mode mode, int reason,
+		unsigned long private, enum migrate_mode mode, enum migrate_reason reason,
 		struct list_head *ret_folios, struct list_head *split_folios,
 		struct migrate_pages_stats *stats, int nr_pass)
 {
@@ -2011,7 +2011,7 @@ static int migrate_pages_batch(struct list_head *from,
 
 static int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio,
 		free_folio_t put_new_folio, unsigned long private,
-		enum migrate_mode mode, int reason,
+		enum migrate_mode mode, enum migrate_reason reason,
 		struct list_head *ret_folios, struct list_head *split_folios,
 		struct migrate_pages_stats *stats)
 {
@@ -2088,7 +2088,7 @@ static int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio,
  */
 int migrate_pages(struct list_head *from, new_folio_t get_new_folio,
 		free_folio_t put_new_folio, unsigned long private,
-		enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
+		enum migrate_mode mode, enum migrate_reason reason, unsigned int *ret_succeeded)
 {
 	int rc, rc_gather;
 	int nr_pages;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index c2f43ab860eb..4e352941a6e2 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -345,7 +345,7 @@ noinline void __set_page_owner(struct page *page, unsigned short order,
 	inc_stack_record_count(handle, gfp_mask, 1 << order);
 }
 
-void __folio_set_owner_migrate_reason(struct folio *folio, int reason)
+void __folio_set_owner_migrate_reason(struct folio *folio, enum migrate_reason reason)
 {
 	struct page_ext *page_ext = page_ext_get(&folio->page);
 	struct page_owner *page_owner;
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH v3 2/7] mm/page_owner: add MR_NEVER to enum migrate_reason and use it for last_migrate_reason
From: Zi Yan @ 2026-06-30  1:58 UTC (permalink / raw)
  To: Ye Liu, Andrew Morton, David Hildenbrand, Steven Rostedt,
	Masami Hiramatsu, Vlastimil Babka
  Cc: Matthew Brost, Joshua Hahn, Rakie Kim, Byungchul Park,
	Gregory Price, Ying Huang, Alistair Popple, Mathieu Desnoyers,
	Suren Baghdasaryan, Michal Hocko, Brendan Jackman,
	Johannes Weiner, linux-mm, linux-kernel, linux-trace-kernel
In-Reply-To: <20260630015331.147174-3-ye.liu@linux.dev>

On Mon Jun 29, 2026 at 9:53 PM EDT, Ye Liu wrote:
> The last_migrate_reason field uses -1 as a sentinel value to mean "no
> migration has happened".  Replace the four bare -1 occurrences by
> adding a proper MR_NEVER member to enum migrate_reason, defining a
> corresponding "never_migrated" string in the MIGRATE_REASON trace
> macro, and removing the local MIGRATE_REASON_NONE define.
>
> No functional change.
>
> Signed-off-by: Ye Liu <ye.liu@linux.dev>
> ---
>  include/linux/migrate_mode.h   | 1 +
>  include/trace/events/migrate.h | 3 ++-
>  mm/page_owner.c                | 8 ++++----
>  3 files changed, 7 insertions(+), 5 deletions(-)
>

LGTM. Thanks.

Reviewed-by: Zi Yan <ziy@nvidia.com>


-- 
Best Regards,
Yan, Zi


^ permalink raw reply

* Re: [PATCH v3 3/7] mm: use enum migrate_reason instead of int for migration reason parameters
From: Zi Yan @ 2026-06-30  2:01 UTC (permalink / raw)
  To: Ye Liu, Muchun Song, Oscar Salvador, Andrew Morton,
	David Hildenbrand, Steven Rostedt, Masami Hiramatsu,
	Vlastimil Babka
  Cc: Matthew Brost, Joshua Hahn, Rakie Kim, Byungchul Park,
	Gregory Price, Ying Huang, Alistair Popple, Lorenzo Stoakes,
	Liam R. Howlett, Mike Rapoport, Suren Baghdasaryan, Michal Hocko,
	Mathieu Desnoyers, Brendan Jackman, Johannes Weiner, linux-mm,
	linux-kernel, linux-trace-kernel
In-Reply-To: <20260630015331.147174-4-ye.liu@linux.dev>

On Mon Jun 29, 2026 at 9:53 PM EDT, Ye Liu wrote:
> Replace all 'int reason' function parameters that carry migrate_reason
> values with the proper 'enum migrate_reason' type.  This makes the
> intent explicit and leverages compiler type checking.  The affected
> subsystems are:
>
>   - page_owner: __folio_set_owner_migrate_reason(),
>                 folio_set_owner_migrate_reason()
>   - migrate: migrate_pages(), migrate_pages_sync(),
>              migrate_pages_batch(), migrate_folios_move(),
>              migrate_hugetlbs(), unmap_and_move_huge_page()
>   - hugetlb: move_hugetlb_state(), htlb_allow_alloc_fallback()
>   - trace: mm_migrate_pages and mm_migrate_pages_start events
>
> The 'short last_migrate_reason' struct field and internal helper
> parameter in page_owner are intentionally left as 'short' since they
> store per-page metadata where size matters.
>
> No functional change.
>
> Signed-off-by: Ye Liu <ye.liu@linux.dev>
> ---
>  include/linux/hugetlb.h        |  9 +++++----
>  include/linux/migrate.h        |  6 ++++--
>  include/linux/page_owner.h     |  7 ++++---
>  include/trace/events/migrate.h |  8 ++++----
>  mm/hugetlb.c                   |  3 ++-
>  mm/migrate.c                   | 12 ++++++------
>  mm/page_owner.c                |  2 +-
>  7 files changed, 26 insertions(+), 21 deletions(-)
>

LGTM. Thanks.
Reviewed-by: Zi Yan <ziy@nvidia.com>

-- 
Best Regards,
Yan, Zi


^ permalink raw reply

* Re: [PATCH v8 23/46] KVM: TDX: Make source page optional for KVM_TDX_INIT_MEM_REGION
From: Yan Zhao @ 2026-06-30  2:09 UTC (permalink / raw)
  To: Ackerley Tng
  Cc: Sean Christopherson, aik, andrew.jones, binbin.wu, brauner,
	chao.p.peng, david, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
	Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
	Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen, Yuanchu Xie,
	Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt, Kiryl Shutsemau,
	Baoquan He, Jason Gunthorpe, Vlastimil Babka, kvm, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest, linux-mm,
	linux-coco
In-Reply-To: <CAEvNRgHO3T6pKDP7ye-RdqbGhAzVC7a=8uBUyaPxwbSuj9khqA@mail.gmail.com>

On Mon, Jun 29, 2026 at 05:00:02PM -0700, Ackerley Tng wrote:
> >> > The original uAPI did not explicitly define 0 as an invalid uaddr. Whether 0 was
> >> > rejected depended on whether the user mmap()'d address 0. If 0 was a valid
> >> > mapping, populate() could proceed.
> >> >
> >> > commit 2a62345b3052 ("KVM: guest_memfd: GUP source pages prior to populating
> >> > guest memory") changed the behavior though. It would return -EOPNOTSUPP for a 0
> >> > uaddr.
> >> >
> >>
> >> I see, I only looked at this after commit 2a62345b3052.
> >>
> >> > But if a user configures 0 uaddr as valid, writes to it, and then passes 0 as
> >> > source_addr(not from gmem), I'm not sure if it's good for the kernel to silently
> >> > treat 0 uaddr as an identifier for in-place copy from the private PFN in gmem.
> >> >
> >>
> >> I'd say the original uAPI perhaps just didn't document 0 as an
> >> unsupported uaddr. Given that commit 2a62345b3052 already merged, uAPI
> >> was perhaps accidentally changed and no customer complained, I think we
> >> can move forward with 0 as an invalid src_address? I wouldn't think
> >> anyone relies on 0 intentionally being a valid address.
> >>
> >> I could document that, if it helps?
> > What about just documenting that 0 is an unsupported uaddr which will be
> > re-purposed as an indicator to use the target pfn as the source, regardless of
> > whether gmem_in_place_conversion is true? i.e.,
> >
> > if (!src_page)
> > 	src_page = pfn_to_page(pfn);
> >
> > I don't get why the two scenarios should be treated differently:
> > 1. gmem_in_place_conversion==true, shared memory is not from gmem
> > 2. gmem_in_place_conversion==false, shared memory is not from gmem
> >
> > In both case, a 0 uaddr could be mapped to a valid page not from gmem.
> 
> This is true, but this check isn't about whether the page is from gmem.
Hmm. TDX's in-place add does not rely on gmem in-place conversion, which means
when gmem_in_place_conversion==false, TDX's in-place add can still be successful.

Since checking gmem_in_place_conversion==true also can't guarantee the share
memory is from gmem, it makes me feel odd to reject scenario 2 while turning
scenario 1 to in-place add.

> > So why not update the uAPI to handle both cases consistently? :)
> >
> 
> Wait, but before this series, if region.src_address = 0, src_page = NULL
> and that's not supported so it returns -EOPNOTSUPP.
As in our previous discussion, no customer complaining about the previous change
to -EOPNOTSUPP means no one uses 0 uaddr today.

> If that's dropped, then suddenly if region.src_address = 0 and
> !gmem_in_place_conversion, tdx_gmem_post_populate() will now load the
> memory (zeroed) after [1] into the guest? I don't think we want to
> change that behavior.
>
> I could document that 0 is an unsupported uaddr only for TDX, and only
> when gmem_in_place_conversion = false.
>
> Since it is unsupported only when gmem_in_place_conversion = false, the
> check two lines marked with <<==== can't go away?
> 
> 	if (!src_page) {
> 		if (!gmem_in_place_conversion)  <<====
> 			return -EOPNOTSUPP;     <<====
This rejecting scenario 2.

> 		src_page = pfn_to_page(pfn);
This turning scenario 1 to in-place add.

> 	}
> 
> Also, for SNP, src_address == 0 is permitted (and desired, I believe, to
> avoid a pointless kernel memcpy) if the type of population is
> KVM_SEV_SNP_PAGE_TYPE_ZERO.
> 
> >> >> >> getting pfn" patch, ends up with the guest silently having a zero page?
> >> >> >> I think that would be found quite early in userspace VMM testing...
> >>
> >> [...snip...]
> >>
> 

^ permalink raw reply

* Re: [PATCH v8 23/46] KVM: TDX: Make source page optional for KVM_TDX_INIT_MEM_REGION
From: Yan Zhao @ 2026-06-30  2:21 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Ackerley Tng, aik@amd.com, andrew.jones@linux.dev,
	binbin.wu@linux.intel.com, brauner@kernel.org,
	chao.p.peng@linux.intel.com, david@kernel.org,
	jmattson@google.com, jthoughton@google.com, michael.roth@amd.com,
	oupton@kernel.org, pankaj.gupta@amd.com, qperret@google.com,
	Edgecombe, Rick P, rientjes@google.com, shivankg@amd.com,
	steven.price@arm.com, tabba@google.com, willy@infradead.org,
	wyihan@google.com, forkloop@google.com, pratyush@kernel.org,
	suzuki.poulose@arm.com, aneesh.kumar@kernel.org,
	liam@infradead.org, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86@kernel.org, H. Peter Anvin,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Jonathan Corbet, Shuah Khan, Shuah Khan, Annapurve, Vishal,
	Andrew Morton, Chris Li, Kairui Song, Kemeng Shi, Nhat Pham,
	Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park,
	Qi Zheng, Shakeel Butt, Kiryl Shutsemau, Baoquan He,
	Jason Gunthorpe, Vlastimil Babka, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org,
	linux-doc@vger.kernel.org, linux-kselftest@vger.kernel.org,
	linux-mm@kvack.org, linux-coco@lists.linux.dev
In-Reply-To: <akMPZePBdwQlD74H@google.com>

On Tue, Jun 30, 2026 at 08:35:49AM +0800, Sean Christopherson wrote:
> Gah, I thought I had sent this out this morning, long before Ackerley's response.
> But I got distracted by a meeting and forgot to get back to this... *sigh*
> 
> Sending what I already wrote, even though there's a lot of overlap with Ackerley's
> mail.
> 
> On Mon, Jun 29, 2026, Yan Zhao wrote:
> > On Fri, Jun 26, 2026 at 08:28:32AM -0700, Ackerley Tng wrote:
> > > Yan Zhao <yan.y.zhao@intel.com> writes:
> > > > But if a user configures 0 uaddr as valid, writes to it, and then passes 0 as
> > > > source_addr(not from gmem), I'm not sure if it's good for the kernel to silently
> > > > treat 0 uaddr as an identifier for in-place copy from the private PFN in gmem.
> > > >
> > > 
> > > I'd say the original uAPI perhaps just didn't document 0 as an
> > > unsupported uaddr. Given that commit 2a62345b3052 already merged, uAPI
> > > was perhaps accidentally changed and no customer complained, I think we
> > > can move forward with 0 as an invalid src_address? I wouldn't think
> > > anyone relies on 0 intentionally being a valid address.
> > > 
> > > I could document that, if it helps?
> > What about just documenting that 0 is an unsupported uaddr which will be
> > re-purposed as an indicator to use the target pfn as the source, regardless of
> > whether gmem_in_place_conversion is true? i.e.,
> > 
> > if (!src_page) 
> > 	src_page = pfn_to_page(pfn);
> 
> Because KVM can't generally use the target page as the source without in-place
> conversion, it's not supported today, and out-of-place conversion is being
> deprecated.
By "out-of-place conversion", do you mean using per-VM memory attribute
conversion?

> > I don't get why the two scenarios should be treated differently:
> > 1. gmem_in_place_conversion==true, shared memory is not from gmem 
> > 2. gmem_in_place_conversion==false, shared memory is not from gmem
> > 
> > In both case, a 0 uaddr could be mapped to a valid page not from gmem.
> 
> That's immaterial.  KVM's ABI (that we're solidifying) is that an address of '0'
> for the source means NULL.  The fact that userspace could have a valid mapping
> at virtual address '0' is irrelevant.
So, I'm wondering if we can document that 0 uaddr could always mean using target
PFN.
i.e., for both scenarios 1 and 2, al long as 0 uaddr is specified, we always
use target PFN as source for in-place add.

> Again, just because something is technically possible doesn't mean it needs to
> be supported by every piece of KVM's uAPI.
> 
> > So why not update the uAPI to handle both cases consistently? :)
> 
> Because retroactively adding support for out-of-place conversion is pointless
> (requires a userspace update for a feature that's being deprecated), KVM can't
> generally support using the source for out-of-place conversion (it's effectively
> an obscure zero-page optimization), and IMO rejecting the out-of-place conversion
> scenario is valuable for KVM developers, e.g. to help newcomers understand what
> exactly is and isn't possible.
Ok. You mean per-VM memory attribute is deprecating, and source page from !gmem
backend is also deprecating, so we don't want to change uAPI for scenarios under
gmem_in_place_conversion==false. Right?

> Side topic, isn't TDX broken if target page has already been added to the TD?
> IIUC, kvm_tdp_mmu_map_private_pfn() will be a glorified nop due to the page
> already having a valid S-EPT mapping, and so KVM will incorrectly allow a double
Not sure if my understand out-of-place conversion correctly.
Given target PFNs and GFNs are not duplicated, what would cause double add? :)

> add.  Ahhh, no, because KVM will return RET_PF_SPURIOUS and
> kvm_tdp_mmu_map_private_pfn() will then return -EIO.
My asking was if we could document uaddr always means using target PFN, since
TDX's in-place add does not rely on gmem in-place conversion.





^ permalink raw reply

* [PATCH 00/18] mm/damon: optimize out nr_accesses_bp
From: SJ Park @ 2026-06-30  4:07 UTC (permalink / raw)
  To: Andrew Morton
  Cc: SJ Park, Brendan Higgins, David Gow, Masami Hiramatsu,
	Mathieu Desnoyers, Shuah Khan, Steven Rostedt, damon, kunit-dev,
	linux-kernel, linux-kselftest, linux-mm, linux-trace-kernel

TLDR: Replace damon_region->nr_accesses_bp, which is easy to be wrong,
with a simpler on-demand moving sum function, damon_nr_accesses_mvsum().

Background
==========

DAMON's monitoring output (access pattern snapshot, or more technically
speaking, damon_region->nr_accesses) is completed once per aggregation
interval, which is 100 ms by default.  Users can arbitrarily increase
the interval for demand.  Under the suggested intervals auto-tuning
setup, it can span up to 200 seconds.  If the aggregation interval is
too long, the snapshot users cannot use it in reasonable time.  To
mitigate this, we introduced a new field of damon_region, namely
nr_accesses_bp.  It contains a pseudo moving sum of nr_accesses in bp
units and is updated for each sampling interval.

It turned out keeping it correctly updated every sampling interval is
not that easy.  From online parameter update feature development and
more experimental hacks, we found it is easy to be corrupted.  Once it
is corrupted, DAMON's monitoring outputs become quite insane.  Hence we
added a few validation checks.  It is easy  to be corrupted because it
requires every update per sampling interval to be correct.

Solution
========

There is no real reason to keep it updated every sampling interval.  Due
to the simple pseudo-moving sum mechanism and existing helper field
(last_nr_accesses), we can also calculate the pseudo moving sum on
demand in a much simpler way.

Implement a function for getting the pseudo moving sum on demand, and
replace nr_accessses_bp uses with the new function.  Also remove no more
needed tests for nr_accesses_bp and the per-sampling interval update
functions.  Finally, remove the nr_accesses_bp.  The new function is
quite simple.

Discussion
==========

Depending on the use case, multiple nr_accesses readers could be
executed in the same kdamond_fn() main loop iteration, which is executed
once per sampling interval.  Such readers include DAMON region exporting
tracepoints (damon_[region_]aggregated and damos_before_apply), DAMOS,
and DAMON sysfs interface logic for update_schemes_tried_regions
command.  In this case, the new function will be called multiple times
and this could be overhead compared to the old logic, which simply reads
the field without any additional work.  Nonetheless, the new function is
quite simple.  And the new approach does nothing while there is no need
to read.  The old approach had to execute its update function for each
region for every sampling interval.  Hence the new approach is believed
to be even more lightweight in common case, and the overhead is anyway
negligible.

One more advantage of this change is that one field from the
damon_region struct is removed.  On setups that uses a high number of
DAMON regions, this could be a potential memory space benefit.

Patches Sequence
================

Patch 1 introduces the new function for getting the pseudo moving sum of
nr_accesses on demands.  Patch 2 implements a unit test for the new
function's internal logic.  Patch 3 and 4 update monitoring logic and
the new function to ready for safe use on the existing logic.  Patches
5-7 replace uses of nr_accesses_bp in DAMOS, tracepoints and DAMON sysfs
interface with the new function, respectively.  Patches 8-10 removes
nr_accesses_bp validation functions in DAMON core, one by one.  Patches
11 and 12 further remove tests and test helper for nr_accesses_bp,
respectively.  Patches 13 removes the setups and updates or
nr_accesses_bp field.  Patches 14-16 cleans up function parameters that
are no more being used due to the previous patch.  Patch 17 removes the
function that was used for updating nr_accesses_bp field with its unit
test, which is the single remaining caller of the function.  Finally,
patch 18 removes damon_region->nr_accesses_bp field.

Changes from RFC v1.3
- RFC v1.3: https://lore.kernel.org/20260622142139.30269-1-sj@kernel.org
- Wordsmith and fixup commit subjects.
- Drop RFC tag.
- Rebase to latest mm-new.
Changes from RFC v1.2
- RFC v1.2: https://lore.kernel.org/20260621155715.87932-1-sj@kernel.org
- Explicitly ignore nr_accesses from mvsum at the beginning of
  aggregation.
- Fix a typo in a commit message.
Changes from RFC v1.1
- RFC v1.1: https://lore.kernel.org/20260620172244.90953-1-sj@kernel.org
- Handle next_aggregation_sis < passed_sample_intervals in
  nr_accesses_mvsum().
- Always rescale ->last_nr_accesss for parameter changes.
- Remove unused attrs params from damon_update_region_access_rate() and
  its callers.
Changes from RFC v1
- RFC v1: https://lore.kernel.org/20260619193415.73833-1-sj@kernel.org
- Avoid divide-by-zero from zero aggregation interval.
- Call damon_nr_accesses_mvsum() for damos tracing only when it is enabled.
- Remove obsolete mentions of nr_accesses_bp in comments.

SJ Park (18):
  mm/damon/core: introduce damon_nr_accesses_mvsum()
  mm/damon/tests/core-kunit: test damon_mvsum()
  mm/damon/core: always update ->last_nr_accesses for intervals change
  mm/damon/core: handle unreset nr_accesses in damon_nr_accesses_mvsum()
  mm/damon/core: use damon_nr_accesses_mvsum() in __damos_valid_target()
  mm/damon/core: use damon_nr_accesses_mvsum() for damos region tracing
  mm/damon/sysfs-schemes: use damon_nr_accesses_mvsum() for damo regions
  mm/damon/core: remove damon_warn_fix_nr_accesses_corruption()
  mm/damon/core: remove damon_verify_reset_aggregated()
  mm/damon/core: remove damon_verify_merge_regions_of()
  mm/damon/tests/core-kunit: remove nr_accesses_bp setup and tests
  selftests/damon/drgn_dump_damon_status: do not dump nr_accesses_bp
  mm/damon/core: remove nr_accesses_bp setups and updates
  mm/damon/core: remove attrs param from
    damon_update_region_access_rate()
  mm/damon/paddr: remove attrs param from __damon_pa_check_access()
  mm/damon/vaddr: remove attrs param from __damon_va_check_access()
  mm/damon/core: remove damon_moving_sum() and its unit test
  mm/damon/core: remove damon_region->nr_accesses_bp

 include/linux/damon.h                         |  15 +-
 include/trace/events/damon.h                  |   8 +-
 mm/damon/core.c                               | 201 +++++++-----------
 mm/damon/paddr.c                              |   9 +-
 mm/damon/sysfs-schemes.c                      |   6 +-
 mm/damon/tests/core-kunit.h                   |  37 ++--
 mm/damon/vaddr.c                              |  12 +-
 .../selftests/damon/drgn_dump_damon_status.py |   1 -
 8 files changed, 119 insertions(+), 170 deletions(-)


base-commit: d74f4c01f930b32580e172a786efbcb352c312f1
-- 
2.47.3

^ permalink raw reply

* [PATCH 06/18] mm/damon/core: use damon_nr_accesses_mvsum() for damos region tracing
From: SJ Park @ 2026-06-30  4:07 UTC (permalink / raw)
  To: Andrew Morton
  Cc: SJ Park, Masami Hiramatsu, Mathieu Desnoyers, Steven Rostedt,
	damon, linux-kernel, linux-mm, linux-trace-kernel
In-Reply-To: <20260630040812.149729-1-sj@kernel.org>

damon_nr_accesses_mvsum() returns a value same to nr_accesses_bp.  Also
the function is more simple and therefore more tolerant to errors.
Execution of the function would be more expensive than the simple read
of the field, but because the function is quite simple, the overhead
should be negligible.  Use it in the DAMON region exporting trace points
instead of the nr_accesses_bp.

Signed-off-by: SJ Park <sj@kernel.org>
---
 include/trace/events/damon.h | 8 +++++---
 mm/damon/core.c              | 5 +++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h
index 78388538acf44..8851727ae1627 100644
--- a/include/trace/events/damon.h
+++ b/include/trace/events/damon.h
@@ -78,9 +78,11 @@ TRACE_EVENT_CONDITION(damos_before_apply,
 
 	TP_PROTO(unsigned int context_idx, unsigned int scheme_idx,
 		unsigned int target_idx, struct damon_region *r,
-		unsigned int nr_regions, bool do_trace),
+		unsigned int nr_accesses, unsigned int nr_regions,
+		bool do_trace),
 
-	TP_ARGS(context_idx, scheme_idx, target_idx, r, nr_regions, do_trace),
+	TP_ARGS(context_idx, scheme_idx, target_idx, r, nr_accesses,
+		nr_regions, do_trace),
 
 	TP_CONDITION(do_trace),
 
@@ -101,7 +103,7 @@ TRACE_EVENT_CONDITION(damos_before_apply,
 		__entry->target_idx = target_idx;
 		__entry->start = r->ar.start;
 		__entry->end = r->ar.end;
-		__entry->nr_accesses = r->nr_accesses_bp / 10000;
+		__entry->nr_accesses = nr_accesses;
 		__entry->age = r->age;
 		__entry->nr_regions = nr_regions;
 	),
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 612762490e866..6423e7417c24b 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -2446,7 +2446,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
 	struct damos *siter;		/* schemes iterator */
 	unsigned int sidx = 0;
 	struct damon_target *titer;	/* targets iterator */
-	unsigned int tidx = 0;
+	unsigned int tidx = 0, nr_accesses = 0;
 	bool do_trace = false;
 
 	/* get indices for trace_damos_before_apply() */
@@ -2461,6 +2461,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
 				break;
 			tidx++;
 		}
+		nr_accesses = damon_nr_accesses_mvsum(r, c);
 		do_trace = true;
 	}
 
@@ -2476,7 +2477,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
 		if (damos_core_filter_out(c, t, r, s))
 			return;
 		ktime_get_coarse_ts64(&begin);
-		trace_damos_before_apply(cidx, sidx, tidx, r,
+		trace_damos_before_apply(cidx, sidx, tidx, r, nr_accesses,
 				damon_nr_regions(t), do_trace);
 		sz_applied = c->ops.apply_scheme(c, t, r, s,
 				&sz_ops_filter_passed);
-- 
2.47.3

^ permalink raw reply related

* Re: [PATCH 00/18] mm/damon: optimize out nr_accesses_bp
From: SJ Park @ 2026-06-30  4:52 UTC (permalink / raw)
  To: SJ Park
  Cc: Andrew Morton, Brendan Higgins, David Gow, Masami Hiramatsu,
	Mathieu Desnoyers, Shuah Khan, Steven Rostedt, damon, kunit-dev,
	linux-kernel, linux-kselftest, linux-mm, linux-trace-kernel
In-Reply-To: <20260630040812.149729-1-sj@kernel.org>

On Mon, 29 Jun 2026 21:07:53 -0700 SJ Park <sj@kernel.org> wrote:

> TLDR: Replace damon_region->nr_accesses_bp, which is easy to be wrong,
> with a simpler on-demand moving sum function, damon_nr_accesses_mvsum().

Sashiko found no blocker for this series.  All Sashiko findings and my
responses can be found from the complete thread [1] at lore.kernel.org.

It found a typo in the commit message of patch 14, though.  I added the detail
about the typo as a reply [2] to the patch.  Andrew, could you please fix the
commit message when you pick it up?  If you prefer sending a new revision with
the fixup, pleae feel free to let me know.

[1] https://lore.kernel.org/20260630040812.149729-1-sj@kernel.org
[2] https://lore.kernel.org/20260630044859.152347-1-sj@kernel.org/

Thanks,
SJ

[...]

^ permalink raw reply

* [PATCH 0/2] Add trace events for Qualcomm GENI I2C drivers
From: Praveen Talari @ 2026-06-30  6:02 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Mukesh Kumar Savaliya, Viken Dadhaniya, Andi Shyti
  Cc: linux-kernel, linux-trace-kernel, linux-arm-msm, linux-i2c,
	aniket.randive, chandana.chiluveru, Praveen Talari

Add a new trace event header providing tracepoints for the Qualcomm
GENI I2C controller.

The trace events cover controller bus setup, interrupt status and 
error reporting. These events enable structured debugging and performance
analysis using ftrace.

Usage examples:

Enable all I2C traces:
echo 1 > /sys/kernel/tracing/events/i2c/enable
echo 1 > /sys/kernel/tracing/events/qcom_geni_i2c/enable

cat /sys/kernel/debug/tracing/trace_pipe

Example trace output:
79.737075: i2c_write: i2c-11 #0 a=057 f=0200 l=3 [00-00-3f]
79.737075: geni_i2c_bus_setup: a90000.i2c: clk_freq=400000 clk_div=2
   t_high=5 t_low=11 t_cycle=22
79.737084: geni_i2c_irq: a90000.i2c: m_stat=0x40000000 rx_st=0x00000000
   dm_tx=0x00000000 dm_rx=0x00000000
79.737201: geni_i2c_irq: a90000.i2c: m_stat=0x00000001 rx_st=0x00000000
   dm_tx=0x00000000 dm_rx=0x00000000
79.737211: i2c_result: i2c-11 n=1 ret=1

Signed-off-by: Praveen Talari <praveen.talari@oss.qualcomm.com>
---
Praveen Talari (2):
      i2c: qcom-geni: trace: Add trace events for Qualcomm GENI I2C
      i2c: qcom-geni: Add trace events for Qualcomm GENI I2C driver

 drivers/i2c/busses/i2c-qcom-geni.c   | 15 +++++++
 include/trace/events/qcom_geni_i2c.h | 82 ++++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+)
---
base-commit: 3d5670d672ae08b8c534b7beed6f57c8b44e7b43
change-id: 20260629-add-tracepoints-for-qcom-geni-i2c-33d14584228b

Best regards,
--  
Praveen Talari <praveen.talari@oss.qualcomm.com>


^ permalink raw reply

* [PATCH 1/2] i2c: qcom-geni: trace: Add trace events for Qualcomm GENI I2C
From: Praveen Talari @ 2026-06-30  6:02 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Mukesh Kumar Savaliya, Viken Dadhaniya, Andi Shyti
  Cc: linux-kernel, linux-trace-kernel, linux-arm-msm, linux-i2c,
	aniket.randive, chandana.chiluveru, Praveen Talari
In-Reply-To: <20260630-add-tracepoints-for-qcom-geni-i2c-v1-0-474cd6cdbe27@oss.qualcomm.com>

Add trace event support to the Qualcomm GENI I2C driver to enable
detailed runtime debugging and analysis.

The trace events capture I2C clock configuration, interrupt status
and error code and message.

Signed-off-by: Praveen Talari <praveen.talari@oss.qualcomm.com>
---
 include/trace/events/qcom_geni_i2c.h | 82 ++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/include/trace/events/qcom_geni_i2c.h b/include/trace/events/qcom_geni_i2c.h
new file mode 100644
index 000000000000..c7e7984f3620
--- /dev/null
+++ b/include/trace/events/qcom_geni_i2c.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM qcom_geni_i2c
+
+#if !defined(_TRACE_QCOM_GENI_I2C_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_QCOM_GENI_I2C_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(geni_i2c_bus_setup,
+	    TP_PROTO(struct device *dev, u32 clk_freq, u8 clk_div,
+		     u8 t_high_cnt, u8 t_low_cnt, u8 t_cycle_cnt),
+	    TP_ARGS(dev, clk_freq, clk_div, t_high_cnt, t_low_cnt, t_cycle_cnt),
+
+	    TP_STRUCT__entry(__string(name, dev_name(dev))
+			     __field(u32, clk_freq)
+			     __field(u8,  clk_div)
+			     __field(u8,  t_high_cnt)
+			     __field(u8,  t_low_cnt)
+			     __field(u8,  t_cycle_cnt)
+	    ),
+
+	    TP_fast_assign(__assign_str(name);
+			   __entry->clk_freq   = clk_freq;
+			   __entry->clk_div    = clk_div;
+			   __entry->t_high_cnt = t_high_cnt;
+			   __entry->t_low_cnt  = t_low_cnt;
+			   __entry->t_cycle_cnt = t_cycle_cnt;
+	    ),
+
+	    TP_printk("%s: clk_freq=%u clk_div=%u t_high=%u t_low=%u t_cycle=%u",
+		      __get_str(name), __entry->clk_freq, __entry->clk_div,
+		      __entry->t_high_cnt, __entry->t_low_cnt,
+		      __entry->t_cycle_cnt)
+);
+
+TRACE_EVENT(geni_i2c_irq,
+	    TP_PROTO(struct device *dev, u32 m_stat, u32 rx_st,
+		     u32 dm_tx_st, u32 dm_rx_st),
+	    TP_ARGS(dev, m_stat, rx_st, dm_tx_st, dm_rx_st),
+
+	    TP_STRUCT__entry(__string(name, dev_name(dev))
+			     __field(u32, m_stat)
+			     __field(u32, rx_st)
+			     __field(u32, dm_tx_st)
+			     __field(u32, dm_rx_st)
+	    ),
+
+	    TP_fast_assign(__assign_str(name);
+			   __entry->m_stat = m_stat;
+			   __entry->rx_st = rx_st;
+			   __entry->dm_tx_st = dm_tx_st;
+			   __entry->dm_rx_st = dm_rx_st;
+	    ),
+
+	    TP_printk("%s: m_stat=0x%08x rx_st=0x%08x dm_tx=0x%08x dm_rx=0x%08x",
+		      __get_str(name), __entry->m_stat, __entry->rx_st,
+		      __entry->dm_tx_st, __entry->dm_rx_st)
+);
+
+TRACE_EVENT(geni_i2c_err,
+	    TP_PROTO(struct device *dev, int err, const char *msg),
+	    TP_ARGS(dev, err, msg),
+
+	    TP_STRUCT__entry(__string(name, dev_name(dev))
+			     __field(int, err)
+			     __string(msg, msg)
+	    ),
+
+	    TP_fast_assign(__assign_str(name);
+			   __entry->err = err;
+			   __assign_str(msg);
+	    ),
+
+	    TP_printk("%s: err=%d msg=%s",
+		      __get_str(name), __entry->err, __get_str(msg))
+);
+
+#endif /* _TRACE_QCOM_GENI_I2C_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

-- 
2.34.1


^ permalink raw reply related

* [PATCH 2/2] i2c: qcom-geni: Add trace events for Qualcomm GENI I2C driver
From: Praveen Talari @ 2026-06-30  6:02 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Mukesh Kumar Savaliya, Viken Dadhaniya, Andi Shyti
  Cc: linux-kernel, linux-trace-kernel, linux-arm-msm, linux-i2c,
	aniket.randive, chandana.chiluveru, Praveen Talari
In-Reply-To: <20260630-add-tracepoints-for-qcom-geni-i2c-v1-0-474cd6cdbe27@oss.qualcomm.com>

Add trace event definitions for the Qualcomm GENI (Generic Interface)
I2C driver. These trace events enable runtime debugging and performance
analysis of I2C operations.

The trace events capture I2C clock configuration, interrupt status and
error code and message.

Signed-off-by: Praveen Talari <praveen.talari@oss.qualcomm.com>
---
 drivers/i2c/busses/i2c-qcom-geni.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/i2c/busses/i2c-qcom-geni.c b/drivers/i2c/busses/i2c-qcom-geni.c
index 96dbf04138be..3227fab6d76e 100644
--- a/drivers/i2c/busses/i2c-qcom-geni.c
+++ b/drivers/i2c/busses/i2c-qcom-geni.c
@@ -1,6 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
 // Copyright (c) 2017-2018, The Linux Foundation. All rights reserved.
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/qcom_geni_i2c.h>
+
 #include <linux/acpi.h>
 #include <linux/clk.h>
 #include <linux/dmaengine.h>
@@ -219,6 +222,9 @@ static int qcom_geni_i2c_conf(struct geni_se *se, unsigned long freq)
 	val |= itr->t_low_cnt << LOW_COUNTER_SHFT;
 	val |= itr->t_cycle_cnt;
 	writel_relaxed(val, gi2c->se.base + SE_I2C_SCL_COUNTERS);
+	trace_geni_i2c_bus_setup(gi2c->se.dev, gi2c->clk_freq_out,
+				 itr->clk_div, itr->t_high_cnt,
+				 itr->t_low_cnt, itr->t_cycle_cnt);
 	return 0;
 }
 
@@ -252,6 +258,8 @@ static void geni_i2c_err(struct geni_i2c_dev *gi2c, int err)
 		dev_dbg(gi2c->se.dev, "len:%d, slv-addr:0x%x, RD/WR:%d\n",
 			gi2c->cur->len, gi2c->cur->addr, gi2c->cur->flags);
 
+	trace_geni_i2c_err(gi2c->se.dev, gi2c_log[err].err, gi2c_log[err].msg);
+
 	switch (err) {
 	case GENI_ABORT_DONE:
 		gi2c->abort_done = true;
@@ -288,6 +296,8 @@ static irqreturn_t geni_i2c_irq(int irq, void *dev)
 	dma = readl_relaxed(base + SE_GENI_DMA_MODE_EN);
 	cur = gi2c->cur;
 
+	trace_geni_i2c_irq(gi2c->se.dev, m_stat, rx_st, dm_tx_st, dm_rx_st);
+
 	if (!cur ||
 	    m_stat & (M_CMD_FAILURE_EN | M_CMD_ABORT_EN) ||
 	    dm_rx_st & (DM_I2C_CB_ERR)) {
@@ -788,6 +798,10 @@ static int geni_i2c_gpi_xfer(struct geni_i2c_dev *gi2c, struct i2c_msg msgs[], i
 	peripheral.set_config = 1;
 	peripheral.multi_msg = false;
 
+	trace_geni_i2c_bus_setup(gi2c->se.dev, gi2c->clk_freq_out,
+				 itr->clk_div, itr->t_high_cnt,
+				 itr->t_low_cnt, itr->t_cycle_cnt);
+
 	gi2c->num_msgs = num;
 	gi2c->is_tx_multi_desc_xfer = false;
 
@@ -895,6 +909,7 @@ static int geni_i2c_fifo_xfer(struct geni_i2c_dev *gi2c,
 		m_param |= ((msgs[i].addr << SLV_ADDR_SHFT) & SLV_ADDR_MSK);
 
 		gi2c->cur = &msgs[i];
+
 		if (msgs[i].flags & I2C_M_RD)
 			ret = geni_i2c_rx_one_msg(gi2c, &msgs[i], m_param);
 		else

-- 
2.34.1


^ permalink raw reply related

* [PATCH v2 5.15.y] ring-buffer: Remove ring_buffer_read_prepare_sync()
From: Bjoern Doebel @ 2026-06-30  6:03 UTC (permalink / raw)
  To: stable
  Cc: sashal, rostedt, mhiramat, mathieu.desnoyers, dhowells,
	linux-trace-kernel, linux-kernel, doebel
In-Reply-To: <20260625054005.0014.ringbuf-515@kernel.org>

[ Upstream commit 119a5d573622ae90ba730d18acfae9bb75d77b9a ]

When the ring buffer was first introduced, reading the non-consuming
"trace" file required disabling the writing of the ring buffer. To make
sure the writing was fully disabled before iterating the buffer with a
non-consuming read, it would set the disable flag of the buffer and then
call an RCU synchronization to make sure all the buffers were
synchronized.

The function ring_buffer_read_start() originally  would initialize the
iterator and call an RCU synchronization, but this was for each individual
per CPU buffer where this would get called many times on a machine with
many CPUs before the trace file could be read. The commit 72c9ddfd4c5bf
("ring-buffer: Make non-consuming read less expensive with lots of cpus.")
separated ring_buffer_read_start into ring_buffer_read_prepare(),
ring_buffer_read_sync() and then ring_buffer_read_start() to allow each of
the per CPU buffers to be prepared, call the read_buffer_read_sync() once,
and then the ring_buffer_read_start() for each of the CPUs which made
things much faster.

The commit 1039221cc278 ("ring-buffer: Do not disable recording when there
is an iterator") removed the requirement of disabling the recording of the
ring buffer in order to iterate it, but it did not remove the
synchronization that was happening that was required to wait for all the
buffers to have no more writers. It's now OK for the buffers to have
writers and no synchronization is needed.

Remove the synchronization and put back the interface for the ring buffer
iterator back before commit 72c9ddfd4c5bf was applied.

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/20250630180440.3eabb514@batman.local.home
Reported-by: David Howells <dhowells@redhat.com>
Fixes: 1039221cc278 ("ring-buffer: Do not disable recording when there is an iterator")
Tested-by: David Howells <dhowells@redhat.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

Assisted-by: Kiro:claude-opus-4.8
[doebel@amazon.de: move patch section using guard() macro into a
separate block to address declaration after statement warning.]
Signed-off-by: Bjoern Doebel <doebel@amazon.de>
---
 include/linux/ring_buffer.h |  4 +--
 kernel/trace/ring_buffer.c  | 72 ++++++++-----------------------------
 kernel/trace/trace.c        | 14 +++-----
 kernel/trace/trace_kdb.c    |  8 ++---
 4 files changed, 22 insertions(+), 76 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 3e7bfc0f65aee..b53335ed2d0ef 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -130,9 +130,7 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
 		    unsigned long *lost_events);
 
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags);
-void ring_buffer_read_prepare_sync(void);
-void ring_buffer_read_start(struct ring_buffer_iter *iter);
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags);
 void ring_buffer_read_finish(struct ring_buffer_iter *iter);
 
 struct ring_buffer_event *
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e44115db0efe3..e7c472729542d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5037,28 +5037,20 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
 
 /**
- * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
+ * ring_buffer_read_start - start a non consuming read of the buffer
  * @buffer: The ring buffer to read from
  * @cpu: The cpu buffer to iterate over
  * @flags: gfp flags to use for memory allocation
  *
- * This performs the initial preparations necessary to iterate
- * through the buffer.  Memory is allocated, buffer recording
- * is disabled, and the iterator pointer is returned to the caller.
- *
- * Disabling buffer recording prevents the reading from being
- * corrupted. This is not a consuming read, so a producer is not
- * expected.
- *
- * After a sequence of ring_buffer_read_prepare calls, the user is
- * expected to make at least one call to ring_buffer_read_prepare_sync.
- * Afterwards, ring_buffer_read_start is invoked to get things going
- * for real.
+ * This creates an iterator to allow non-consuming iteration through
+ * the buffer. If the buffer is disabled for writing, it will produce
+ * the same information each time, but if the buffer is still writing
+ * then the first hit of a write will cause the iteration to stop.
  *
- * This overall must be paired with ring_buffer_read_finish.
+ * Must be paired with ring_buffer_read_finish.
  */
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_iter *iter;
@@ -5083,51 +5075,15 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
 
 	atomic_inc(&cpu_buffer->resize_disabled);
 
-	return iter;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
+	{
+		guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
 
-/**
- * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
- *
- * All previously invoked ring_buffer_read_prepare calls to prepare
- * iterators will be synchronized.  Afterwards, read_buffer_read_start
- * calls on those iterators are allowed.
- */
-void
-ring_buffer_read_prepare_sync(void)
-{
-	synchronize_rcu();
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
-
-/**
- * ring_buffer_read_start - start a non consuming read of the buffer
- * @iter: The iterator returned by ring_buffer_read_prepare
- *
- * This finalizes the startup of an iteration through the buffer.
- * The iterator comes from a call to ring_buffer_read_prepare and
- * an intervening ring_buffer_read_prepare_sync must have been
- * performed.
- *
- * Must be paired with ring_buffer_read_finish.
- */
-void
-ring_buffer_read_start(struct ring_buffer_iter *iter)
-{
-	struct ring_buffer_per_cpu *cpu_buffer;
-	unsigned long flags;
-
-	if (!iter)
-		return;
-
-	cpu_buffer = iter->cpu_buffer;
+		arch_spin_lock(&cpu_buffer->lock);
+		rb_iter_reset(iter);
+		arch_spin_unlock(&cpu_buffer->lock);
+	}
 
-	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
-	arch_spin_lock(&cpu_buffer->lock);
-	rb_iter_reset(iter);
-	arch_spin_unlock(&cpu_buffer->lock);
-	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+	return iter;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 537360be8e4e0..1a29a9d9e8685 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4803,21 +4803,15 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter->buffer_iter[cpu] =
-				ring_buffer_read_prepare(iter->array_buffer->buffer,
-							 cpu, GFP_KERNEL);
-		}
-		ring_buffer_read_prepare_sync();
-		for_each_tracing_cpu(cpu) {
-			ring_buffer_read_start(iter->buffer_iter[cpu]);
+				ring_buffer_read_start(iter->array_buffer->buffer,
+						       cpu, GFP_KERNEL);
 			tracing_iter_reset(iter, cpu);
 		}
 	} else {
 		cpu = iter->cpu_file;
 		iter->buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter->array_buffer->buffer,
-						 cpu, GFP_KERNEL);
-		ring_buffer_read_prepare_sync();
-		ring_buffer_read_start(iter->buffer_iter[cpu]);
+			ring_buffer_read_start(iter->array_buffer->buffer,
+					       cpu, GFP_KERNEL);
 		tracing_iter_reset(iter, cpu);
 	}
 
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 59857a1ee44cd..628c25693cef2 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -43,17 +43,15 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file)
 	if (cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter.buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter.array_buffer->buffer,
-						 cpu, GFP_ATOMIC);
-			ring_buffer_read_start(iter.buffer_iter[cpu]);
+			ring_buffer_read_start(iter.array_buffer->buffer,
+					       cpu, GFP_ATOMIC);
 			tracing_iter_reset(&iter, cpu);
 		}
 	} else {
 		iter.cpu_file = cpu_file;
 		iter.buffer_iter[cpu_file] =
-			ring_buffer_read_prepare(iter.array_buffer->buffer,
+			ring_buffer_read_start(iter.array_buffer->buffer,
 						 cpu_file, GFP_ATOMIC);
-		ring_buffer_read_start(iter.buffer_iter[cpu_file]);
 		tracing_iter_reset(&iter, cpu_file);
 	}
 
-- 
2.50.1




Amazon Web Services Development Center Germany GmbH
Tamara-Danz-Str. 13
10243 Berlin
Geschaeftsfuehrung: Christof Hellmis, Andreas Stieger
Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B
Sitz: Berlin
Ust-ID: DE 365 538 597


^ permalink raw reply related

* [PATCH v2 5.10.y] ring-buffer: Remove ring_buffer_read_prepare_sync()
From: Bjoern Doebel @ 2026-06-30  6:06 UTC (permalink / raw)
  To: stable
  Cc: sashal, rostedt, mhiramat, mathieu.desnoyers, dhowells,
	linux-trace-kernel, linux-kernel, doebel
In-Reply-To: <20260625054005.0015.ringbuf-510@kernel.org>

[ Upstream commit 119a5d573622ae90ba730d18acfae9bb75d77b9a ]

When the ring buffer was first introduced, reading the non-consuming
"trace" file required disabling the writing of the ring buffer. To make
sure the writing was fully disabled before iterating the buffer with a
non-consuming read, it would set the disable flag of the buffer and then
call an RCU synchronization to make sure all the buffers were
synchronized.

The function ring_buffer_read_start() originally  would initialize the
iterator and call an RCU synchronization, but this was for each individual
per CPU buffer where this would get called many times on a machine with
many CPUs before the trace file could be read. The commit 72c9ddfd4c5bf
("ring-buffer: Make non-consuming read less expensive with lots of cpus.")
separated ring_buffer_read_start into ring_buffer_read_prepare(),
ring_buffer_read_sync() and then ring_buffer_read_start() to allow each of
the per CPU buffers to be prepared, call the read_buffer_read_sync() once,
and then the ring_buffer_read_start() for each of the CPUs which made
things much faster.

The commit 1039221cc278 ("ring-buffer: Do not disable recording when there
is an iterator") removed the requirement of disabling the recording of the
ring buffer in order to iterate it, but it did not remove the
synchronization that was happening that was required to wait for all the
buffers to have no more writers. It's now OK for the buffers to have
writers and no synchronization is needed.

Remove the synchronization and put back the interface for the ring buffer
iterator back before commit 72c9ddfd4c5bf was applied.

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/20250630180440.3eabb514@batman.local.home
Reported-by: David Howells <dhowells@redhat.com>
Fixes: 1039221cc278 ("ring-buffer: Do not disable recording when there is an iterator")
Tested-by: David Howells <dhowells@redhat.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

Assisted-by: Kiro:claude-opus-4.8
[doebel@amazon.de: move patch section using guard() macro into a
separate block to address declaration after statement warning.]
Signed-off-by: Bjoern Doebel <doebel@amazon.de>
---
 include/linux/ring_buffer.h |  4 +--
 kernel/trace/ring_buffer.c  | 72 ++++++++-----------------------------
 kernel/trace/trace.c        | 14 +++-----
 kernel/trace/trace_kdb.c    |  8 ++---
 4 files changed, 22 insertions(+), 76 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 7d5a78f49d43d..be5c120922469 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -128,9 +128,7 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
 		    unsigned long *lost_events);
 
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags);
-void ring_buffer_read_prepare_sync(void);
-void ring_buffer_read_start(struct ring_buffer_iter *iter);
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags);
 void ring_buffer_read_finish(struct ring_buffer_iter *iter);
 
 struct ring_buffer_event *
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 03a7127efc5a8..1089daa17b09e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4858,28 +4858,20 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
 
 /**
- * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
+ * ring_buffer_read_start - start a non consuming read of the buffer
  * @buffer: The ring buffer to read from
  * @cpu: The cpu buffer to iterate over
  * @flags: gfp flags to use for memory allocation
  *
- * This performs the initial preparations necessary to iterate
- * through the buffer.  Memory is allocated, buffer recording
- * is disabled, and the iterator pointer is returned to the caller.
- *
- * Disabling buffer recording prevents the reading from being
- * corrupted. This is not a consuming read, so a producer is not
- * expected.
- *
- * After a sequence of ring_buffer_read_prepare calls, the user is
- * expected to make at least one call to ring_buffer_read_prepare_sync.
- * Afterwards, ring_buffer_read_start is invoked to get things going
- * for real.
+ * This creates an iterator to allow non-consuming iteration through
+ * the buffer. If the buffer is disabled for writing, it will produce
+ * the same information each time, but if the buffer is still writing
+ * then the first hit of a write will cause the iteration to stop.
  *
- * This overall must be paired with ring_buffer_read_finish.
+ * Must be paired with ring_buffer_read_finish.
  */
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_iter *iter;
@@ -4904,51 +4896,15 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
 
 	atomic_inc(&cpu_buffer->resize_disabled);
 
-	return iter;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
+	{
+		guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
 
-/**
- * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
- *
- * All previously invoked ring_buffer_read_prepare calls to prepare
- * iterators will be synchronized.  Afterwards, read_buffer_read_start
- * calls on those iterators are allowed.
- */
-void
-ring_buffer_read_prepare_sync(void)
-{
-	synchronize_rcu();
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
-
-/**
- * ring_buffer_read_start - start a non consuming read of the buffer
- * @iter: The iterator returned by ring_buffer_read_prepare
- *
- * This finalizes the startup of an iteration through the buffer.
- * The iterator comes from a call to ring_buffer_read_prepare and
- * an intervening ring_buffer_read_prepare_sync must have been
- * performed.
- *
- * Must be paired with ring_buffer_read_finish.
- */
-void
-ring_buffer_read_start(struct ring_buffer_iter *iter)
-{
-	struct ring_buffer_per_cpu *cpu_buffer;
-	unsigned long flags;
-
-	if (!iter)
-		return;
-
-	cpu_buffer = iter->cpu_buffer;
+		arch_spin_lock(&cpu_buffer->lock);
+		rb_iter_reset(iter);
+		arch_spin_unlock(&cpu_buffer->lock);
+	}
 
-	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
-	arch_spin_lock(&cpu_buffer->lock);
-	rb_iter_reset(iter);
-	arch_spin_unlock(&cpu_buffer->lock);
-	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+	return iter;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5bcd4cbeeb4fe..ed32d3c4f0e76 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4480,21 +4480,15 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter->buffer_iter[cpu] =
-				ring_buffer_read_prepare(iter->array_buffer->buffer,
-							 cpu, GFP_KERNEL);
-		}
-		ring_buffer_read_prepare_sync();
-		for_each_tracing_cpu(cpu) {
-			ring_buffer_read_start(iter->buffer_iter[cpu]);
+				ring_buffer_read_start(iter->array_buffer->buffer,
+						       cpu, GFP_KERNEL);
 			tracing_iter_reset(iter, cpu);
 		}
 	} else {
 		cpu = iter->cpu_file;
 		iter->buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter->array_buffer->buffer,
-						 cpu, GFP_KERNEL);
-		ring_buffer_read_prepare_sync();
-		ring_buffer_read_start(iter->buffer_iter[cpu]);
+			ring_buffer_read_start(iter->array_buffer->buffer,
+					       cpu, GFP_KERNEL);
 		tracing_iter_reset(iter, cpu);
 	}
 
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 9da76104f7a28..18d1551db2b0d 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -43,17 +43,15 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file)
 	if (cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter.buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter.array_buffer->buffer,
-						 cpu, GFP_ATOMIC);
-			ring_buffer_read_start(iter.buffer_iter[cpu]);
+			ring_buffer_read_start(iter.array_buffer->buffer,
+					       cpu, GFP_ATOMIC);
 			tracing_iter_reset(&iter, cpu);
 		}
 	} else {
 		iter.cpu_file = cpu_file;
 		iter.buffer_iter[cpu_file] =
-			ring_buffer_read_prepare(iter.array_buffer->buffer,
+			ring_buffer_read_start(iter.array_buffer->buffer,
 						 cpu_file, GFP_ATOMIC);
-		ring_buffer_read_start(iter.buffer_iter[cpu_file]);
 		tracing_iter_reset(&iter, cpu_file);
 	}
 
-- 
2.50.1




Amazon Web Services Development Center Germany GmbH
Tamara-Danz-Str. 13
10243 Berlin
Geschaeftsfuehrung: Christof Hellmis, Andreas Stieger
Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B
Sitz: Berlin
Ust-ID: DE 365 538 597


^ permalink raw reply related

* Re: [PATCH v3 2/7] mm/page_owner: add MR_NEVER to enum migrate_reason and use it for last_migrate_reason
From: Vlastimil Babka (SUSE) @ 2026-06-30  7:28 UTC (permalink / raw)
  To: Ye Liu, Andrew Morton, David Hildenbrand, Steven Rostedt,
	Masami Hiramatsu
  Cc: Zi Yan, Matthew Brost, Joshua Hahn, Rakie Kim, Byungchul Park,
	Gregory Price, Ying Huang, Alistair Popple, Mathieu Desnoyers,
	Suren Baghdasaryan, Michal Hocko, Brendan Jackman,
	Johannes Weiner, linux-mm, linux-kernel, linux-trace-kernel
In-Reply-To: <20260630015331.147174-3-ye.liu@linux.dev>

On 6/30/26 03:53, Ye Liu wrote:
> The last_migrate_reason field uses -1 as a sentinel value to mean "no
> migration has happened".  Replace the four bare -1 occurrences by
> adding a proper MR_NEVER member to enum migrate_reason, defining a
> corresponding "never_migrated" string in the MIGRATE_REASON trace
> macro, and removing the local MIGRATE_REASON_NONE define.
> 
> No functional change.
> 
> Signed-off-by: Ye Liu <ye.liu@linux.dev>

Reviewed-by: Vlastimil Babka (SUSE) <vbabka@kernel.org>

> ---
>  include/linux/migrate_mode.h   | 1 +
>  include/trace/events/migrate.h | 3 ++-
>  mm/page_owner.c                | 8 ++++----
>  3 files changed, 7 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h
> index 265c4328b36a..05102d4d2490 100644
> --- a/include/linux/migrate_mode.h
> +++ b/include/linux/migrate_mode.h
> @@ -25,6 +25,7 @@ enum migrate_reason {
>  	MR_LONGTERM_PIN,
>  	MR_DEMOTION,
>  	MR_DAMON,
> +	MR_NEVER,		/* page has never been migrated */
>  	MR_TYPES
>  };
>  
> diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
> index cd01dd7b3640..11bc0aa14c7e 100644
> --- a/include/trace/events/migrate.h
> +++ b/include/trace/events/migrate.h
> @@ -23,7 +23,8 @@
>  	EM( MR_CONTIG_RANGE,	"contig_range")			\
>  	EM( MR_LONGTERM_PIN,	"longterm_pin")			\
>  	EM( MR_DEMOTION,	"demotion")			\
> -	EMe(MR_DAMON,		"damon")
> +	EM( MR_DAMON,		"damon")			\
> +	EMe(MR_NEVER,		"never_migrated")
>  
>  /*
>   * First define the enums in the above macros to be exported to userspace
> diff --git a/mm/page_owner.c b/mm/page_owner.c
> index 342549891a8d..c2f43ab860eb 100644
> --- a/mm/page_owner.c
> +++ b/mm/page_owner.c
> @@ -339,7 +339,7 @@ noinline void __set_page_owner(struct page *page, unsigned short order,
>  	depot_stack_handle_t handle;
>  
>  	handle = save_stack(gfp_mask);
> -	__update_page_owner_handle(page, handle, order, gfp_mask, -1,
> +	__update_page_owner_handle(page, handle, order, gfp_mask, MR_NEVER,
>  				   ts_nsec, current->pid, current->tgid,
>  				   current->comm);
>  	inc_stack_record_count(handle, gfp_mask, 1 << order);
> @@ -596,7 +596,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
>  	if (ret >= count)
>  		goto err;
>  
> -	if (page_owner->last_migrate_reason != -1) {
> +	if (page_owner->last_migrate_reason != MR_NEVER) {
>  		ret += scnprintf(kbuf + ret, count - ret,
>  			"Page has been migrated, last migrate reason: %s\n",
>  			migrate_reason_names[page_owner->last_migrate_reason]);
> @@ -667,7 +667,7 @@ void __dump_page_owner(const struct page *page)
>  		stack_depot_print(handle);
>  	}
>  
> -	if (page_owner->last_migrate_reason != -1)
> +	if (page_owner->last_migrate_reason != MR_NEVER)
>  		pr_alert("page has been migrated, last migrate reason: %s\n",
>  			migrate_reason_names[page_owner->last_migrate_reason]);
>  	page_ext_put(page_ext);
> @@ -826,7 +826,7 @@ static void init_pages_in_zone(struct zone *zone)
>  
>  			/* Found early allocated page */
>  			__update_page_owner_handle(page, early_handle, 0, 0,
> -						   -1, local_clock(), current->pid,
> +						   MR_NEVER, local_clock(), current->pid,
>  						   current->tgid, current->comm);
>  			count++;
>  ext_put_continue:


^ permalink raw reply

* Re: [PATCH v3 3/7] mm: use enum migrate_reason instead of int for migration reason parameters
From: Vlastimil Babka (SUSE) @ 2026-06-30  7:30 UTC (permalink / raw)
  To: Ye Liu, Muchun Song, Oscar Salvador, Andrew Morton,
	David Hildenbrand, Steven Rostedt, Masami Hiramatsu
  Cc: Zi Yan, Matthew Brost, Joshua Hahn, Rakie Kim, Byungchul Park,
	Gregory Price, Ying Huang, Alistair Popple, Lorenzo Stoakes,
	Liam R. Howlett, Mike Rapoport, Suren Baghdasaryan, Michal Hocko,
	Mathieu Desnoyers, Brendan Jackman, Johannes Weiner, linux-mm,
	linux-kernel, linux-trace-kernel
In-Reply-To: <20260630015331.147174-4-ye.liu@linux.dev>

On 6/30/26 03:53, Ye Liu wrote:
> Replace all 'int reason' function parameters that carry migrate_reason
> values with the proper 'enum migrate_reason' type.  This makes the
> intent explicit and leverages compiler type checking.  The affected
> subsystems are:
> 
>   - page_owner: __folio_set_owner_migrate_reason(),
>                 folio_set_owner_migrate_reason()
>   - migrate: migrate_pages(), migrate_pages_sync(),
>              migrate_pages_batch(), migrate_folios_move(),
>              migrate_hugetlbs(), unmap_and_move_huge_page()
>   - hugetlb: move_hugetlb_state(), htlb_allow_alloc_fallback()
>   - trace: mm_migrate_pages and mm_migrate_pages_start events
> 
> The 'short last_migrate_reason' struct field and internal helper
> parameter in page_owner are intentionally left as 'short' since they
> store per-page metadata where size matters.
> 
> No functional change.
> 
> Signed-off-by: Ye Liu <ye.liu@linux.dev>

Reviewed-by: Vlastimil Babka (SUSE) <vbabka@kernel.org>


^ permalink raw reply

* Re: [PATCH v2 1/2] x86/uprobes: Keep shadow stack in sync for emulated CALLs
From: Oleg Nesterov @ 2026-06-30  8:39 UTC (permalink / raw)
  To: David Windsor
  Cc: mhiramat, peterz, tglx, mingo, bp, dave.hansen, x86, shuah,
	rick.p.edgecombe, jolsa, linux-trace-kernel, linux-kselftest,
	linux-kernel
In-Reply-To: <8b5b1c7407b98f31664ad7b6a6faf20d2d4a6cad.1782777969.git.dwindsor@gmail.com>

On 06/29, David Windsor wrote:
>
> Uprobe CALL emulation updates the normal user stack, but not the CET user
> shadow stack. The subsequent RET then sees a stale shadow stack entry and
> raises #CP.
>
> Update the relative CALL emulation and XOL CALL fixup paths to keep the
> shadow stack in sync.
>
> Fixes: 488af8ea7131 ("x86/shstk: Wire in shadow stack interface")
> Signed-off-by: David Windsor <dwindsor@gmail.com>

Acked-by: Oleg Nesterov <oleg@redhat.com>


^ permalink raw reply

* Re: [PATCH v3 2/9] rv: add generic uprobe infrastructure for RV monitors
From: Gabriele Monaco @ 2026-06-30  8:48 UTC (permalink / raw)
  To: Wen Yang; +Cc: linux-trace-kernel, linux-kernel
In-Reply-To: <878be1d4-2f93-4fbd-a1f6-b2b7836c9c44@linux.dev>

On Mon, 2026-06-29 at 00:47 +0800, Wen Yang wrote:
> On [2-1]~[2-5]: embedded consumer causes UAF on PREEMPT_RT
> -----------------------------------------------------------
> 
> The uprobe_bind selftest oopses on PREEMPT_RT(full):
> 
>    handler_chain+0xc9: mov rax, [r15+0x18]  ; advance list iterator
>    RAX: 000015ec00001f28   ; garbage — &uprobe->consumers after kfree
> 
> handler_chain() reads uc->cons_node.next after uc->handler() returns,
> still inside rcu_read_lock_trace().  That pointer is &uprobe->consumers
> (the list head embedded in struct uprobe), which gets freed through:
> 
>    put_uprobe()
>      -> schedule_work(uprobe_free_deferred)   /* async */
>         -> call_srcu(&uretprobes_srcu, ...)
>            -> call_rcu_tasks_trace(kfree_uprobe)
>               -> kfree(uprobe)
> 
> rv_uprobe_sync() calls uprobe_unregister_sync() which calls
> synchronize_srcu(&uretprobes_srcu), but that only matters after the
> kworker has submitted work to uretprobes_srcu.  On a loaded PREEMPT_RT
> box the kworker may not have run yet, so synchronize_srcu() returns
> immediately and kfree(uprobe) races with the still-iterating
> handler_chain():
> 
>    CPU A                                CPU B
>    consumer_del() → list_del_rcu        rcu_read_lock_trace()
>    put_uprobe()   → schedule_work         uc->handler() returns
>    rv_uprobe_sync():                       reading cons_node.next...
>      synchronize_srcu(&uretprobes_srcu)

If CPU B is in an RCU trace read-side critical section this doesn't
return immediately, it returns after readers are done, doesn't it?
(well, what you really care here is the synchronize_rcu_tasks_trace()
but we do both).

>      ← idle; returns immediately
>    [kworker fires later]
>      kfree(uprobe)  ← frees &uprobe->consumers
>                                          cons_node.next = freed mem → CRASH
> 

So I had a bit of a look and as far as I understand, we don't have any
control over the uprobe allocation pattern (workqueues and whatnot) and
we don't really care as long as we deregister it appropriately.

What we do control is the uprobe_consumer, that must be freed only after
the uprobe was synchronously deregistered, that should guarantee no
reader is going to reference it, shouldn't it?

uprobe_unregister_nosync() removes it from the cons_node list, so it's
safe to assume any handler_chain() after the next RCU-trace grace period
won't see it.

In the sketch I sent this is happening (all kfree(b) are after
rv_uprobe_unregister() which does the sync). What am I missing here?


uprobe is also freed after both grace periods, so no reader should use
that either.

The situation you're seeing isn't fully clear to me, I applied my sketch
and don't see any splat on a vng box.

> With uc embedded in the binding (as [2-1] suggests), no amount of
> delaying kfree(binding) helps: uprobe->consumers is freed by a chain we
> don't control.  The fix is to keep uc->cons_node in memory that outlives
> the handler_chain() iteration, which means a separate allocation freed
> only after rv_uprobe_sync():
> 
>    rv_uprobe_unregister_nosync()  /* list_del_rcu + schedule_work */
>    rv_uprobe_sync()               /* waits for any already-submitted 
> srcu work */

We of course need to do any free after this sync, but I don't get why we
need an additional allocation since the following are both plain
synchronous frees, why isn't a single one (on b) enough?

I'm a bit lost on this..

Thanks,
Gabriele

>    rv_uprobe_free()               /* kfree(impl) — safe, iteration is 
> done */
>    kfree(b)                       /* binding; never contained uc */
> 
> v4 keeps the public API shape you suggested with impl private:
> 
>    struct rv_uprobe {
>        struct rv_uprobe_impl  *impl;  /* allocated by 
> rv_uprobe_register() */
>        struct inode           *inode;
>    };
>    #define DECLARE_RV_UPROBE(name)  struct rv_uprobe name        /* [2-2] */
> 
>    int   rv_uprobe_register(const char *binpath, loff_t offset,
>                             struct rv_uprobe *p, handler_fn,
>                             ret_handler_fn, void *priv);         /* [2-4] */
>    void  rv_uprobe_unregister(struct rv_uprobe *p);
>    void  rv_uprobe_unregister_nosync(struct rv_uprobe *p);
>    void  rv_uprobe_sync(void);
>    void  rv_uprobe_free(struct rv_uprobe *p);                    /* 
> [2-5] restored */
>    bool  rv_uprobe_is_registered(const struct rv_uprobe *p);     /* 
> [2-7] added */
>    void *rv_uprobe_get_priv(struct uprobe_consumer *uc);
> 
> Handler signature is (struct uprobe_consumer *self, ...) [2-3]; private
> data is retrieved via rv_uprobe_get_priv(self) instead of container_of().
> 
> Then, rv_uprobe_free() is restored ([2-5] partially reverted).
> 
> 
> All other v3 items have been resolved, we are waiting for your comments 
> on the above(embedded consumer causes UAF):
> 
> [1-1,1-3]  `# define` / `# error` space removed
> [1-4]      `⟹` -> `=>`
> [1-6,1-7]  #if/#else in da_destroy_storage() and da_monitor_init()
>             replaced with plain if (DA_MON_ALLOCATION_STRATEGY == 
> DA_ALLOC_POOL)
> [1-8]      tracepoint_synchronize_unregister() lifted to 
> da_monitor_destroy before the pool/kmalloc branch
> 
> [2-2]  DECLARE_RV_UPROBE(name) — kept
> [2-3]  handler sig (struct uprobe_consumer *self, ...) — kept
> [2-4]  rv_uprobe_register() returns int — kept
> [2-5]  rv_uprobe_attach/detach removed; rv_uprobe_free() restored (see 
> above)
> [2-6]  offset, path fields removed; inode used for identity
> [2-7]  rv_uprobe_is_registered() added
> 
> [6-1]  Suggested-by removed
> [6-2]  tlob Kconfig entry placed after the deadline monitors marker
> [6-3]  Unnecessary includes removed (hrtimer.h, ktime.h, sched.h, tracefs.h)
> [6-4]  `#include "../../rv.h"` -> `<rv.h>`
> [6-5,6-6]  Verbose comments around DA_MON_POOL_SIZE and da_extra_cleanup
>             simplified; early return in tlob_reset_notify() on
>             !trace_detail_env_tlob_enabled()
> [6-7]  ha_setup_invariants():
>           if (atomic_read_acquire(&target->stopping)) return;
>           if (next_state < state_max_tlob) ha_start_timer_ns(...);
>           else ha_cancel_timer(ha_mon);
> [6-8]  offsetof() arithmetic -> enum-indexed array:
>           enum tlob_acc_idx { TLOB_ACC_RUNNING, TLOB_ACC_WAITING,
>                               TLOB_ACC_SLEEPING, TLOB_ACC_MAX };
>           u64 accs_ns[TLOB_ACC_MAX];
> [6-9]  1000 → TLOB_MIN_THRESHOLD_NS
> [6-10] WARN_ON_ONCE(!da_mon) -> if (unlikely(WARN_ON_ONCE(!da_mon)))
> [6-11] __free(kfree) + list_add_tail(&no_free_ptr(b)->list, ...) in
>         tlob_add_uprobe(); note that "b = no_free_ptr(b)" would restore b
>         and re-trigger __free on return — the correct pattern is to use
>         no_free_ptr() as the argument to list_add_tail() directly
> [6-15] tlob.h enum/struct order aligned with rvgen output
> [6-16] tracefs_create_file() -> rv_create_file()
> 
> [7-1]  KUnit tests call tlob_parse_uprobe_line/tlob_parse_remove_line
>         directly; no real uprobe creation from unit tests
> [7-2]  KUNIT_EXPECT_EQ(test, ..., 0) for valid-line cases
> 
> [8-1]  ftracetest: walk-up algorithm to locate test.d/functions
> [9-4]  tlob_busy/sleep/preempt_work unified to duration_ms
> 
> 
> --
> Best wishes,
> Wen
> 
> 
> > Gabriele
> > 
> > **Suggested simplification:**
> > 
> > ---
> > diff --git a/include/rv/rv_uprobe.h b/include/rv/rv_uprobe.h
> > index 9106c5c927..4fb3f50a63 100644
> > --- a/include/rv/rv_uprobe.h
> > +++ b/include/rv/rv_uprobe.h
> > @@ -7,83 +7,56 @@
> >   #ifndef _RV_UPROBE_H
> >   #define _RV_UPROBE_H
> >   
> > -#include <linux/path.h>
> >   #include <linux/types.h>
> > +#include <linux/uprobes.h>
> >   
> >   struct pt_regs;
> > +struct inode;
> >   
> >   /**
> >    * struct rv_uprobe - a single uprobe registered on behalf of an RV
> > monitor
> >    *
> > - * @offset:   byte offset within the ELF binary where the probe is
> > installed
> > - * @priv:     monitor-private pointer; set at attach time, never touched by
> > - *            this layer; passed unchanged to entry_fn / ret_fn
> > - * @path:     resolved path of the probed binary (read-only after attach);
> > - *            callers may use path.dentry for identity comparisons
> > - *
> > - * The implementation fields (uprobe_consumer, uprobe handle, callbacks)
> > are
> > - * private to rv_uprobe.c and are not exposed here; monitors must not
> > access
> > - * them directly.
> > + * @uc:       underlying uprobe consumer (publicly visible)
> > + * @uprobe:   active uprobe structure handle
> > + * @inode:    inode of the target binary (read-only after registration)
> >    */
> >   struct rv_uprobe {
> > -	/* public: read-only after rv_uprobe_attach*() */
> > -	loff_t		 offset;
> > -	void		*priv;
> > -	struct path	 path;
> > +	struct uprobe_consumer	uc;
> > +	struct uprobe		*uprobe;
> > +	struct inode		*inode;
> >   };
> >   
> > -/**
> > - * rv_uprobe_attach_path - register an uprobe given an already-resolved
> > path
> > - * @path:     path of the target binary; rv_uprobe takes its own reference
> > - * @offset:   byte offset within the binary
> > - * @entry_fn: called on probe hit (entry); may be NULL
> > - * @ret_fn:   called on function return (uretprobe); may be NULL
> > - * @priv:     opaque pointer forwarded to callbacks unchanged
> > - *
> > - * Use this variant when the caller has already resolved the path (e.g. to
> > - * register multiple probes on the same binary with a single kern_path
> > call).
> > - * The inode is derived internally via d_real_inode(), so inode and path
> > are
> > - * always consistent.
> > - *
> > - * Returns a pointer to the new rv_uprobe on success, ERR_PTR on failure.
> > - */
> > -struct rv_uprobe *rv_uprobe_attach_path(struct path *path, loff_t offset,
> > -	int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64
> > *data),
> > -	int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
> > -			struct pt_regs *regs, __u64 *data),
> > -	void *priv);
> > +/* Seamless inline declaration of a named uprobe inside user structs */
> > +#define DECLARE_RV_UPROBE(name) \
> > +	struct rv_uprobe name
> >   
> >   /**
> > - * rv_uprobe_attach - resolve binpath and register an uprobe
> > - * @binpath:  absolute path to the target binary
> > - * @offset:   byte offset within the binary
> > - * @entry_fn: called on probe hit (entry); may be NULL
> > - * @ret_fn:   called on function return (uretprobe); may be NULL
> > - * @priv:     opaque pointer forwarded to callbacks unchanged
> > + * rv_uprobe_register - resolve binpath and register an uprobe
> > + * @binpath:     absolute path to the target binary
> > + * @offset:      byte offset within the binary
> > + * @p:           pointer to the allocated/embedded rv_uprobe structure
> > + * @handler:     called on probe hit (entry); may be NULL
> > + * @ret_handler: called on function return (uretprobe); may be NULL
> >    *
> > - * Resolves binpath via kern_path(), then delegates to
> > rv_uprobe_attach_path().
> > + * Resolves binpath via kern_path(), registers the uprobe directly using
> > the
> > + * embedded `uprobe_consumer`, and immediately releases the path reference.
> >    *
> > - * Returns a pointer to the new rv_uprobe on success, ERR_PTR on failure.
> > + * Returns 0 on success, or a negative error code on failure.
> >    */
> > -struct rv_uprobe *rv_uprobe_attach(const char *binpath, loff_t offset,
> > -	int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64
> > *data),
> > -	int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
> > -			struct pt_regs *regs, __u64 *data),
> > -	void *priv);
> > +int rv_uprobe_register(const char *binpath, loff_t offset, struct rv_uprobe
> > *p,
> > +	int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs,
> > __u64 *data),
> > +	int (*ret_handler)(struct uprobe_consumer *self, unsigned long
> > func,
> > +			   struct pt_regs *regs, __u64 *data));
> >   
> >   /**
> > - * rv_uprobe_detach - synchronously unregister an uprobe and free it
> > - * @p:  probe to detach; may be NULL (no-op)
> > - *
> > - * Calls uprobe_unregister_nosync(), then uprobe_unregister_sync() to wait
> > - * for any in-progress handler to finish, then releases the path reference
> > - * and frees the rv_uprobe struct.  The caller's priv data is NOT freed.
> > + * rv_uprobe_unregister - synchronously unregister a uprobe
> > + * @p:  probe to unregister; may be NULL (no-op)
> >    *
> > - * When removing a single probe, prefer this over the three-phase API.
> > - * Safe to call from process context only (uprobe_unregister_sync() may
> > - * schedule).
> > + * Dequeues the uprobe and waits synchronously for all in-flight handlers
> > + * to complete.
> >    */
> > -void rv_uprobe_detach(struct rv_uprobe *p);
> > +void rv_uprobe_unregister(struct rv_uprobe *p);
> >   
> >   /**
> >    * rv_uprobe_unregister_nosync - dequeue an uprobe without waiting
> > @@ -91,9 +64,7 @@ void rv_uprobe_detach(struct rv_uprobe *p);
> >    *
> >    * Removes the uprobe from the uprobe subsystem but does NOT wait for
> >    * in-flight handlers to complete.  The caller must call rv_uprobe_sync()
> > - * before calling rv_uprobe_free() on the same probe.
> > - *
> > - * Use this to batch multiple deregistrations before a single
> > rv_uprobe_sync().
> > + * before freeing any container holding this probe.
> >    */
> >   void rv_uprobe_unregister_nosync(struct rv_uprobe *p);
> >   
> > @@ -101,19 +72,8 @@ void rv_uprobe_unregister_nosync(struct rv_uprobe *p);
> >    * rv_uprobe_sync - wait for all in-flight uprobe handlers to complete
> >    *
> >    * Global barrier: waits for every in-flight uprobe handler across the
> > system
> > - * to finish.  Call once after a batch of rv_uprobe_unregister_nosync()
> > calls
> > - * and before any rv_uprobe_free() call.
> > + * to finish.
> >    */
> >   void rv_uprobe_sync(void);
> >   
> > -/**
> > - * rv_uprobe_free - release resources of a previously deregistered probe
> > - * @p:  probe to free; may be NULL (no-op)
> > - *
> > - * Releases the path reference and frees the rv_uprobe struct.  Must only
> > - * be called after rv_uprobe_sync() has returned.  The caller's priv data
> > - * is NOT freed.
> > - */
> > -void rv_uprobe_free(struct rv_uprobe *p);
> > -
> >   #endif /* _RV_UPROBE_H */
> > diff --git a/kernel/trace/rv/monitors/tlob/tlob.c
> > b/kernel/trace/rv/monitors/tlob/tlob.c
> > index d8e0c47947..28a6c740c7 100644
> > --- a/kernel/trace/rv/monitors/tlob/tlob.c
> > +++ b/kernel/trace/rv/monitors/tlob/tlob.c
> > @@ -252,8 +252,8 @@ struct tlob_uprobe_binding {
> >   	char			binpath[TLOB_MAX_PATH];
> >   	loff_t			offset_start;
> >   	loff_t			offset_stop;
> > -	struct rv_uprobe	*start_probe;
> > -	struct rv_uprobe	*stop_probe;
> > +	DECLARE_RV_UPROBE(start_probe);
> > +	DECLARE_RV_UPROBE(stop_probe);
> >   };
> >   
> >   /* RCU callback: free the slab once no readers remain. */
> > @@ -512,16 +512,16 @@ int tlob_stop_task(struct task_struct *task)
> >   EXPORT_SYMBOL_GPL(tlob_stop_task);
> >   
> >   
> > -static int tlob_uprobe_entry_handler(struct rv_uprobe *p, struct pt_regs
> > *regs,
> > +static int tlob_uprobe_entry_handler(struct uprobe_consumer *self, struct
> > pt_regs *regs,
> >   				     __u64 *data)
> >   {
> > -	struct tlob_uprobe_binding *b = p->priv;
> > +	struct tlob_uprobe_binding *b = container_of(self, struct
> > tlob_uprobe_binding, start_probe.uc);
> >   
> >   	tlob_start_task(current, b->threshold_ns);
> >   	return 0;
> >   }
> >   
> > -static int tlob_uprobe_stop_handler(struct rv_uprobe *p, struct pt_regs
> > *regs,
> > +static int tlob_uprobe_stop_handler(struct uprobe_consumer *self, struct
> > pt_regs *regs,
> >   				    __u64 *data)
> >   {
> >   	tlob_stop_task(current);
> > @@ -537,6 +537,7 @@ static int tlob_add_uprobe(u64 threshold_ns, const char
> > *binpath,
> >   {
> >   	struct tlob_uprobe_binding *b, *tmp_b;
> >   	char pathbuf[TLOB_MAX_PATH];
> > +	struct inode *inode;
> >   	struct path path;
> >   	char *canon;
> >   	int ret;
> > @@ -561,10 +562,12 @@ static int tlob_add_uprobe(u64 threshold_ns, const
> > char *binpath,
> >   		goto err_path;
> >   	}
> >   
> > -	/* Reject duplicate start offset for the same binary. */
> > +	inode = d_real_inode(path.dentry);
> > +
> > +	/* Reject duplicate start offset for the same binary inode. */
> >   	list_for_each_entry(tmp_b, &tlob_uprobe_list, list) {
> >   		if (tmp_b->offset_start == offset_start &&
> > -		    tmp_b->start_probe->path.dentry == path.dentry) {
> > +		    tmp_b->start_probe.inode == inode) {
> >   			ret = -EEXIST;
> >   			goto err_path;
> >   		}
> > @@ -577,29 +580,25 @@ static int tlob_add_uprobe(u64 threshold_ns, const
> > char *binpath,
> >   	}
> >   	strscpy(b->binpath, canon, sizeof(b->binpath));
> >   
> > -	/* Both probes share b (priv) and path; attach_path refs path
> > itself. */
> > -	b->start_probe = rv_uprobe_attach_path(&path, offset_start,
> > -					       tlob_uprobe_entry_handler,
> > NULL, b);
> > -	if (IS_ERR(b->start_probe)) {
> > -		ret = PTR_ERR(b->start_probe);
> > -		b->start_probe = NULL;
> > -		goto err_path;
> > -	}
> > +	path_put(&path);
> > +
> > +	/* Both probes are registered directly on the embedded fields */
> > +	ret = rv_uprobe_register(b->binpath, offset_start, &b->start_probe,
> > +				 tlob_uprobe_entry_handler, NULL);
> > +	if (ret)
> > +		goto err_free;
> >   
> > -	b->stop_probe = rv_uprobe_attach_path(&path, offset_stop,
> > -					      tlob_uprobe_stop_handler,
> > NULL, b);
> > -	if (IS_ERR(b->stop_probe)) {
> > -		ret = PTR_ERR(b->stop_probe);
> > -		b->stop_probe = NULL;
> > +	ret = rv_uprobe_register(b->binpath, offset_stop, &b->stop_probe,
> > +				 tlob_uprobe_stop_handler, NULL);
> > +	if (ret)
> >   		goto err_start;
> > -	}
> >   
> > -	path_put(&path);
> >   	list_add_tail(&b->list, &tlob_uprobe_list);
> >   	return 0;
> >   
> >   err_start:
> > -	rv_uprobe_detach(b->start_probe);
> > +	rv_uprobe_unregister(&b->start_probe);
> > +	goto err_free;
> >   err_path:
> >   	path_put(&path);
> >   err_free:
> > @@ -611,21 +610,24 @@ static int tlob_remove_uprobe_by_key(loff_t
> > offset_start, const char *binpath)
> >   {
> >   	struct tlob_uprobe_binding *b, *tmp;
> >   	struct path remove_path;
> > +	struct inode *inode;
> >   	int ret;
> >   
> >   	ret = kern_path(binpath, LOOKUP_FOLLOW, &remove_path);
> >   	if (ret)
> >   		return ret;
> >   
> > +	inode = d_real_inode(remove_path.dentry);
> > +
> >   	ret = -ENOENT;
> >   	list_for_each_entry_safe(b, tmp, &tlob_uprobe_list, list) {
> >   		if (b->offset_start != offset_start)
> >   			continue;
> > -		if (b->start_probe->path.dentry != remove_path.dentry)
> > +		if (b->start_probe.inode != inode)
> >   			continue;
> >   		list_del(&b->list);
> > -		rv_uprobe_detach(b->start_probe);
> > -		rv_uprobe_detach(b->stop_probe);
> > +		rv_uprobe_unregister(&b->start_probe);
> > +		rv_uprobe_unregister(&b->stop_probe);
> >   		kfree(b);
> >   		ret = 0;
> >   		break;
> > @@ -643,8 +645,8 @@ static void tlob_remove_all_uprobes(void)
> >   	mutex_lock(&tlob_uprobe_mutex);
> >   	list_for_each_entry_safe(b, tmp, &tlob_uprobe_list, list) {
> >   		list_move(&b->list, &pending);
> > -		rv_uprobe_unregister_nosync(b->start_probe);
> > -		rv_uprobe_unregister_nosync(b->stop_probe);
> > +		rv_uprobe_unregister_nosync(&b->start_probe);
> > +		rv_uprobe_unregister_nosync(&b->stop_probe);
> >   	}
> >   	mutex_unlock(&tlob_uprobe_mutex);
> >   
> > @@ -658,8 +660,6 @@ static void tlob_remove_all_uprobes(void)
> >   	rv_uprobe_sync();
> >   
> >   	list_for_each_entry_safe(b, tmp, &pending, list) {
> > -		rv_uprobe_free(b->start_probe);
> > -		rv_uprobe_free(b->stop_probe);
> >   		kfree(b);
> >   	}
> >   }
> > diff --git a/kernel/trace/rv/rv_uprobe.c b/kernel/trace/rv/rv_uprobe.c
> > index 3d8b764dde..69b8b0c27e 100644
> > --- a/kernel/trace/rv/rv_uprobe.c
> > +++ b/kernel/trace/rv/rv_uprobe.c
> > @@ -10,149 +10,74 @@
> >   #include <linux/uprobes.h>
> >   #include <rv/rv_uprobe.h>
> >   
> > -/*
> > - * Private extension of struct rv_uprobe.  Allocated by rv_uprobe_attach*()
> > - * and returned to callers as &impl->pub.
> > - */
> > -struct rv_uprobe_impl {
> > -	struct rv_uprobe	pub;	/* must be first; callers hold &pub
> > */
> > -	struct uprobe_consumer	uc;
> > -	struct uprobe		*uprobe;
> > -	int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64
> > *data);
> > -	int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
> > -			struct pt_regs *regs, __u64 *data);
> > -};
> > -
> > -static int rv_uprobe_handler(struct uprobe_consumer *uc,
> > -			     struct pt_regs *regs, __u64 *data)
> > -{
> > -	struct rv_uprobe_impl *impl = container_of(uc, struct
> > rv_uprobe_impl, uc);
> > -
> > -	if (impl->entry_fn)
> > -		return impl->entry_fn(&impl->pub, regs, data);
> > -	return 0;
> > -}
> > -
> > -static int rv_uprobe_ret_handler(struct uprobe_consumer *uc,
> > -				 unsigned long func,
> > -				 struct pt_regs *regs, __u64 *data)
> > -{
> > -	struct rv_uprobe_impl *impl = container_of(uc, struct
> > rv_uprobe_impl, uc);
> > -
> > -	if (impl->ret_fn)
> > -		return impl->ret_fn(&impl->pub, func, regs, data);
> > -	return 0;
> > -}
> > -
> > -static struct rv_uprobe *
> > -__rv_uprobe_attach(struct inode *inode, struct path *path, loff_t offset,
> > -		   int (*entry_fn)(struct rv_uprobe *p, struct pt_regs
> > *regs, __u64 *data),
> > -		   int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
> > -				   struct pt_regs *regs, __u64 *data),
> > -		   void *priv)
> > -{
> > -	struct rv_uprobe_impl *impl;
> > -	int ret;
> > -
> > -	if (!entry_fn && !ret_fn)
> > -		return ERR_PTR(-EINVAL);
> > -
> > -	impl = kzalloc_obj(*impl, GFP_KERNEL);
> > -	if (!impl)
> > -		return ERR_PTR(-ENOMEM);
> > -
> > -	impl->pub.offset = offset;
> > -	impl->pub.priv   = priv;
> > -	impl->entry_fn   = entry_fn;
> > -	impl->ret_fn     = ret_fn;
> > -	path_get(path);
> > -	impl->pub.path   = *path;
> > -
> > -	if (entry_fn)
> > -		impl->uc.handler     = rv_uprobe_handler;
> > -	if (ret_fn)
> > -		impl->uc.ret_handler = rv_uprobe_ret_handler;
> > -
> > -	impl->uprobe = uprobe_register(inode, offset, 0, &impl->uc);
> > -	if (IS_ERR(impl->uprobe)) {
> > -		ret = PTR_ERR(impl->uprobe);
> > -		path_put(&impl->pub.path);
> > -		kfree(impl);
> > -		return ERR_PTR(ret);
> > -	}
> > -
> > -	return &impl->pub;
> > -}
> > -
> > -/**
> > - * rv_uprobe_attach_path - register an uprobe given an already-resolved
> > path
> > - */
> > -struct rv_uprobe *rv_uprobe_attach_path(struct path *path, loff_t offset,
> > -	int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64
> > *data),
> > -	int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
> > -			struct pt_regs *regs, __u64 *data),
> > -	void *priv)
> > -{
> > -	struct inode *inode = d_real_inode(path->dentry);
> > -
> > -	return __rv_uprobe_attach(inode, path, offset, entry_fn, ret_fn,
> > priv);
> > -}
> > -EXPORT_SYMBOL_GPL(rv_uprobe_attach_path);
> > -
> >   /**
> > - * rv_uprobe_attach - resolve binpath and register an uprobe
> > + * rv_uprobe_register - resolve binpath and register an uprobe
> >    */
> > -struct rv_uprobe *rv_uprobe_attach(const char *binpath, loff_t offset,
> > -	int (*entry_fn)(struct rv_uprobe *p, struct pt_regs *regs, __u64
> > *data),
> > -	int (*ret_fn)(struct rv_uprobe *p, unsigned long func,
> > -			struct pt_regs *regs, __u64 *data),
> > -	void *priv)
> > +int rv_uprobe_register(const char *binpath, loff_t offset, struct rv_uprobe
> > *p,
> > +	int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs,
> > __u64 *data),
> > +	int (*ret_handler)(struct uprobe_consumer *self, unsigned long
> > func,
> > +			   struct pt_regs *regs, __u64 *data))
> >   {
> > -	struct rv_uprobe *p;
> > +	struct inode *inode;
> >   	struct path path;
> >   	int ret;
> >   
> > +	if (!handler && !ret_handler)
> > +		return -EINVAL;
> > +
> >   	ret = kern_path(binpath, LOOKUP_FOLLOW, &path);
> >   	if (ret)
> > -		return ERR_PTR(ret);
> > +		return ret;
> >   
> >   	if (!d_is_reg(path.dentry)) {
> >   		path_put(&path);
> > -		return ERR_PTR(-EINVAL);
> > +		return -EINVAL;
> >   	}
> >   
> > -	p = rv_uprobe_attach_path(&path, offset, entry_fn, ret_fn, priv);
> > +	inode = d_real_inode(path.dentry);
> > +
> > +	p->uc.handler     = handler;
> > +	p->uc.ret_handler = ret_handler;
> > +	p->inode          = inode;
> > +
> > +	p->uprobe = uprobe_register(inode, offset, 0, &p->uc);
> >   	path_put(&path);
> > -	return p;
> > +
> > +	if (IS_ERR(p->uprobe)) {
> > +		ret = PTR_ERR(p->uprobe);
> > +		p->uprobe = NULL;
> > +		p->inode  = NULL;
> > +		return ret;
> > +	}
> > +
> > +	return 0;
> >   }
> > -EXPORT_SYMBOL_GPL(rv_uprobe_attach);
> > +EXPORT_SYMBOL_GPL(rv_uprobe_register);
> >   
> >   /**
> > - * rv_uprobe_detach - synchronously unregister an uprobe and free it
> > + * rv_uprobe_unregister - synchronously unregister a uprobe
> >    */
> > -void rv_uprobe_detach(struct rv_uprobe *p)
> > +void rv_uprobe_unregister(struct rv_uprobe *p)
> >   {
> > -	if (!p)
> > +	if (!p || IS_ERR_OR_NULL(p->uprobe))
> >   		return;
> >   
> >   	rv_uprobe_unregister_nosync(p);
> >   	rv_uprobe_sync();
> > -	rv_uprobe_free(p);
> >   }
> > -EXPORT_SYMBOL_GPL(rv_uprobe_detach);
> > +EXPORT_SYMBOL_GPL(rv_uprobe_unregister);
> >   
> >   /**
> >    * rv_uprobe_unregister_nosync - dequeue an uprobe without waiting
> >    */
> >   void rv_uprobe_unregister_nosync(struct rv_uprobe *p)
> >   {
> > -	struct rv_uprobe_impl *impl;
> > -
> > -	if (!p)
> > +	if (!p || IS_ERR_OR_NULL(p->uprobe))
> >   		return;
> >   
> > -	impl = container_of(p, struct rv_uprobe_impl, pub);
> > -	uprobe_unregister_nosync(impl->uprobe, &impl->uc);
> > +	uprobe_unregister_nosync(p->uprobe, &p->uc);
> > +	p->uprobe = NULL;
> > +	p->inode  = NULL;
> >   }
> >   EXPORT_SYMBOL_GPL(rv_uprobe_unregister_nosync);
> >   
> > @@ -164,19 +89,3 @@ void rv_uprobe_sync(void)
> >   	uprobe_unregister_sync();
> >   }
> >   EXPORT_SYMBOL_GPL(rv_uprobe_sync);
> > -
> > -/**
> > - * rv_uprobe_free - release resources of a previously deregistered probe
> > - */
> > -void rv_uprobe_free(struct rv_uprobe *p)
> > -{
> > -	struct rv_uprobe_impl *impl;
> > -
> > -	if (!p)
> > -		return;
> > -
> > -	impl = container_of(p, struct rv_uprobe_impl, pub);
> > -	path_put(&p->path);
> > -	kfree(impl);
> > -}
> > -EXPORT_SYMBOL_GPL(rv_uprobe_free);
> > 


^ permalink raw reply

* Re: [PATCH 1/2] tracing: Embed 'char comm[16]' in a structure
From: David Laight @ 2026-06-30  9:26 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, Michal Koutný
In-Reply-To: <20260629162634.0327fa25@robin>

On Mon, 29 Jun 2026 16:26:34 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Fri, 26 Jun 2026 22:23:55 +0100
> David Laight <david.laight.linux@gmail.com> wrote:
> 
> \> --- a/kernel/trace/trace.c
> > +++ b/kernel/trace/trace.c
> > @@ -52,6 +52,7 @@
> >  #include <linux/sort.h>
> >  #include <linux/io.h> /* vmap_page_range() */
> >  #include <linux/fs_context.h>
> > +#include <linux/trace_printk.h>  
> 
> Left over debugging? ;-)

Possibly from a build where I'd taken it out of kernel.h.
I think this file needs that header from somewhere.
Probably for the prototypes for tracing_on() etc.
It really ought to have an explicit include rather than relying on
a 'proxy' include.

I did resolve a lot of those include issues by putting it into the
main trace headers (which is big an non-trivial).

	David

> 
> -- Steve


^ permalink raw reply

* Re: [PATCH 2/2] tracing: Keep pid and comm[] in the same structure
From: David Laight @ 2026-06-30 10:01 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, Michal Koutný
In-Reply-To: <20260629164912.4c1c2855@robin>

On Mon, 29 Jun 2026 16:49:12 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Fri, 26 Jun 2026 22:23:56 +0100
> David Laight <david.laight.linux@gmail.com> wrote:
> 
> > Rather than have two separate dynamic arrays on the end of struct
> > saved_commandlines_buffer have a single dynamic array where each
> > entry contains the pid and associated task->comm[].
> > This simplifies the initialisation and lookup.
> > 
> > Don't bother trying to initialise the pid field no a non-zero value,
> > it only matters in the tracing_saved_cmdlines_seq_ops code.
> > Allocate entry [0] first so that the tracing_saved_cmdlines_seq_ops
> > code can just index the array with the file offset.
> > 
> > The code now uses the correct size when determining the page 'order'
> > to free the structure. The smaller size will always give the same
> > 'order'.
> > 
> > Signed-off-by: David Laight <david.laight.linux@gmail.com>
> > ---
> > 
> > Is there any reason why this code uses alloc_pages() rather
> > than vmalloc()?  
> 
> It's been a long time since I worked on this, but IIRC, it was to keep
> the pressure down on the TLB when tracing. It updates at every
> sched_switch that has a trace event occurring so, I likely used normal
> pages which are part of the huge pages the kernel sets up and doesn't
> affect the TLB as much. vmalloc does have impact on the TLB pressure,
> and tracing should always try to avoid that.

Isn't this a cache so that the pid numbers can be converted to strings
when the trace is read out after the actual process has exited?
That does mean that cache doesn't need to be updated on every trace
request - it might be enough to just save on process exit and lookup the
pid itself for running processes (the whole thing relies on pids not
being reused).

> 
> > map_pid_to_cmdline[] is 64k*sizeof(int) so the whole structure
> > expands to 512k with about 64k/20 (about 3200) pid entries even
> > though the default is 128.  
> 
> That's because it is not dynamic. That array needs to be able to hold
> most PIDs. The default is 128 but it will expand to how much it can
> hold to allocate the full map_pid_to_cmdline. The real default for 4098
> page sized architectures is 6552 entries.

That is double my 'quick calculation' - but both are a lot of entries.

> > AFAICT there is only one copy of the data - so it could be static.
> > Perhaps with pointers to map_pid_cmdline[] and (after this patch)
> > pid_comm[], both of which could be separately resized.  
> 
> map_pid_t_cmdline[] is to hold the PID_MAX_DEFAULT amount of PIDs to
> avoid collisions. I wouldn't resize it.

If comm[] is only saved on process exit you'd likely get away with far
fewer entries - getting collisions for processes that have exited is
rather unlikely.
(I wonder if I could make that work.)

Does that memory get allocated at boot time?
512k is a lot to allocated for a feature that won't usually be used.
OTOH you won't reliably get that much contiguous memory later on.
Deferring to a later time (maybe as late as the first tracing_on())
might be more reasonable - but that would have to use vmalloc().

I'm also not sure about the code that lets you trace from boot.
That must be able to initialise early - but I'm not sure how early.

	David

> 
> > 
> > I also noticed that map_pid_to_cmdline[] contains indexes into
> > pid_comm[], restricting these to 16bits would half the data area.  
> 
> Hmm, yeah, this could be useful, as it doesn't appear one could make
> saved_cmdline_size greater than 65536 (or even close to that).
> 
> -- Steve


^ permalink raw reply

* Re: [RFC PATCH v2 4/4] rtla/osnoise: Leverage IPI event filters when tracing a subset of CPUs
From: Tomas Glozar @ 2026-06-30 10:14 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: linux-kernel, linux-trace-kernel, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Costa Shulyupin,
	Crystal Wood, John Kacur, Ivan Pravdin, Jonathan Corbet
In-Reply-To: <20260617131803.2988989-5-vschneid@redhat.com>

st 17. 6. 2026 v 15:18 odesílatel Valentin Schneider
<vschneid@redhat.com> napsal:
>
> Instead of post-processing the events in the tracefs_iterate_raw_events()
> callbacks, leverage the kernel event filtering infrastructure to only emit
> IPI events if they target CPUs that are being traced, as specified by the
> -c cmdline option.
>
> Note that some post-processing is still required for the ipi_send_cpumask
> event, as the event being emitted means *some* CPUs targeted by that event
> are monitored, but not all of them - userspace has to recompute that
> intersection.
>

Nit: I'd drop the "Instead of post-processing the events in the
tracefs_iterate_raw_events() callbacks" sentence. I find it a bit
confusing, as "instead of" is quite a strong wording implying
post-processing is removed (at least to my perception), but in the
next paragraph, you contradict it by saying that some post-processing
is still done. Also the commit message is perfectly understandable
without it.

> Signed-off-by: Valentin Schneider <vschneid@redhat.com>
> ---
>  tools/tracing/rtla/src/osnoise_top.c | 37 +++++++++++++++++++++++++---
>  1 file changed, 33 insertions(+), 4 deletions(-)
>
> diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c
> index 5b462a3543b97..8040521710884 100644
> --- a/tools/tracing/rtla/src/osnoise_top.c
> +++ b/tools/tracing/rtla/src/osnoise_top.c
> @@ -93,18 +93,15 @@ osnoise_ipi_cpu_handler(struct trace_seq *s, struct tep_record *record,
>                      struct tep_event *event, void *context)
>  {
>         struct osnoise_tool *tool;
> -       struct osnoise_params *params;
>         unsigned long long src_cpu, dst_cpu;
>         struct trace_instance *trace = context;
>
>         tool = container_of(trace, struct osnoise_tool, trace);
> -       params = to_osnoise_params(tool->params);
>
>         src_cpu = record->cpu;
>         tep_get_field_val(s, event, "cpu", record, &dst_cpu, 1);
>
> -       if (CPU_ISSET(dst_cpu, &params->common.monitored_cpus))
> -               account_ipi(tool, src_cpu, dst_cpu);
> +       account_ipi(tool, src_cpu, dst_cpu);
>
>         return 0;
>  }
> @@ -141,6 +138,11 @@ osnoise_ipi_cpumask_handler(struct trace_seq *s, struct tep_record *record,
>                 return 0;
>         }
>
> +       /*
> +        * Despite already filtering for such an intersection, we need to compute
> +        * the intersection here as the @cpumask field may contain non-monitered

Typo: non-monitered -> non-monitored

> +        * CPUs.
> +        */
>         CPU_AND(&cpumask_tmp_cpus, event_cpus, &params->common.monitored_cpus);
>
>         /*
> @@ -406,6 +408,33 @@ struct osnoise_tool *osnoise_init_top(struct common_params *params)
>                 goto out_err;
>         }
>
> +       /*
> +        * If tracing on a subset of possible CPUs, leverage the kernel filtering
> +        * infrastructure to only generate events on traced CPUs.
> +        */
> +       if (params->cpus) {
> +               char filter[MAX_PATH];
> +
> +               snprintf(filter, ARRAY_SIZE(filter), "cpu & CPUS{%s}\n", params->cpus);
> +               retval = tracefs_event_file_write(tool->trace.inst,
> +                                                 "ipi", "ipi_send_cpu", "filter",
> +                                                 filter);
> +               if (retval) {

retval is the number of bytes written here, so this should be "retval
< 0" like in trace_event_enable_filter() in trace.c. Same below.

> +                       err_msg("Could not set ipi_send_cpu CPU filter\n");
> +                       goto out_err;

It would be useful to have --ipi work even on older kernels that don't
yet have your cpumask trace event filter patchset [1], for example, by
printing a debug message that filtering is disabled and setting a flag
instead of erroring out here. Then the code in
osnoise_ipi_cpu_handler() can preserve the CPU_ISSET check if the flag
is set.

As --ipi is optional, we can choose to only support it on newer
kernels, but it would be nice to have it working without the filter,
too.

[1] https://lore.kernel.org/linux-trace-kernel/20230707172155.70873-1-vschneid@redhat.com/T/#u

> +               }
> +
> +
> +               snprintf(filter, ARRAY_SIZE(filter), "cpumask & CPUS{%s}\n", params->cpus);
> +               retval = tracefs_event_file_write(tool->trace.inst,
> +                                                 "ipi", "ipi_send_cpumask", "filter",
> +                                                 filter);
> +               if (retval) {
> +                       err_msg("Could not set ipi_send_cpumask CPU filter\n");
> +                       goto out_err;
> +               }

Same two comments above apply here.

> +       }
> +
>         tep_register_event_handler(tool->trace.tep, -1, "ipi", "ipi_send_cpu",
>                                    osnoise_ipi_cpu_handler, NULL);
>
> --
> 2.54.0
>

I was thinking that it might make sense to enable the filters also for
the trace output instance. On the other hand, it would make it
difficult to enable the event without the filter then, as specifying
"-e ipi" or similar only re-enables the event but does not remove the
filter. Maybe the better idea is to implement an option to filter any
event enabled through -e/--event only to the measurement CPU, as a
separate feature.

Tomas


^ permalink raw reply

* Re: [PATCH v8 02/46] KVM: Rename KVM_GENERIC_MEMORY_ATTRIBUTES to KVM_VM_MEMORY_ATTRIBUTES
From: Xiaoyao Li @ 2026-06-30 10:45 UTC (permalink / raw)
  To: ackerleytng, aik, andrew.jones, binbin.wu, brauner, chao.p.peng,
	david, jmattson, jthoughton, michael.roth, oupton, pankaj.gupta,
	qperret, rick.p.edgecombe, rientjes, shivankg, steven.price,
	tabba, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260618-gmem-inplace-conversion-v8-2-9d2959357853@google.com>

On 6/19/2026 8:31 AM, Ackerley Tng via B4 Relay wrote:
> From: Sean Christopherson <seanjc@google.com>
> 
> Rename the per-VM memory attributes Kconfig to make it explicitly about
> per-VM attributes in anticipation of adding memory attributes support to
> guest_memfd, at which point it will be possible (and desirable) to have
> memory attributes without the per-VM support, even in x86.
> 
> No functional change intended.
> 
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Reviewed-by: Fuad Tabba <tabba@google.com>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>

^ permalink raw reply

* Re: [PATCH v8 03/46] KVM: Move KVM_VM_MEMORY_ATTRIBUTES config definition to x86
From: Xiaoyao Li @ 2026-06-30 10:45 UTC (permalink / raw)
  To: ackerleytng, aik, andrew.jones, binbin.wu, brauner, chao.p.peng,
	david, jmattson, jthoughton, michael.roth, oupton, pankaj.gupta,
	qperret, rick.p.edgecombe, rientjes, shivankg, steven.price,
	tabba, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260618-gmem-inplace-conversion-v8-3-9d2959357853@google.com>

On 6/19/2026 8:31 AM, Ackerley Tng via B4 Relay wrote:
> From: Sean Christopherson<seanjc@google.com>
> 
> Bury KVM_VM_MEMORY_ATTRIBUTES in x86 to discourage other architectures
> from adding support for per-VM memory attributes, because tracking private
> vs. shared memory on a per-VM basis is now deprecated in favor of tracking
> on a per-guest_memfd basis, and while RWX memory attributes are on the
> horizon, they too are expected to be x86-only.
> 
> This will also allow modifying KVM_VM_MEMORY_ATTRIBUTES to be
> user-selectable (in x86) without creating weirdness in KVM's Kconfigs.
> Now that guest_memfd supports in-place conversions, it's entirely possible
> to run x86 CoCo VMs without support for KVM_VM_MEMORY_ATTRIBUTES.
> 
> Leave the code itself in common KVM so that it's trivial to undo this
> change if new per-VM attributes do come along.
> 
> Signed-off-by: Sean Christopherson<seanjc@google.com>
> Reviewed-by: Fuad Tabba<tabba@google.com>
> Signed-off-by: Ackerley Tng<ackerleytng@google.com>

Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>

^ permalink raw reply

* Re: [PATCH v8 04/46] KVM: Decouple kvm_has_arch_private_mem from CONFIG_KVM_VM_MEMORY_ATTRIBUTES
From: Xiaoyao Li @ 2026-06-30 10:47 UTC (permalink / raw)
  To: ackerleytng, aik, andrew.jones, binbin.wu, brauner, chao.p.peng,
	david, jmattson, jthoughton, michael.roth, oupton, pankaj.gupta,
	qperret, rick.p.edgecombe, rientjes, shivankg, steven.price,
	tabba, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260618-gmem-inplace-conversion-v8-4-9d2959357853@google.com>

On 6/19/2026 8:31 AM, Ackerley Tng via B4 Relay wrote:
> From: Sean Christopherson <seanjc@google.com>
> 
> When memory attributes become trackable in guest_memfd, the concept of
> having private memory is no longer dependent on
> CONFIG_KVM_VM_MEMORY_ATTRIBUTES.
> 
> With this, on x86, kvm_arch_has_private_mem() is defined if some CoCo
> platform support (or the testing CONFIG_KVM_SW_PROTECTED_VM) is compiled
> in.
> 
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Co-developed-by: Ackerley Tng <ackerleytng@google.com>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>

> ---
>   arch/x86/include/asm/kvm_host.h | 4 +++-
>   include/linux/kvm_host.h        | 2 +-
>   2 files changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 8e8eb8a5e8a6b..1bde67cf6eb0e 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -2394,7 +2394,9 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
>   		       int tdp_max_root_level, int tdp_huge_page_level);
>   
>   
> -#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
> +#if defined(CONFIG_KVM_SW_PROTECTED_VM) ||	\
> +	defined(CONFIG_KVM_INTEL_TDX) ||	\
> +	defined(CONFIG_KVM_AMD_SEV)

Maybe we can just remove the #ifdef and make it always avaiable?

>   #define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
>   #endif
>   
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 201d0f2143976..d370e834d619e 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -722,7 +722,7 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
>   }
>   #endif
>   
> -#ifndef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
> +#ifndef kvm_arch_has_private_mem
>   static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
>   {
>   	return false;
> 


^ permalink raw reply

* Re: [PATCH v8 05/46] KVM: Make CONFIG_KVM_VM_MEMORY_ATTRIBUTES selectable
From: Xiaoyao Li @ 2026-06-30 10:55 UTC (permalink / raw)
  To: ackerleytng, aik, andrew.jones, binbin.wu, brauner, chao.p.peng,
	david, jmattson, jthoughton, michael.roth, oupton, pankaj.gupta,
	qperret, rick.p.edgecombe, rientjes, shivankg, steven.price,
	tabba, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260618-gmem-inplace-conversion-v8-5-9d2959357853@google.com>

On 6/19/2026 8:31 AM, Ackerley Tng via B4 Relay wrote:
> From: Ackerley Tng <ackerleytng@google.com>
> 
> Make CONFIG_KVM_VM_MEMORY_ATTRIBUTES selectable, only for (CoCo) VM types
> that might use vm_memory_attributes.
> 
> Also document CONFIG_KVM_VM_MEMORY_ATTRIBUTES to specifically be about the
> private/shared attribute.

I think this patch needs to be moved later after per-gmem shared/private 
attribute is implemented. Because so far, TDX/SEV indeed depend on 
CONFIG_KVM_VM_MEMORY_ATTRIBUTES.

Not to discuss if it makes sense to report TDX as supported VM TYPE when 
CONFIG_KVM_VM_MEMORY_ATTRIBUTES is not enabled, this patch just fails 
the compilation when

   CONFIG_KVM_VM_MEMORY_ATTRIBUTES = n

and KVM_INTEL_TDX/KVM_AMD_SEV is enabled:

arch/x86/kvm/../../../virt/kvm/guest_memfd.c: In function 
‘__kvm_gmem_populate’:
arch/x86/kvm/../../../virt/kvm/guest_memfd.c:918:14: error: implicit 
declaration of function ‘kvm_range_has_memory_attributes’ 
[-Werror=implicit-function-declaration]
   918 |         if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1,
       |              ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>   arch/x86/kvm/Kconfig | 9 +++++----
>   1 file changed, 5 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index 24f96396cfa1c..c28393dc664eb 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -81,13 +81,16 @@ config KVM_WERROR
>   	  If in doubt, say "N".
>   
>   config KVM_VM_MEMORY_ATTRIBUTES
> -	bool
> +	depends on KVM_SW_PROTECTED_VM || KVM_INTEL_TDX || KVM_AMD_SEV
> +	bool "Enable per-VM PRIVATE vs. SHARED attributes (for CoCo VMs)"
> +	help
> +	  Enable support for tracking PRIVATE vs. SHARED memory using per-VM
> +	  memory attributes.
>   
>   config KVM_SW_PROTECTED_VM
>   	bool "Enable support for KVM software-protected VMs"
>   	depends on EXPERT
>   	depends on KVM_X86 && X86_64
> -	select KVM_VM_MEMORY_ATTRIBUTES
>   	help
>   	  Enable support for KVM software-protected VMs.  Currently, software-
>   	  protected VMs are purely a development and testing vehicle for
> @@ -138,7 +141,6 @@ config KVM_INTEL_TDX
>   	bool "Intel Trust Domain Extensions (TDX) support"
>   	default y
>   	depends on INTEL_TDX_HOST
> -	select KVM_VM_MEMORY_ATTRIBUTES
>   	select HAVE_KVM_ARCH_GMEM_POPULATE
>   	help
>   	  Provides support for launching Intel Trust Domain Extensions (TDX)
> @@ -162,7 +164,6 @@ config KVM_AMD_SEV
>   	depends on KVM_AMD && X86_64
>   	depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
>   	select ARCH_HAS_CC_PLATFORM
> -	select KVM_VM_MEMORY_ATTRIBUTES
>   	select HAVE_KVM_ARCH_GMEM_PREPARE
>   	select HAVE_KVM_ARCH_GMEM_INVALIDATE
>   	select HAVE_KVM_ARCH_GMEM_POPULATE
> 


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox