Linux Trace Kernel
 help / color / mirror / Atom feed
* [PATCH RFC v4 02/44] KVM: Rename KVM_GENERIC_MEMORY_ATTRIBUTES to KVM_VM_MEMORY_ATTRIBUTES
From: Ackerley Tng @ 2026-03-26 22:24 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jroedel, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Jason Gunthorpe,
	Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, Ackerley Tng
In-Reply-To: <20260326-gmem-inplace-conversion-v4-0-e202fe950ffd@google.com>

From: Sean Christopherson <seanjc@google.com>

Rename the per-VM memory attributes Kconfig to make it explicitly about
per-VM attributes in anticipation of adding memory attributes support to
guest_memfd, at which point it will be possible (and desirable) to have
memory attributes without the per-VM support, even in x86.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/Kconfig            |  6 +++---
 arch/x86/kvm/mmu/mmu.c          |  2 +-
 arch/x86/kvm/x86.c              |  2 +-
 include/linux/kvm_host.h        |  8 ++++----
 include/trace/events/kvm.h      |  4 ++--
 virt/kvm/Kconfig                |  2 +-
 virt/kvm/kvm_main.c             | 14 +++++++-------
 8 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6e4e3ef9b8c72..cf3d2bdababc7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2329,7 +2329,7 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
 		       int tdp_max_root_level, int tdp_huge_page_level);
 
 
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 #define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
 #endif
 
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 801bf9e520db3..26f6afd51bbdc 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -84,7 +84,7 @@ config KVM_SW_PROTECTED_VM
 	bool "Enable support for KVM software-protected VMs"
 	depends on EXPERT
 	depends on KVM_X86 && X86_64
-	select KVM_GENERIC_MEMORY_ATTRIBUTES
+	select KVM_VM_MEMORY_ATTRIBUTES
 	help
 	  Enable support for KVM software-protected VMs.  Currently, software-
 	  protected VMs are purely a development and testing vehicle for
@@ -135,7 +135,7 @@ config KVM_INTEL_TDX
 	bool "Intel Trust Domain Extensions (TDX) support"
 	default y
 	depends on INTEL_TDX_HOST
-	select KVM_GENERIC_MEMORY_ATTRIBUTES
+	select KVM_VM_MEMORY_ATTRIBUTES
 	select HAVE_KVM_ARCH_GMEM_POPULATE
 	help
 	  Provides support for launching Intel Trust Domain Extensions (TDX)
@@ -159,7 +159,7 @@ config KVM_AMD_SEV
 	depends on KVM_AMD && X86_64
 	depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
 	select ARCH_HAS_CC_PLATFORM
-	select KVM_GENERIC_MEMORY_ATTRIBUTES
+	select KVM_VM_MEMORY_ATTRIBUTES
 	select HAVE_KVM_ARCH_GMEM_PREPARE
 	select HAVE_KVM_ARCH_GMEM_INVALIDATE
 	select HAVE_KVM_ARCH_GMEM_POPULATE
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index b922a8b000577..792701b093234 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7889,7 +7889,7 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
 		vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread);
 }
 
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
 				int level)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fd1c4a36b5936..7e133a9da11f0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13509,7 +13509,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
 		}
 	}
 
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 	kvm_mmu_init_memslot_memory_attributes(kvm, slot);
 #endif
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6b76e7a6f4c22..e75f7295af5d0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -721,7 +721,7 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
 }
 #endif
 
-#ifndef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifndef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
 {
 	return false;
@@ -870,7 +870,7 @@ struct kvm {
 #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
 	struct notifier_block pm_notifier;
 #endif
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 	/* Protected by slots_lock (for writes) and RCU (for reads) */
 	struct xarray mem_attr_array;
 #endif
@@ -2513,7 +2513,7 @@ static inline bool kvm_memslot_is_gmem_only(const struct kvm_memory_slot *slot)
 	return slot->flags & KVM_MEMSLOT_GMEM_ONLY;
 }
 
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
 {
 	return xa_to_value(xa_load(&kvm->mem_attr_array, gfn));
@@ -2535,7 +2535,7 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
 {
 	return false;
 }
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
 
 #ifdef CONFIG_KVM_GUEST_MEMFD
 int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index b282e3a867696..1ba72bd73ea2f 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -358,7 +358,7 @@ TRACE_EVENT(kvm_dirty_ring_exit,
 	TP_printk("vcpu %d", __entry->vcpu_id)
 );
 
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 /*
  * @start:	Starting address of guest memory range
  * @end:	End address of guest memory range
@@ -383,7 +383,7 @@ TRACE_EVENT(kvm_vm_set_mem_attributes,
 	TP_printk("%#016llx -- %#016llx [0x%lx]",
 		  __entry->start, __entry->end, __entry->attr)
 );
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
 
 TRACE_EVENT(kvm_unmap_hva_range,
 	TP_PROTO(unsigned long start, unsigned long end),
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 794976b88c6f9..5119cb37145fc 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -100,7 +100,7 @@ config KVM_ELIDE_TLB_FLUSH_IF_YOUNG
 config KVM_MMU_LOCKLESS_AGING
        bool
 
-config KVM_GENERIC_MEMORY_ATTRIBUTES
+config KVM_VM_MEMORY_ATTRIBUTES
        bool
 
 config KVM_GUEST_MEMFD
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9093251beb398..301d7ddac6ba6 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1122,7 +1122,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 	spin_lock_init(&kvm->mn_invalidate_lock);
 	rcuwait_init(&kvm->mn_memslots_update_rcuwait);
 	xa_init(&kvm->vcpu_array);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 	xa_init(&kvm->mem_attr_array);
 #endif
 
@@ -1307,7 +1307,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	cleanup_srcu_struct(&kvm->irq_srcu);
 	srcu_barrier(&kvm->srcu);
 	cleanup_srcu_struct(&kvm->srcu);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 	xa_destroy(&kvm->mem_attr_array);
 #endif
 	kvm_arch_free_vm(kvm);
@@ -2425,7 +2425,7 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
 }
 #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
 
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 static u64 kvm_supported_mem_attributes(struct kvm *kvm)
 {
 	if (!kvm || kvm_arch_has_private_mem(kvm))
@@ -2630,7 +2630,7 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
 
 	return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
 }
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
 
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
@@ -4928,7 +4928,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_SYSTEM_EVENT_DATA:
 	case KVM_CAP_DEVICE_CTRL:
 		return 1;
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 	case KVM_CAP_MEMORY_ATTRIBUTES:
 		return kvm_supported_mem_attributes(kvm);
 #endif
@@ -5332,7 +5332,7 @@ static long kvm_vm_ioctl(struct file *filp,
 		break;
 	}
 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
 	case KVM_SET_MEMORY_ATTRIBUTES: {
 		struct kvm_memory_attributes attrs;
 
@@ -5343,7 +5343,7 @@ static long kvm_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
 		break;
 	}
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
 	case KVM_CREATE_DEVICE: {
 		struct kvm_create_device cd;
 

-- 
2.53.0.1018.g2bb0e51243-goog


^ permalink raw reply related

* [PATCH RFC v4 01/44] KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
From: Ackerley Tng @ 2026-03-26 22:24 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jroedel, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Jason Gunthorpe,
	Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, Ackerley Tng
In-Reply-To: <20260326-gmem-inplace-conversion-v4-0-e202fe950ffd@google.com>

From: Sean Christopherson <seanjc@google.com>

Start plumbing in guest_memfd support for in-place private<=>shared
conversions by tracking attributes via a maple tree.  KVM currently tracks
private vs. shared attributes on a per-VM basis, which made sense when a
guest_memfd _only_ supported private memory, but tracking per-VM simply
can't work for in-place conversions as the shareability of a given page
needs to be per-gmem_inode, not per-VM.

Use the filemap invalidation lock to protect the maple tree, as taking the
lock for read when faulting in memory (for userspace or the guest) isn't
expected to result in meaningful contention, and using a separate lock
would add significant complexity (avoid deadlock is quite difficult).

Signed-off-by: Sean Christopherson <seanjc@google.com>
Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Vishal Annapurve <vannapurve@google.com>
Signed-off-by: Vishal Annapurve <vannapurve@google.com>
Co-developed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
---
 virt/kvm/guest_memfd.c | 139 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 123 insertions(+), 16 deletions(-)

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 017d84a7adf37..aa2caf5114da2 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -4,6 +4,7 @@
 #include <linux/falloc.h>
 #include <linux/fs.h>
 #include <linux/kvm_host.h>
+#include <linux/maple_tree.h>
 #include <linux/mempolicy.h>
 #include <linux/pseudo_fs.h>
 #include <linux/pagemap.h>
@@ -32,6 +33,12 @@ struct gmem_inode {
 	struct inode vfs_inode;
 
 	u64 flags;
+	/*
+	 * Every index in this inode, whether memory is populated or
+	 * not, is tracked in attributes. There are no gaps in this
+	 * maple tree.
+	 */
+	struct maple_tree attributes;
 };
 
 static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
@@ -59,6 +66,31 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
 	return gfn - slot->base_gfn + slot->gmem.pgoff;
 }
 
+static u64 kvm_gmem_get_attributes(struct inode *inode, pgoff_t index)
+{
+	struct maple_tree *mt = &GMEM_I(inode)->attributes;
+	void *entry = mtree_load(mt, index);
+
+	/*
+	 * The lock _must_ be held for lookups, as some maple tree operations,
+	 * e.g. append, are unsafe (return inaccurate information) with respect
+	 * to concurrent RCU-protected lookups.
+	 */
+	lockdep_assert(mt_lock_is_held(mt));
+
+	return WARN_ON_ONCE(!entry) ? 0 : xa_to_value(entry);
+}
+
+static bool kvm_gmem_is_private_mem(struct inode *inode, pgoff_t index)
+{
+	return kvm_gmem_get_attributes(inode, index) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
+}
+
+static bool kvm_gmem_is_shared_mem(struct inode *inode, pgoff_t index)
+{
+	return !kvm_gmem_is_private_mem(inode, index);
+}
+
 static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
 				    pgoff_t index, struct folio *folio)
 {
@@ -397,10 +429,13 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
 	if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
 		return VM_FAULT_SIGBUS;
 
-	if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED))
-		return VM_FAULT_SIGBUS;
+	filemap_invalidate_lock_shared(inode->i_mapping);
+	if (kvm_gmem_is_shared_mem(inode, vmf->pgoff))
+		folio = kvm_gmem_get_folio(inode, vmf->pgoff);
+	else
+		folio = ERR_PTR(-EACCES);
+	filemap_invalidate_unlock_shared(inode->i_mapping);
 
-	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
 	if (IS_ERR(folio)) {
 		if (PTR_ERR(folio) == -EAGAIN)
 			return VM_FAULT_RETRY;
@@ -556,6 +591,51 @@ bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
 	return true;
 }
 
+static int kvm_gmem_init_inode(struct inode *inode, loff_t size, u64 flags)
+{
+	struct gmem_inode *gi = GMEM_I(inode);
+	MA_STATE(mas, &gi->attributes, 0, (size >> PAGE_SHIFT) - 1);
+	u64 attrs;
+	int r;
+
+	inode->i_op = &kvm_gmem_iops;
+	inode->i_mapping->a_ops = &kvm_gmem_aops;
+	inode->i_mode |= S_IFREG;
+	inode->i_size = size;
+	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+
+	/*
+	 * guest_memfd memory is neither migratable nor swappable: set
+	 * inaccessible to gate off both.
+	 */
+	mapping_set_inaccessible(inode->i_mapping);
+	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
+
+	gi->flags = flags;
+
+	mt_set_external_lock(&gi->attributes,
+			     &inode->i_mapping->invalidate_lock);
+
+	/*
+	 * Store default attributes for the entire gmem instance. Ensuring every
+	 * index is represented in the maple tree at all times simplifies the
+	 * conversion and merging logic.
+	 */
+	attrs = gi->flags & GUEST_MEMFD_FLAG_INIT_SHARED ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE;
+
+	/*
+	 * Acquire the invalidation lock purely to make lockdep happy.  The
+	 * maple tree library expects all stores to be protected via the lock,
+	 * and the library can't know when the tree is reachable only by the
+	 * caller, as is the case here.
+	 */
+	filemap_invalidate_lock(inode->i_mapping);
+	r = mas_store_gfp(&mas, xa_mk_value(attrs), GFP_KERNEL);
+	filemap_invalidate_unlock(inode->i_mapping);
+
+	return r;
+}
+
 static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 {
 	static const char *name = "[kvm-gmem]";
@@ -586,16 +666,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 		goto err_fops;
 	}
 
-	inode->i_op = &kvm_gmem_iops;
-	inode->i_mapping->a_ops = &kvm_gmem_aops;
-	inode->i_mode |= S_IFREG;
-	inode->i_size = size;
-	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
-	mapping_set_inaccessible(inode->i_mapping);
-	/* Unmovable mappings are supposed to be marked unevictable as well. */
-	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
-
-	GMEM_I(inode)->flags = flags;
+	err = kvm_gmem_init_inode(inode, size, flags);
+	if (err)
+		goto err_inode;
 
 	file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
 	if (IS_ERR(file)) {
@@ -797,9 +870,13 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 	if (!file)
 		return -EFAULT;
 
+	filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
+
 	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
-	if (IS_ERR(folio))
-		return PTR_ERR(folio);
+	if (IS_ERR(folio)) {
+		r = PTR_ERR(folio);
+		goto out;
+	}
 
 	if (!folio_test_uptodate(folio)) {
 		clear_highpage(folio_page(folio, 0));
@@ -815,6 +892,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 	else
 		folio_put(folio);
 
+out:
+	filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
 	return r;
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
@@ -944,13 +1023,41 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
 
 	mpol_shared_policy_init(&gi->policy, NULL);
 
+	/*
+	 * Memory attributes are protected by the filemap invalidation lock, but
+	 * the lock structure isn't available at this time.  Immediately mark
+	 * maple tree as using external locking so that accessing the tree
+	 * before it's fully initialized results in NULL pointer dereferences
+	 * and not more subtle bugs.
+	 */
+	mt_init_flags(&gi->attributes, MT_FLAGS_LOCK_EXTERN);
+
 	gi->flags = 0;
 	return &gi->vfs_inode;
 }
 
 static void kvm_gmem_destroy_inode(struct inode *inode)
 {
-	mpol_free_shared_policy(&GMEM_I(inode)->policy);
+	struct gmem_inode *gi = GMEM_I(inode);
+
+	mpol_free_shared_policy(&gi->policy);
+
+	/*
+	 * Note!  Checking for an empty tree is functionally necessary
+	 * to avoid explosions if the tree hasn't been fully
+	 * initialized, i.e. if the inode is being destroyed before
+	 * guest_memfd can set the external lock, lockdep would find
+	 * that the tree's internal ma_lock was not held.
+	 */
+	if (!mtree_empty(&gi->attributes)) {
+		/*
+		 * Acquire the invalidation lock purely to make lockdep happy,
+		 * the inode is unreachable at this point.
+		 */
+		filemap_invalidate_lock(inode->i_mapping);
+		__mt_destroy(&gi->attributes);
+		filemap_invalidate_unlock(inode->i_mapping);
+	}
 }
 
 static void kvm_gmem_free_inode(struct inode *inode)

-- 
2.53.0.1018.g2bb0e51243-goog


^ permalink raw reply related

* [PATCH RFC v4 00/44] guest_memfd: In-place conversion support
From: Ackerley Tng @ 2026-03-26 22:24 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jroedel, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Jason Gunthorpe,
	Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, Ackerley Tng

This is RFC v4 of guest_memfd in-place conversion support.

Up till now, guest_memfd supports the entire inode worth of memory being
used as all-shared, or all-private. CoCo VMs may request guest memory to be
converted between private and shared states, and the only way to support
that currently would be to have the userspace VMM provide two sources of
backing memory from completely different areas of physical memory.

pKVM has a use case for in-place sharing: the guest and host may be
cooperating on given data, and pKVM doesn't protect data through
encryption, so copying that given data between different areas of physical
memory as part of conversions would be unnecessary work.

This series also serves as a foundation for guest_memfd huge page
support. Now, guest_memfd only supports PAGE_SIZE pages, so if two sources
of backing memory are used, the userspace VMM could maintain a steady total
memory utilized by punching out the pages that are not used. When huge
pages are available in guest_memfd, even if the backing memory source
supports hole punching within a huge page, punching out pages to maintain
the total memory utilized by a VM would be introducing lots of
fragmentation.

In-place conversion avoids fragmentation by allowing the same physical
memory to be used for both shared and private memory, with guest_memfd
tracks the shared/private status of all the pages at a per-page
granularity.

The central principle, which guest_memfd continues to uphold, is that any
guest-private page will not be mappable to host userspace. All pages will
be mmap()-able in host userspace, but accesses to guest-private pages (as
tracked by guest_memfd) will result in a SIGBUS.

This series introduces a guest_memfd ioctl (not kvm, vm or vcpu, but
guest_memfd ioctl) that allows userspace to set memory
attributes (shared/private) directly through the guest_memfd. This is the
appropriate interface because shared/private-ness is a property of memory
and hence the request should be sent directly to the memory provider -
guest_memfd.

RFC v4 integrates comments from RFC v3:

+ ZERO is not supported on shared to private conversions
+ Adds KVM_CAP_GUEST_MEMFD_SET_MEMORY_ATTRIBUTES2_FLAGS to enumerate
  supported content modes for a given VM, or all supported content modes if
  no VM is provided
+ Uses flags and not values to specify content modes for conversion
+ Allows architectures to override the content mode application for the
  entire range rather than per-folio: so if actions can be skipped, folio
  iteration can be skipped entirely.
+ Addresses comments from Sashiko [7]

I would like feedback on:

+ Content modes: 0 (MODE_UNSPECIFIED), ZERO, and PRESERVE. Is that all
  good, or does anyone think there is a use case for something else?
+ Should the content modes apply even if no attribute changes are required?
    + See notes added in "KVM: guest_memfd: Apply content modes while
      setting memory attributes"
    + Possibly related: should setting attributes be allowed if some
      sub-range requested already has the requested attribute?
+ Structure of how various content modes are checked for support or
  applied? I used overridable weak functions for architectures that haven't
  defined support, and defined overrides for x86 to show how I think it would
  work. For CoCo platforms, I only implemented TDX for illustration purposes
  and might need help with the other platforms. Should I have used
  kvm_x86_ops? I tried and found myself defining lots of boilerplate.
+ The use of private_mem_conversions_test.sh to run different options in
  private_mem_conversions_test. If this makes sense, I'll adjust the
  Makefile to have private_mem_conversions_test tested only via the script.

TODOs

+ Address locking issue when kvm_gmem_get_attribute() is called from
  kvm_mmu_zap_collapsible_spte(). In this path, KVM's MMU lock is held
  while guest_memfd tries to take filemap_invalidate_lock while looking up
  the attributes xarray.
+ Move guest_memfd_conversions_test.c to only be compiled and tested for
  x86, since it depends so heavily on KVM_X86_SW_PROTECTED_VM's as a
  testing vehicle

This series is based on kvm/next, and here's the tree for your convenience:

https://github.com/googleprodkernel/linux-cc/commits/guest_memfd-inplace-conversion-v4

Older series:

+ RFCv3 is at [6]
+ RFCv2 is at [5]
+ RFCv1 is at [4]
+ Previous versions of this feature, part of other series, are available at
  [1][2][3].

[1] https://lore.kernel.org/all/bd163de3118b626d1005aa88e71ef2fb72f0be0f.1726009989.git.ackerleytng@google.com/
[2] https://lore.kernel.org/all/20250117163001.2326672-6-tabba@google.com/
[3] https://lore.kernel.org/all/b784326e9ccae6a08388f1bf39db70a2204bdc51.1747264138.git.ackerleytng@google.com/
[4] https://lore.kernel.org/all/cover.1760731772.git.ackerleytng@google.com/T/
[5] https://lore.kernel.org/all/cover.1770071243.git.ackerleytng@google.com/T/
[6] https://lore.kernel.org/r/20260313-gmem-inplace-conversion-v3-0-5fc12a70ec89@google.com
[7] https://sashiko.dev/#/patchset/20260313-gmem-inplace-conversion-v3-0-5fc12a70ec89%40google.com

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
Ackerley Tng (26):
      KVM: guest_memfd: Update kvm_gmem_populate() to use gmem attributes
      KVM: guest_memfd: Only prepare folios for private pages
      KVM: Introduce KVM_SET_MEMORY_ATTRIBUTES2
      KVM: guest_memfd: Add support for KVM_SET_MEMORY_ATTRIBUTES2
      KVM: guest_memfd: Handle lru_add fbatch refcounts during conversion safety check
      KVM: guest_memfd: Introduce default handlers for content modes
      KVM: guest_memfd: Apply content modes while setting memory attributes
      KVM: x86: Add support for applying content modes
      KVM: Add CAP to enumerate supported SET_MEMORY_ATTRIBUTES2 flags
      KVM: selftests: Update framework to use KVM_SET_MEMORY_ATTRIBUTES2
      KVM: selftests: Test using guest_memfd for guest private memory
      KVM: selftests: Test basic single-page conversion flow
      KVM: selftests: Test conversion flow when INIT_SHARED
      KVM: selftests: Test conversion precision in guest_memfd
      KVM: selftests: Test conversion before allocation
      KVM: selftests: Convert with allocated folios in different layouts
      KVM: selftests: Test that truncation does not change shared/private status
      KVM: selftests: Test conversion with elevated page refcount
      KVM: selftests: Test that conversion to private does not support ZERO
      KVM: selftests: Support checking that data not equal expected
      KVM: selftests: Test that not specifying a conversion flag scrambles memory contents
      KVM: selftests: Reset shared memory after hole-punching
      KVM: selftests: Provide function to look up guest_memfd details from gpa
      KVM: selftests: Make TEST_EXPECT_SIGBUS thread-safe
      KVM: selftests: Update private_mem_conversions_test to mmap() guest_memfd
      KVM: selftests: Add script to exercise private_mem_conversions_test

Sean Christopherson (18):
      KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
      KVM: Rename KVM_GENERIC_MEMORY_ATTRIBUTES to KVM_VM_MEMORY_ATTRIBUTES
      KVM: Enumerate support for PRIVATE memory iff kvm_arch_has_private_mem is defined
      KVM: Stub in ability to disable per-VM memory attribute tracking
      KVM: guest_memfd: Wire up kvm_get_memory_attributes() to per-gmem attributes
      KVM: guest_memfd: Enable INIT_SHARED on guest_memfd for x86 Coco VMs
      KVM: Move KVM_VM_MEMORY_ATTRIBUTES config definition to x86
      KVM: Let userspace disable per-VM mem attributes, enable per-gmem attributes
      KVM: selftests: Create gmem fd before "regular" fd when adding memslot
      KVM: selftests: Rename guest_memfd{,_offset} to gmem_{fd,offset}
      KVM: selftests: Add support for mmap() on guest_memfd in core library
      KVM: selftests: Add selftests global for guest memory attributes capability
      KVM: selftests: Add helpers for calling ioctls on guest_memfd
      KVM: selftests: Test that shared/private status is consistent across processes
      KVM: selftests: Provide common function to set memory attributes
      KVM: selftests: Check fd/flags provided to mmap() when setting up memslot
      KVM: selftests: Update pre-fault test to work with per-guest_memfd attributes
      KVM: selftests: Update private memory exits test to work with per-gmem attributes

 Documentation/virt/kvm/api.rst                     | 136 ++++-
 arch/x86/include/asm/kvm_host.h                    |   2 +-
 arch/x86/kvm/Kconfig                               |  15 +-
 arch/x86/kvm/mmu/mmu.c                             |   4 +-
 arch/x86/kvm/x86.c                                 | 114 ++++-
 include/linux/kvm_host.h                           |  77 ++-
 include/trace/events/kvm.h                         |   4 +-
 include/uapi/linux/kvm.h                           |  22 +
 mm/swap.c                                          |   2 +
 tools/testing/selftests/kvm/Makefile.kvm           |   5 +
 .../selftests/kvm/guest_memfd_conversions_test.c   | 552 ++++++++++++++++++++
 tools/testing/selftests/kvm/guest_memfd_test.c     |  57 ++-
 tools/testing/selftests/kvm/include/kvm_util.h     | 144 +++++-
 tools/testing/selftests/kvm/include/test_util.h    |  34 +-
 .../selftests/kvm/kvm_has_gmem_attributes.c        |  17 +
 tools/testing/selftests/kvm/lib/kvm_util.c         | 130 +++--
 tools/testing/selftests/kvm/lib/test_util.c        |   7 -
 tools/testing/selftests/kvm/lib/x86/sev.c          |   2 +-
 .../testing/selftests/kvm/pre_fault_memory_test.c  |   4 +-
 .../kvm/x86/private_mem_conversions_test.c         |  55 +-
 .../kvm/x86/private_mem_conversions_test.sh        | 128 +++++
 .../selftests/kvm/x86/private_mem_kvm_exits_test.c |  38 +-
 virt/kvm/Kconfig                                   |   3 +-
 virt/kvm/guest_memfd.c                             | 562 ++++++++++++++++++++-
 virt/kvm/kvm_main.c                                | 116 ++++-
 25 files changed, 2047 insertions(+), 183 deletions(-)
---
base-commit: d2ea4ff1ce50787a98a3900b3fb1636f3620b7cf
change-id: 20260225-gmem-inplace-conversion-bd0dbd39753a

Best regards,
-- 
Ackerley Tng <ackerleytng@google.com>


^ permalink raw reply

* [PATCH next] tracing: Remove spurious default precision from show_event_trigger/filter formats
From: david.laight.linux @ 2026-03-26 20:18 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, Aaron Tomlin
  Cc: David Laight, Petr Mladek, Rasmus Villemoes, Andy Shevchenko,
	Sergey Senozhatsky, Andrew Morton

From: David Laight <david.laight.linux@gmail.com>

Change 2d8b7f9bf8e6e ("tracing: Have show_event_trigger/filter format a bit more in columns")
added space padding to align the output.
However it used ("%*.s", len, "") which requests the default precision.
It doesn't matter here whether the userspace default (0) or kernel
default (no precision) is used, but the format should be "%*s".

Signed-off-by: David Laight <david.laight.linux@gmail.com>
---
 kernel/trace/trace_events.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 249d1cba72c0..6b54c10f9ba4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1718,7 +1718,7 @@ static int t_show_filters(struct seq_file *m, void *v)
 
 	len = get_call_len(call);
 
-	seq_printf(m, "%s:%s%*.s%s\n", call->class->system,
+	seq_printf(m, "%s:%s%*s%s\n", call->class->system,
 		   trace_event_name(call), len, "", filter->filter_string);
 
 	return 0;
@@ -1750,7 +1750,7 @@ static int t_show_triggers(struct seq_file *m, void *v)
 	len = get_call_len(call);
 
 	list_for_each_entry_rcu(data, &file->triggers, list) {
-		seq_printf(m, "%s:%s%*.s", call->class->system,
+		seq_printf(m, "%s:%s%*s", call->class->system,
 			   trace_event_name(call), len, "");
 
 		data->cmd_ops->print(m, data);
-- 
2.39.5


^ permalink raw reply related

* Re: [PATCH v4 0/5] locking: contended_release tracepoint instrumentation
From: Dmitry Ilvokhin @ 2026-03-26 17:47 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng, Waiman Long,
	Thomas Bogendoerfer, Juergen Gross, Ajay Kaher, Alexey Makhalov,
	Broadcom internal kernel review list, Thomas Gleixner,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Arnd Bergmann,
	Dennis Zhou, Tejun Heo, Christoph Lameter, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, linux-kernel, linux-mips,
	virtualization, linux-arch, linux-mm, linux-trace-kernel,
	kernel-team
In-Reply-To: <acVW6eFGwqACD91Y@casper.infradead.org>

On Thu, Mar 26, 2026 at 03:55:21PM +0000, Matthew Wilcox wrote:
> On Thu, Mar 26, 2026 at 03:09:59PM +0000, Dmitry Ilvokhin wrote:
> > The existing contention_begin/contention_end tracepoints fire on the
> > waiter side. The lock holder's identity and stack can be captured at
> > contention_begin time (e.g. perf lock contention --lock-owner), but
> > this reflects the holder's state when a waiter arrives, not when the
> > lock is actually released.
> > 
> > This series adds a contended_release tracepoint that fires on the
> > holder side when a lock with waiters is released. This provides:
> > 
> > - Hold time estimation: when the holder's own acquisition was
> >   contended, its contention_end (acquisition) and contended_release
> >   can be correlated to measure how long the lock was held under
> >   contention.
> > 
> > - The holder's stack at release time, which may differ from what perf lock
> >   contention --lock-owner captures if the holder does significant work between
> >   the waiter's arrival and the unlock.
> 
> As someone who's not an expert in this area (so please use short words
> to explain it to me), why do we want to know how long this holder took
> to release the lock from when it became contended?
> 
> I understand why we want to know how long any given waiter had to wait
> to gain the lock (but we already have tracepoints which show that).

I think the simplest way to think about it is the following. Waiter time
is the symptom, while holder time is the cause.

The waiter-side contention_begin/contention_end tells us how long a
waiter waited, but that time can span multiple holders.

If a waiter waited 10 ms, we can not tell whether one holder held the
lock for 10 ms or five holders held it for 2 ms each. These need
different treatments: the first means shrink the critical section, the
second means reduce lock frequency or split the lock. Today we can not
distinguish between these cases from waiter-side data alone.

> 
> I also don't understand why we want to know the holder's stack at
> release time.  The stack at contention-begin time will include
> the point at which the lock was acquired which should be correlated
> with where the lock was released.
> 
> Perhaps examples might help me understand why we want this?

Holder's stack allows us to understand who exactly waiters were waiting
for to release the lock.

The stack at contention_begin time does not always include the holder's
stack. The --lock-owner feature works by reading the owner field from
the lock struct, but it only supports mutex and rwsem. For spinlocks,
queued rwlocks, semaphores, and several others, the waiter has no
visibility into the holder whatsoever.

contended_release fires in the holder's context, so we get the holder's
stack at release time. For spinlocks, this is the only way to get any
holder-side information.

Original motivation was zone lock contention (a spinlock) in Meta
production workloads. We could see waiters were blocked, but had no way
to identify the holders or what they were doing.

^ permalink raw reply

* Re: [PATCH v4 0/5] locking: contended_release tracepoint instrumentation
From: Steven Rostedt @ 2026-03-26 16:46 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Dmitry Ilvokhin, Peter Zijlstra, Ingo Molnar, Will Deacon,
	Boqun Feng, Waiman Long, Thomas Bogendoerfer, Juergen Gross,
	Ajay Kaher, Alexey Makhalov, Broadcom internal kernel review list,
	Thomas Gleixner, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Arnd Bergmann, Dennis Zhou, Tejun Heo,
	Christoph Lameter, Masami Hiramatsu, Mathieu Desnoyers,
	linux-kernel, linux-mips, virtualization, linux-arch, linux-mm,
	linux-trace-kernel, kernel-team
In-Reply-To: <acVW6eFGwqACD91Y@casper.infradead.org>

On Thu, 26 Mar 2026 15:55:21 +0000
Matthew Wilcox <willy@infradead.org> wrote:

> > - The holder's stack at release time, which may differ from what perf lock
> >   contention --lock-owner captures if the holder does significant work between
> >   the waiter's arrival and the unlock.  
> 
> As someone who's not an expert in this area (so please use short words
> to explain it to me), why do we want to know how long this holder took
> to release the lock from when it became contended?
> 
> I understand why we want to know how long any given waiter had to wait
> to gain the lock (but we already have tracepoints which show that).
> 
> I also don't understand why we want to know the holder's stack at
> release time.  The stack at contention-begin time will include
> the point at which the lock was acquired which should be correlated
> with where the lock was released.
> 
> Perhaps examples might help me understand why we want this?

Dmitry could give his own rationale for this, but I have my only use case.

This would be useful to find out how long the critical section is. If a
lock is highly contended by many tasks, you could get a high contention
time simply because other tasks are causing the delay for the waiter.
Seeing the release time and location would let you also know how long the
critical section was held, and if the length of the critical section is
causing the contention.

Having a stack trace of the release would differentiate the path that
released the lock, as there can be many places that release them. Although,
I have to admit, I'm not sure there are many different places locks are
released. Especially now that we have guard(), which will make all the
releases in a function at the same location.

-- Steve


^ permalink raw reply

* Re: [PATCH v4 0/5] locking: contended_release tracepoint instrumentation
From: Matthew Wilcox @ 2026-03-26 15:55 UTC (permalink / raw)
  To: Dmitry Ilvokhin
  Cc: Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng, Waiman Long,
	Thomas Bogendoerfer, Juergen Gross, Ajay Kaher, Alexey Makhalov,
	Broadcom internal kernel review list, Thomas Gleixner,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Arnd Bergmann,
	Dennis Zhou, Tejun Heo, Christoph Lameter, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, linux-kernel, linux-mips,
	virtualization, linux-arch, linux-mm, linux-trace-kernel,
	kernel-team
In-Reply-To: <cover.1774536681.git.d@ilvokhin.com>

On Thu, Mar 26, 2026 at 03:09:59PM +0000, Dmitry Ilvokhin wrote:
> The existing contention_begin/contention_end tracepoints fire on the
> waiter side. The lock holder's identity and stack can be captured at
> contention_begin time (e.g. perf lock contention --lock-owner), but
> this reflects the holder's state when a waiter arrives, not when the
> lock is actually released.
> 
> This series adds a contended_release tracepoint that fires on the
> holder side when a lock with waiters is released. This provides:
> 
> - Hold time estimation: when the holder's own acquisition was
>   contended, its contention_end (acquisition) and contended_release
>   can be correlated to measure how long the lock was held under
>   contention.
> 
> - The holder's stack at release time, which may differ from what perf lock
>   contention --lock-owner captures if the holder does significant work between
>   the waiter's arrival and the unlock.

As someone who's not an expert in this area (so please use short words
to explain it to me), why do we want to know how long this holder took
to release the lock from when it became contended?

I understand why we want to know how long any given waiter had to wait
to gain the lock (but we already have tracepoints which show that).

I also don't understand why we want to know the holder's stack at
release time.  The stack at contention-begin time will include
the point at which the lock was acquired which should be correlated
with where the lock was released.

Perhaps examples might help me understand why we want this?

^ permalink raw reply

* [PATCH v4 3/5] locking: Add contended_release tracepoint to sleepable locks
From: Dmitry Ilvokhin @ 2026-03-26 15:10 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng, Waiman Long,
	Thomas Bogendoerfer, Juergen Gross, Ajay Kaher, Alexey Makhalov,
	Broadcom internal kernel review list, Thomas Gleixner,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Arnd Bergmann,
	Dennis Zhou, Tejun Heo, Christoph Lameter, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers
  Cc: linux-kernel, linux-mips, virtualization, linux-arch, linux-mm,
	linux-trace-kernel, kernel-team, Dmitry Ilvokhin
In-Reply-To: <cover.1774536681.git.d@ilvokhin.com>

Add the contended_release trace event. This tracepoint fires on the
holder side when a contended lock is released, complementing the
existing contention_begin/contention_end tracepoints which fire on the
waiter side.

This enables correlating lock hold time under contention with waiter
events by lock address.

Add trace_contended_release() calls to the slowpath unlock paths of
sleepable locks: mutex, rtmutex, semaphore, rwsem, percpu-rwsem, and
RT-specific rwbase locks.

Where possible, trace_contended_release() fires before the lock is
released and before the waiter is woken. For some lock types, the
tracepoint fires after the release but before the wake. Making the
placement consistent across all lock types is not worth the added
complexity.

For reader/writer locks, the tracepoint fires for every reader releasing
while a writer is waiting, not only for the last reader.

Signed-off-by: Dmitry Ilvokhin <d@ilvokhin.com>
---
 include/trace/events/lock.h   | 17 +++++++++++++++++
 kernel/locking/mutex.c        |  4 ++++
 kernel/locking/percpu-rwsem.c | 11 +++++++++++
 kernel/locking/rtmutex.c      |  1 +
 kernel/locking/rwbase_rt.c    |  6 ++++++
 kernel/locking/rwsem.c        | 10 ++++++++--
 kernel/locking/semaphore.c    |  4 ++++
 7 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h
index da978f2afb45..1ded869cd619 100644
--- a/include/trace/events/lock.h
+++ b/include/trace/events/lock.h
@@ -137,6 +137,23 @@ TRACE_EVENT(contention_end,
 	TP_printk("%p (ret=%d)", __entry->lock_addr, __entry->ret)
 );
 
+TRACE_EVENT(contended_release,
+
+	TP_PROTO(void *lock),
+
+	TP_ARGS(lock),
+
+	TP_STRUCT__entry(
+		__field(void *, lock_addr)
+	),
+
+	TP_fast_assign(
+		__entry->lock_addr = lock;
+	),
+
+	TP_printk("%p", __entry->lock_addr)
+);
+
 #endif /* _TRACE_LOCK_H */
 
 /* This part must be outside protection */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 427187ff02db..6c2c9312eb8f 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -997,6 +997,9 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
 		wake_q_add(&wake_q, next);
 	}
 
+	if (trace_contended_release_enabled() && waiter)
+		trace_contended_release(lock);
+
 	if (owner & MUTEX_FLAG_HANDOFF)
 		__mutex_handoff(lock, next);
 
@@ -1194,6 +1197,7 @@ EXPORT_SYMBOL(ww_mutex_lock_interruptible);
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(contention_begin);
 EXPORT_TRACEPOINT_SYMBOL_GPL(contention_end);
+EXPORT_TRACEPOINT_SYMBOL_GPL(contended_release);
 
 /**
  * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index f3ee7a0d6047..46b5903989b8 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -263,6 +263,9 @@ void percpu_up_write(struct percpu_rw_semaphore *sem)
 {
 	rwsem_release(&sem->dep_map, _RET_IP_);
 
+	if (trace_contended_release_enabled() && wq_has_sleeper(&sem->waiters))
+		trace_contended_release(sem);
+
 	/*
 	 * Signal the writer is done, no fast path yet.
 	 *
@@ -292,6 +295,14 @@ EXPORT_SYMBOL_GPL(percpu_up_write);
 void __percpu_up_read(struct percpu_rw_semaphore *sem)
 {
 	lockdep_assert_preemption_disabled();
+	/*
+	 * After percpu_up_write() completes, rcu_sync_is_idle() can still
+	 * return false during the grace period, forcing readers into this
+	 * slowpath. Only trace when a writer is actually waiting for
+	 * readers to drain.
+	 */
+	if (trace_contended_release_enabled() && rcuwait_active(&sem->writer))
+		trace_contended_release(sem);
 	/*
 	 * slowpath; reader will only ever wake a single blocked
 	 * writer.
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index ccaba6148b61..3db8a840b4e8 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1466,6 +1466,7 @@ static void __sched rt_mutex_slowunlock(struct rt_mutex_base *lock)
 		raw_spin_lock_irqsave(&lock->wait_lock, flags);
 	}
 
+	trace_contended_release(lock);
 	/*
 	 * The wakeup next waiter path does not suffer from the above
 	 * race. See the comments there.
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
index 82e078c0665a..74da5601018f 100644
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -174,6 +174,8 @@ static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb,
 static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb,
 					       unsigned int state)
 {
+	if (trace_contended_release_enabled() && rt_mutex_owner(&rwb->rtmutex))
+		trace_contended_release(rwb);
 	/*
 	 * rwb->readers can only hit 0 when a writer is waiting for the
 	 * active readers to leave the critical section.
@@ -205,6 +207,8 @@ static inline void rwbase_write_unlock(struct rwbase_rt *rwb)
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+	if (trace_contended_release_enabled() && rt_mutex_has_waiters(rtm))
+		trace_contended_release(rwb);
 	__rwbase_write_unlock(rwb, WRITER_BIAS, flags);
 }
 
@@ -214,6 +218,8 @@ static inline void rwbase_write_downgrade(struct rwbase_rt *rwb)
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+	if (trace_contended_release_enabled() && rt_mutex_has_waiters(rtm))
+		trace_contended_release(rwb);
 	/* Release it and account current as reader */
 	__rwbase_write_unlock(rwb, WRITER_BIAS - 1, flags);
 }
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index bf647097369c..602d5fd3c91a 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1387,6 +1387,8 @@ static inline void __up_read(struct rw_semaphore *sem)
 	rwsem_clear_reader_owned(sem);
 	tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
 	DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
+	if (trace_contended_release_enabled() && (tmp & RWSEM_FLAG_WAITERS))
+		trace_contended_release(sem);
 	if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
 		      RWSEM_FLAG_WAITERS)) {
 		clear_nonspinnable(sem);
@@ -1413,8 +1415,10 @@ static inline void __up_write(struct rw_semaphore *sem)
 	preempt_disable();
 	rwsem_clear_owner(sem);
 	tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
-	if (unlikely(tmp & RWSEM_FLAG_WAITERS))
+	if (unlikely(tmp & RWSEM_FLAG_WAITERS)) {
+		trace_contended_release(sem);
 		rwsem_wake(sem);
+	}
 	preempt_enable();
 }
 
@@ -1437,8 +1441,10 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 	tmp = atomic_long_fetch_add_release(
 		-RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
 	rwsem_set_reader_owned(sem);
-	if (tmp & RWSEM_FLAG_WAITERS)
+	if (tmp & RWSEM_FLAG_WAITERS) {
+		trace_contended_release(sem);
 		rwsem_downgrade_wake(sem);
+	}
 	preempt_enable();
 }
 
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 74d41433ba13..35ac3498dca5 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -230,6 +230,10 @@ void __sched up(struct semaphore *sem)
 		sem->count++;
 	else
 		__up(sem, &wake_q);
+
+	if (trace_contended_release_enabled() && !wake_q_empty(&wake_q))
+		trace_contended_release(sem);
+
 	raw_spin_unlock_irqrestore(&sem->lock, flags);
 	if (!wake_q_empty(&wake_q))
 		wake_up_q(&wake_q);
-- 
2.52.0


^ permalink raw reply related

* [PATCH v4 0/5] locking: contended_release tracepoint instrumentation
From: Dmitry Ilvokhin @ 2026-03-26 15:09 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng, Waiman Long,
	Thomas Bogendoerfer, Juergen Gross, Ajay Kaher, Alexey Makhalov,
	Broadcom internal kernel review list, Thomas Gleixner,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Arnd Bergmann,
	Dennis Zhou, Tejun Heo, Christoph Lameter, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers
  Cc: linux-kernel, linux-mips, virtualization, linux-arch, linux-mm,
	linux-trace-kernel, kernel-team, Dmitry Ilvokhin

The existing contention_begin/contention_end tracepoints fire on the
waiter side. The lock holder's identity and stack can be captured at
contention_begin time (e.g. perf lock contention --lock-owner), but
this reflects the holder's state when a waiter arrives, not when the
lock is actually released.

This series adds a contended_release tracepoint that fires on the
holder side when a lock with waiters is released. This provides:

- Hold time estimation: when the holder's own acquisition was
  contended, its contention_end (acquisition) and contended_release
  can be correlated to measure how long the lock was held under
  contention.

- The holder's stack at release time, which may differ from what perf lock
  contention --lock-owner captures if the holder does significant work between
  the waiter's arrival and the unlock.

Note: for reader/writer locks, the tracepoint fires for every reader
releasing while a writer is waiting, not only for the last reader.

v3 -> v4:

- Fix spurious events in __percpu_up_read(): guard with
  rcuwait_active(&sem->writer) to avoid tracing during the RCU grace
  period after a writer releases (Sashiko).
- Fix possible use-after-free in semaphore up(): move
  trace_contended_release() inside the sem->lock critical section
  (Sashiko).
- Fix build failure with CONFIG_PARAVIRT_SPINLOCKS=y: introduce
  queued_spin_release() as the arch-overridable unlock primitive,
  so queued_spin_unlock() can be a generic tracing wrapper. Convert
  x86 (paravirt) and MIPS overrides (Sashiko).
- Add EXPORT_TRACEPOINT_SYMBOL_GPL(contended_release) for module
  support (Sashiko).
- Split spinning locks patch: factor out queued_spin_release() as a
  separate preparatory commit (Sashiko).
- Make read unlock tracepoint behavior consistent across all
  reader/writer lock types: fire for every reader releasing while
  a writer is waiting (rwsem, rwbase_rt were previously last-reader
  only).

v2 -> v3:

- Added new patch: extend contended_release tracepoint to queued spinlocks
  and queued rwlocks (marked as RFC, requesting feedback). This is prompted by
  Matthew Wilcox's suggestion to try to come up with generic instrumentation,
  instead of instrumenting each "special" lock manually. See [1] for the
  discussion.
- Reworked tracepoint placement to fire before the lock is released and
  before the waiter is woken where possible, for consistency with
  spinning locks where there is no explicit wake (inspired by Usama Arif's
  suggestion).
- Remove unnecessary linux/sched.h include from trace/events/lock.h.

RFC -> v2:

- Add trace_contended_release_enabled() guard before waiter checks that
  exist only for the tracepoint (Steven Rostedt).
- Rename __percpu_up_read_slowpath() to __percpu_up_read() (Peter
  Zijlstra).
- Add extern for __percpu_up_read() (Peter Zijlstra).
- Squashed tracepoint introduction and usage commits (Masami Hiramatsu).

v3: https://lore.kernel.org/all/cover.1773858853.git.d@ilvokhin.com/
v2: https://lore.kernel.org/all/cover.1773164180.git.d@ilvokhin.com/
RFC: https://lore.kernel.org/all/cover.1772642407.git.d@ilvokhin.com/

[1]: https://lore.kernel.org/all/aa7G1nD7Rd9F4eBH@casper.infradead.org/

Dmitry Ilvokhin (5):
  tracing/lock: Remove unnecessary linux/sched.h include
  locking/percpu-rwsem: Extract __percpu_up_read()
  locking: Add contended_release tracepoint to sleepable locks
  locking: Factor out queued_spin_release()
  locking: Add contended_release tracepoint to spinning locks

 arch/mips/include/asm/spinlock.h         |  6 +--
 arch/x86/include/asm/paravirt-spinlock.h |  6 +--
 include/asm-generic/qrwlock.h            | 48 ++++++++++++++++++++----
 include/asm-generic/qspinlock.h          | 33 ++++++++++++++--
 include/linux/percpu-rwsem.h             | 15 ++------
 include/trace/events/lock.h              | 18 ++++++++-
 kernel/locking/mutex.c                   |  4 ++
 kernel/locking/percpu-rwsem.c            | 29 ++++++++++++++
 kernel/locking/qrwlock.c                 | 16 ++++++++
 kernel/locking/qspinlock.c               |  8 ++++
 kernel/locking/rtmutex.c                 |  1 +
 kernel/locking/rwbase_rt.c               |  6 +++
 kernel/locking/rwsem.c                   | 10 ++++-
 kernel/locking/semaphore.c               |  4 ++
 14 files changed, 172 insertions(+), 32 deletions(-)

-- 
2.52.0


^ permalink raw reply

* [PATCH v4 1/5] tracing/lock: Remove unnecessary linux/sched.h include
From: Dmitry Ilvokhin @ 2026-03-26 15:10 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng, Waiman Long,
	Thomas Bogendoerfer, Juergen Gross, Ajay Kaher, Alexey Makhalov,
	Broadcom internal kernel review list, Thomas Gleixner,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Arnd Bergmann,
	Dennis Zhou, Tejun Heo, Christoph Lameter, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers
  Cc: linux-kernel, linux-mips, virtualization, linux-arch, linux-mm,
	linux-trace-kernel, kernel-team, Dmitry Ilvokhin
In-Reply-To: <cover.1774536681.git.d@ilvokhin.com>

None of the trace events in lock.h reference anything from
linux/sched.h. Remove the unnecessary include.

Signed-off-by: Dmitry Ilvokhin <d@ilvokhin.com>
---
 include/trace/events/lock.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h
index 8e89baa3775f..da978f2afb45 100644
--- a/include/trace/events/lock.h
+++ b/include/trace/events/lock.h
@@ -5,7 +5,6 @@
 #if !defined(_TRACE_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_LOCK_H
 
-#include <linux/sched.h>
 #include <linux/tracepoint.h>
 
 /* flags for lock:contention_begin */
-- 
2.52.0


^ permalink raw reply related

* [PATCH v4 4/5] locking: Factor out queued_spin_release()
From: Dmitry Ilvokhin @ 2026-03-26 15:10 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng, Waiman Long,
	Thomas Bogendoerfer, Juergen Gross, Ajay Kaher, Alexey Makhalov,
	Broadcom internal kernel review list, Thomas Gleixner,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Arnd Bergmann,
	Dennis Zhou, Tejun Heo, Christoph Lameter, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers
  Cc: linux-kernel, linux-mips, virtualization, linux-arch, linux-mm,
	linux-trace-kernel, kernel-team, Dmitry Ilvokhin
In-Reply-To: <cover.1774536681.git.d@ilvokhin.com>

Introduce queued_spin_release() as an arch-overridable unlock primitive,
and make queued_spin_unlock() a generic wrapper around it. This is a
preparatory refactoring for the next commit, which adds
contended_release tracepoint instrumentation to queued_spin_unlock().

Rename the existing arch-specific queued_spin_unlock() overrides on
x86 (paravirt) and MIPS to queued_spin_release().

No functional change.

Signed-off-by: Dmitry Ilvokhin <d@ilvokhin.com>
---
 arch/mips/include/asm/spinlock.h         |  6 +++---
 arch/x86/include/asm/paravirt-spinlock.h |  6 +++---
 include/asm-generic/qspinlock.h          | 15 ++++++++++++---
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/arch/mips/include/asm/spinlock.h b/arch/mips/include/asm/spinlock.h
index 6ce2117e49f6..c349162f15eb 100644
--- a/arch/mips/include/asm/spinlock.h
+++ b/arch/mips/include/asm/spinlock.h
@@ -13,12 +13,12 @@
 
 #include <asm-generic/qspinlock_types.h>
 
-#define	queued_spin_unlock queued_spin_unlock
+#define	queued_spin_release queued_spin_release
 /**
- * queued_spin_unlock - release a queued spinlock
+ * queued_spin_release - release a queued spinlock
  * @lock : Pointer to queued spinlock structure
  */
-static inline void queued_spin_unlock(struct qspinlock *lock)
+static inline void queued_spin_release(struct qspinlock *lock)
 {
 	/* This could be optimised with ARCH_HAS_MMIOWB */
 	mmiowb();
diff --git a/arch/x86/include/asm/paravirt-spinlock.h b/arch/x86/include/asm/paravirt-spinlock.h
index 7beffcb08ed6..ac75e0736198 100644
--- a/arch/x86/include/asm/paravirt-spinlock.h
+++ b/arch/x86/include/asm/paravirt-spinlock.h
@@ -49,9 +49,9 @@ static __always_inline bool pv_vcpu_is_preempted(long cpu)
 				ALT_NOT(X86_FEATURE_VCPUPREEMPT));
 }
 
-#define queued_spin_unlock queued_spin_unlock
+#define queued_spin_release queued_spin_release
 /**
- * queued_spin_unlock - release a queued spinlock
+ * queued_spin_release - release a queued spinlock
  * @lock : Pointer to queued spinlock structure
  *
  * A smp_store_release() on the least-significant byte.
@@ -66,7 +66,7 @@ static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 	pv_queued_spin_lock_slowpath(lock, val);
 }
 
-static inline void queued_spin_unlock(struct qspinlock *lock)
+static inline void queued_spin_release(struct qspinlock *lock)
 {
 	kcsan_release();
 	pv_queued_spin_unlock(lock);
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index bf47cca2c375..df76f34645a0 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -115,12 +115,12 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock)
 }
 #endif
 
-#ifndef queued_spin_unlock
+#ifndef queued_spin_release
 /**
- * queued_spin_unlock - release a queued spinlock
+ * queued_spin_release - release a queued spinlock
  * @lock : Pointer to queued spinlock structure
  */
-static __always_inline void queued_spin_unlock(struct qspinlock *lock)
+static __always_inline void queued_spin_release(struct qspinlock *lock)
 {
 	/*
 	 * unlock() needs release semantics:
@@ -129,6 +129,15 @@ static __always_inline void queued_spin_unlock(struct qspinlock *lock)
 }
 #endif
 
+/**
+ * queued_spin_unlock - unlock a queued spinlock
+ * @lock : Pointer to queued spinlock structure
+ */
+static __always_inline void queued_spin_unlock(struct qspinlock *lock)
+{
+	queued_spin_release(lock);
+}
+
 #ifndef virt_spin_lock
 static __always_inline bool virt_spin_lock(struct qspinlock *lock)
 {
-- 
2.52.0


^ permalink raw reply related

* [PATCH v4 2/5] locking/percpu-rwsem: Extract __percpu_up_read()
From: Dmitry Ilvokhin @ 2026-03-26 15:10 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng, Waiman Long,
	Thomas Bogendoerfer, Juergen Gross, Ajay Kaher, Alexey Makhalov,
	Broadcom internal kernel review list, Thomas Gleixner,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Arnd Bergmann,
	Dennis Zhou, Tejun Heo, Christoph Lameter, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers
  Cc: linux-kernel, linux-mips, virtualization, linux-arch, linux-mm,
	linux-trace-kernel, kernel-team, Dmitry Ilvokhin, Usama Arif
In-Reply-To: <cover.1774536681.git.d@ilvokhin.com>

Move the percpu_up_read() slowpath out of the inline function into a new
__percpu_up_read() to avoid binary size increase from adding a
tracepoint to an inlined function.

Signed-off-by: Dmitry Ilvokhin <d@ilvokhin.com>
Acked-by: Usama Arif <usama.arif@linux.dev>
---
 include/linux/percpu-rwsem.h  | 15 +++------------
 kernel/locking/percpu-rwsem.c | 18 ++++++++++++++++++
 2 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index c8cb010d655e..39d5bf8e6562 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -107,6 +107,8 @@ static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
 	return ret;
 }
 
+extern void __percpu_up_read(struct percpu_rw_semaphore *sem);
+
 static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
 {
 	rwsem_release(&sem->dep_map, _RET_IP_);
@@ -118,18 +120,7 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
 	if (likely(rcu_sync_is_idle(&sem->rss))) {
 		this_cpu_dec(*sem->read_count);
 	} else {
-		/*
-		 * slowpath; reader will only ever wake a single blocked
-		 * writer.
-		 */
-		smp_mb(); /* B matches C */
-		/*
-		 * In other words, if they see our decrement (presumably to
-		 * aggregate zero, as that is the only time it matters) they
-		 * will also see our critical section.
-		 */
-		this_cpu_dec(*sem->read_count);
-		rcuwait_wake_up(&sem->writer);
+		__percpu_up_read(sem);
 	}
 	preempt_enable();
 }
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index ef234469baac..f3ee7a0d6047 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -288,3 +288,21 @@ void percpu_up_write(struct percpu_rw_semaphore *sem)
 	rcu_sync_exit(&sem->rss);
 }
 EXPORT_SYMBOL_GPL(percpu_up_write);
+
+void __percpu_up_read(struct percpu_rw_semaphore *sem)
+{
+	lockdep_assert_preemption_disabled();
+	/*
+	 * slowpath; reader will only ever wake a single blocked
+	 * writer.
+	 */
+	smp_mb(); /* B matches C */
+	/*
+	 * In other words, if they see our decrement (presumably to
+	 * aggregate zero, as that is the only time it matters) they
+	 * will also see our critical section.
+	 */
+	this_cpu_dec(*sem->read_count);
+	rcuwait_wake_up(&sem->writer);
+}
+EXPORT_SYMBOL_GPL(__percpu_up_read);
-- 
2.52.0


^ permalink raw reply related

* [PATCH v4 5/5] locking: Add contended_release tracepoint to spinning locks
From: Dmitry Ilvokhin @ 2026-03-26 15:10 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng, Waiman Long,
	Thomas Bogendoerfer, Juergen Gross, Ajay Kaher, Alexey Makhalov,
	Broadcom internal kernel review list, Thomas Gleixner,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Arnd Bergmann,
	Dennis Zhou, Tejun Heo, Christoph Lameter, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers
  Cc: linux-kernel, linux-mips, virtualization, linux-arch, linux-mm,
	linux-trace-kernel, kernel-team, Dmitry Ilvokhin
In-Reply-To: <cover.1774536681.git.d@ilvokhin.com>

Extend the contended_release tracepoint to queued spinlocks and queued
rwlocks.

Use the arch-overridable queued_spin_release(), introduced in the
previous commit, to ensure the tracepoint works correctly across all
architectures, including those with custom unlock implementations (e.g.
x86 paravirt).

When the tracepoint is disabled, the only addition to the hot path is a
single NOP instruction (the static branch). When enabled, the contention
check, trace call, and unlock are combined in an out-of-line function to
minimize hot path impact, avoiding the compiler needing to preserve the
lock pointer in a callee-saved register across the trace call.

Binary size impact (x86_64, defconfig):
  uninlined unlock (common case): +983 bytes  (+0.00%)
  inlined unlock (worst case):    +58165 bytes (+0.24%)

The inlined unlock case could not be achieved through Kconfig options on
x86_64 as PREEMPT_BUILD unconditionally selects UNINLINE_SPIN_UNLOCK on
x86_64. The UNINLINE_SPIN_UNLOCK guards were manually inverted to force
inline the unlock path and estimate the worst case binary size increase.

Architectures with fully custom qspinlock implementations (e.g.
PowerPC) are not covered by this change.

Signed-off-by: Dmitry Ilvokhin <d@ilvokhin.com>
---
 include/asm-generic/qrwlock.h   | 48 +++++++++++++++++++++++++++------
 include/asm-generic/qspinlock.h | 18 +++++++++++++
 kernel/locking/qrwlock.c        | 16 +++++++++++
 kernel/locking/qspinlock.c      |  8 ++++++
 4 files changed, 82 insertions(+), 8 deletions(-)

diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h
index 75b8f4601b28..e24dc537fd66 100644
--- a/include/asm-generic/qrwlock.h
+++ b/include/asm-generic/qrwlock.h
@@ -14,6 +14,7 @@
 #define __ASM_GENERIC_QRWLOCK_H
 
 #include <linux/atomic.h>
+#include <linux/tracepoint-defs.h>
 #include <asm/barrier.h>
 #include <asm/processor.h>
 
@@ -35,6 +36,10 @@
  */
 extern void queued_read_lock_slowpath(struct qrwlock *lock);
 extern void queued_write_lock_slowpath(struct qrwlock *lock);
+extern void queued_read_unlock_traced(struct qrwlock *lock);
+extern void queued_write_unlock_traced(struct qrwlock *lock);
+
+DECLARE_TRACEPOINT(contended_release);
 
 /**
  * queued_read_trylock - try to acquire read lock of a queued rwlock
@@ -102,10 +107,16 @@ static inline void queued_write_lock(struct qrwlock *lock)
 }
 
 /**
- * queued_read_unlock - release read lock of a queued rwlock
+ * queued_rwlock_is_contended - check if the lock is contended
  * @lock : Pointer to queued rwlock structure
+ * Return: 1 if lock contended, 0 otherwise
  */
-static inline void queued_read_unlock(struct qrwlock *lock)
+static inline int queued_rwlock_is_contended(struct qrwlock *lock)
+{
+	return arch_spin_is_locked(&lock->wait_lock);
+}
+
+static __always_inline void __queued_read_unlock(struct qrwlock *lock)
 {
 	/*
 	 * Atomically decrement the reader count
@@ -114,22 +125,43 @@ static inline void queued_read_unlock(struct qrwlock *lock)
 }
 
 /**
- * queued_write_unlock - release write lock of a queued rwlock
+ * queued_read_unlock - release read lock of a queued rwlock
  * @lock : Pointer to queued rwlock structure
  */
-static inline void queued_write_unlock(struct qrwlock *lock)
+static inline void queued_read_unlock(struct qrwlock *lock)
+{
+	/*
+	 * Trace and unlock are combined in the traced unlock variant so
+	 * the compiler does not need to preserve the lock pointer across
+	 * the function call, avoiding callee-saved register save/restore
+	 * on the hot path.
+	 */
+	if (tracepoint_enabled(contended_release)) {
+		queued_read_unlock_traced(lock);
+		return;
+	}
+
+	__queued_read_unlock(lock);
+}
+
+static __always_inline void __queued_write_unlock(struct qrwlock *lock)
 {
 	smp_store_release(&lock->wlocked, 0);
 }
 
 /**
- * queued_rwlock_is_contended - check if the lock is contended
+ * queued_write_unlock - release write lock of a queued rwlock
  * @lock : Pointer to queued rwlock structure
- * Return: 1 if lock contended, 0 otherwise
  */
-static inline int queued_rwlock_is_contended(struct qrwlock *lock)
+static inline void queued_write_unlock(struct qrwlock *lock)
 {
-	return arch_spin_is_locked(&lock->wait_lock);
+	/* See comment in queued_read_unlock(). */
+	if (tracepoint_enabled(contended_release)) {
+		queued_write_unlock_traced(lock);
+		return;
+	}
+
+	__queued_write_unlock(lock);
 }
 
 /*
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index df76f34645a0..915a4c2777f6 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -41,6 +41,7 @@
 
 #include <asm-generic/qspinlock_types.h>
 #include <linux/atomic.h>
+#include <linux/tracepoint-defs.h>
 
 #ifndef queued_spin_is_locked
 /**
@@ -129,12 +130,29 @@ static __always_inline void queued_spin_release(struct qspinlock *lock)
 }
 #endif
 
+DECLARE_TRACEPOINT(contended_release);
+
+extern void queued_spin_release_traced(struct qspinlock *lock);
+
 /**
  * queued_spin_unlock - unlock a queued spinlock
  * @lock : Pointer to queued spinlock structure
+ *
+ * Generic tracing wrapper around the arch-overridable
+ * queued_spin_release().
  */
 static __always_inline void queued_spin_unlock(struct qspinlock *lock)
 {
+	/*
+	 * Trace and release are combined in queued_spin_release_traced() so
+	 * the compiler does not need to preserve the lock pointer across the
+	 * function call, avoiding callee-saved register save/restore on the
+	 * hot path.
+	 */
+	if (tracepoint_enabled(contended_release)) {
+		queued_spin_release_traced(lock);
+		return;
+	}
 	queued_spin_release(lock);
 }
 
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index d2ef312a8611..5f7a0fc2b27a 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -90,3 +90,19 @@ void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock)
 	trace_contention_end(lock, 0);
 }
 EXPORT_SYMBOL(queued_write_lock_slowpath);
+
+void __lockfunc queued_read_unlock_traced(struct qrwlock *lock)
+{
+	if (queued_rwlock_is_contended(lock))
+		trace_contended_release(lock);
+	__queued_read_unlock(lock);
+}
+EXPORT_SYMBOL(queued_read_unlock_traced);
+
+void __lockfunc queued_write_unlock_traced(struct qrwlock *lock)
+{
+	if (queued_rwlock_is_contended(lock))
+		trace_contended_release(lock);
+	__queued_write_unlock(lock);
+}
+EXPORT_SYMBOL(queued_write_unlock_traced);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index af8d122bb649..c72610980ec7 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -104,6 +104,14 @@ static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
 #define queued_spin_lock_slowpath	native_queued_spin_lock_slowpath
 #endif
 
+void __lockfunc queued_spin_release_traced(struct qspinlock *lock)
+{
+	if (queued_spin_is_contended(lock))
+		trace_contended_release(lock);
+	queued_spin_release(lock);
+}
+EXPORT_SYMBOL(queued_spin_release_traced);
+
 #endif /* _GEN_PV_LOCK_SLOWPATH */
 
 /**
-- 
2.52.0


^ permalink raw reply related

* Re: [PATCH v2] bootconfig: Apply early options from embedded config
From: Masami Hiramatsu @ 2026-03-26 14:30 UTC (permalink / raw)
  To: Masami Hiramatsu
  Cc: Breno Leitao, Jonathan Corbet, Shuah Khan, linux-kernel,
	linux-trace-kernel, linux-doc, oss, paulmck, rostedt, kernel-team
In-Reply-To: <20260325232204.05edbb21c7602b6408ca007b@kernel.org>

On Wed, 25 Mar 2026 23:22:04 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> > +	/*
> > +	 * Keys that do not match any early_param() handler are silently
> > +	 * ignored — do_early_param() always returns 0.
> > +	 */
> > +	xbc_node_for_each_key_value(root, knode, val) {
> 
> [sashiko comment]
> | Does this loop handle array values correctly?
> | xbc_node_for_each_key_value() only assigns the first value of an array to
> | the val pointer before advancing to the next key. It does not iterate over
> | the child nodes of the array.
> | If the bootconfig contains a multi-value key like
> | kernel.console = "ttyS0", "tty0", will the subsequent values in the array
> | be silently dropped instead of passed to the early_param handlers?
> 
> Also, good catch :) we need to use xbc_node_for_each_array_value()
> for inner loop.

FYI, xbc_snprint_cmdline() translates the arraied parameter as
multiple parameters. For example,

foo = bar, buz;

will be converted to

foo=bar foo=buz

Thus, I think we should do the same thing below;

> 
> > +		if (xbc_node_compose_key_after(root, knode, xbc_namebuf, XBC_KEYLEN_MAX) < 0)
> > +			continue;
> > +
> > +		/*
> > +		 * We need to copy const char *val to a char pointer,
> > +		 * which is what do_early_param() need, given it might
> > +		 * call strsep(), strtok() later.
> > +		 */
> > +		ret = strscpy(val_buf, val, sizeof(val_buf));
> > +		if (ret < 0) {
> > +			pr_warn("ignoring bootconfig value '%s', too long\n",
> > +				xbc_namebuf);
> > +			continue;
> > +		}
> > +		do_early_param(xbc_namebuf, val_buf, NULL, NULL);

So instead of this;

xbc_array_for_each_value(vnode, val) {
	do_early_param(xbc_namebuf, val, NULL, NULL);
}

Maybe it is a good timing to recondier unifying kernel cmdline and bootconfig
from API viewpoint.

Thanks,

-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* [syzbot] Monthly trace report (Mar 2026)
From: syzbot @ 2026-03-26 12:34 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel, syzkaller-bugs

Hello trace maintainers/developers,

This is a 31-day syzbot report for the trace subsystem.
All related reports/information can be found at:
https://syzkaller.appspot.com/upstream/s/trace

During the period, 3 new issues were detected and 1 were fixed.
In total, 17 issues are still open and 64 have already been fixed.

Some of the still happening issues:

Ref  Crashes Repro Title
<1>  277     Yes   WARNING in tracepoint_probe_unregister (3)
                   https://syzkaller.appspot.com/bug?extid=a1d25e53cd4a10f7f2d3
<2>  107     Yes   WARNING in blk_register_tracepoints
                   https://syzkaller.appspot.com/bug?extid=c54ded83396afee31eb1
<3>  76      Yes   KASAN: slab-use-after-free Read in bpf_trace_run2 (3)
                   https://syzkaller.appspot.com/bug?extid=59701a78e84b0bccfe1b
<4>  75      Yes   INFO: task hung in blk_trace_ioctl (4)
                   https://syzkaller.appspot.com/bug?extid=ed812ed461471ab17a0c
<5>  46      Yes   INFO: task hung in blk_trace_startstop
                   https://syzkaller.appspot.com/bug?extid=774863666ef5b025c9d0
<6>  42      No    WARNING in ring_buffer_map_get_reader (2)
                   https://syzkaller.appspot.com/bug?extid=c7143161d8215214a993
<7>  27      Yes   INFO: task hung in relay_open (2)
                   https://syzkaller.appspot.com/bug?extid=16ea22f26882d7e46f35
<8>  27      Yes   WARNING in get_probe_ref
                   https://syzkaller.appspot.com/bug?extid=8672dcb9d10011c0a160
<9>  18      Yes   INFO: task hung in blk_trace_setup (4)
                   https://syzkaller.appspot.com/bug?extid=9c1ebb9957045e00ac63
<10> 2       Yes   possible deadlock in down_trylock (3)
                   https://syzkaller.appspot.com/bug?extid=c3740bc819eb55460ec3

---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

To disable reminders for individual bugs, reply with the following command:
#syz set <Ref> no-reminders

To change bug's subsystems, reply with:
#syz set <Ref> subsystems: new-subsystem

You may send multiple commands in a single email message.

^ permalink raw reply

* Re: [PATCH v2 06/19] cpufreq: Use trace_call__##name() at guarded tracepoint call sites
From: Rafael J. Wysocki @ 2026-03-26 10:24 UTC (permalink / raw)
  To: Vineeth Pillai (Google)
  Cc: Steven Rostedt, Peter Zijlstra, Huang Rui, Gautham R. Shenoy,
	Mario Limonciello, Perry Yuan, Rafael J. Wysocki, Viresh Kumar,
	Srinivas Pandruvada, Len Brown, linux-pm, linux-kernel,
	linux-trace-kernel
In-Reply-To: <20260323160052.17528-7-vineeth@bitbyteword.org>

On Mon, Mar 23, 2026 at 5:01 PM Vineeth Pillai (Google)
<vineeth@bitbyteword.org> wrote:
>
> Replace trace_foo() with the new trace_call__foo() at sites already
> guarded by trace_foo_enabled(), avoiding a redundant
> static_branch_unlikely() re-evaluation inside the tracepoint.
> trace_call__foo() calls the tracepoint callbacks directly without
> utilizing the static branch again.
>
> Suggested-by: Steven Rostedt <rostedt@goodmis.org>
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Vineeth Pillai (Google) <vineeth@bitbyteword.org>
> Assisted-by: Claude:claude-sonnet-4-6

Acked-by: Rafael J. Wysocki (Intel) <rafael@kernel.org> # cpufreq core
& intel_pstate

> ---
>  drivers/cpufreq/amd-pstate.c   | 10 +++++-----
>  drivers/cpufreq/cpufreq.c      |  2 +-
>  drivers/cpufreq/intel_pstate.c |  2 +-
>  3 files changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
> index 5aa9fcd80cf51..4c47324aa2f73 100644
> --- a/drivers/cpufreq/amd-pstate.c
> +++ b/drivers/cpufreq/amd-pstate.c
> @@ -247,7 +247,7 @@ static int msr_update_perf(struct cpufreq_policy *policy, u8 min_perf,
>         if (trace_amd_pstate_epp_perf_enabled()) {
>                 union perf_cached perf = READ_ONCE(cpudata->perf);
>
> -               trace_amd_pstate_epp_perf(cpudata->cpu,
> +               trace_call__amd_pstate_epp_perf(cpudata->cpu,
>                                           perf.highest_perf,
>                                           epp,
>                                           min_perf,
> @@ -298,7 +298,7 @@ static int msr_set_epp(struct cpufreq_policy *policy, u8 epp)
>         if (trace_amd_pstate_epp_perf_enabled()) {
>                 union perf_cached perf = cpudata->perf;
>
> -               trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf,
> +               trace_call__amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf,
>                                           epp,
>                                           FIELD_GET(AMD_CPPC_MIN_PERF_MASK,
>                                                     cpudata->cppc_req_cached),
> @@ -343,7 +343,7 @@ static int shmem_set_epp(struct cpufreq_policy *policy, u8 epp)
>         if (trace_amd_pstate_epp_perf_enabled()) {
>                 union perf_cached perf = cpudata->perf;
>
> -               trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf,
> +               trace_call__amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf,
>                                           epp,
>                                           FIELD_GET(AMD_CPPC_MIN_PERF_MASK,
>                                                     cpudata->cppc_req_cached),
> @@ -507,7 +507,7 @@ static int shmem_update_perf(struct cpufreq_policy *policy, u8 min_perf,
>         if (trace_amd_pstate_epp_perf_enabled()) {
>                 union perf_cached perf = READ_ONCE(cpudata->perf);
>
> -               trace_amd_pstate_epp_perf(cpudata->cpu,
> +               trace_call__amd_pstate_epp_perf(cpudata->cpu,
>                                           perf.highest_perf,
>                                           epp,
>                                           min_perf,
> @@ -588,7 +588,7 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u8 min_perf,
>         }
>
>         if (trace_amd_pstate_perf_enabled() && amd_pstate_sample(cpudata)) {
> -               trace_amd_pstate_perf(min_perf, des_perf, max_perf, cpudata->freq,
> +               trace_call__amd_pstate_perf(min_perf, des_perf, max_perf, cpudata->freq,
>                         cpudata->cur.mperf, cpudata->cur.aperf, cpudata->cur.tsc,
>                                 cpudata->cpu, fast_switch);
>         }
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index 277884d91913c..58901047eae5a 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -2222,7 +2222,7 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
>
>         if (trace_cpu_frequency_enabled()) {
>                 for_each_cpu(cpu, policy->cpus)
> -                       trace_cpu_frequency(freq, cpu);
> +                       trace_call__cpu_frequency(freq, cpu);
>         }
>
>         return freq;
> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
> index 11c58af419006..70be952209144 100644
> --- a/drivers/cpufreq/intel_pstate.c
> +++ b/drivers/cpufreq/intel_pstate.c
> @@ -3132,7 +3132,7 @@ static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, in
>                 return;
>
>         sample = &cpu->sample;
> -       trace_pstate_sample(trace_type,
> +       trace_call__pstate_sample(trace_type,
>                 0,
>                 old_pstate,
>                 cpu->pstate.current_pstate,
> --
> 2.53.0
>

^ permalink raw reply

* Re: [PATCH v2] tracing/osnoise: fix potential deadlock in cpu hotplug
From: Masami Hiramatsu @ 2026-03-26  7:18 UTC (permalink / raw)
  To: hu.shengming
  Cc: rostedt, mathieu.desnoyers, linux-kernel, linux-trace-kernel,
	zhang.run, yang.tao172, ran.xiaokai, luo.haiyang
In-Reply-To: <20260326141953414bVSj33dAYktqp9Oiyizq8@zte.com.cn>

On Thu, 26 Mar 2026 14:19:53 +0800 (CST)
<hu.shengming@zte.com.cn> wrote:

> From: Luo Haiyang <luo.haiyang@zte.com.cn>
> 
> The following sequence may leads deadlock in cpu hotplug:
> 
>     task1        task2        task3
>     -----        -----        -----
> 
>  mutex_lock(&interface_lock)
> 
>             [CPU GOING OFFLINE]
> 
>             cpus_write_lock();
>             osnoise_cpu_die();
>               kthread_stop(task3);
>                 wait_for_completion();
> 
>                       osnoise_sleep();
>                         mutex_lock(&interface_lock);
> 
>  cpus_read_lock();
> 
>  [DEAD LOCK]
> 
> Fix by swap the order of cpus_read_lock() and mutex_lock(&interface_lock).
> 
> Signed-off-by: Luo Haiyang <luo.haiyang@zte.com.cn>
> 

This looks good to me.

Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Thanks,


> ---
> Changes in v2:
> - update change log
> - Link to v1: https://lore.kernel.org/all/20260324150616953rMo1BWtAZ1nXTNrEFP6hr@zte.com.cn/
> ---
>  kernel/trace/trace_osnoise.c | 10 +++++-----
>  1 file changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
> index dee610e465b9..be6cf0bb3c03 100644
> --- a/kernel/trace/trace_osnoise.c
> +++ b/kernel/trace/trace_osnoise.c
> @@ -2073,8 +2073,8 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy)
>  	if (!osnoise_has_registered_instances())
>  		return;
>  
> -	guard(mutex)(&interface_lock);
>  	guard(cpus_read_lock)();
> +	guard(mutex)(&interface_lock);
>  
>  	if (!cpu_online(cpu))
>  		return;
> @@ -2237,11 +2237,11 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf,
>  	if (running)
>  		stop_per_cpu_kthreads();
>  
> -	mutex_lock(&interface_lock);
>  	/*
>  	 * avoid CPU hotplug operations that might read options.
>  	 */
>  	cpus_read_lock();
> +	mutex_lock(&interface_lock);
>  
>  	retval = cnt;
>  
> @@ -2257,8 +2257,8 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf,
>  			clear_bit(option, &osnoise_options);
>  	}
>  
> -	cpus_read_unlock();
>  	mutex_unlock(&interface_lock);
> +	cpus_read_unlock();
>  
>  	if (running)
>  		start_per_cpu_kthreads();
> @@ -2345,16 +2345,16 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count,
>  	if (running)
>  		stop_per_cpu_kthreads();
>  
> -	mutex_lock(&interface_lock);
>  	/*
>  	 * osnoise_cpumask is read by CPU hotplug operations.
>  	 */
>  	cpus_read_lock();
> +	mutex_lock(&interface_lock);
>  
>  	cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new);
>  
> -	cpus_read_unlock();
>  	mutex_unlock(&interface_lock);
> +	cpus_read_unlock();
>  
>  	if (running)
>  		start_per_cpu_kthreads();
> -- 
> 2.25.1

-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v5 3/3] PCI: dw-rockchip: Add pcie_ltssm_state_transition trace support
From: Anand Moon @ 2026-03-26  7:13 UTC (permalink / raw)
  To: Shawn Lin
  Cc: Manivannan Sadhasivam, Bjorn Helgaas, linux-rockchip, linux-pci,
	linux-trace-kernel, linux-doc, Steven Rostedt
In-Reply-To: <1774403912-210670-4-git-send-email-shawn.lin@rock-chips.com>

Hi Shawn,

On Wed, 25 Mar 2026 at 07:29, Shawn Lin <shawn.lin@rock-chips.com> wrote:
>
> Rockchip platforms provide a 64x4 bytes debug FIFO to trace the
> LTSSM history. Any LTSSM change will be recorded. It's useful
> for debug purpose, for example link failure, etc.
>
> Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com>
> ---
Tested-by: Anand Moon <linux.amoon@gmail.com>

Thanks
-Anand

^ permalink raw reply

* Re: [PATCH v5 1/3] PCI: trace: Add PCI controller LTSSM transition tracepoint
From: Anand Moon @ 2026-03-26  7:13 UTC (permalink / raw)
  To: Shawn Lin
  Cc: Manivannan Sadhasivam, Bjorn Helgaas, linux-rockchip, linux-pci,
	linux-trace-kernel, linux-doc, Steven Rostedt
In-Reply-To: <1774403912-210670-2-git-send-email-shawn.lin@rock-chips.com>

Hi Shawn,

On Wed, 25 Mar 2026 at 07:28, Shawn Lin <shawn.lin@rock-chips.com> wrote:
>
> Some platforms may provide LTSSM trace functionality, recording historical
> LTSSM state transition information. This is very useful for debugging, such
> as when certain devices cannot be recognized or link broken during test.
> Implement the pci controller tracepoint for recording LTSSM and rate.
>
> Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com>
> ---
Tested-by: Anand Moon <linux.amoon@gmail.com>

Thanks
-Anand

^ permalink raw reply

* [PATCH v2] tracing/osnoise: fix potential deadlock in cpu hotplug
From: hu.shengming @ 2026-03-26  6:19 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers
  Cc: linux-kernel, linux-trace-kernel, zhang.run, yang.tao172,
	ran.xiaokai, luo.haiyang


[-- Attachment #1.1.1: Type: text/plain, Size: 2491 bytes --]

From: Luo Haiyang <luo.haiyang@zte.com.cn>

The following sequence may leads deadlock in cpu hotplug:

    task1        task2        task3
    -----        -----        -----

 mutex_lock(&interface_lock)

            [CPU GOING OFFLINE]

            cpus_write_lock();
            osnoise_cpu_die();
              kthread_stop(task3);
                wait_for_completion();

                      osnoise_sleep();
                        mutex_lock(&interface_lock);

 cpus_read_lock();

 [DEAD LOCK]

Fix by swap the order of cpus_read_lock() and mutex_lock(&interface_lock).

Signed-off-by: Luo Haiyang <luo.haiyang@zte.com.cn>

---
Changes in v2:
- update change log
- Link to v1: https://lore.kernel.org/all/20260324150616953rMo1BWtAZ1nXTNrEFP6hr@zte.com.cn/
---
 kernel/trace/trace_osnoise.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index dee610e465b9..be6cf0bb3c03 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -2073,8 +2073,8 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy)
 	if (!osnoise_has_registered_instances())
 		return;
 
-	guard(mutex)(&interface_lock);
 	guard(cpus_read_lock)();
+	guard(mutex)(&interface_lock);
 
 	if (!cpu_online(cpu))
 		return;
@@ -2237,11 +2237,11 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf,
 	if (running)
 		stop_per_cpu_kthreads();
 
-	mutex_lock(&interface_lock);
 	/*
 	 * avoid CPU hotplug operations that might read options.
 	 */
 	cpus_read_lock();
+	mutex_lock(&interface_lock);
 
 	retval = cnt;
 
@@ -2257,8 +2257,8 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf,
 			clear_bit(option, &osnoise_options);
 	}
 
-	cpus_read_unlock();
 	mutex_unlock(&interface_lock);
+	cpus_read_unlock();
 
 	if (running)
 		start_per_cpu_kthreads();
@@ -2345,16 +2345,16 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count,
 	if (running)
 		stop_per_cpu_kthreads();
 
-	mutex_lock(&interface_lock);
 	/*
 	 * osnoise_cpumask is read by CPU hotplug operations.
 	 */
 	cpus_read_lock();
+	mutex_lock(&interface_lock);
 
 	cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new);
 
-	cpus_read_unlock();
 	mutex_unlock(&interface_lock);
+	cpus_read_unlock();
 
 	if (running)
 		start_per_cpu_kthreads();
-- 
2.25.1

[-- Attachment #1.1.2: Type: text/html , Size: 7713 bytes --]

^ permalink raw reply related

* Re: [PATCH v2 02/19] kernel: Use trace_call__##name() at guarded tracepoint call sites
From: Masami Hiramatsu @ 2026-03-26  1:29 UTC (permalink / raw)
  To: Vineeth Pillai (Google)
  Cc: Steven Rostedt, Peter Zijlstra, Tejun Heo, David Vernet,
	Andrea Righi, Changwoo Min, Ingo Molnar, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Ben Segall, Mel Gorman,
	Valentin Schneider, Thomas Gleixner, Yury Norov [NVIDIA],
	Paul E. McKenney, Rik van Riel, Roman Kisel, Joel Fernandes,
	Rafael J. Wysocki, Ulf Hansson, linux-kernel, sched-ext,
	linux-trace-kernel
In-Reply-To: <20260323160052.17528-3-vineeth@bitbyteword.org>

On Mon, 23 Mar 2026 12:00:21 -0400
"Vineeth Pillai (Google)" <vineeth@bitbyteword.org> wrote:

> Replace trace_foo() with the new trace_call__foo() at sites already
> guarded by trace_foo_enabled(), avoiding a redundant
> static_branch_unlikely() re-evaluation inside the tracepoint.
> trace_call__foo() calls the tracepoint callbacks directly without
> utilizing the static branch again.

This looks good to me.

Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Thanks,

> 
> Suggested-by: Steven Rostedt <rostedt@goodmis.org>
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Vineeth Pillai (Google) <vineeth@bitbyteword.org>
> Assisted-by: Claude:claude-sonnet-4-6
> ---
>  kernel/irq_work.c  | 2 +-
>  kernel/sched/ext.c | 2 +-
>  kernel/smp.c       | 2 +-
>  3 files changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/kernel/irq_work.c b/kernel/irq_work.c
> index 73f7e1fd4ab4d..120fd7365fbe2 100644
> --- a/kernel/irq_work.c
> +++ b/kernel/irq_work.c
> @@ -79,7 +79,7 @@ void __weak arch_irq_work_raise(void)
>  static __always_inline void irq_work_raise(struct irq_work *work)
>  {
>  	if (trace_ipi_send_cpu_enabled() && arch_irq_work_has_interrupt())
> -		trace_ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func);
> +		trace_call__ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func);
>  
>  	arch_irq_work_raise();
>  }
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 1594987d637b0..cfbac9cf62f84 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -4494,7 +4494,7 @@ static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...)
>  		vscnprintf(line_buf, sizeof(line_buf), fmt, args);
>  		va_end(args);
>  
> -		trace_sched_ext_dump(line_buf);
> +		trace_call__sched_ext_dump(line_buf);
>  	}
>  #endif
>  	/* @s may be zero sized and seq_buf triggers WARN if so */
> diff --git a/kernel/smp.c b/kernel/smp.c
> index f349960f79cad..537cf1f461d75 100644
> --- a/kernel/smp.c
> +++ b/kernel/smp.c
> @@ -394,7 +394,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node)
>  		func = CSD_TYPE(csd) == CSD_TYPE_TTWU ?
>  			sched_ttwu_pending : csd->func;
>  
> -		trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);
> +		trace_call__csd_queue_cpu(cpu, _RET_IP_, func, csd);
>  	}
>  
>  	/*
> -- 
> 2.53.0
> 
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v2 01/19] tracepoint: Add trace_call__##name() API
From: Masami Hiramatsu @ 2026-03-26  1:28 UTC (permalink / raw)
  To: Vineeth Pillai (Google)
  Cc: Steven Rostedt, Peter Zijlstra, Dmitry Ilvokhin, Masami Hiramatsu,
	Mathieu Desnoyers, Ingo Molnar, Jens Axboe, io-uring,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Alexei Starovoitov, Daniel Borkmann, Marcelo Ricardo Leitner,
	Xin Long, Jon Maloy, Aaron Conole, Eelco Chaudron, Ilya Maximets,
	netdev, bpf, linux-sctp, tipc-discussion, dev, Jiri Pirko,
	Oded Gabbay, Koby Elbaz, dri-devel, Rafael J. Wysocki,
	Viresh Kumar, Gautham R. Shenoy, Huang Rui, Mario Limonciello,
	Len Brown, Srinivas Pandruvada, linux-pm, MyungJoo Ham,
	Kyungmin Park, Chanwoo Choi, Christian König, Sumit Semwal,
	linaro-mm-sig, Eddie James, Andrew Jeffery, Joel Stanley,
	linux-fsi, David Airlie, Simona Vetter, Alex Deucher,
	Danilo Krummrich, Matthew Brost, Philipp Stanner, Harry Wentland,
	Leo Li, amd-gfx, Jiri Kosina, Benjamin Tissoires, linux-input,
	Wolfram Sang, linux-i2c, Mark Brown, Michael Hennerich,
	Nuno Sá, linux-spi, James E.J. Bottomley, Martin K. Petersen,
	linux-scsi, Chris Mason, David Sterba, linux-btrfs,
	Thomas Gleixner, Andrew Morton, SeongJae Park, linux-mm,
	Borislav Petkov, Dave Hansen, x86, linux-trace-kernel,
	linux-kernel
In-Reply-To: <20260323160052.17528-2-vineeth@bitbyteword.org>

On Mon, 23 Mar 2026 12:00:20 -0400
"Vineeth Pillai (Google)" <vineeth@bitbyteword.org> wrote:

> Add trace_call__##name() as a companion to trace_##name().  When a
> caller already guards a tracepoint with an explicit enabled check:
> 
>   if (trace_foo_enabled() && cond)
>       trace_foo(args);
> 
> trace_foo() internally repeats the static_branch_unlikely() test, which
> the compiler cannot fold since static branches are patched binary
> instructions.  This results in two static-branch evaluations for every
> guarded call site.
> 
> trace_call__##name() calls __do_trace_##name() directly, skipping the
> redundant static-branch re-check.  This avoids leaking the internal
> __do_trace_##name() symbol into call sites while still eliminating the
> double evaluation:
> 
>   if (trace_foo_enabled() && cond)
>       trace_invoke_foo(args);   /* calls __do_trace_foo() directly */

nit: trace_call_foo() instead of trace_invoke_foo()?

Anyway looks good to me.

Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>


> 
> Three locations are updated:
> - __DECLARE_TRACE: invoke form omits static_branch_unlikely, retains
>   the LOCKDEP RCU-watching assertion.
> - __DECLARE_TRACE_SYSCALL: same, plus retains might_fault().
> - !TRACEPOINTS_ENABLED stub: empty no-op so callers compile cleanly
>   when tracepoints are compiled out.
> 
> Suggested-by: Steven Rostedt <rostedt@goodmis.org>
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Vineeth Pillai (Google) <vineeth@bitbyteword.org>
> Assisted-by: Claude:claude-sonnet-4-6
> ---
>  include/linux/tracepoint.h | 11 +++++++++++
>  1 file changed, 11 insertions(+)
> 
> diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
> index 22ca1c8b54f32..ed969705341f1 100644
> --- a/include/linux/tracepoint.h
> +++ b/include/linux/tracepoint.h
> @@ -294,6 +294,10 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
>  			WARN_ONCE(!rcu_is_watching(),			\
>  				  "RCU not watching for tracepoint");	\
>  		}							\
> +	}								\
> +	static inline void trace_call__##name(proto)			\
> +	{								\
> +		__do_trace_##name(args);				\
>  	}
>  
>  #define __DECLARE_TRACE_SYSCALL(name, proto, args, data_proto)		\
> @@ -313,6 +317,11 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
>  			WARN_ONCE(!rcu_is_watching(),			\
>  				  "RCU not watching for tracepoint");	\
>  		}							\
> +	}								\
> +	static inline void trace_call__##name(proto)			\
> +	{								\
> +		might_fault();						\
> +		__do_trace_##name(args);				\
>  	}
>  
>  /*
> @@ -398,6 +407,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
>  #define __DECLARE_TRACE_COMMON(name, proto, args, data_proto)		\
>  	static inline void trace_##name(proto)				\
>  	{ }								\
> +	static inline void trace_call__##name(proto)			\
> +	{ }								\
>  	static inline int						\
>  	register_trace_##name(void (*probe)(data_proto),		\
>  			      void *data)				\
> -- 
> 2.53.0
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCHv4 bpf-next 25/25] selftests/bpf: Add tracing multi attach rollback tests
From: Jiri Olsa @ 2026-03-25 21:49 UTC (permalink / raw)
  To: Leon Hwang
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, bpf,
	linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <f222748c-69ce-4c63-826b-4f2d67fd4ec9@linux.dev>

On Wed, Mar 25, 2026 at 02:45:59PM +0800, Leon Hwang wrote:

SNIP

> > +static void test_rollback_unlink(void)
> > +{
> > +	struct tracing_multi_rollback *skel, *extra;
> > +	LIBBPF_OPTS(bpf_tracing_multi_opts, opts);
> > +	struct tracing_multi_rollback **fillers;
> > +	size_t cnt = FUNCS_CNT;
> > +	__u32 *ids = NULL;
> > +	int err, max;
> > +
> > +	max = get_bpf_max_tramp_links();
> > +	if (!ASSERT_GE(max, 1, "bpf_max_tramp_links"))
> > +		return;
> > +
> > +	/* Attach maximum allowed programs to bpf_fentry_test10 */
> > +	fillers = fillers_load_and_link(max);
> > +	if (!ASSERT_OK_PTR(fillers, "fillers_load_and_link"))
> > +		return;
> > +
> > +	extra = extra_load_and_link();
> > +	if (!ASSERT_OK_PTR(extra, "extra_load_and_link"))
> 
> Should cleanup fillers here?

yep, should jump to cleanup, thanks

jirka

^ permalink raw reply

* Re: [PATCHv4 bpf-next 13/25] bpf: Add support for tracing_multi link fdinfo
From: Jiri Olsa @ 2026-03-25 21:49 UTC (permalink / raw)
  To: Leon Hwang
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, bpf,
	linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <03163163-fae9-4980-8526-a1353cdabcfc@linux.dev>

On Wed, Mar 25, 2026 at 02:43:19PM +0800, Leon Hwang wrote:
> On 24/3/26 16:18, Jiri Olsa wrote:
> > Adding tracing_multi link fdinfo support with following output:
> > 
> > pos:    0
> > flags:  02000000
> > mnt_id: 19
> > ino:    3091
> > link_type:      tracing_multi
> > link_id:        382
> 
> Would better to add attach_type?
> 
> attach_type:	[fentry,fexit,fsession]_multi

that's seems ok, will add

thanks,
jirka

^ permalink raw reply

* Re: [PATCHv4 bpf-next 24/25] selftests/bpf: Add tracing multi attach benchmark test
From: Jiri Olsa @ 2026-03-25 21:48 UTC (permalink / raw)
  To: Leon Hwang
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, bpf,
	linux-trace-kernel, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, Menglong Dong, Steven Rostedt
In-Reply-To: <7a119223-9994-4edc-af0b-f1ee9876cd20@linux.dev>

On Wed, Mar 25, 2026 at 02:45:31PM +0800, Leon Hwang wrote:

SNIP

> > +
> > +	attach_delta = (attach_end_ns - attach_start_ns) / 1000000000.0;
> > +	detach_delta = (detach_end_ns - detach_start_ns) / 1000000000.0;
> > +
> > +	printf("%s: found %lu functions\n", __func__, cnt);
> > +	printf("%s: attached in %7.3lfs\n", __func__, attach_delta);
> > +	printf("%s: detached in %7.3lfs\n", __func__, detach_delta);
> > +
> > +cleanup:
> > +	tracing_multi_bench__destroy(skel);
> > +	tdestroy(root, tdestroy_free_nop);
> > +	free_kallsyms_local(ksyms);
> > +	free(ids);
> 
> Is btf__free(btf) missing here? Since 'btf' was calloc inner
> btf__load_vmlinux_btf().

ah yea, will add, thanks

jirka

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox