* [PATCH v6 02/43] KVM: Rename KVM_GENERIC_MEMORY_ATTRIBUTES to KVM_VM_MEMORY_ATTRIBUTES
From: Ackerley Tng via B4 Relay @ 2026-05-07 20:22 UTC (permalink / raw)
To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260507-gmem-inplace-conversion-v6-0-91ab5a8b19a4@google.com>
From: Sean Christopherson <seanjc@google.com>
Rename the per-VM memory attributes Kconfig to make it explicitly about
per-VM attributes in anticipation of adding memory attributes support to
guest_memfd, at which point it will be possible (and desirable) to have
memory attributes without the per-VM support, even in x86.
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/Kconfig | 6 +++---
arch/x86/kvm/mmu/mmu.c | 2 +-
arch/x86/kvm/x86.c | 2 +-
include/linux/kvm_host.h | 8 ++++----
include/trace/events/kvm.h | 4 ++--
virt/kvm/Kconfig | 2 +-
virt/kvm/kvm_main.c | 14 +++++++-------
8 files changed, 20 insertions(+), 20 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c470e40a00aa4..60b997764beef 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2369,7 +2369,7 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
int tdp_max_root_level, int tdp_huge_page_level);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
#endif
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 801bf9e520db3..26f6afd51bbdc 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -84,7 +84,7 @@ config KVM_SW_PROTECTED_VM
bool "Enable support for KVM software-protected VMs"
depends on EXPERT
depends on KVM_X86 && X86_64
- select KVM_GENERIC_MEMORY_ATTRIBUTES
+ select KVM_VM_MEMORY_ATTRIBUTES
help
Enable support for KVM software-protected VMs. Currently, software-
protected VMs are purely a development and testing vehicle for
@@ -135,7 +135,7 @@ config KVM_INTEL_TDX
bool "Intel Trust Domain Extensions (TDX) support"
default y
depends on INTEL_TDX_HOST
- select KVM_GENERIC_MEMORY_ATTRIBUTES
+ select KVM_VM_MEMORY_ATTRIBUTES
select HAVE_KVM_ARCH_GMEM_POPULATE
help
Provides support for launching Intel Trust Domain Extensions (TDX)
@@ -159,7 +159,7 @@ config KVM_AMD_SEV
depends on KVM_AMD && X86_64
depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
select ARCH_HAS_CC_PLATFORM
- select KVM_GENERIC_MEMORY_ATTRIBUTES
+ select KVM_VM_MEMORY_ATTRIBUTES
select HAVE_KVM_ARCH_GMEM_PREPARE
select HAVE_KVM_ARCH_GMEM_INVALIDATE
select HAVE_KVM_ARCH_GMEM_POPULATE
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 892246204435c..a80a876ab4ad6 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7899,7 +7899,7 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread);
}
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
int level)
{
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0a1b63c63d1a9..1560de1e95be0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13625,7 +13625,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
}
}
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
kvm_mmu_init_memslot_memory_attributes(kvm, slot);
#endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4c14aee1fb063..7b9faa3545300 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -722,7 +722,7 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
}
#endif
-#ifndef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifndef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
{
return false;
@@ -871,7 +871,7 @@ struct kvm {
#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
struct notifier_block pm_notifier;
#endif
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
/* Protected by slots_lock (for writes) and RCU (for reads) */
struct xarray mem_attr_array;
#endif
@@ -2528,7 +2528,7 @@ static inline bool kvm_memslot_is_gmem_only(const struct kvm_memory_slot *slot)
return slot->flags & KVM_MEMSLOT_GMEM_ONLY;
}
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
{
return xa_to_value(xa_load(&kvm->mem_attr_array, gfn));
@@ -2550,7 +2550,7 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
{
return false;
}
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
#ifdef CONFIG_KVM_GUEST_MEMFD
int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index b282e3a867696..1ba72bd73ea2f 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -358,7 +358,7 @@ TRACE_EVENT(kvm_dirty_ring_exit,
TP_printk("vcpu %d", __entry->vcpu_id)
);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
/*
* @start: Starting address of guest memory range
* @end: End address of guest memory range
@@ -383,7 +383,7 @@ TRACE_EVENT(kvm_vm_set_mem_attributes,
TP_printk("%#016llx -- %#016llx [0x%lx]",
__entry->start, __entry->end, __entry->attr)
);
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
TRACE_EVENT(kvm_unmap_hva_range,
TP_PROTO(unsigned long start, unsigned long end),
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 794976b88c6f9..5119cb37145fc 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -100,7 +100,7 @@ config KVM_ELIDE_TLB_FLUSH_IF_YOUNG
config KVM_MMU_LOCKLESS_AGING
bool
-config KVM_GENERIC_MEMORY_ATTRIBUTES
+config KVM_VM_MEMORY_ATTRIBUTES
bool
config KVM_GUEST_MEMFD
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 89489996fbc1e..306153abbafa5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1115,7 +1115,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
spin_lock_init(&kvm->mn_invalidate_lock);
rcuwait_init(&kvm->mn_memslots_update_rcuwait);
xa_init(&kvm->vcpu_array);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
xa_init(&kvm->mem_attr_array);
#endif
@@ -1300,7 +1300,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
cleanup_srcu_struct(&kvm->irq_srcu);
srcu_barrier(&kvm->srcu);
cleanup_srcu_struct(&kvm->srcu);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
xa_destroy(&kvm->mem_attr_array);
#endif
kvm_arch_free_vm(kvm);
@@ -2418,7 +2418,7 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
}
#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static u64 kvm_supported_mem_attributes(struct kvm *kvm)
{
if (!kvm || kvm_arch_has_private_mem(kvm))
@@ -2623,7 +2623,7 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
}
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
{
@@ -4921,7 +4921,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
case KVM_CAP_SYSTEM_EVENT_DATA:
case KVM_CAP_DEVICE_CTRL:
return 1;
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
case KVM_CAP_MEMORY_ATTRIBUTES:
return kvm_supported_mem_attributes(kvm);
#endif
@@ -5325,7 +5325,7 @@ static long kvm_vm_ioctl(struct file *filp,
break;
}
#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
case KVM_SET_MEMORY_ATTRIBUTES: {
struct kvm_memory_attributes attrs;
@@ -5336,7 +5336,7 @@ static long kvm_vm_ioctl(struct file *filp,
r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
break;
}
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
case KVM_CREATE_DEVICE: {
struct kvm_create_device cd;
--
2.54.0.563.g4f69b47b94-goog
^ permalink raw reply related
* [PATCH v6 01/43] KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
From: Ackerley Tng via B4 Relay @ 2026-05-07 20:22 UTC (permalink / raw)
To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260507-gmem-inplace-conversion-v6-0-91ab5a8b19a4@google.com>
From: Sean Christopherson <seanjc@google.com>
Start plumbing in guest_memfd support for in-place private<=>shared
conversions by tracking attributes via a maple tree. KVM currently tracks
private vs. shared attributes on a per-VM basis, which made sense when a
guest_memfd _only_ supported private memory, but tracking per-VM simply
can't work for in-place conversions as the shareability of a given page
needs to be per-gmem_inode, not per-VM.
Use the filemap invalidation lock to protect the maple tree, as taking the
lock for read when faulting in memory (for userspace or the guest) isn't
expected to result in meaningful contention, and using a separate lock
would add significant complexity (avoid deadlock is quite difficult).
Signed-off-by: Sean Christopherson <seanjc@google.com>
Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Vishal Annapurve <vannapurve@google.com>
Signed-off-by: Vishal Annapurve <vannapurve@google.com>
Co-developed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
---
virt/kvm/guest_memfd.c | 133 +++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 117 insertions(+), 16 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 69c9d6d546b28..5011d38820d0d 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -4,6 +4,7 @@
#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/kvm_host.h>
+#include <linux/maple_tree.h>
#include <linux/mempolicy.h>
#include <linux/pseudo_fs.h>
#include <linux/pagemap.h>
@@ -33,6 +34,13 @@ struct gmem_inode {
struct list_head gmem_file_list;
u64 flags;
+ /*
+ * Every index in this inode, whether memory is populated or
+ * not, is tracked in attributes. The entire range of indices,
+ * corresponding to the size of this inode, is represented in
+ * this maple tree.
+ */
+ struct maple_tree attributes;
};
static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
@@ -60,6 +68,24 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
return gfn - slot->base_gfn + slot->gmem.pgoff;
}
+static u64 kvm_gmem_get_attributes(struct inode *inode, pgoff_t index)
+{
+ struct maple_tree *mt = &GMEM_I(inode)->attributes;
+ void *entry = mtree_load(mt, index);
+
+ return WARN_ON_ONCE(!entry) ? 0 : xa_to_value(entry);
+}
+
+static bool kvm_gmem_is_private_mem(struct inode *inode, pgoff_t index)
+{
+ return kvm_gmem_get_attributes(inode, index) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
+}
+
+static bool kvm_gmem_is_shared_mem(struct inode *inode, pgoff_t index)
+{
+ return !kvm_gmem_is_private_mem(inode, index);
+}
+
static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
pgoff_t index, struct folio *folio)
{
@@ -397,10 +423,13 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
return VM_FAULT_SIGBUS;
- if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED))
- return VM_FAULT_SIGBUS;
+ filemap_invalidate_lock_shared(inode->i_mapping);
+ if (kvm_gmem_is_shared_mem(inode, vmf->pgoff))
+ folio = kvm_gmem_get_folio(inode, vmf->pgoff);
+ else
+ folio = ERR_PTR(-EACCES);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
- folio = kvm_gmem_get_folio(inode, vmf->pgoff);
if (IS_ERR(folio)) {
if (PTR_ERR(folio) == -EAGAIN)
return VM_FAULT_RETRY;
@@ -556,6 +585,51 @@ bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
return true;
}
+static int kvm_gmem_init_inode(struct inode *inode, loff_t size, u64 flags)
+{
+ struct gmem_inode *gi = GMEM_I(inode);
+ MA_STATE(mas, &gi->attributes, 0, (size >> PAGE_SHIFT) - 1);
+ u64 attrs;
+ int r;
+
+ inode->i_op = &kvm_gmem_iops;
+ inode->i_mapping->a_ops = &kvm_gmem_aops;
+ inode->i_mode |= S_IFREG;
+ inode->i_size = size;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+
+ /*
+ * guest_memfd memory is neither migratable nor swappable: set
+ * inaccessible to gate off both.
+ */
+ mapping_set_inaccessible(inode->i_mapping);
+ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
+
+ gi->flags = flags;
+
+ mt_set_external_lock(&gi->attributes,
+ &inode->i_mapping->invalidate_lock);
+
+ /*
+ * Store default attributes for the entire gmem instance. Ensuring every
+ * index is represented in the maple tree at all times simplifies the
+ * conversion and merging logic.
+ */
+ attrs = gi->flags & GUEST_MEMFD_FLAG_INIT_SHARED ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE;
+
+ /*
+ * Acquire the invalidation lock purely to make lockdep happy. The
+ * maple tree library expects all stores to be protected via the lock,
+ * and the library can't know when the tree is reachable only by the
+ * caller, as is the case here.
+ */
+ filemap_invalidate_lock(inode->i_mapping);
+ r = mas_store_gfp(&mas, xa_mk_value(attrs), GFP_KERNEL);
+ filemap_invalidate_unlock(inode->i_mapping);
+
+ return r;
+}
+
static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
{
static const char *name = "[kvm-gmem]";
@@ -586,16 +660,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
goto err_fops;
}
- inode->i_op = &kvm_gmem_iops;
- inode->i_mapping->a_ops = &kvm_gmem_aops;
- inode->i_mode |= S_IFREG;
- inode->i_size = size;
- mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
- mapping_set_inaccessible(inode->i_mapping);
- /* Unmovable mappings are supposed to be marked unevictable as well. */
- WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
-
- GMEM_I(inode)->flags = flags;
+ err = kvm_gmem_init_inode(inode, size, flags);
+ if (err)
+ goto err_inode;
file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
if (IS_ERR(file)) {
@@ -797,9 +864,13 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
if (!file)
return -EFAULT;
+ filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
+
folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
- if (IS_ERR(folio))
- return PTR_ERR(folio);
+ if (IS_ERR(folio)) {
+ r = PTR_ERR(folio);
+ goto out;
+ }
if (!folio_test_uptodate(folio)) {
clear_highpage(folio_page(folio, 0));
@@ -815,6 +886,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
else
folio_put(folio);
+out:
+ filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
return r;
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
@@ -944,6 +1017,15 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
mpol_shared_policy_init(&gi->policy, NULL);
+ /*
+ * Memory attributes are protected by the filemap invalidation lock, but
+ * the lock structure isn't available at this time. Immediately mark
+ * maple tree as using external locking so that accessing the tree
+ * before it's fully initialized results in NULL pointer dereferences
+ * and not more subtle bugs.
+ */
+ mt_init_flags(&gi->attributes, MT_FLAGS_LOCK_EXTERN | MT_FLAGS_USE_RCU);
+
gi->flags = 0;
INIT_LIST_HEAD(&gi->gmem_file_list);
return &gi->vfs_inode;
@@ -951,7 +1033,26 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
static void kvm_gmem_destroy_inode(struct inode *inode)
{
- mpol_free_shared_policy(&GMEM_I(inode)->policy);
+ struct gmem_inode *gi = GMEM_I(inode);
+
+ mpol_free_shared_policy(&gi->policy);
+
+ /*
+ * Note! Checking for an empty tree is functionally necessary
+ * to avoid explosions if the tree hasn't been fully
+ * initialized, i.e. if the inode is being destroyed before
+ * guest_memfd can set the external lock, lockdep would find
+ * that the tree's internal ma_lock was not held.
+ */
+ if (!mtree_empty(&gi->attributes)) {
+ /*
+ * Acquire the invalidation lock purely to make lockdep happy,
+ * the inode is unreachable at this point.
+ */
+ filemap_invalidate_lock(inode->i_mapping);
+ __mt_destroy(&gi->attributes);
+ filemap_invalidate_unlock(inode->i_mapping);
+ }
}
static void kvm_gmem_free_inode(struct inode *inode)
--
2.54.0.563.g4f69b47b94-goog
^ permalink raw reply related
* [PATCH v6 00/43] guest_memfd: In-place conversion support
From: Ackerley Tng via B4 Relay @ 2026-05-07 20:22 UTC (permalink / raw)
To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco, Ackerley Tng
This is v6 of guest_memfd in-place conversion support, now out of RFC.
Up till now, guest_memfd supports the entire inode worth of memory being
used as all-shared, or all-private. CoCo VMs may request guest memory to be
converted between private and shared states, and the only way to support
that currently would be to have the userspace VMM provide two sources of
backing memory from completely different areas of physical memory.
pKVM has a use case for in-place sharing: the guest and host may be
cooperating on given data, and pKVM doesn't protect data through
encryption, so copying that given data between different areas of physical
memory as part of conversions would be unnecessary work.
This series also serves as a foundation for guest_memfd huge page
support. Now, guest_memfd only supports PAGE_SIZE pages, so if two sources
of backing memory are used, the userspace VMM could maintain a steady total
memory utilized by punching out the pages that are not used. When huge
pages are available in guest_memfd, even if the backing memory source
supports hole punching within a huge page, punching out pages to maintain
the total memory utilized by a VM would be introducing lots of
fragmentation.
In-place conversion avoids fragmentation by allowing the same physical
memory to be used for both shared and private memory, with guest_memfd
tracks the shared/private status of all the pages at a per-page
granularity.
The central principle, which guest_memfd continues to uphold, is that any
guest-private page will not be mappable to host userspace. All pages will
be mmap()-able in host userspace, but accesses to guest-private pages (as
tracked by guest_memfd) will result in a SIGBUS.
This series introduces a guest_memfd ioctl (not kvm, vm or vcpu, but
guest_memfd ioctl) that allows userspace to set memory
attributes (shared/private) directly through the guest_memfd. This is the
appropriate interface because shared/private-ness is a property of memory
and hence the request should be sent directly to the memory provider -
guest_memfd.
Tested with both CONFIG_KVM_VM_MEMORY_ATTRIBUTES enabled and disabled:
+ tools/testing/selftests/kvm/guest_memfd_test.c
+ tools/testing/selftests/kvm/pre_fault_memory_test.c
+ tools/testing/selftests/kvm/x86/guest_memfd_conversions_test.c
+ tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
+ tools/testing/selftests/kvm/x86/private_mem_conversions_test.sh
+ tools/testing/selftests/kvm/x86/private_mem_kvm_exits_test.c
Updates for this revision:
+ Dropped everything to do with content modes (ZERO/PRESERVE) (thanks
Michael and Sean!)
+ Discussed this at PUCK on 2026-05-06
+ guest_memfd was taking on too much complexity to try to paper
over differences in how trusted firmware handle conversions
+ Addressed Liam's comments about usage of the maple_tree
TODOs
+ Resolve issue where guest_memfd_conversions_test, which uses the
kselftest framework, doesn't perform teardown on assertion
failure. Please see proposal at [9]
+ Test with TDX selftests. We're in the process of rebasing TDX selftests
on this series and will post updates when that's tested.
I would like feedback on:
+ The use of private_mem_conversions_test.sh to run different options in
private_mem_conversions_test. If this makes sense, I'll adjust the
Makefile to have private_mem_conversions_test tested only via the script.
This series is based on kvm/next, and here's the tree for your convenience:
https://github.com/googleprodkernel/linux-cc/commits/guest_memfd-inplace-conversion-v6
Older series:
+ RFCv5 is at [8]
+ RFCv4 is at [7]
+ RFCv3 is at [6]
+ RFCv2 is at [5]
+ RFCv1 is at [4]
+ Previous versions of this feature, part of other series, are available at
[1][2][3].
[1] https://lore.kernel.org/all/bd163de3118b626d1005aa88e71ef2fb72f0be0f.1726009989.git.ackerleytng@google.com/
[2] https://lore.kernel.org/all/20250117163001.2326672-6-tabba@google.com/
[3] https://lore.kernel.org/all/b784326e9ccae6a08388f1bf39db70a2204bdc51.1747264138.git.ackerleytng@google.com/
[4] https://lore.kernel.org/all/cover.1760731772.git.ackerleytng@google.com/T/
[5] https://lore.kernel.org/all/cover.1770071243.git.ackerleytng@google.com/T/
[6] https://lore.kernel.org/r/20260313-gmem-inplace-conversion-v3-0-5fc12a70ec89@google.com/T/
[7] https://lore.kernel.org/all/20260326-gmem-inplace-conversion-v4-0-e202fe950ffd@google.com/T/
[8] https://lore.kernel.org/r/20260428-gmem-inplace-conversion-v5-0-d8608ccfca22@google.com
[9] https://lore.kernel.org/all/20260414-selftest-global-metadata-v1-0-fd223922bc57@google.com/T/
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
Ackerley Tng (25):
KVM: x86/mmu: Bug the VM if gmem attributes are queried to determine max mapping level
KVM: guest_memfd: Update kvm_gmem_populate() to use gmem attributes
KVM: guest_memfd: Only prepare folios for private pages
KVM: Move kvm_supported_mem_attributes() to kvm_host.h
KVM: guest_memfd: Add base support for KVM_SET_MEMORY_ATTRIBUTES2
KVM: guest_memfd: Ensure pages are not in use before conversion
KVM: guest_memfd: Call arch invalidate hooks on conversion
KVM: guest_memfd: Return early if range already has requested attributes
KVM: guest_memfd: Advertise KVM_SET_MEMORY_ATTRIBUTES2 ioctl
KVM: guest_memfd: Handle lru_add fbatch refcounts during conversion safety check
KVM: guest_memfd: Use actual size for invalidation in kvm_gmem_release()
KVM: guest_memfd: Determine invalidation filter from memory attributes
KVM: TDX: Make source page optional for KVM_TDX_INIT_MEM_REGION
KVM: selftests: Test basic single-page conversion flow
KVM: selftests: Test conversion flow when INIT_SHARED
KVM: selftests: Test conversion precision in guest_memfd
KVM: selftests: Test conversion before allocation
KVM: selftests: Convert with allocated folios in different layouts
KVM: selftests: Test that truncation does not change shared/private status
KVM: selftests: Test conversion with elevated page refcount
KVM: selftests: Reset shared memory after hole-punching
KVM: selftests: Provide function to look up guest_memfd details from gpa
KVM: selftests: Make TEST_EXPECT_SIGBUS thread-safe
KVM: selftests: Update private_mem_conversions_test to mmap() guest_memfd
KVM: selftests: Add script to exercise private_mem_conversions_test
Michael Roth (1):
KVM: SEV: Make 'uaddr' parameter optional for KVM_SEV_SNP_LAUNCH_UPDATE
Sean Christopherson (17):
KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
KVM: Rename KVM_GENERIC_MEMORY_ATTRIBUTES to KVM_VM_MEMORY_ATTRIBUTES
KVM: Enumerate support for PRIVATE memory iff kvm_arch_has_private_mem is defined
KVM: Stub in ability to disable per-VM memory attribute tracking
KVM: guest_memfd: Wire up kvm_get_memory_attributes() to per-gmem attributes
KVM: Move KVM_VM_MEMORY_ATTRIBUTES config definition to x86
KVM: Let userspace disable per-VM mem attributes, enable per-gmem attributes
KVM: guest_memfd: Enable INIT_SHARED on guest_memfd for x86 Coco VMs
KVM: selftests: Create gmem fd before "regular" fd when adding memslot
KVM: selftests: Rename guest_memfd{,_offset} to gmem_{fd,offset}
KVM: selftests: Add support for mmap() on guest_memfd in core library
KVM: selftests: Add selftests global for guest memory attributes capability
KVM: selftests: Add helpers for calling ioctls on guest_memfd
KVM: selftests: Test that shared/private status is consistent across processes
KVM: selftests: Provide common function to set memory attributes
KVM: selftests: Check fd/flags provided to mmap() when setting up memslot
KVM: selftests: Update private memory exits test to work with per-gmem attributes
Documentation/virt/kvm/api.rst | 78 +++-
.../virt/kvm/x86/amd-memory-encryption.rst | 15 +-
Documentation/virt/kvm/x86/intel-tdx.rst | 4 +
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/Kconfig | 15 +-
arch/x86/kvm/mmu/mmu.c | 13 +-
arch/x86/kvm/svm/sev.c | 18 +-
arch/x86/kvm/vmx/tdx.c | 11 +-
arch/x86/kvm/x86.c | 13 +-
include/linux/kvm_host.h | 53 ++-
include/trace/events/kvm.h | 4 +-
include/uapi/linux/kvm.h | 16 +
mm/swap.c | 2 +
tools/testing/selftests/kvm/Makefile.kvm | 5 +
tools/testing/selftests/kvm/include/kvm_util.h | 138 +++++-
tools/testing/selftests/kvm/include/test_util.h | 34 +-
.../selftests/kvm/kvm_has_gmem_attributes.c | 17 +
tools/testing/selftests/kvm/lib/kvm_util.c | 133 +++---
tools/testing/selftests/kvm/lib/test_util.c | 7 -
.../kvm/x86/guest_memfd_conversions_test.c | 487 +++++++++++++++++++++
.../kvm/x86/private_mem_conversions_test.c | 53 ++-
.../kvm/x86/private_mem_conversions_test.sh | 128 ++++++
.../selftests/kvm/x86/private_mem_kvm_exits_test.c | 36 +-
virt/kvm/Kconfig | 3 +-
virt/kvm/guest_memfd.c | 457 +++++++++++++++++--
virt/kvm/kvm_main.c | 82 +++-
26 files changed, 1636 insertions(+), 188 deletions(-)
---
base-commit: 6d35786de28116ecf78797a62b84e6bf3c45aa5a
change-id: 20260225-gmem-inplace-conversion-bd0dbd39753a
Best regards,
--
Ackerley Tng <ackerleytng@google.com>
^ permalink raw reply
* Re: [PATCH RFC v5 01/53] KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
From: Ackerley Tng @ 2026-05-07 18:31 UTC (permalink / raw)
To: Liam R. Howlett
Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco
In-Reply-To: <byywwfin2aenobdwuesqihm6nzxyx6ecedwgbt7f5tvaaul6fi@u7bmexpavwdn>
"Liam R. Howlett" <liam@infradead.org> writes:
>
> [...snip...]
>
>> +static u64 kvm_gmem_get_attributes(struct inode *inode, pgoff_t index)
>> +{
>> + struct maple_tree *mt = &GMEM_I(inode)->attributes;
>> + void *entry = mtree_load(mt, index);
>> +
>> + /*
>> + * The lock _must_ be held for lookups, as some maple tree operations,
>> + * e.g. append, are unsafe (return inaccurate information) with respect
>> + * to concurrent RCU-protected lookups.
>> + */
>
> Can you please elaborate how you see inaccurate information and which
> information is inaccurate?
>
> Your comment is incorrect and misleading as append will not be used in
> rcu mode. Note that you have not set this tree up in rcu mode.
>
My bad. Thanks for clarifying about usage of rcu mode.
>> + lockdep_assert(mt_lock_is_held(mt));
>> +
In the next revision I'll remove this lockdep and use RCU mode, and
kvm_gmem_get_memory_attributes() should get a stable result.
The other lookups using mt_for_each() in kvm_gmem_range_has_attributes()
and kvm_gmem_get_invalidate_filter() retain the lockdep since those
operate over multiple ranges. Those are called from paths that require
holding the lock to exclude other operations anyway, so the lockdep
requirement does not cost anything more.
>> + return WARN_ON_ONCE(!entry) ? 0 : xa_to_value(entry);
>> +}
>> +
>>
>> [...snip...]
>>
^ permalink raw reply
* CONFIG_FTRACE=y significantly slowing down module loading
From: Hans de Goede @ 2026-05-07 17:33 UTC (permalink / raw)
To: Steven Rostedt, Masami Hiramatsu; +Cc: Linux trace kernel, Mario Limonciello
Hi Everyone,
On some old hw CONFIG_FTRACE=y causes "modprobe amdgpu" to take
25-30 seconds, halting the entire boot process for that time.
Where as with "# CONFIG_FTRACE is not set" this takes only 2 seconds.
I believe this was reported before by Mario (in the Cc) and there was
a fix of sorts merged for 6.15? But there still is a significant
slowdown (especially on slow hardware).
I was also the reason Mario originally reported this and Mario
asked me to test 6.15 but I was busy with other stuff so from
the better late then never department I only got around this now...
I've tested this on 2 admittedly quite old and slow machines,
but the impact is big enough that fixing this might also help
speeding up modprobe on modern machines.
This was tested on:
MSI S270 laptop, AMD Turion 64 MT-30 CPU, RS480 iGPU
FUJITSU D3003-S2 industrial PC, AMD G-T56N CPU, Radeon HD 6320 iGPU
Both old machines where amdgpu.ko auto-loads.
Kernel/modprobe real-/sys-time/FTRACE Kconfig enabled/machine
6.19.6 32.5s/31.8s yes MSI S270
7.0.0 25.5s/22.8s yes FUJITSU D3003-S2
6.15-rc7 27.7s/26.5s yes FUJITSU D3003-S2
7.0.0 1.9s/1.1s no FUJITSU D3003-S2
Regards,
Hans
^ permalink raw reply
* Re: [PATCH v1 1/2] spi: qcom-geni: trace: Add trace events for Qualcomm GENI SPI
From: Praveen Talari @ 2026-05-07 17:33 UTC (permalink / raw)
To: Mark Brown
Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
linux-trace-kernel, linux-arm-msm, linux-spi,
MukeshKumarSavaliyamukesh.savaliya, AniketRandiveaniket.randive,
chandana.chiluveru, jyothi.seerapu
In-Reply-To: <afxJmZ9MkP5eJkQC@sirena.co.uk>
Hi Mark,
On 07-05-2026 13:43, Mark Brown wrote:
> On Thu, May 07, 2026 at 08:58:02AM +0530, Praveen Talari wrote:
>> On 07-05-2026 06:32, Mark Brown wrote:
>>> At least these feel like they really should be generic events, there
>>> hopefully isn't anything driver specific about them.
>> Initially implemented as a generic event; however, splitting into separate
>> TX and RX events may be more appropriate.
>> Which approach would you prefer?
> By generic I mean this should not be driver specific at all.
I hope these changes are fine. Please let me know if you have any
concerns or feedback.
diff --git a/drivers/spi/spi-geni-qcom.c b/drivers/spi/spi-geni-qcom.c
index 4da888359cfc..9abb5f4f719b 100644
--- a/drivers/spi/spi-geni-qcom.c
+++ b/drivers/spi/spi-geni-qcom.c
@@ -1,6 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2017-2018, The Linux foundation. All rights reserved.
+#include <trace/events/spi.h>
+
#define CREATE_TRACE_POINTS
#include <trace/events/qcom_geni_spi.h>
@@ -709,6 +711,7 @@ static unsigned int geni_byte_per_fifo_word(struct
spi_geni_master *mas)
static bool geni_spi_handle_tx(struct spi_geni_master *mas)
{
+ struct spi_controller *spi = dev_get_drvdata(mas->dev);
struct geni_se *se = &mas->se;
unsigned int max_bytes;
const u8 *tx_buf;
@@ -739,7 +742,7 @@ static bool geni_spi_handle_tx(struct
spi_geni_master *mas)
iowrite32_rep(se->base + SE_GENI_TX_FIFOn, &fifo_word, 1);
}
mas->tx_rem_bytes -= max_bytes;
- trace_geni_spi_tx_data(mas->dev, tx_buf, max_bytes,
mas->tx_rem_bytes);
+ trace_spi_tx_data(spi->cur_msg->spi, tx_buf, max_bytes,
mas->tx_rem_bytes);
if (!mas->tx_rem_bytes) {
writel(0, se->base + SE_GENI_TX_WATERMARK_REG);
return false;
@@ -749,6 +752,7 @@ static bool geni_spi_handle_tx(struct
spi_geni_master *mas)
static void geni_spi_handle_rx(struct spi_geni_master *mas)
{
+ struct spi_controller *spi = dev_get_drvdata(mas->dev);
struct geni_se *se = &mas->se;
u32 rx_fifo_status;
unsigned int rx_bytes;
@@ -790,7 +794,7 @@ static void geni_spi_handle_rx(struct
spi_geni_master *mas)
}
mas->rx_rem_bytes -= rx_bytes;
- trace_geni_spi_rx_data(mas->dev, rx_buf, rx_bytes,
mas->rx_rem_bytes);
+ trace_spi_rx_data(spi->cur_msg->spi, rx_buf, rx_bytes,
mas->rx_rem_bytes);
}
static int setup_se_xfer(struct spi_transfer *xfer,
diff --git a/include/trace/events/spi.h b/include/trace/events/spi.h
index e63d4a24d879..4907625e019d 100644
--- a/include/trace/events/spi.h
+++ b/include/trace/events/spi.h
@@ -233,6 +233,53 @@ DEFINE_EVENT(spi_transfer, spi_transfer_stop,
);
+DECLARE_EVENT_CLASS(spi_data,
+
+ TP_PROTO(struct spi_device *spi, const u8 *buf, unsigned int len,
+ unsigned int rem),
+
+ TP_ARGS(spi, buf, len, rem),
+
+ TP_STRUCT__entry(
+ __field( int, bus_num )
+ __field( int, chip_select )
+ __field( unsigned int, len )
+ __field( unsigned int, rem )
+ __dynamic_array(u8, data, len )
+ ),
+
+ TP_fast_assign(
+ __entry->bus_num = spi->controller->bus_num;
+ __entry->chip_select = spi_get_chipselect(spi, 0);
+ __entry->len = len;
+ __entry->rem = rem;
+ memcpy(__get_dynamic_array(data), buf, len);
+ ),
+
+ TP_printk("spi%d.%d len=%u rem=%u data=%s",
+ __entry->bus_num, __entry->chip_select,
+ __entry->len, __entry->rem,
+ __print_hex(__get_dynamic_array(data), __entry->len))
+);
+
+DEFINE_EVENT(spi_data, spi_tx_data,
+
+ TP_PROTO(struct spi_device *spi, const u8 *buf, unsigned int len,
+ unsigned int rem),
+
+ TP_ARGS(spi, buf, len, rem)
+
+);
+
+DEFINE_EVENT(spi_data, spi_rx_data,
+
+ TP_PROTO(struct spi_device *spi, const u8 *buf, unsigned int len,
+ unsigned int rem),
+
+ TP_ARGS(spi, buf, len, rem)
+
+);
+
#endif /* _TRACE_POWER_H */
Thanks,
Praveen Talari
^ permalink raw reply related
* Re: [PATCH RFC v5 10/53] KVM: guest_memfd: Add basic support for KVM_SET_MEMORY_ATTRIBUTES2
From: Ackerley Tng @ 2026-05-07 16:56 UTC (permalink / raw)
To: Liam R. Howlett
Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco
In-Reply-To: <n5ce32wumzeiqqyqutom4apy2kqfetbvusc6j4k2xarsska5mw@klp5bmy7qhfm>
"Liam R. Howlett" <liam@infradead.org> writes:
> On 26/04/28 04:25PM, Ackerley Tng via B4 Relay wrote:
>>
>> [...snip...]
>>
>> +/*
>> + * Preallocate memory for attributes to be stored on a maple tree, pointed to
>> + * by mas. Adjacent ranges with attributes identical to the new attributes
>> + * will be merged. Also sets mas's bounds up for storing attributes.
>> + *
>> + * This maintains the invariant that ranges with the same attributes will
>> + * always be merged.
>> + */
>> +static int kvm_gmem_mas_preallocate(struct ma_state *mas, u64 attributes,
>> + pgoff_t start, size_t nr_pages)
>> +{
>> + pgoff_t end = start + nr_pages;
>> + pgoff_t last = end - 1;
>> + void *entry;
>> +
>> + /* Try extending range. entry is NULL on overflow/wrap-around. */
>> + mas_set_range(mas, end, end);
>> + entry = mas_find(mas, end);
Thank you for your reviews!
>
> Please read the documentation as I believe you have a bug here. What
> happens if there is another range stored higher than end + 1?
>
The invariant in this maple tree is that contiguous ranges with the same
attribute are stored as a single range.
The goal of this first part is to get the entry at the index just after
the requested range, and see what the attribute there is. If that
attribute is what we're about to set, extend the requested range for
storing to the end of that range.
If there is another range higher than end + 1, with the invariant
maintained, that attribute has to be different than the attribute stored
at end. Hence, we only want to extend this requested range up till end.
> Do you have testing of these functions somewhere?
>
GMEM_CONVERSION_MULTIPAGE_TEST_INIT_SHARED(indexing, 4) tests setting
attributes in ranges. If test_page is 2,
1. [0, 4) starts off shared (4 is the number of pages in the guest_memfd)
2. [2, 3) is converted to private
=> so the ranges should now be [0, 2), [2, 3), [3, 4)
3. [2, 3) is converted back to shared
=> so the ranges should now be [0, 4)
I verified this by inserting some trace_printk()s and inspecting manually.
>> + if (entry && xa_to_value(entry) == attributes)
>> + last = mas->last;
>> +
>> + if (start > 0) {
>> + mas_set_range(mas, start - 1, start - 1);
>> + entry = mas_find(mas, start - 1);
>> + if (entry && xa_to_value(entry) == attributes)
>> + start = mas->index;
>> + }
>> +
>> + mas_set_range(mas, start, last);
>> + return mas_preallocate(mas, xa_mk_value(attributes), GFP_KERNEL);
>> +}
>> +
>>
>> [...snip...]
>>
^ permalink raw reply
* [PATCH] test_kprobes: clear kprobes between test runs
From: Martin Kaiser @ 2026-05-07 13:44 UTC (permalink / raw)
To: Naveen N Rao, Masami Hiramatsu
Cc: Steven Rostedt, linux-trace-kernel, linux-kernel, Martin Kaiser
Running the kprobes sanity tests twice makes all tests fail and
eventually crashes the kernel.
[root@martin-riscv-1 ~]# echo 1 > /sys/kernel/debug/kunit/kprobes_test/run
...
# Totals: pass:5 fail:0 skip:0 total:5
ok 1 kprobes_test
[root@martin-riscv-1 ~]# echo 1 > /sys/kernel/debug/kunit/kprobes_test/run
...
# test_kprobe: EXPECTATION FAILED at lib/tests/test_kprobes.c:64
Expected 0 == register_kprobe(&kp), but
register_kprobe(&kp) == -22 (0xffffffffffffffea)
...
Unable to handle kernel paging request ...
The testsuite defines several kprobes and kretprobes as static variables
that are preserved across test runs.
After register_kprobe and unregister_kprobe, a kprobe contains some
leftover data that must be cleared before the kprobe can be registered
again. The tests are setting symbol_name to define the probe location.
Address and flags must be cleared.
The existing code clears some of the probes between subsequent tests, but
not between two test runs. The leftover data from a previous test run
makes the registrations fail in the next run.
Move the cleanups for all kprobes into kprobes_test_init, this function
is called before each single test (including the first test of a test
run).
Signed-off-by: Martin Kaiser <martin@kaiser.cx>
---
lib/tests/test_kprobes.c | 29 ++++++++++++++++++-----------
1 file changed, 18 insertions(+), 11 deletions(-)
diff --git a/lib/tests/test_kprobes.c b/lib/tests/test_kprobes.c
index b7582010125c..06e729e4de05 100644
--- a/lib/tests/test_kprobes.c
+++ b/lib/tests/test_kprobes.c
@@ -12,6 +12,12 @@
#define div_factor 3
+#define KP_CLEAR(_kp) \
+do { \
+ (_kp).addr = NULL; \
+ (_kp).flags = 0; \
+} while (0)
+
static u32 rand1, preh_val, posth_val;
static u32 (*target)(u32 value);
static u32 (*recursed_target)(u32 value);
@@ -125,10 +131,6 @@ static void test_kprobes(struct kunit *test)
current_test = test;
- /* addr and flags should be cleard for reusing kprobe. */
- kp.addr = NULL;
- kp.flags = 0;
-
KUNIT_EXPECT_EQ(test, 0, register_kprobes(kps, 2));
preh_val = 0;
posth_val = 0;
@@ -226,9 +228,6 @@ static void test_kretprobes(struct kunit *test)
struct kretprobe *rps[2] = {&rp, &rp2};
current_test = test;
- /* addr and flags should be cleard for reusing kprobe. */
- rp.kp.addr = NULL;
- rp.kp.flags = 0;
KUNIT_EXPECT_EQ(test, 0, register_kretprobes(rps, 2));
krph_val = 0;
@@ -290,8 +289,6 @@ static void test_stacktrace_on_kretprobe(struct kunit *test)
unsigned long myretaddr = (unsigned long)__builtin_return_address(0);
current_test = test;
- rp3.kp.addr = NULL;
- rp3.kp.flags = 0;
/*
* Run the stacktrace_driver() to record correct return address in
@@ -352,8 +349,6 @@ static void test_stacktrace_on_nested_kretprobe(struct kunit *test)
struct kretprobe *rps[2] = {&rp3, &rp4};
current_test = test;
- rp3.kp.addr = NULL;
- rp3.kp.flags = 0;
//KUNIT_ASSERT_NE(test, myretaddr, stacktrace_driver());
@@ -367,6 +362,18 @@ static void test_stacktrace_on_nested_kretprobe(struct kunit *test)
static int kprobes_test_init(struct kunit *test)
{
+ KP_CLEAR(kp);
+ KP_CLEAR(kp2);
+ KP_CLEAR(kp_missed);
+#ifdef CONFIG_KRETPROBES
+ KP_CLEAR(rp.kp);
+ KP_CLEAR(rp2.kp);
+#ifdef CONFIG_ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
+ KP_CLEAR(rp3.kp);
+ KP_CLEAR(rp4.kp);
+#endif
+#endif
+
target = kprobe_target;
target2 = kprobe_target2;
recursed_target = kprobe_recursed_target;
--
2.43.7
^ permalink raw reply related
* Re: [PATCH v1 1/2] spi: qcom-geni: trace: Add trace events for Qualcomm GENI SPI
From: Mark Brown @ 2026-05-07 8:13 UTC (permalink / raw)
To: Praveen Talari
Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
linux-trace-kernel, linux-arm-msm, linux-spi,
MukeshKumarSavaliyamukesh.savaliya, AniketRandiveaniket.randive,
chandana.chiluveru, jyothi.seerapu
In-Reply-To: <e4651363-7c1c-4ae0-a97b-b64841424c83@oss.qualcomm.com>
[-- Attachment #1: Type: text/plain, Size: 456 bytes --]
On Thu, May 07, 2026 at 08:58:02AM +0530, Praveen Talari wrote:
> On 07-05-2026 06:32, Mark Brown wrote:
> > At least these feel like they really should be generic events, there
> > hopefully isn't anything driver specific about them.
> Initially implemented as a generic event; however, splitting into separate
> TX and RX events may be more appropriate.
> Which approach would you prefer?
By generic I mean this should not be driver specific at all.
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply
* [PATCH 1/4] eventfs/tracing: fix typo in a comment
From: Martin Kaiser @ 2026-05-07 8:09 UTC (permalink / raw)
To: Steven Rostedt, Masami Hiramatsu
Cc: linux-trace-kernel, linux-kernel, Martin Kaiser
In-Reply-To: <20260507081041.885781-1-martin@kaiser.cx>
Fix a typo ("eventfs files") in a comment.
Signed-off-by: Martin Kaiser <martin@kaiser.cx>
---
include/linux/tracefs.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h
index d03f74658716..bc354d340046 100644
--- a/include/linux/tracefs.h
+++ b/include/linux/tracefs.h
@@ -30,7 +30,7 @@ struct eventfs_file;
* @data: data to pass to the created file ops
* @fops: the file operations of the created file
*
- * The evetnfs files are dynamically created. The struct eventfs_entry array
+ * The eventfs files are dynamically created. The struct eventfs_entry array
* is passed to eventfs_create_dir() or eventfs_create_events_dir() that will
* be used to create the files within those directories. When a lookup
* or access to a file within the directory is made, the struct eventfs_entry
--
2.43.7
^ permalink raw reply related
* [PATCH 4/4] tracing: trace_fprobe: fix typo in function name
From: Martin Kaiser @ 2026-05-07 8:09 UTC (permalink / raw)
To: Steven Rostedt, Masami Hiramatsu
Cc: linux-trace-kernel, linux-kernel, Martin Kaiser
In-Reply-To: <20260507081041.885781-1-martin@kaiser.cx>
The function name should be __register_tracepoint_fprobe.
Signed-off-by: Martin Kaiser <martin@kaiser.cx>
---
kernel/trace/trace_fprobe.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index 9f5f08c0e7c2..4d1abbf66229 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -764,7 +764,7 @@ static int unregister_fprobe_event(struct trace_fprobe *tf)
return trace_probe_unregister_event_call(&tf->tp);
}
-static int __regsiter_tracepoint_fprobe(struct trace_fprobe *tf)
+static int __register_tracepoint_fprobe(struct trace_fprobe *tf)
{
struct tracepoint_user *tuser __free(tuser_put) = NULL;
struct module *mod __free(module_put) = NULL;
@@ -836,7 +836,7 @@ static int __register_trace_fprobe(struct trace_fprobe *tf)
tf->fp.flags &= ~FPROBE_FL_DISABLED;
if (trace_fprobe_is_tracepoint(tf))
- return __regsiter_tracepoint_fprobe(tf);
+ return __register_tracepoint_fprobe(tf);
/* TODO: handle filter, nofilter or symbol list */
return register_fprobe(&tf->fp, tf->symbol, NULL);
--
2.43.7
^ permalink raw reply related
* [PATCH 3/4] tracing: probes: fix typo in a log message
From: Martin Kaiser @ 2026-05-07 8:09 UTC (permalink / raw)
To: Steven Rostedt, Masami Hiramatsu
Cc: linux-trace-kernel, linux-kernel, Martin Kaiser
In-Reply-To: <20260507081041.885781-1-martin@kaiser.cx>
Fix a typo ("Invalid $-variable") in a log message.
Signed-off-by: Martin Kaiser <martin@kaiser.cx>
---
kernel/trace/trace_probe.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 262d8707a3df..df68d40de161 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -509,7 +509,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(NO_RETVAL, "This function returns 'void' type"), \
C(BAD_STACK_NUM, "Invalid stack number"), \
C(BAD_ARG_NUM, "Invalid argument number"), \
- C(BAD_VAR, "Invalid $-valiable specified"), \
+ C(BAD_VAR, "Invalid $-variable specified"), \
C(BAD_REG_NAME, "Invalid register name"), \
C(BAD_MEM_ADDR, "Invalid memory address"), \
C(BAD_IMM, "Invalid immediate value"), \
--
2.43.7
^ permalink raw reply related
* [PATCH 0/4] trace: trivial: fix some typos
From: Martin Kaiser @ 2026-05-07 8:09 UTC (permalink / raw)
To: Steven Rostedt, Masami Hiramatsu
Cc: linux-trace-kernel, linux-kernel, Martin Kaiser
Some trivial typo fixes for the tracing code.
Martin Kaiser (4):
eventfs/tracing: fix typo in a comment
perf/core: fix typo in a comment
tracing: probes: fix typo in a log message
tracing: trace_fprobe: fix typo in function name
include/linux/tracefs.h | 2 +-
kernel/events/core.c | 2 +-
kernel/trace/trace_fprobe.c | 4 ++--
kernel/trace/trace_probe.h | 2 +-
4 files changed, 5 insertions(+), 5 deletions(-)
--
2.43.7
^ permalink raw reply
* [PATCH 2/4] perf/core: fix typo in a comment
From: Martin Kaiser @ 2026-05-07 8:09 UTC (permalink / raw)
To: Steven Rostedt, Masami Hiramatsu
Cc: linux-trace-kernel, linux-kernel, Martin Kaiser
In-Reply-To: <20260507081041.885781-1-martin@kaiser.cx>
Fix a typo ("swevents") in a comment.
Signed-off-by: Martin Kaiser <martin@kaiser.cx>
---
kernel/events/core.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bcaf175f3ded..2ca27547bb74 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11444,7 +11444,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
/*
* Here use the same on-stack perf_sample_data,
* some members in data are event-specific and
- * need to be re-computed for different sweveents.
+ * need to be re-computed for different swevents.
* Re-initialize data->sample_flags safely to avoid
* the problem that next event skips preparing data
* because data->sample_flags is set.
--
2.43.7
^ permalink raw reply related
* [PATCH v2] fprobe: Fix unregister_fprobe() to wait for RCU grace period
From: Masami Hiramatsu (Google) @ 2026-05-07 7:46 UTC (permalink / raw)
To: Steven Rostedt, Masami Hiramatsu, Alexei Starovoitov,
Daniel Borkmann, Andrii Nakryiko, Jiri Olsa
Cc: Mathieu Desnoyers, linux-kernel, linux-trace-kernel, bpf
From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Commit 4346ba1604093 ("fprobe: Rewrite fprobe on function-graph tracer")
changed fprobe to register struct fprobe to an rcu-hlist, but it forgot
to wait for RCU GP. Thus there can be use-after-free if the fprobe is
released right after unregistering. This can be happened on fprobe
event and sample module code.
To fix this issue, add synchronize_rcu() in unregister_fprobe().
Note that BPF is OK because fprobe is used as a part of
bpf_kprobe_multi_link. This unregisters its fprobe in
bpf_kprobe_multi_link_release() and it is deallocated via
bpf_kprobe_multi_link_dealloc(), which is invoked from
bpf_link_defer_dealloc_rcu_gp() RCU callback.
For BPF, this also introduced unregister_fprobe_async() which does
NOT wait for RCU grace priod.
Fixes: 4346ba1604093 ("fprobe: Rewrite fprobe on function-graph tracer")
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
Changes from v1 [1]:
- Rewrite with async API.
- Apply async API only to BPF.
[1] https://lore.kernel.org/all/177729179863.401400.6063130067239479972.stgit@mhiramat.tok.corp.google.com/
---
include/linux/fprobe.h | 5 +++++
kernel/trace/bpf_trace.c | 3 ++-
kernel/trace/fprobe.c | 23 +++++++++++++++++++++--
3 files changed, 28 insertions(+), 3 deletions(-)
diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h
index 0a3bcd1718f3..be1b38c981d4 100644
--- a/include/linux/fprobe.h
+++ b/include/linux/fprobe.h
@@ -94,6 +94,7 @@ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter
int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num);
int register_fprobe_syms(struct fprobe *fp, const char **syms, int num);
int unregister_fprobe(struct fprobe *fp);
+int unregister_fprobe_async(struct fprobe *fp);
bool fprobe_is_registered(struct fprobe *fp);
int fprobe_count_ips_from_filter(const char *filter, const char *notfilter);
#else
@@ -113,6 +114,10 @@ static inline int unregister_fprobe(struct fprobe *fp)
{
return -EOPNOTSUPP;
}
+static inline int unregister_fprobe_async(struct fprobe *fp)
+{
+ return -EOPNOTSUPP;
+}
static inline bool fprobe_is_registered(struct fprobe *fp)
{
return false;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index af7079aa0f36..a02bd258677e 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2384,7 +2384,8 @@ static void bpf_kprobe_multi_link_release(struct bpf_link *link)
struct bpf_kprobe_multi_link *kmulti_link;
kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
- unregister_fprobe(&kmulti_link->fp);
+ /* Don't wait for RCU GP here. */
+ unregister_fprobe_async(&kmulti_link->fp);
kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt);
}
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index cc49ebd2a773..f378613ad120 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -1093,14 +1093,15 @@ static int unregister_fprobe_nolock(struct fprobe *fp)
}
/**
- * unregister_fprobe() - Unregister fprobe.
+ * unregister_fprobe_async() - Unregister fprobe without RCU GP wait
* @fp: A fprobe data structure to be unregistered.
*
* Unregister fprobe (and remove ftrace hooks from the function entries).
+ * This function will NOT wait until the fprobe is no longer used.
*
* Return 0 if @fp is unregistered successfully, -errno if not.
*/
-int unregister_fprobe(struct fprobe *fp)
+int unregister_fprobe_async(struct fprobe *fp)
{
guard(mutex)(&fprobe_mutex);
if (!fp || !fprobe_registered(fp))
@@ -1108,6 +1109,24 @@ int unregister_fprobe(struct fprobe *fp)
return unregister_fprobe_nolock(fp);
}
+
+/**
+ * unregister_fprobe() - Unregister fprobe with RCU GP wait
+ * @fp: A fprobe data structure to be unregistered.
+ *
+ * Unregister fprobe (and remove ftrace hooks from the function entries).
+ * This function will block until the fprobe is no longer used.
+ *
+ * Return 0 if @fp is unregistered successfully, -errno if not.
+ */
+int unregister_fprobe(struct fprobe *fp)
+{
+ int ret = unregister_fprobe_async(fp);
+
+ if (!ret)
+ synchronize_rcu();
+ return ret;
+}
EXPORT_SYMBOL_GPL(unregister_fprobe);
static int __init fprobe_initcall(void)
^ permalink raw reply related
* Re: [PATCH v19 0/7] ring-buffer: Making persistent ring buffers robust
From: Masami Hiramatsu @ 2026-05-07 4:14 UTC (permalink / raw)
To: Steven Rostedt
Cc: Catalin Marinas, Will Deacon, Mathieu Desnoyers, linux-kernel,
linux-trace-kernel, Ian Rogers, linux-arm-kernel
In-Reply-To: <20260502181619.7f5003dc@robin>
On Sat, 2 May 2026 18:17:06 -0400
Steven Rostedt <rostedt@kernel.org> wrote:
> On Sat, 2 May 2026 15:23:04 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
> > Hi Masami,
> >
> > I applied your patches and enabled your ptracingtest code. I noticed
> > that when there's dropped pages, the trace output is not in order:
> >
> > # trace-cmd start -B ptracingtest -e all -v -e '*lock*'
> > # taskset -c 5 echo c > /proc/sysrq-trigger
> >
> > On reboot, I ran:
> >
> > # trace-cmd show -B ptracingtest > /tmp/trace.out
> >
> > Then executed the attached perl program:
> >
> > # ./read-ts.pl < /tmp/trace.out
> >
> > And it errors our:
> >
> > 30.212495 < 30.213534
> > <...>-1048 [005] d.... 30.212495: irq_enable: caller=irqentry_exit+0xf5/0x710 parent=0x0
> >
> > That is, I think the zero timestamps may be messing with the order.
> >
>
> Ah, I think I found the problem. The iterator needs the same logic you
> added for the consuming read:
>
> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> index 7bfbed0ac90c..90a7fa772fe3 100644
> --- a/kernel/trace/ring_buffer.c
> +++ b/kernel/trace/ring_buffer.c
> @@ -6105,12 +6105,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
> struct ring_buffer_per_cpu *cpu_buffer;
> struct ring_buffer_event *event;
> int nr_loops = 0;
> + int max_loops;
>
> if (ts)
> *ts = 0;
>
> cpu_buffer = iter->cpu_buffer;
> buffer = cpu_buffer->buffer;
> + max_loops = cpu_buffer->ring_meta ? cpu_buffer->nr_pages : 3;
>
> /*
> * Check if someone performed a consuming read to the buffer
> @@ -6133,7 +6135,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
> * the ring buffer with an active write as the consumer is.
> * Do not warn if the three failures is reached.
> */
> - if (++nr_loops > 3)
> + if (++nr_loops > max_loops)
> return NULL;
>
> if (rb_per_cpu_empty(cpu_buffer))
>
>
> I'll test this some more, and make a proper patch.
Ah, indeed. Thanks for fixing!
BTW, shouldn't we unify common logic of those functions?
Thank you,
>
> -- Steve
>
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply
* Re: [PATCH RFC v5 10/53] KVM: guest_memfd: Add basic support for KVM_SET_MEMORY_ATTRIBUTES2
From: Liam R. Howlett @ 2026-05-07 3:46 UTC (permalink / raw)
To: ackerleytng
Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco
In-Reply-To: <20260428-gmem-inplace-conversion-v5-10-d8608ccfca22@google.com>
On 26/04/28 04:25PM, Ackerley Tng via B4 Relay wrote:
> From: Ackerley Tng <ackerleytng@google.com>
>
> Introduce basic support for KVM_SET_MEMORY_ATTRIBUTES2 in guest_memfd,
> which just updates attributes tracked by guest_memfd.
>
> Validate input fields in general. Guard usage of KVM_SET_MEMORY_ATTRIBUTES2
> by making sure requested attributes are supported for this instance of kvm.
>
> A new KVM_SET_MEMORY_ATTRIBUTES2 is defined to support writes (unlike
> KVM_SET_MEMORY_ATTRIBUTES) in addition to reads so it can provide error
> details to userspace. This will be used in a later patch.
>
> The two ioctls use their corresponding structs with no overlap, but
> backward compatibility is baked in for future support of
> KVM_SET_MEMORY_ATTRIBUTES2 and struct kvm_memory_attributes2 in the VM
> ioctl.
>
> The process of setting memory attributes is set up such that the later half
> will not fail due to allocation. Any necessary checks are performed before
> the point of no return.
>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
> Co-developed-by: Sean Christoperson <seanjc@google.com>
> Signed-off-by: Sean Christoperson <seanjc@google.com>
> ---
> include/uapi/linux/kvm.h | 13 ++++++
> virt/kvm/Kconfig | 1 +
> virt/kvm/guest_memfd.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++
> virt/kvm/kvm_main.c | 12 +++++
> 4 files changed, 140 insertions(+)
>
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 6c8afa2047bf3..e6bbf68a83813 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1648,6 +1648,19 @@ struct kvm_memory_attributes {
> __u64 flags;
> };
>
> +#define KVM_SET_MEMORY_ATTRIBUTES2 _IOWR(KVMIO, 0xd2, struct kvm_memory_attributes2)
> +
> +struct kvm_memory_attributes2 {
> + union {
> + __u64 address;
> + __u64 offset;
> + };
> + __u64 size;
> + __u64 attributes;
> + __u64 flags;
> + __u64 reserved[12];
> +};
> +
> #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3)
>
> #define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd)
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index 3fea89c45cfb4..e371e079e2c50 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -109,6 +109,7 @@ config KVM_VM_MEMORY_ATTRIBUTES
>
> config KVM_GUEST_MEMFD
> select XARRAY_MULTI
> + select KVM_MEMORY_ATTRIBUTES
> bool
>
> config HAVE_KVM_ARCH_GMEM_PREPARE
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 506219e2359eb..9a26eca717047 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -552,11 +552,125 @@ unsigned long kvm_gmem_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
> }
> EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_memory_attributes);
>
> +/*
> + * Preallocate memory for attributes to be stored on a maple tree, pointed to
> + * by mas. Adjacent ranges with attributes identical to the new attributes
> + * will be merged. Also sets mas's bounds up for storing attributes.
> + *
> + * This maintains the invariant that ranges with the same attributes will
> + * always be merged.
> + */
> +static int kvm_gmem_mas_preallocate(struct ma_state *mas, u64 attributes,
> + pgoff_t start, size_t nr_pages)
> +{
> + pgoff_t end = start + nr_pages;
> + pgoff_t last = end - 1;
> + void *entry;
> +
> + /* Try extending range. entry is NULL on overflow/wrap-around. */
> + mas_set_range(mas, end, end);
> + entry = mas_find(mas, end);
Please read the documentation as I believe you have a bug here. What
happens if there is another range stored higher than end + 1?
Do you have testing of these functions somewhere?
> + if (entry && xa_to_value(entry) == attributes)
> + last = mas->last;
> +
> + if (start > 0) {
> + mas_set_range(mas, start - 1, start - 1);
> + entry = mas_find(mas, start - 1);
> + if (entry && xa_to_value(entry) == attributes)
> + start = mas->index;
> + }
> +
> + mas_set_range(mas, start, last);
> + return mas_preallocate(mas, xa_mk_value(attributes), GFP_KERNEL);
> +}
> +
> +static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start,
> + size_t nr_pages, uint64_t attrs)
> +{
> + struct address_space *mapping = inode->i_mapping;
> + struct gmem_inode *gi = GMEM_I(inode);
> + pgoff_t end = start + nr_pages;
> + struct maple_tree *mt;
> + struct ma_state mas;
> + int r;
> +
> + mt = &gi->attributes;
> +
> + filemap_invalidate_lock(mapping);
> +
> + mas_init(&mas, mt, start);
> + r = kvm_gmem_mas_preallocate(&mas, attrs, start, nr_pages);
> + if (r)
> + goto out;
> +
> + /*
> + * From this point on guest_memfd has performed necessary
> + * checks and can proceed to do guest-breaking changes.
> + */
> +
> + kvm_gmem_invalidate_begin(inode, start, end);
> + mas_store_prealloc(&mas, xa_mk_value(attrs));
> + kvm_gmem_invalidate_end(inode, start, end);
> +out:
> + filemap_invalidate_unlock(mapping);
> + return r;
> +}
> +
> +static long kvm_gmem_set_attributes(struct file *file, void __user *argp)
> +{
> + struct gmem_file *f = file->private_data;
> + struct inode *inode = file_inode(file);
> + struct kvm_memory_attributes2 attrs;
> + size_t nr_pages;
> + pgoff_t index;
> + int i;
> +
> + if (copy_from_user(&attrs, argp, sizeof(attrs)))
> + return -EFAULT;
> +
> + if (attrs.flags)
> + return -EINVAL;
> + for (i = 0; i < ARRAY_SIZE(attrs.reserved); i++) {
> + if (attrs.reserved[i])
> + return -EINVAL;
> + }
> + if (attrs.attributes & ~kvm_supported_mem_attributes(f->kvm))
> + return -EINVAL;
> + if (attrs.size == 0 || attrs.offset + attrs.size < attrs.offset)
> + return -EINVAL;
> + if (!PAGE_ALIGNED(attrs.offset) || !PAGE_ALIGNED(attrs.size))
> + return -EINVAL;
> +
> + if (attrs.offset >= i_size_read(inode) ||
> + attrs.offset + attrs.size > i_size_read(inode))
> + return -EINVAL;
> +
> + nr_pages = attrs.size >> PAGE_SHIFT;
> + index = attrs.offset >> PAGE_SHIFT;
> + return __kvm_gmem_set_attributes(inode, index, nr_pages,
> + attrs.attributes);
> +}
> +
> +static long kvm_gmem_ioctl(struct file *file, unsigned int ioctl,
> + unsigned long arg)
> +{
> + switch (ioctl) {
> + case KVM_SET_MEMORY_ATTRIBUTES2:
> + if (vm_memory_attributes)
> + return -ENOTTY;
> +
> + return kvm_gmem_set_attributes(file, (void __user *)arg);
> + default:
> + return -ENOTTY;
> + }
> +}
> +
> static struct file_operations kvm_gmem_fops = {
> .mmap = kvm_gmem_mmap,
> .open = generic_file_open,
> .release = kvm_gmem_release,
> .fallocate = kvm_gmem_fallocate,
> + .unlocked_ioctl = kvm_gmem_ioctl,
> };
>
> static int kvm_gmem_migrate_folio(struct address_space *mapping,
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index ff20e63143642..4d7bf52b7b717 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -110,6 +110,18 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(STATIC_CALL_KEY(__kvm_get_memory_attributes));
> EXPORT_SYMBOL_FOR_KVM_INTERNAL(STATIC_CALL_TRAMP(__kvm_get_memory_attributes));
> #endif
>
> +#define MEMORY_ATTRIBUTES_MATCH(one, two) \
> + static_assert(offsetof(struct kvm_memory_attributes, one) == \
> + offsetof(struct kvm_memory_attributes2, two)); \
> + static_assert(sizeof_field(struct kvm_memory_attributes, one) ==\
> + sizeof_field(struct kvm_memory_attributes2, two))
> +
> +/* Ensure the common parts of the two structs are identical. */
> +MEMORY_ATTRIBUTES_MATCH(address, address);
> +MEMORY_ATTRIBUTES_MATCH(size, size);
> +MEMORY_ATTRIBUTES_MATCH(attributes, attributes);
> +MEMORY_ATTRIBUTES_MATCH(flags, flags);
> +
> /*
> * Ordering of locks:
> *
>
> --
> 2.54.0.545.g6539524ca2-goog
>
>
>
^ permalink raw reply
* Re: [PATCH RFC v5 01/53] KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
From: Liam R. Howlett @ 2026-05-07 3:34 UTC (permalink / raw)
To: ackerleytng
Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco
In-Reply-To: <20260428-gmem-inplace-conversion-v5-1-d8608ccfca22@google.com>
On 26/04/28 04:24PM, Ackerley Tng via B4 Relay wrote:
> From: Sean Christopherson <seanjc@google.com>
>
> Start plumbing in guest_memfd support for in-place private<=>shared
> conversions by tracking attributes via a maple tree. KVM currently tracks
> private vs. shared attributes on a per-VM basis, which made sense when a
> guest_memfd _only_ supported private memory, but tracking per-VM simply
> can't work for in-place conversions as the shareability of a given page
> needs to be per-gmem_inode, not per-VM.
>
> Use the filemap invalidation lock to protect the maple tree, as taking the
> lock for read when faulting in memory (for userspace or the guest) isn't
> expected to result in meaningful contention, and using a separate lock
> would add significant complexity (avoid deadlock is quite difficult).
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Co-developed-by: Ackerley Tng <ackerleytng@google.com>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
> Co-developed-by: Fuad Tabba <tabba@google.com>
> Signed-off-by: Fuad Tabba <tabba@google.com>
> ---
> virt/kvm/guest_memfd.c | 139 +++++++++++++++++++++++++++++++++++++++++++------
> 1 file changed, 123 insertions(+), 16 deletions(-)
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 69c9d6d546b28..17e5a23fec0a1 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -4,6 +4,7 @@
> #include <linux/falloc.h>
> #include <linux/fs.h>
> #include <linux/kvm_host.h>
> +#include <linux/maple_tree.h>
> #include <linux/mempolicy.h>
> #include <linux/pseudo_fs.h>
> #include <linux/pagemap.h>
> @@ -33,6 +34,12 @@ struct gmem_inode {
> struct list_head gmem_file_list;
>
> u64 flags;
> + /*
> + * Every index in this inode, whether memory is populated or
> + * not, is tracked in attributes. There are no gaps in this
> + * maple tree.
> + */
> + struct maple_tree attributes;
> };
>
> static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
> @@ -60,6 +67,31 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
> return gfn - slot->base_gfn + slot->gmem.pgoff;
> }
>
> +static u64 kvm_gmem_get_attributes(struct inode *inode, pgoff_t index)
> +{
> + struct maple_tree *mt = &GMEM_I(inode)->attributes;
> + void *entry = mtree_load(mt, index);
> +
> + /*
> + * The lock _must_ be held for lookups, as some maple tree operations,
> + * e.g. append, are unsafe (return inaccurate information) with respect
> + * to concurrent RCU-protected lookups.
> + */
Can you please elaborate how you see inaccurate information and which
information is inaccurate?
Your comment is incorrect and misleading as append will not be used in
rcu mode. Note that you have not set this tree up in rcu mode.
> + lockdep_assert(mt_lock_is_held(mt));
> +
> + return WARN_ON_ONCE(!entry) ? 0 : xa_to_value(entry);
> +}
> +
> +static bool kvm_gmem_is_private_mem(struct inode *inode, pgoff_t index)
> +{
> + return kvm_gmem_get_attributes(inode, index) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
> +}
> +
> +static bool kvm_gmem_is_shared_mem(struct inode *inode, pgoff_t index)
> +{
> + return !kvm_gmem_is_private_mem(inode, index);
> +}
> +
> static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> pgoff_t index, struct folio *folio)
> {
> @@ -397,10 +429,13 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
> if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
> return VM_FAULT_SIGBUS;
>
> - if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED))
> - return VM_FAULT_SIGBUS;
> + filemap_invalidate_lock_shared(inode->i_mapping);
> + if (kvm_gmem_is_shared_mem(inode, vmf->pgoff))
> + folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> + else
> + folio = ERR_PTR(-EACCES);
> + filemap_invalidate_unlock_shared(inode->i_mapping);
>
> - folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> if (IS_ERR(folio)) {
> if (PTR_ERR(folio) == -EAGAIN)
> return VM_FAULT_RETRY;
> @@ -556,6 +591,51 @@ bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
> return true;
> }
>
> +static int kvm_gmem_init_inode(struct inode *inode, loff_t size, u64 flags)
> +{
> + struct gmem_inode *gi = GMEM_I(inode);
> + MA_STATE(mas, &gi->attributes, 0, (size >> PAGE_SHIFT) - 1);
> + u64 attrs;
> + int r;
> +
> + inode->i_op = &kvm_gmem_iops;
> + inode->i_mapping->a_ops = &kvm_gmem_aops;
> + inode->i_mode |= S_IFREG;
> + inode->i_size = size;
> + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
> +
> + /*
> + * guest_memfd memory is neither migratable nor swappable: set
> + * inaccessible to gate off both.
> + */
> + mapping_set_inaccessible(inode->i_mapping);
> + WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
> +
> + gi->flags = flags;
> +
> + mt_set_external_lock(&gi->attributes,
> + &inode->i_mapping->invalidate_lock);
> +
> + /*
> + * Store default attributes for the entire gmem instance. Ensuring every
> + * index is represented in the maple tree at all times simplifies the
> + * conversion and merging logic.
> + */
> + attrs = gi->flags & GUEST_MEMFD_FLAG_INIT_SHARED ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE;
> +
> + /*
> + * Acquire the invalidation lock purely to make lockdep happy. The
> + * maple tree library expects all stores to be protected via the lock,
> + * and the library can't know when the tree is reachable only by the
> + * caller, as is the case here.
> + */
> + filemap_invalidate_lock(inode->i_mapping);
> + r = mas_store_gfp(&mas, xa_mk_value(attrs), GFP_KERNEL);
> + filemap_invalidate_unlock(inode->i_mapping);
> +
> + return r;
> +}
> +
> static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
> {
> static const char *name = "[kvm-gmem]";
> @@ -586,16 +666,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
> goto err_fops;
> }
>
> - inode->i_op = &kvm_gmem_iops;
> - inode->i_mapping->a_ops = &kvm_gmem_aops;
> - inode->i_mode |= S_IFREG;
> - inode->i_size = size;
> - mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
> - mapping_set_inaccessible(inode->i_mapping);
> - /* Unmovable mappings are supposed to be marked unevictable as well. */
> - WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
> -
> - GMEM_I(inode)->flags = flags;
> + err = kvm_gmem_init_inode(inode, size, flags);
> + if (err)
> + goto err_inode;
>
> file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
> if (IS_ERR(file)) {
> @@ -797,9 +870,13 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> if (!file)
> return -EFAULT;
>
> + filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
> +
> folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
> - if (IS_ERR(folio))
> - return PTR_ERR(folio);
> + if (IS_ERR(folio)) {
> + r = PTR_ERR(folio);
> + goto out;
> + }
>
> if (!folio_test_uptodate(folio)) {
> clear_highpage(folio_page(folio, 0));
> @@ -815,6 +892,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> else
> folio_put(folio);
>
> +out:
> + filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
> return r;
> }
> EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
> @@ -944,6 +1023,15 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
>
> mpol_shared_policy_init(&gi->policy, NULL);
>
> + /*
> + * Memory attributes are protected by the filemap invalidation lock, but
> + * the lock structure isn't available at this time. Immediately mark
> + * maple tree as using external locking so that accessing the tree
> + * before it's fully initialized results in NULL pointer dereferences
> + * and not more subtle bugs.
> + */
> + mt_init_flags(&gi->attributes, MT_FLAGS_LOCK_EXTERN);
> +
> gi->flags = 0;
> INIT_LIST_HEAD(&gi->gmem_file_list);
> return &gi->vfs_inode;
> @@ -951,7 +1039,26 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
>
> static void kvm_gmem_destroy_inode(struct inode *inode)
> {
> - mpol_free_shared_policy(&GMEM_I(inode)->policy);
> + struct gmem_inode *gi = GMEM_I(inode);
> +
> + mpol_free_shared_policy(&gi->policy);
> +
> + /*
> + * Note! Checking for an empty tree is functionally necessary
> + * to avoid explosions if the tree hasn't been fully
> + * initialized, i.e. if the inode is being destroyed before
> + * guest_memfd can set the external lock, lockdep would find
> + * that the tree's internal ma_lock was not held.
> + */
> + if (!mtree_empty(&gi->attributes)) {
> + /*
> + * Acquire the invalidation lock purely to make lockdep happy,
> + * the inode is unreachable at this point.
> + */
> + filemap_invalidate_lock(inode->i_mapping);
> + __mt_destroy(&gi->attributes);
> + filemap_invalidate_unlock(inode->i_mapping);
> + }
> }
>
> static void kvm_gmem_free_inode(struct inode *inode)
>
> --
> 2.54.0.545.g6539524ca2-goog
>
>
>
^ permalink raw reply
* Re: [PATCH v1 2/2] spi: qcom-geni: Add trace events for Qualcomm GENI SPI driver
From: Praveen Talari @ 2026-05-07 3:28 UTC (permalink / raw)
To: Mark Brown
Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
linux-trace-kernel, linux-arm-msm, linux-spi,
MukeshKumarSavaliyamukesh.savaliya, AniketRandiveaniket.randive,
chandana.chiluveru, jyothi.seerapu
In-Reply-To: <afvltFJzKLaO9weP@sirena.co.uk>
Hi Mark,
On 07-05-2026 06:37, Mark Brown wrote:
> On Wed, May 06, 2026 at 10:59:43PM +0530, Praveen Talari wrote:
>
>> @@ -717,6 +726,7 @@ static bool geni_spi_handle_tx(struct spi_geni_master *mas)
>> max_bytes = mas->tx_rem_bytes;
>>
>> tx_buf = mas->cur_xfer->tx_buf + mas->cur_xfer->len - mas->tx_rem_bytes;
>> +
>> while (i < max_bytes) {
>> unsigned int j;
>> unsigned int bytes_to_write;
> Unrelated whitespace change.
oh its my bad. will fix in next patch set.
Thanks,
Praveen Talari
^ permalink raw reply
* Re: [PATCH v1 1/2] spi: qcom-geni: trace: Add trace events for Qualcomm GENI SPI
From: Praveen Talari @ 2026-05-07 3:28 UTC (permalink / raw)
To: Mark Brown
Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
linux-trace-kernel, linux-arm-msm, linux-spi,
MukeshKumarSavaliyamukesh.savaliya, AniketRandiveaniket.randive,
chandana.chiluveru, jyothi.seerapu
In-Reply-To: <afvkiT50ZUEXZ-YO@sirena.co.uk>
Hi Mark,
On 07-05-2026 06:32, Mark Brown wrote:
> On Wed, May 06, 2026 at 10:59:42PM +0530, Praveen Talari wrote:
>> Add tracepoint support to the Qualcomm GENI SPI driver to provide
>> runtime visibility into driver behavior without requiring invasive debug
>> patches.
>> +TRACE_EVENT(geni_spi_tx_data,
>> +TRACE_EVENT(geni_spi_rx_data,
> At least these feel like they really should be generic events, there
> hopefully isn't anything driver specific about them.
Initially implemented as a generic event; however, splitting into
separate TX and RX events may be more appropriate.
Which approach would you prefer?
Thanks,
Praveen Talari
^ permalink raw reply
* Re: [PATCH] fprobe: Add unregister_fprobe_sync() for synchronous unregistration
From: Masami Hiramatsu @ 2026-05-07 2:02 UTC (permalink / raw)
To: Masami Hiramatsu (Google)
Cc: Steven Rostedt, Mathieu Desnoyers, Jonathan Corbet, linux-kernel,
linux-trace-kernel, linux-doc
In-Reply-To: <177729179863.401400.6063130067239479972.stgit@mhiramat.tok.corp.google.com>
Hi,
On Mon, 27 Apr 2026 21:09:58 +0900
"Masami Hiramatsu (Google)" <mhiramat@kernel.org> wrote:
> From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
>
> Currently, unregister_fprobe() removes the ftrace hooks but does not
> wait for the RCU grace period to expire. This is efficient for batch
> unregistration of multiple fprobes (to avoid multiple RCU grace period
> latencies), but it leaves a window where probe handlers might still be
> running on other CPUs after the function returns.
> If a caller needs to free the fprobe structure or unload the module
> immediately after unregistration, they must manually call
> synchronize_rcu() to prevent use-after-free issues.
>
> To simplify this use case, introduce unregister_fprobe_sync(). This
> function unregisters the fprobe and waits for the RCU grace period to
> complete before returning.
BTW, as same as kprobes does, is it better to sync it by default as
the current documentation says?
Considering the need for consistency in behavior with kprobes interfaces,
it might be better to introduce an asynchronous API eventually.
The first patch will fix the current issue (whose behavior differs from
the documentation and other APIs), and then introduce an asynchronous
version (the same as the current implementation) for internal use.
This can provide a mind model with more consistent behavior for *probes.
Thank you,
>
> Also, update the documentation of unregister_fprobe() to clarify its
> non-blocking behavior and suggest using unregister_fprobe_sync() for the
> last probe in a batch. Finally, update the fprobe sample module to use
> the synchronous version on exit to ensure safe module unloading.
> And add a fix to use synchronous version in the sample code and
> trace_fprobe (unexpected error case).
>
> Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> ---
> Documentation/trace/fprobe.rst | 15 ++++++++++++---
> include/linux/fprobe.h | 5 +++++
> kernel/trace/fprobe.c | 30 ++++++++++++++++++++++++++++++
> kernel/trace/trace_fprobe.c | 9 +++++++--
> samples/fprobe/fprobe_example.c | 2 +-
> 5 files changed, 55 insertions(+), 6 deletions(-)
>
> diff --git a/Documentation/trace/fprobe.rst b/Documentation/trace/fprobe.rst
> index 95998b189ae3..eee4860ab29a 100644
> --- a/Documentation/trace/fprobe.rst
> +++ b/Documentation/trace/fprobe.rst
> @@ -65,6 +65,12 @@ To disable (remove from functions) this fprobe, call::
>
> unregister_fprobe(&fp);
>
> +Or if you need to wait for the RCU grace period to ensure no handlers
> +are running on any CPU (e.g., before freeing the `fprobe` structure),
> +use::
> +
> + unregister_fprobe_sync(&fp);
> +
> You can temporally (soft) disable the fprobe by::
>
> disable_fprobe(&fp);
> @@ -81,9 +87,12 @@ Same as ftrace, the registered callbacks will start being called some time
> after the register_fprobe() is called and before it returns. See
> Documentation/trace/ftrace.rst.
>
> -Also, the unregister_fprobe() will guarantee that both enter and exit
> -handlers are no longer being called by functions after unregister_fprobe()
> -returns as same as unregister_ftrace_function().
> +Also, the `unregister_fprobe_sync()` will guarantee that both enter and exit
> +handlers are no longer being called by functions after it returns.
> +On the other hand, `unregister_fprobe()` does not wait for the RCU grace period,
> +so handlers might still be running on other CPUs for a short time after it returns.
> +This is useful when you unregister multiple fprobes in a batch to avoid
> +waiting for the RCU grace period for each one.
>
> The fprobe entry/exit handler
> =============================
> diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h
> index 0a3bcd1718f3..6ae452e250a1 100644
> --- a/include/linux/fprobe.h
> +++ b/include/linux/fprobe.h
> @@ -94,6 +94,7 @@ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter
> int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num);
> int register_fprobe_syms(struct fprobe *fp, const char **syms, int num);
> int unregister_fprobe(struct fprobe *fp);
> +int unregister_fprobe_sync(struct fprobe *fp);
> bool fprobe_is_registered(struct fprobe *fp);
> int fprobe_count_ips_from_filter(const char *filter, const char *notfilter);
> #else
> @@ -113,6 +114,10 @@ static inline int unregister_fprobe(struct fprobe *fp)
> {
> return -EOPNOTSUPP;
> }
> +static inline int unregister_fprobe_sync(struct fprobe *fp)
> +{
> + return -EOPNOTSUPP;
> +}
> static inline bool fprobe_is_registered(struct fprobe *fp)
> {
> return false;
> diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
> index cc49ebd2a773..5f3e48385a47 100644
> --- a/kernel/trace/fprobe.c
> +++ b/kernel/trace/fprobe.c
> @@ -1097,6 +1097,9 @@ static int unregister_fprobe_nolock(struct fprobe *fp)
> * @fp: A fprobe data structure to be unregistered.
> *
> * Unregister fprobe (and remove ftrace hooks from the function entries).
> + * Note: This function does not wait for RCU grace period, since user
> + * may use several fprobes (and then unregister them one by one). In that
> + * case, it is recommended to use unregister_fprobe_sync() for the last fprobe.
> *
> * Return 0 if @fp is unregistered successfully, -errno if not.
> */
> @@ -1110,6 +1113,33 @@ int unregister_fprobe(struct fprobe *fp)
> }
> EXPORT_SYMBOL_GPL(unregister_fprobe);
>
> +/**
> + * unregister_fprobe_sync() - Unregister fprobe synchronously with RCU grace period.
> + * @fp: A fprobe data structure to be unregistered.
> + *
> + * Unregister fprobe (and remove ftrace hooks from the function entries) and
> + * wait for the RCU grace period to finish. This is useful for preventing
> + * the fprobe from being used after it is unregistered.
> + *
> + * Return 0 if @fp is unregistered successfully, -errno if not.
> + */
> +int unregister_fprobe_sync(struct fprobe *fp)
> +{
> + int ret;
> +
> + guard(mutex)(&fprobe_mutex);
> + if (!fp || !fprobe_registered(fp))
> + return -EINVAL;
> +
> + ret = unregister_fprobe_nolock(fp);
> + if (ret)
> + return ret;
> +
> + synchronize_rcu();
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(unregister_fprobe_sync);
> +
> static int __init fprobe_initcall(void)
> {
> rhltable_init(&fprobe_ip_table, &fprobe_rht_params);
> diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
> index 9f5f08c0e7c2..fa5b41f7f306 100644
> --- a/kernel/trace/trace_fprobe.c
> +++ b/kernel/trace/trace_fprobe.c
> @@ -845,8 +845,13 @@ static int __register_trace_fprobe(struct trace_fprobe *tf)
> /* Internal unregister function - just handle fprobe and flags */
> static void __unregister_trace_fprobe(struct trace_fprobe *tf)
> {
> - if (trace_fprobe_is_registered(tf))
> - unregister_fprobe(&tf->fp);
> + /*
> + * Here, @tf must NOT be busy, so it MUST be unregistered already.
> + * But if it is unexpectedly registered, unregister it synchronously.
> + */
> + if (WARN_ON_ONCE(trace_fprobe_is_registered(tf)))
> + unregister_fprobe_sync(&tf->fp);
> +
> if (tf->tuser) {
> tracepoint_user_put(tf->tuser);
> tf->tuser = NULL;
> diff --git a/samples/fprobe/fprobe_example.c b/samples/fprobe/fprobe_example.c
> index bfe98ce826f3..382d2f67672a 100644
> --- a/samples/fprobe/fprobe_example.c
> +++ b/samples/fprobe/fprobe_example.c
> @@ -142,7 +142,7 @@ static int __init fprobe_init(void)
>
> static void __exit fprobe_exit(void)
> {
> - unregister_fprobe(&sample_probe);
> + unregister_fprobe_sync(&sample_probe);
>
> pr_info("fprobe at %s unregistered. %ld times hit, %ld times missed\n",
> symbol, nhit, sample_probe.nmissed);
>
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply
* Re: [PATCH] fprobe: Add unregister_fprobe_sync() for synchronous unregistration
From: Masami Hiramatsu @ 2026-05-07 1:48 UTC (permalink / raw)
To: Steven Rostedt
Cc: Mathieu Desnoyers, Jonathan Corbet, linux-kernel,
linux-trace-kernel, linux-doc
In-Reply-To: <20260428142736.11f5211a@gandalf.local.home>
On Tue, 28 Apr 2026 14:27:36 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:
> On Mon, 27 Apr 2026 21:09:58 +0900
> "Masami Hiramatsu (Google)" <mhiramat@kernel.org> wrote:
>
> > +/**
> > + * unregister_fprobe_sync() - Unregister fprobe synchronously with RCU grace period.
> > + * @fp: A fprobe data structure to be unregistered.
> > + *
> > + * Unregister fprobe (and remove ftrace hooks from the function entries) and
> > + * wait for the RCU grace period to finish. This is useful for preventing
> > + * the fprobe from being used after it is unregistered.
> > + *
> > + * Return 0 if @fp is unregistered successfully, -errno if not.
> > + */
> > +int unregister_fprobe_sync(struct fprobe *fp)
> > +{
> > + int ret;
> > +
> > + guard(mutex)(&fprobe_mutex);
> > + if (!fp || !fprobe_registered(fp))
> > + return -EINVAL;
> > +
> > + ret = unregister_fprobe_nolock(fp);
> > + if (ret)
> > + return ret;
> > +
> > + synchronize_rcu();
>
> Hmm, do we really need to hold the fprobe_mutex when doing the
> synchronize_rcu()? This could cause other updates to have to wait longer
> too.
Good catch! Indeed, there is no need to hold the mutex.
OK, let me update it.
Thanks,
>
> -- Steve
>
>
> > + return 0;
> > +}
> > +EXPORT_SYMBOL_GPL(unregister_fprobe_sync);
>
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply
* Re: [v2 PATCH] kprobes: skip non-symbol addresses in kprobe_add_ksym_blacklist()
From: Masami Hiramatsu @ 2026-05-07 1:46 UTC (permalink / raw)
To: Jianpeng Chang
Cc: naveen, davem, arnd, mark.rutland, catalin.marinas, linux-kernel,
linux-trace-kernel, linux-arch, stable
In-Reply-To: <20260506012706.2785785-1-jianpeng.chang.cn@windriver.com>
On Wed, 6 May 2026 09:27:06 +0800
Jianpeng Chang <jianpeng.chang.cn@windriver.com> wrote:
> When kprobe_add_area_blacklist() iterates through a section like
> .kprobes.text, the start address may not correspond to a named symbol.
> On ARM64 with CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS=y (introduced by
> commit baaf553d3bc3 ("arm64: Implement
> HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS")), the compiler flag
> -fpatchable-function-entry=4,2 inserts 2 NOPs before each function entry
> point for ftrace call_ops. These pre-function NOPs sit at the section base
> address, before the first named function symbol. The compiler emits a $x
> mapping symbol at offset 0x00 to mark the start of code, but
> find_kallsyms_symbol() ignores mapping symbols.
>
> Without CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS (e.g. defconfig), no
> pre-function NOPs are inserted, the first function starts at offset
> 0x00, and the bug does not trigger.
>
> This only affects modules that have a .kprobes.text section (i.e. those
> using the __kprobes annotation). Modules using NOKPROBE_SYMBOL() instead
> (like kretprobe_example.ko) blacklist exact function addresses via the
> _kprobe_blacklist section and are not affected.
>
> For kprobe_example.ko on ARM64 with -fpatchable-function-entry=4,2,
> the .kprobes.text section layout is:
>
> offset 0x00: $x + 2 NOPs (mapping symbol + ftrace preamble)
> offset 0x08: handler_post (64 bytes)
> offset 0x50: handler_pre (68 bytes)
>
> kprobe_add_area_blacklist() starts iterating from the section base
> address (offset 0x00), which only has the $x mapping symbol.
> kprobe_add_ksym_blacklist() then calls kallsyms_lookup_size_offset()
> for this address, which goes through:
>
> kallsyms_lookup_size_offset()
> -> module_address_lookup()
> -> find_kallsyms_symbol()
>
> find_kallsyms_symbol() scans all module symbols to find the closest
> preceding symbol.
>
> Since no named text symbol exists at offset 0x00,
> find_kallsyms_symbol() picks __UNIQUE_ID_vermagic (a .modinfo symbol
> whose address is in the temporary image) as the "best" match. The
> computed "size" = next_text_symbol - modinfo_symbol spans across
> these two unrelated memory regions, creating a blacklist entry with
> a bogus range of tens of terabytes.
>
> Whether this causes a visible failure depends on address randomization,
> here is what happens on Raspberry Pi 4/5:
>
> - On RPi5, the bogus size was ~35 TB. start + size stayed within
> 64-bit range, so the blacklist entry covered the entire kernel
> text. register_kprobe() in the module's own init function failed
> with -EINVAL.
>
> - On RPi4, the bogus size was ~75 TB. start + size overflowed
> 64 bits and wrapped to a small address near zero. The range
> check (addr >= start && addr < end) then failed because end
> wrapped around, so the bogus entry was accidentally harmless
> and kprobes worked by luck.
>
> The same bug exists on both machines, but randomization determines whether
> the integer overflow masks it or not.
>
> Fix this by adding notrace to the __kprobes macro. Functions in
> .kprobes.text are kprobe infrastructure handlers that should never be
> traced by ftrace. With notrace, the compiler stops inserting them and the
> non-symbol gap at the section start disappears entirely.
>
Thanks, this looks good to me!
> Fixes: baaf553d3bc3 ("arm64: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS")
> Signed-off-by: Jianpeng Chang <jianpeng.chang.cn@windriver.com>
> ---
> v2:
> - use notrace instead of skipping the nops
> v1: https://lore.kernel.org/all/20260427073545.3656835-1-jianpeng.chang.cn@windriver.com/
>
> include/asm-generic/kprobes.h | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/include/asm-generic/kprobes.h b/include/asm-generic/kprobes.h
> index 060eab094e5a..5290a2b2e15a 100644
> --- a/include/asm-generic/kprobes.h
> +++ b/include/asm-generic/kprobes.h
> @@ -14,7 +14,7 @@ static unsigned long __used \
> _kbl_addr_##fname = (unsigned long)fname;
> # define NOKPROBE_SYMBOL(fname) __NOKPROBE_SYMBOL(fname)
> /* Use this to forbid a kprobes attach on very low level functions */
> -# define __kprobes __section(".kprobes.text")
> +# define __kprobes notrace __section(".kprobes.text")
> # define nokprobe_inline __always_inline
> #else
> # define NOKPROBE_SYMBOL(fname)
> --
> 2.54.0
>
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply
* Re: [PATCH] sparc64: uprobes: add missing break
From: Masami Hiramatsu @ 2026-05-07 1:41 UTC (permalink / raw)
To: Rosen Penev
Cc: linux-kernel, Masami Hiramatsu, Oleg Nesterov, Peter Zijlstra,
David S. Miller, Andreas Larsson, open list:UPROBES,
open list:SPARC + UltraSPARC (sparc/sparc64)
In-Reply-To: <20260506031815.779909-1-rosenp@gmail.com>
On Tue, 5 May 2026 20:18:15 -0700
Rosen Penev <rosenp@gmail.com> wrote:
> Missing fallthrough causes failure with newer compilers:
>
> arch/sparc/kernel/uprobes.c:284:2: error: unannotated fall-through between switch labels [-Werror,-Wimplicit-fallthrough]
> 284 | default:
> | ^
> arch/sparc/kernel/uprobes.c:284:2: note: insert 'break;' to avoid fall-through
> 284 | default:
> | ^
> | break;
>
> Signed-off-by: Rosen Penev <rosenp@gmail.com>
Looks good to me.
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> ---
> arch/sparc/kernel/uprobes.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/arch/sparc/kernel/uprobes.c b/arch/sparc/kernel/uprobes.c
> index 305017bec164..c8cac64e9988 100644
> --- a/arch/sparc/kernel/uprobes.c
> +++ b/arch/sparc/kernel/uprobes.c
> @@ -280,6 +280,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self,
> case DIE_SSTEP:
> if (uprobe_post_sstep_notifier(args->regs))
> ret = NOTIFY_STOP;
> + break;
>
> default:
> break;
> --
> 2.54.0
>
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply
* Re: [PATCH v1 2/2] spi: qcom-geni: Add trace events for Qualcomm GENI SPI driver
From: Mark Brown @ 2026-05-07 1:07 UTC (permalink / raw)
To: Praveen Talari
Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
linux-trace-kernel, linux-arm-msm, linux-spi,
MukeshKumarSavaliyamukesh.savaliya, AniketRandiveaniket.randive,
chandana.chiluveru, jyothi.seerapu
In-Reply-To: <20260506-add-tracepoints-for-qcom-geni-spi-v1-2-c957cfe712d1@oss.qualcomm.com>
[-- Attachment #1: Type: text/plain, Size: 393 bytes --]
On Wed, May 06, 2026 at 10:59:43PM +0530, Praveen Talari wrote:
> @@ -717,6 +726,7 @@ static bool geni_spi_handle_tx(struct spi_geni_master *mas)
> max_bytes = mas->tx_rem_bytes;
>
> tx_buf = mas->cur_xfer->tx_buf + mas->cur_xfer->len - mas->tx_rem_bytes;
> +
> while (i < max_bytes) {
> unsigned int j;
> unsigned int bytes_to_write;
Unrelated whitespace change.
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox