* [PATCH v7 06/42] KVM: guest_memfd: Update kvm_gmem_populate() to use gmem attributes
From: Ackerley Tng via B4 Relay @ 2026-05-23 0:17 UTC (permalink / raw)
To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>
From: Ackerley Tng <ackerleytng@google.com>
Update the guest_memfd populate() flow to pull memory attributes from the
gmem instance instead of the VM when KVM is not configured to track
shared/private status in the VM.
Rename the per-VM API to make it clear that it retrieves per-VM
attributes, i.e. is not suitable for use outside of flows that are
specific to generic per-VM attributes.
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
arch/x86/kvm/mmu/mmu.c | 2 +-
include/linux/kvm_host.h | 14 +++++++++++++-
virt/kvm/guest_memfd.c | 24 +++++++++++++++++++++---
virt/kvm/kvm_main.c | 8 +++-----
4 files changed, 38 insertions(+), 10 deletions(-)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index b53a0c4b4dfca..3f70859232b07 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -8060,7 +8060,7 @@ static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
const unsigned long end = start + KVM_PAGES_PER_HPAGE(level);
if (level == PG_LEVEL_2M)
- return kvm_range_has_memory_attributes(kvm, start, end, ~0, attrs);
+ return kvm_range_has_vm_memory_attributes(kvm, start, end, ~0, attrs);
for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) {
if (hugepage_test_mixed(slot, gfn, level - 1) ||
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7de85474c75bd..3039b291e4b09 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2549,12 +2549,24 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
#endif
#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
-bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+extern bool vm_memory_attributes;
+bool kvm_range_has_vm_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
unsigned long mask, unsigned long attrs);
bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
struct kvm_gfn_range *range);
bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
struct kvm_gfn_range *range);
+#else
+#define vm_memory_attributes false
+static inline bool kvm_range_has_vm_memory_attributes(struct kvm *kvm,
+ gfn_t start, gfn_t end,
+ unsigned long mask,
+ unsigned long attrs)
+{
+ WARN_ONCE(1, "Unexpected call to kvm_range_has_vm_memory_attributes()");
+
+ return false;
+}
#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
unsigned long kvm_gmem_get_memory_attributes(struct kvm *kvm, gfn_t gfn);
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index c55879e033d96..78e5435967341 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -930,12 +930,31 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
+static bool kvm_gmem_range_is_private(struct gmem_inode *gi, pgoff_t index,
+ size_t nr_pages, struct kvm *kvm, gfn_t gfn)
+{
+ pgoff_t end = index + nr_pages - 1;
+ void *entry;
+
+ if (vm_memory_attributes)
+ return kvm_range_has_vm_memory_attributes(kvm, gfn, gfn + nr_pages,
+ KVM_MEMORY_ATTRIBUTE_PRIVATE,
+ KVM_MEMORY_ATTRIBUTE_PRIVATE);
+
+ mt_for_each(&gi->attributes, entry, index, end) {
+ if (xa_to_value(entry) != KVM_MEMORY_ATTRIBUTE_PRIVATE)
+ return false;
+ }
+
+ return true;
+}
static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot,
struct file *file, gfn_t gfn, struct page *src_page,
kvm_gmem_populate_cb post_populate, void *opaque)
{
pgoff_t index = kvm_gmem_get_index(slot, gfn);
+ struct gmem_inode *gi;
struct folio *folio;
kvm_pfn_t pfn;
int ret;
@@ -950,9 +969,8 @@ static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot,
folio_unlock(folio);
- if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1,
- KVM_MEMORY_ATTRIBUTE_PRIVATE,
- KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
+ gi = GMEM_I(file_inode(file));
+ if (!kvm_gmem_range_is_private(gi, index, 1, kvm, gfn)) {
ret = -EINVAL;
goto out_put_folio;
}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4139e903f756a..0a4024948711a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -103,9 +103,7 @@ module_param(allow_unsafe_mappings, bool, 0444);
#ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
-static bool vm_memory_attributes = true;
-#else
-#define vm_memory_attributes false
+bool vm_memory_attributes = true;
#endif
DEFINE_STATIC_CALL_RET0(__kvm_get_memory_attributes, kvm_get_memory_attributes_t);
EXPORT_SYMBOL_FOR_KVM_INTERNAL(STATIC_CALL_KEY(__kvm_get_memory_attributes));
@@ -2450,7 +2448,7 @@ static unsigned long kvm_get_vm_memory_attributes(struct kvm *kvm, gfn_t gfn)
* Returns true if _all_ gfns in the range [@start, @end) have attributes
* such that the bits in @mask match @attrs.
*/
-bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+bool kvm_range_has_vm_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
unsigned long mask, unsigned long attrs)
{
XA_STATE(xas, &kvm->mem_attr_array, start);
@@ -2584,7 +2582,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
mutex_lock(&kvm->slots_lock);
/* Nothing to do if the entire range has the desired attributes. */
- if (kvm_range_has_memory_attributes(kvm, start, end, ~0, attributes))
+ if (kvm_range_has_vm_memory_attributes(kvm, start, end, ~0, attributes))
goto out_unlock;
/*
--
2.54.0.794.g4f17f83d09-goog
^ permalink raw reply related
* [PATCH v7 05/42] KVM: guest_memfd: Wire up kvm_get_memory_attributes() to per-gmem attributes
From: Ackerley Tng via B4 Relay @ 2026-05-23 0:17 UTC (permalink / raw)
To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>
From: Sean Christopherson <seanjc@google.com>
Implement kvm_gmem_get_memory_attributes() for guest_memfd to allow the KVM
core and architecture code to query per-GFN memory attributes.
kvm_gmem_get_memory_attributes() finds the memory slot for a given GFN and
queries the guest_memfd file's to determine if the page is marked as
private.
If vm_memory_attributes is not enabled, there is no shared/private tracking
at the VM level. Install the guest_memfd implementation as long as
guest_memfd is enabled to give guest_memfd a chance to respond on
attributes.
guest_memfd should look up attributes regardless of whether this memslot is
gmem-only since attributes are now tracked by gmem regardless of whether
mmap() is enabled.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
include/linux/kvm_host.h | 2 ++
virt/kvm/guest_memfd.c | 31 +++++++++++++++++++++++++++++++
virt/kvm/kvm_main.c | 3 +++
3 files changed, 36 insertions(+)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 29694b348df40..7de85474c75bd 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2557,6 +2557,8 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
struct kvm_gfn_range *range);
#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
+unsigned long kvm_gmem_get_memory_attributes(struct kvm *kvm, gfn_t gfn);
+
#ifdef CONFIG_KVM_GUEST_MEMFD
int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 117b726f670e8..c55879e033d96 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -509,6 +509,37 @@ static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
+unsigned long kvm_gmem_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+ struct inode *inode;
+
+ /*
+ * If this gfn has no associated memslot, there's no chance of the gfn
+ * being backed by private memory, since guest_memfd must be used for
+ * private memory, and guest_memfd must be associated with some memslot.
+ */
+ if (!slot)
+ return 0;
+
+ CLASS(gmem_get_file, file)(slot);
+ if (!file)
+ return 0;
+
+ inode = file_inode(file);
+
+ /*
+ * Rely on the maple tree's internal RCU lock to ensure a
+ * stable result. This result can become stale as soon as the
+ * lock is dropped, so the caller _must_ still protect
+ * consumption of private vs. shared by checking
+ * mmu_invalidate_retry_gfn() under mmu_lock to serialize
+ * against ongoing attribute updates.
+ */
+ return kvm_gmem_get_attributes(inode, kvm_gmem_get_index(slot, gfn));
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_memory_attributes);
+
static struct file_operations kvm_gmem_fops = {
.mmap = kvm_gmem_mmap,
.open = generic_file_open,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ee26f1d9b5fda..4139e903f756a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2653,6 +2653,9 @@ static void kvm_init_memory_attributes(void)
if (vm_memory_attributes)
static_call_update(__kvm_get_memory_attributes,
kvm_get_vm_memory_attributes);
+ else if (IS_ENABLED(CONFIG_KVM_GUEST_MEMFD))
+ static_call_update(__kvm_get_memory_attributes,
+ kvm_gmem_get_memory_attributes);
else
static_call_update(__kvm_get_memory_attributes,
(void *)__static_call_return0);
--
2.54.0.794.g4f17f83d09-goog
^ permalink raw reply related
* [PATCH v7 04/42] KVM: Stub in ability to disable per-VM memory attribute tracking
From: Ackerley Tng via B4 Relay @ 2026-05-23 0:17 UTC (permalink / raw)
To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>
From: Sean Christopherson <seanjc@google.com>
Introduce the basic infrastructure to allow per-VM memory attribute
tracking to be disabled. This will be built-upon in a later patch, where a
module param can disable per-VM memory attribute tracking.
Split the Kconfig option into a base KVM_MEMORY_ATTRIBUTES and the
existing KVM_VM_MEMORY_ATTRIBUTES. The base option provides the core
plumbing, while the latter enables the full per-VM tracking via an xarray
and the associated ioctls.
kvm_get_memory_attributes() now performs a static call that either looks up
kvm->mem_attr_array with CONFIG_KVM_VM_MEMORY_ATTRIBUTES is enabled, or
just returns 0 otherwise. The static call can be patched depending on
whether per-VM tracking is enabled by the CONFIG.
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
arch/x86/include/asm/kvm_host.h | 2 +-
include/linux/kvm_host.h | 23 ++++++++++++---------
virt/kvm/Kconfig | 4 ++++
virt/kvm/kvm_main.c | 44 ++++++++++++++++++++++++++++++++++++++++-
4 files changed, 62 insertions(+), 11 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8bb7c25240e33..01125be81a131 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2393,7 +2393,7 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
int tdp_max_root_level, int tdp_huge_page_level);
-#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
#endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 68142bc962953..29694b348df40 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2528,19 +2528,15 @@ static inline bool kvm_memslot_is_gmem_only(const struct kvm_memory_slot *slot)
return slot->flags & KVM_MEMSLOT_GMEM_ONLY;
}
-#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
+typedef unsigned long (kvm_get_memory_attributes_t)(struct kvm *kvm, gfn_t gfn);
+DECLARE_STATIC_CALL(__kvm_get_memory_attributes, kvm_get_memory_attributes_t);
+
static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
{
- return xa_to_value(xa_load(&kvm->mem_attr_array, gfn));
+ return static_call(__kvm_get_memory_attributes)(kvm, gfn);
}
-bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
- unsigned long mask, unsigned long attrs);
-bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
- struct kvm_gfn_range *range);
-bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
- struct kvm_gfn_range *range);
-
static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
{
return kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
@@ -2550,6 +2546,15 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
{
return false;
}
+#endif
+
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+ unsigned long mask, unsigned long attrs);
+bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
+ struct kvm_gfn_range *range);
+bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
+ struct kvm_gfn_range *range);
#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
#ifdef CONFIG_KVM_GUEST_MEMFD
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 5119cb37145fc..3fea89c45cfb4 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -100,7 +100,11 @@ config KVM_ELIDE_TLB_FLUSH_IF_YOUNG
config KVM_MMU_LOCKLESS_AGING
bool
+config KVM_MEMORY_ATTRIBUTES
+ bool
+
config KVM_VM_MEMORY_ATTRIBUTES
+ select KVM_MEMORY_ATTRIBUTES
bool
config KVM_GUEST_MEMFD
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index abb9cfa3eb04d..ee26f1d9b5fda 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -101,6 +101,17 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_shrink);
static bool __ro_after_init allow_unsafe_mappings;
module_param(allow_unsafe_mappings, bool, 0444);
+#ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+static bool vm_memory_attributes = true;
+#else
+#define vm_memory_attributes false
+#endif
+DEFINE_STATIC_CALL_RET0(__kvm_get_memory_attributes, kvm_get_memory_attributes_t);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(STATIC_CALL_KEY(__kvm_get_memory_attributes));
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(STATIC_CALL_TRAMP(__kvm_get_memory_attributes));
+#endif
+
/*
* Ordering of locks:
*
@@ -2418,7 +2429,7 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
}
#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
-#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
static u64 kvm_supported_mem_attributes(struct kvm *kvm)
{
#ifdef kvm_arch_has_private_mem
@@ -2429,6 +2440,12 @@ static u64 kvm_supported_mem_attributes(struct kvm *kvm)
return 0;
}
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+static unsigned long kvm_get_vm_memory_attributes(struct kvm *kvm, gfn_t gfn)
+{
+ return xa_to_value(xa_load(&kvm->mem_attr_array, gfn));
+}
+
/*
* Returns true if _all_ gfns in the range [@start, @end) have attributes
* such that the bits in @mask match @attrs.
@@ -2625,7 +2642,24 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
}
+#else /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
+static unsigned long kvm_get_vm_memory_attributes(struct kvm *kvm, gfn_t gfn)
+{
+ BUILD_BUG_ON(1);
+}
#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
+static void kvm_init_memory_attributes(void)
+{
+ if (vm_memory_attributes)
+ static_call_update(__kvm_get_memory_attributes,
+ kvm_get_vm_memory_attributes);
+ else
+ static_call_update(__kvm_get_memory_attributes,
+ (void *)__static_call_return0);
+}
+#else /* CONFIG_KVM_MEMORY_ATTRIBUTES */
+static void kvm_init_memory_attributes(void) { }
+#endif /* CONFIG_KVM_MEMORY_ATTRIBUTES */
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
{
@@ -4925,6 +4959,9 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
return 1;
#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
case KVM_CAP_MEMORY_ATTRIBUTES:
+ if (!vm_memory_attributes)
+ return 0;
+
return kvm_supported_mem_attributes(kvm);
#endif
#ifdef CONFIG_KVM_GUEST_MEMFD
@@ -5331,6 +5368,10 @@ static long kvm_vm_ioctl(struct file *filp,
case KVM_SET_MEMORY_ATTRIBUTES: {
struct kvm_memory_attributes attrs;
+ r = -ENOTTY;
+ if (!vm_memory_attributes)
+ goto out;
+
r = -EFAULT;
if (copy_from_user(&attrs, argp, sizeof(attrs)))
goto out;
@@ -6527,6 +6568,7 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
kvm_preempt_ops.sched_in = kvm_sched_in;
kvm_preempt_ops.sched_out = kvm_sched_out;
+ kvm_init_memory_attributes();
kvm_init_debug();
r = kvm_vfio_ops_init();
--
2.54.0.794.g4f17f83d09-goog
^ permalink raw reply related
* [PATCH v7 03/42] KVM: Enumerate support for PRIVATE memory iff kvm_arch_has_private_mem is defined
From: Ackerley Tng via B4 Relay @ 2026-05-23 0:17 UTC (permalink / raw)
To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>
From: Sean Christopherson <seanjc@google.com>
Explicitly guard reporting support for KVM_MEMORY_ATTRIBUTE_PRIVATE based
on kvm_arch_has_private_mem being #defined in anticipation of decoupling
kvm_supported_mem_attributes() from CONFIG_KVM_VM_MEMORY_ATTRIBUTES.
guest_memfd support for memory attributes will be unconditional to avoid
yet more macros (all architectures that support guest_memfd are expected to
use per-gmem attributes at some point), at which point enumerating support
KVM_MEMORY_ATTRIBUTE_PRIVATE based solely on memory attributes being
supported _somewhere_ would result in KVM over-reporting support on arm64.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
include/linux/kvm_host.h | 2 +-
virt/kvm/kvm_main.c | 2 ++
2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 091f201251159..68142bc962953 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -722,7 +722,7 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
}
#endif
-#ifndef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
+#ifndef kvm_arch_has_private_mem
static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
{
return false;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 306153abbafa5..abb9cfa3eb04d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2421,8 +2421,10 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static u64 kvm_supported_mem_attributes(struct kvm *kvm)
{
+#ifdef kvm_arch_has_private_mem
if (!kvm || kvm_arch_has_private_mem(kvm))
return KVM_MEMORY_ATTRIBUTE_PRIVATE;
+#endif
return 0;
}
--
2.54.0.794.g4f17f83d09-goog
^ permalink raw reply related
* [PATCH v7 00/42] guest_memfd: In-place conversion support
From: Ackerley Tng via B4 Relay @ 2026-05-23 0:17 UTC (permalink / raw)
To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco, Ackerley Tng
This is v7 of guest_memfd in-place conversion support.
Up till now, guest_memfd supports the entire inode worth of memory being
used as all-shared, or all-private. CoCo VMs may request guest memory to be
converted between private and shared states, and the only way to support
that currently would be to have the userspace VMM provide two sources of
backing memory from completely different areas of physical memory.
pKVM has a use case for in-place sharing: the guest and host may be
cooperating on given data, and pKVM doesn't protect data through
encryption, so copying that given data between different areas of physical
memory as part of conversions would be unnecessary work.
This series also serves as a foundation for guest_memfd huge page
support. Now, guest_memfd only supports PAGE_SIZE pages, so if two sources
of backing memory are used, the userspace VMM could maintain a steady total
memory utilized by punching out the pages that are not used. When huge
pages are available in guest_memfd, even if the backing memory source
supports hole punching within a huge page, punching out pages to maintain
the total memory utilized by a VM would be introducing lots of
fragmentation.
In-place conversion avoids fragmentation by allowing the same physical
memory to be used for both shared and private memory, with guest_memfd
tracks the shared/private status of all the pages at a per-page
granularity.
The central principle, which guest_memfd continues to uphold, is that any
guest-private page will not be mappable to host userspace. All pages will
be mmap()-able in host userspace, but accesses to guest-private pages (as
tracked by guest_memfd) will result in a SIGBUS.
This series introduces a guest_memfd ioctl (not kvm, vm or vcpu, but
guest_memfd ioctl) that allows userspace to set memory
attributes (shared/private) directly through the guest_memfd. This is the
appropriate interface because shared/private-ness is a property of memory
and hence the request should be sent directly to the memory provider -
guest_memfd.
Tested with both CONFIG_KVM_VM_MEMORY_ATTRIBUTES enabled and disabled:
+ tools/testing/selftests/kvm/guest_memfd_test.c
+ tools/testing/selftests/kvm/pre_fault_memory_test.c
+ tools/testing/selftests/kvm/x86/guest_memfd_conversions_test.c
+ tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
+ tools/testing/selftests/kvm/x86/private_mem_conversions_test.sh
+ tools/testing/selftests/kvm/x86/private_mem_kvm_exits_test.c
Updates for this revision:
+ Picked up Reviewed-bys from Fuad
+ Addressed Fuad, Sean and Sashiko's comments
Regarding the issue where guest_memfd_conversions_test, which uses the
kselftest framework, doesn't perform teardown on assertion failure. I think
we can have that fixed separately from this series? Please see proposal [9].
TODOs
+ Test with TDX selftests. We're in the process of rebasing TDX selftests
on this series and will post updates when that's tested.
This series is based on kvm/next, and here's the tree for your convenience:
https://github.com/googleprodkernel/linux-cc/commits/guest_memfd-inplace-conversion-v7
Older series:
+ RFCv6 is at [10]
+ RFCv5 is at [8]
+ RFCv4 is at [7]
+ RFCv3 is at [6]
+ RFCv2 is at [5]
+ RFCv1 is at [4]
+ Previous versions of this feature, part of other series, are available at
[1][2][3].
[1] https://lore.kernel.org/all/bd163de3118b626d1005aa88e71ef2fb72f0be0f.1726009989.git.ackerleytng@google.com/
[2]
[3] https://lore.kernel.org/all/b784326e9ccae6a08388f1bf39db70a2204bdc51.1747264138.git.ackerleytng@google.com/
[4] https://lore.kernel.org/all/cover.1760731772.git.ackerleytng@google.com/T/
[5] https://lore.kernel.org/all/cover.1770071243.git.ackerleytng@google.com/T/
[6] https://lore.kernel.org/r/20260313-gmem-inplace-conversion-v3-0-5fc12a70ec89@google.com/T/
[7] https://lore.kernel.org/all/20260326-gmem-inplace-conversion-v4-0-e202fe950ffd@google.com/T/
[8] https://lore.kernel.org/r/20260428-gmem-inplace-conversion-v5-0-d8608ccfca22@google.com
[9] https://lore.kernel.org/all/20260414-selftest-global-metadata-v1-0-fd223922bc57@google.com/T/
[10] https://lore.kernel.org/r/20260507-gmem-inplace-conversion-v6-0-91ab5a8b19a4@google.com
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
Ackerley Tng (24):
KVM: guest_memfd: Update kvm_gmem_populate() to use gmem attributes
KVM: guest_memfd: Only prepare folios for private pages
KVM: Move kvm_supported_mem_attributes() to kvm_host.h
KVM: guest_memfd: Add base support for KVM_SET_MEMORY_ATTRIBUTES2
KVM: guest_memfd: Ensure pages are not in use before conversion
KVM: guest_memfd: Call arch invalidate hooks on conversion
KVM: guest_memfd: Return early if range already has requested attributes
KVM: guest_memfd: Advertise KVM_SET_MEMORY_ATTRIBUTES2 ioctl
KVM: guest_memfd: Handle lru_add fbatch refcounts during conversion safety check
KVM: guest_memfd: Use actual size for invalidation in kvm_gmem_release()
KVM: guest_memfd: Determine invalidation filter from memory attributes
KVM: TDX: Make source page optional for KVM_TDX_INIT_MEM_REGION
KVM: selftests: Test basic single-page conversion flow
KVM: selftests: Test conversion flow when INIT_SHARED
KVM: selftests: Test conversion precision in guest_memfd
KVM: selftests: Test conversion before allocation
KVM: selftests: Convert with allocated folios in different layouts
KVM: selftests: Test that truncation does not change shared/private status
KVM: selftests: Test conversion with elevated page refcount
KVM: selftests: Reset shared memory after hole-punching
KVM: selftests: Provide function to look up guest_memfd details from gpa
KVM: selftests: Make TEST_EXPECT_SIGBUS thread-safe
KVM: selftests: Update private_mem_conversions_test to mmap() guest_memfd
KVM: selftests: Add script to exercise private_mem_conversions_test
Michael Roth (1):
KVM: SEV: Make 'uaddr' parameter optional for KVM_SEV_SNP_LAUNCH_UPDATE
Sean Christopherson (17):
KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
KVM: Rename KVM_GENERIC_MEMORY_ATTRIBUTES to KVM_VM_MEMORY_ATTRIBUTES
KVM: Enumerate support for PRIVATE memory iff kvm_arch_has_private_mem is defined
KVM: Stub in ability to disable per-VM memory attribute tracking
KVM: guest_memfd: Wire up kvm_get_memory_attributes() to per-gmem attributes
KVM: Move KVM_VM_MEMORY_ATTRIBUTES config definition to x86
KVM: Let userspace disable per-VM mem attributes, enable per-gmem attributes
KVM: guest_memfd: Enable INIT_SHARED on guest_memfd for x86 Coco VMs
KVM: selftests: Create gmem fd before "regular" fd when adding memslot
KVM: selftests: Rename guest_memfd{,_offset} to gmem_{fd,offset}
KVM: selftests: Add support for mmap() on guest_memfd in core library
KVM: selftests: Add selftests global for guest memory attributes capability
KVM: selftests: Add helpers for calling ioctls on guest_memfd
KVM: selftests: Test that shared/private status is consistent across processes
KVM: selftests: Provide common function to set memory attributes
KVM: selftests: Check fd/flags provided to mmap() when setting up memslot
KVM: selftests: Update private memory exits test to work with per-gmem attributes
Documentation/virt/kvm/api.rst | 78 +++-
.../virt/kvm/x86/amd-memory-encryption.rst | 15 +-
Documentation/virt/kvm/x86/intel-tdx.rst | 4 +
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/Kconfig | 15 +-
arch/x86/kvm/mmu/mmu.c | 4 +-
arch/x86/kvm/svm/sev.c | 18 +-
arch/x86/kvm/vmx/tdx.c | 11 +-
arch/x86/kvm/x86.c | 13 +-
include/linux/kvm_host.h | 53 ++-
include/trace/events/kvm.h | 4 +-
include/uapi/linux/kvm.h | 16 +
mm/swap.c | 2 +
tools/testing/selftests/kvm/Makefile.kvm | 5 +
tools/testing/selftests/kvm/include/kvm_util.h | 136 +++++-
tools/testing/selftests/kvm/include/test_util.h | 34 +-
.../selftests/kvm/kvm_has_gmem_attributes.c | 17 +
tools/testing/selftests/kvm/lib/kvm_util.c | 141 +++---
tools/testing/selftests/kvm/lib/test_util.c | 7 -
.../kvm/x86/guest_memfd_conversions_test.c | 488 +++++++++++++++++++++
.../kvm/x86/private_mem_conversions_test.c | 53 ++-
.../kvm/x86/private_mem_conversions_test.sh | 128 ++++++
.../selftests/kvm/x86/private_mem_kvm_exits_test.c | 36 +-
virt/kvm/Kconfig | 3 +-
virt/kvm/guest_memfd.c | 460 +++++++++++++++++--
virt/kvm/kvm_main.c | 82 +++-
26 files changed, 1633 insertions(+), 192 deletions(-)
---
base-commit: b7fbe9a1bf9ee6c967ef77d366ca58c35fcf1887
change-id: 20260225-gmem-inplace-conversion-bd0dbd39753a
prerequisite-change-id: 20260522-fix-sev-gmem-post-populate-a36bef7f0698:v2
prerequisite-patch-id: 0d1feef8af7aa3471735869080aefa58b254ed0d
prerequisite-patch-id: f64ff55d6fe8d399e720a570fd83cc47bf12ac15
prerequisite-patch-id: 8c52920dd7f65859cbe804c787a9293b33266a3a
prerequisite-patch-id: 95018daf73833296a045c91cfb55cd9f53886dec
prerequisite-patch-id: bcfd440d79bb9f59f41e3244c4392da4c95cd932
Best regards,
--
Ackerley Tng <ackerleytng@google.com>
^ permalink raw reply
* [PATCH v7 01/42] KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
From: Ackerley Tng via B4 Relay @ 2026-05-23 0:17 UTC (permalink / raw)
To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>
From: Sean Christopherson <seanjc@google.com>
Start plumbing in guest_memfd support for in-place private<=>shared
conversions by tracking attributes via a maple tree. KVM currently tracks
private vs. shared attributes on a per-VM basis, which made sense when a
guest_memfd _only_ supported private memory, but tracking per-VM simply
can't work for in-place conversions as the shareability of a given page
needs to be per-gmem_inode, not per-VM.
Use the filemap invalidation lock to protect the maple tree, as taking the
lock for read when faulting in memory (for userspace or the guest) isn't
expected to result in meaningful contention, and using a separate lock
would add significant complexity (avoid deadlock is quite difficult).
Signed-off-by: Sean Christopherson <seanjc@google.com>
Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Vishal Annapurve <vannapurve@google.com>
Signed-off-by: Vishal Annapurve <vannapurve@google.com>
Co-developed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
---
virt/kvm/guest_memfd.c | 133 +++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 117 insertions(+), 16 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 5b4911ffa208a..117b726f670e8 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -4,6 +4,7 @@
#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/kvm_host.h>
+#include <linux/maple_tree.h>
#include <linux/mempolicy.h>
#include <linux/pseudo_fs.h>
#include <linux/pagemap.h>
@@ -33,6 +34,13 @@ struct gmem_inode {
struct list_head gmem_file_list;
u64 flags;
+ /*
+ * Every index in this inode, whether memory is populated or
+ * not, is tracked in attributes. The entire range of indices,
+ * corresponding to the size of this inode, is represented in
+ * this maple tree.
+ */
+ struct maple_tree attributes;
};
static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
@@ -60,6 +68,24 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
return gfn - slot->base_gfn + slot->gmem.pgoff;
}
+static u64 kvm_gmem_get_attributes(struct inode *inode, pgoff_t index)
+{
+ struct maple_tree *mt = &GMEM_I(inode)->attributes;
+ void *entry = mtree_load(mt, index);
+
+ return WARN_ON_ONCE(!entry) ? 0 : xa_to_value(entry);
+}
+
+static bool kvm_gmem_is_private_mem(struct inode *inode, pgoff_t index)
+{
+ return kvm_gmem_get_attributes(inode, index) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
+}
+
+static bool kvm_gmem_is_shared_mem(struct inode *inode, pgoff_t index)
+{
+ return !kvm_gmem_is_private_mem(inode, index);
+}
+
static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
pgoff_t index, struct folio *folio)
{
@@ -397,10 +423,13 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
return VM_FAULT_SIGBUS;
- if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED))
- return VM_FAULT_SIGBUS;
+ filemap_invalidate_lock_shared(inode->i_mapping);
+ if (kvm_gmem_is_shared_mem(inode, vmf->pgoff))
+ folio = kvm_gmem_get_folio(inode, vmf->pgoff);
+ else
+ folio = ERR_PTR(-EACCES);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
- folio = kvm_gmem_get_folio(inode, vmf->pgoff);
if (IS_ERR(folio)) {
if (PTR_ERR(folio) == -EAGAIN)
return VM_FAULT_RETRY;
@@ -556,6 +585,51 @@ bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
return true;
}
+static int kvm_gmem_init_inode(struct inode *inode, loff_t size, u64 flags)
+{
+ struct gmem_inode *gi = GMEM_I(inode);
+ MA_STATE(mas, &gi->attributes, 0, (size >> PAGE_SHIFT) - 1);
+ u64 attrs;
+ int r;
+
+ inode->i_op = &kvm_gmem_iops;
+ inode->i_mapping->a_ops = &kvm_gmem_aops;
+ inode->i_mode |= S_IFREG;
+ inode->i_size = size;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+
+ /*
+ * guest_memfd memory is neither migratable nor swappable: set
+ * inaccessible to gate off both.
+ */
+ mapping_set_inaccessible(inode->i_mapping);
+ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
+
+ gi->flags = flags;
+
+ mt_set_external_lock(&gi->attributes,
+ &inode->i_mapping->invalidate_lock);
+
+ /*
+ * Store default attributes for the entire gmem instance. Ensuring every
+ * index is represented in the maple tree at all times simplifies the
+ * conversion and merging logic.
+ */
+ attrs = gi->flags & GUEST_MEMFD_FLAG_INIT_SHARED ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE;
+
+ /*
+ * Acquire the invalidation lock purely to make lockdep happy. The
+ * maple tree library expects all stores to be protected via the lock,
+ * and the library can't know when the tree is reachable only by the
+ * caller, as is the case here.
+ */
+ filemap_invalidate_lock(inode->i_mapping);
+ r = mas_store_gfp(&mas, xa_mk_value(attrs), GFP_KERNEL);
+ filemap_invalidate_unlock(inode->i_mapping);
+
+ return r;
+}
+
static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
{
static const char *name = "[kvm-gmem]";
@@ -586,16 +660,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
goto err_fops;
}
- inode->i_op = &kvm_gmem_iops;
- inode->i_mapping->a_ops = &kvm_gmem_aops;
- inode->i_mode |= S_IFREG;
- inode->i_size = size;
- mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
- mapping_set_inaccessible(inode->i_mapping);
- /* Unmovable mappings are supposed to be marked unevictable as well. */
- WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
-
- GMEM_I(inode)->flags = flags;
+ err = kvm_gmem_init_inode(inode, size, flags);
+ if (err)
+ goto err_inode;
file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
if (IS_ERR(file)) {
@@ -803,9 +870,13 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
if (!file)
return -EFAULT;
+ filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
+
folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
- if (IS_ERR(folio))
- return PTR_ERR(folio);
+ if (IS_ERR(folio)) {
+ r = PTR_ERR(folio);
+ goto out;
+ }
if (!folio_test_uptodate(folio)) {
clear_highpage(folio_page(folio, 0));
@@ -821,6 +892,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
else
folio_put(folio);
+out:
+ filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
return r;
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
@@ -952,6 +1025,15 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
mpol_shared_policy_init(&gi->policy, NULL);
+ /*
+ * Memory attributes are protected by the filemap invalidation lock, but
+ * the lock structure isn't available at this time. Immediately mark
+ * maple tree as using external locking so that accessing the tree
+ * before it's fully initialized results in NULL pointer dereferences
+ * and not more subtle bugs.
+ */
+ mt_init_flags(&gi->attributes, MT_FLAGS_LOCK_EXTERN | MT_FLAGS_USE_RCU);
+
gi->flags = 0;
INIT_LIST_HEAD(&gi->gmem_file_list);
return &gi->vfs_inode;
@@ -959,7 +1041,26 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
static void kvm_gmem_destroy_inode(struct inode *inode)
{
- mpol_free_shared_policy(&GMEM_I(inode)->policy);
+ struct gmem_inode *gi = GMEM_I(inode);
+
+ mpol_free_shared_policy(&gi->policy);
+
+ /*
+ * Note! Checking for an empty tree is functionally necessary
+ * to avoid explosions if the tree hasn't been fully
+ * initialized, i.e. if the inode is being destroyed before
+ * guest_memfd can set the external lock, lockdep would find
+ * that the tree's internal ma_lock was not held.
+ */
+ if (!mtree_empty(&gi->attributes)) {
+ /*
+ * Acquire the invalidation lock purely to make lockdep happy,
+ * the inode is unreachable at this point.
+ */
+ filemap_invalidate_lock(inode->i_mapping);
+ __mt_destroy(&gi->attributes);
+ filemap_invalidate_unlock(inode->i_mapping);
+ }
}
static void kvm_gmem_free_inode(struct inode *inode)
--
2.54.0.794.g4f17f83d09-goog
^ permalink raw reply related
* [PATCH v7 02/42] KVM: Rename KVM_GENERIC_MEMORY_ATTRIBUTES to KVM_VM_MEMORY_ATTRIBUTES
From: Ackerley Tng via B4 Relay @ 2026-05-23 0:17 UTC (permalink / raw)
To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>
From: Sean Christopherson <seanjc@google.com>
Rename the per-VM memory attributes Kconfig to make it explicitly about
per-VM attributes in anticipation of adding memory attributes support to
guest_memfd, at which point it will be possible (and desirable) to have
memory attributes without the per-VM support, even in x86.
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/Kconfig | 6 +++---
arch/x86/kvm/mmu/mmu.c | 2 +-
arch/x86/kvm/x86.c | 2 +-
include/linux/kvm_host.h | 8 ++++----
include/trace/events/kvm.h | 4 ++--
virt/kvm/Kconfig | 2 +-
virt/kvm/kvm_main.c | 14 +++++++-------
8 files changed, 20 insertions(+), 20 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8a53ca6195701..8bb7c25240e33 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2393,7 +2393,7 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
int tdp_max_root_level, int tdp_huge_page_level);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
#endif
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 801bf9e520db3..26f6afd51bbdc 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -84,7 +84,7 @@ config KVM_SW_PROTECTED_VM
bool "Enable support for KVM software-protected VMs"
depends on EXPERT
depends on KVM_X86 && X86_64
- select KVM_GENERIC_MEMORY_ATTRIBUTES
+ select KVM_VM_MEMORY_ATTRIBUTES
help
Enable support for KVM software-protected VMs. Currently, software-
protected VMs are purely a development and testing vehicle for
@@ -135,7 +135,7 @@ config KVM_INTEL_TDX
bool "Intel Trust Domain Extensions (TDX) support"
default y
depends on INTEL_TDX_HOST
- select KVM_GENERIC_MEMORY_ATTRIBUTES
+ select KVM_VM_MEMORY_ATTRIBUTES
select HAVE_KVM_ARCH_GMEM_POPULATE
help
Provides support for launching Intel Trust Domain Extensions (TDX)
@@ -159,7 +159,7 @@ config KVM_AMD_SEV
depends on KVM_AMD && X86_64
depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
select ARCH_HAS_CC_PLATFORM
- select KVM_GENERIC_MEMORY_ATTRIBUTES
+ select KVM_VM_MEMORY_ATTRIBUTES
select HAVE_KVM_ARCH_GMEM_PREPARE
select HAVE_KVM_ARCH_GMEM_INVALIDATE
select HAVE_KVM_ARCH_GMEM_POPULATE
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index f8aa7eda661ee..b53a0c4b4dfca 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7971,7 +7971,7 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread);
}
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
int level)
{
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 48f259015ce44..cb4f7432a073d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13611,7 +13611,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
}
}
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
kvm_mmu_init_memslot_memory_attributes(kvm, slot);
#endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2c5ad9a6d5ce8..091f201251159 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -722,7 +722,7 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
}
#endif
-#ifndef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifndef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
{
return false;
@@ -871,7 +871,7 @@ struct kvm {
#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
struct notifier_block pm_notifier;
#endif
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
/* Protected by slots_lock (for writes) and RCU (for reads) */
struct xarray mem_attr_array;
#endif
@@ -2528,7 +2528,7 @@ static inline bool kvm_memslot_is_gmem_only(const struct kvm_memory_slot *slot)
return slot->flags & KVM_MEMSLOT_GMEM_ONLY;
}
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
{
return xa_to_value(xa_load(&kvm->mem_attr_array, gfn));
@@ -2550,7 +2550,7 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
{
return false;
}
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
#ifdef CONFIG_KVM_GUEST_MEMFD
int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index b282e3a867696..1ba72bd73ea2f 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -358,7 +358,7 @@ TRACE_EVENT(kvm_dirty_ring_exit,
TP_printk("vcpu %d", __entry->vcpu_id)
);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
/*
* @start: Starting address of guest memory range
* @end: End address of guest memory range
@@ -383,7 +383,7 @@ TRACE_EVENT(kvm_vm_set_mem_attributes,
TP_printk("%#016llx -- %#016llx [0x%lx]",
__entry->start, __entry->end, __entry->attr)
);
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
TRACE_EVENT(kvm_unmap_hva_range,
TP_PROTO(unsigned long start, unsigned long end),
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 794976b88c6f9..5119cb37145fc 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -100,7 +100,7 @@ config KVM_ELIDE_TLB_FLUSH_IF_YOUNG
config KVM_MMU_LOCKLESS_AGING
bool
-config KVM_GENERIC_MEMORY_ATTRIBUTES
+config KVM_VM_MEMORY_ATTRIBUTES
bool
config KVM_GUEST_MEMFD
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 89489996fbc1e..306153abbafa5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1115,7 +1115,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
spin_lock_init(&kvm->mn_invalidate_lock);
rcuwait_init(&kvm->mn_memslots_update_rcuwait);
xa_init(&kvm->vcpu_array);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
xa_init(&kvm->mem_attr_array);
#endif
@@ -1300,7 +1300,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
cleanup_srcu_struct(&kvm->irq_srcu);
srcu_barrier(&kvm->srcu);
cleanup_srcu_struct(&kvm->srcu);
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
xa_destroy(&kvm->mem_attr_array);
#endif
kvm_arch_free_vm(kvm);
@@ -2418,7 +2418,7 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
}
#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
static u64 kvm_supported_mem_attributes(struct kvm *kvm)
{
if (!kvm || kvm_arch_has_private_mem(kvm))
@@ -2623,7 +2623,7 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
}
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
{
@@ -4921,7 +4921,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
case KVM_CAP_SYSTEM_EVENT_DATA:
case KVM_CAP_DEVICE_CTRL:
return 1;
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
case KVM_CAP_MEMORY_ATTRIBUTES:
return kvm_supported_mem_attributes(kvm);
#endif
@@ -5325,7 +5325,7 @@ static long kvm_vm_ioctl(struct file *filp,
break;
}
#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
-#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
case KVM_SET_MEMORY_ATTRIBUTES: {
struct kvm_memory_attributes attrs;
@@ -5336,7 +5336,7 @@ static long kvm_vm_ioctl(struct file *filp,
r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
break;
}
-#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
case KVM_CREATE_DEVICE: {
struct kvm_create_device cd;
--
2.54.0.794.g4f17f83d09-goog
^ permalink raw reply related
* Re: [PATCH v4 27/30] KVM: x86: Add KVM_VCPU_TSC_EFFECTIVE_FREQ attribute
From: David Woodhouse @ 2026-05-22 23:30 UTC (permalink / raw)
To: Sean Christopherson
Cc: Paolo Bonzini, Jonathan Corbet, Shuah Khan, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
Vitaly Kuznetsov, Juergen Gross, Boris Ostrovsky, Paul Durrant,
Jonathan Cameron, Sascha Bischoff, Marc Zyngier, Joey Gouly,
Jack Allister, Dongli Zhang, joe.jin, kvm, linux-doc,
linux-kernel, xen-devel, linux-kselftest
In-Reply-To: <ahDO58dKuPt-lj_J@google.com>
On 22 May 2026 22:47:19 BST, Sean Christopherson <seanjc@google.com> wrote:
>On Fri, May 22, 2026, David Woodhouse wrote:
>> On Fri, 2026-05-22 at 10:21 -0700, Sean Christopherson wrote:
>> >
>> > I'll send a standalone patch, along with a selftest tweak to verify the fix.
>> > It's technically a fix and won't generate any conflicts, no reason to delay it.
>>
>> Are you suggesting the other 30 should be delayed? :P
>
>LOL, just acknowledging that it'll take me a minute to page all of that code in.
Hey, it took me more than a year to page it back in :)
I am only teasing; no rush. And I will post v5 with some accumulated tweaks (from my kvmclock5 branch) soon.
^ permalink raw reply
* Re: [PATCH v6 25/43] KVM: selftests: Add support for mmap() on guest_memfd in core library
From: Ackerley Tng @ 2026-05-22 23:02 UTC (permalink / raw)
To: Ackerley Tng via B4 Relay, aik, andrew.jones, binbin.wu, brauner,
chao.p.peng, david, ira.weiny, jmattson, jthoughton, michael.roth,
oupton, pankaj.gupta, qperret, rick.p.edgecombe, rientjes,
shivankg, steven.price, tabba, willy, wyihan, yan.y.zhao,
forkloop, pratyush, suzuki.poulose, aneesh.kumar, liam,
Paolo Bonzini, Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-25-91ab5a8b19a4@google.com>
Ackerley Tng via B4 Relay <devnull+ackerleytng.google.com@kernel.org>
writes:
>
> [...snip...]
>
> @@ -1078,13 +1077,17 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
> }
>
> region->fd = -1;
> - if (backing_src_is_shared(src_type))
> + if (flags & KVM_MEM_GUEST_MEMFD && gmem_flags & GUEST_MEMFD_FLAG_MMAP) {
> + region->fd = kvm_dup(gmem_fd);
> + mmap_offset = gmem_offset;
> + } else if (backing_src_is_shared(src_type)) {
> region->fd = kvm_memfd_alloc(region->mmap_size,
> src_type == VM_MEM_SRC_SHARED_HUGETLB);
> + }
>
> - region->mmap_start = kvm_mmap(region->mmap_size, PROT_READ | PROT_WRITE,
> - vm_mem_backing_src_alias(src_type)->flag,
> - region->fd);
> + region->mmap_start = __kvm_mmap(region->mmap_size, PROT_READ | PROT_WRITE,
> + vm_mem_backing_src_alias(src_type)->flag,
> + region->fd, mmap_offset);
Sashiko pointed out these:
1. When mmap() is done for region->mmap_alias, it doesn't use
mmap_offset. I'll fix that in the next revision.
2. mmap() may map past the end of the guest_memfd if, due to alignment,
the mmap_size is increased. That is true, but I feel that that fix
should go with a bigger clean up for vm_mem_add().
3. vm_mem_backing_src_alias(src_type)->flag may contain incompatible
mmap flags. This is true. For now, when guest_memfd is used with
vm_mem_add, the src_type passed has to be VM_MEM_SRC_SHMEM. I think
this also falls in the category of doing a bigger clean up for
vm_mem_add().
>
> [...snip...]
>
^ permalink raw reply
* Re: (subset) [PATCH v4 1/1] leds: Introduce the multi_max_intensity sysfs attribute
From: Armin Wolf @ 2026-05-22 22:05 UTC (permalink / raw)
To: Lee Jones, pavel
Cc: linux-kernel, corbet, skhan, linux-leds, linux-doc, wse,
jacek.anaszewski, pobrn, m.tretter
In-Reply-To: <177928768282.2811520.3177179770372088870.b4-ty@b4>
Am 20.05.26 um 16:34 schrieb Lee Jones:
> On Sat, 09 May 2026 23:46:03 +0200, Armin Wolf wrote:
>> Some multicolor LEDs support global brightness control in hardware,
>> meaning that the maximum intensity of the color components is not
>> connected to the maximum global brightness. Such LEDs cannot be
>> described properly by the current multicolor LED class interface,
>> because it assumes that the maximum intensity of each color component
>> is described by the maximum global brightness of the LED.
>>
>> [...]
> Applied, thanks!
>
> [1/1] leds: Introduce the multi_max_intensity sysfs attribute
> commit: b1a9b7a904af2c793850f83a4801a013a718fc47
Thank you :)
I just noticed that i forgot to update the Date field inside the sysfs documentation, it should
have been:
Date: May 2026
KernelVersion: 7.2
Should i send a separate patch for this or can you edit the patch inplace?
Thanks,
Armin Wolf
>
> --
> Lee Jones [李琼斯]
>
^ permalink raw reply
* Re: [PATCH mm-unstable v18 14/14] Documentation: mm: update the admin guide for mTHP collapse
From: David Hildenbrand (Arm) @ 2026-05-22 21:58 UTC (permalink / raw)
To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
willy, yang, ying.huang, ziy, zokeefe, Bagas Sanjaya
In-Reply-To: <20260522150009.121603-15-npache@redhat.com>
>
> process THP controls
> @@ -264,11 +265,6 @@ support the following arguments::
> Khugepaged controls
> -------------------
>
> -.. note::
> - khugepaged currently only searches for opportunities to collapse to
> - PMD-sized THP and no attempt is made to collapse to other THP
> - sizes.
Should we maybe leave this here and clarify that for file/shmem, it will still
only collapse to PMD-sized THPs?
--
Cheers,
David
^ permalink raw reply
* Re: [PATCH mm-unstable v18 07/14] mm/khugepaged: skip collapsing mTHP to smaller orders
From: David Hildenbrand (Arm) @ 2026-05-22 21:51 UTC (permalink / raw)
To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
willy, yang, ying.huang, ziy, zokeefe, Usama Arif
In-Reply-To: <20260522150009.121603-8-npache@redhat.com>
On 5/22/26 17:00, Nico Pache wrote:
> khugepaged may try to collapse a mTHP to a smaller mTHP, resulting in
> some pages being unmapped.
The "some pages being unmapped" part is unclear.
I assume what you mean is "possibly resulting in a partially mapped source
folio, which is undesired."
But there is also the problem that we could try collapsing a folio to a
same-sized folio, which doesn't make sense (assuming the folio is fully mapped).
Clarify all that, please.
Acked-by: David Hildenbrand (arm) <david@kernel.org>
--
Cheers,
David
^ permalink raw reply
* Re: [PATCH mm-unstable v18 06/14] mm/khugepaged: generalize collapse_huge_page for mTHP collapse
From: David Hildenbrand (Arm) @ 2026-05-22 21:47 UTC (permalink / raw)
To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
willy, yang, ying.huang, ziy, zokeefe, Usama Arif
In-Reply-To: <20260522150009.121603-7-npache@redhat.com>
On 5/22/26 17:00, Nico Pache wrote:
> Pass an order and offset to collapse_huge_page to support collapsing anon
> memory to arbitrary orders within a PMD. order indicates what mTHP size we
> are attempting to collapse to, and offset indicates were in the PMD to
> start the collapse attempt.
>
> For non-PMD collapse we must leave the anon VMA write locked until after
> we collapse the mTHP-- in the PMD case all the pages are isolated, but in
> the mTHP case this is not true, and we must keep the lock to prevent
> access/changes to the page tables. This can happen if the rmap walkers hit
> a pmd_none while the PMD entry is currently unavailable due to being
> temporarily removed during the collapse phase.
>
> Acked-by: Usama Arif <usama.arif@linux.dev>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
I guess we should add a comment here like:
/*
* Only notify about the PTE range we will actually modify. While we
* temporary unmap the whole PTE table for mTHP collapse, we'll remap
* it later, leaving other PTEs effectively unmodified. The locks we hold
* prevent anybody from stumbling over such temporarily unmapped PTE tables.
*/
>
> - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
> - address + HPAGE_PMD_SIZE);
> + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start_addr,
> + end_addr);
> mmu_notifier_invalidate_range_start(&range);
>
> pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
> @@ -1294,26 +1297,23 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> * Parallel GUP-fast is fine since GUP-fast will back off when
> * it detects PMD is changed.
> */
> - _pmd = pmdp_collapse_flush(vma, address, pmd);
> + _pmd = pmdp_collapse_flush(vma, pmd_addr, pmd);
> spin_unlock(pmd_ptl);
> mmu_notifier_invalidate_range_end(&range);
> tlb_remove_table_sync_one();
>
> - pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> + pte = pte_offset_map_lock(mm, &_pmd, start_addr, &pte_ptl);
> if (pte) {
> - result = __collapse_huge_page_isolate(vma, address, pte, cc,
> - HPAGE_PMD_ORDER,
> - &compound_pagelist);
> + result = __collapse_huge_page_isolate(vma, start_addr, pte, cc,
> + order, &compound_pagelist);
> spin_unlock(pte_ptl);
> } else {
> result = SCAN_NO_PTE_TABLE;
> }
>
> if (unlikely(result != SCAN_SUCCEED)) {
> - if (pte)
> - pte_unmap(pte);
> spin_lock(pmd_ptl);
> - BUG_ON(!pmd_none(*pmd));
> + WARN_ON_ONCE(!pmd_none(*pmd));
Likely VM_WARN_ON_ONCE is sufficient.
> /*
> * We can only use set_pmd_at when establishing
> * hugepmds and never for establishing regular pmds that
> @@ -1321,21 +1321,24 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> */
> pmd_populate(mm, pmd, pmd_pgtable(_pmd));
> spin_unlock(pmd_ptl);
> - anon_vma_unlock_write(vma->anon_vma);
> goto out_up_write;
> }
>
> /*
> - * All pages are isolated and locked so anon_vma rmap
> - * can't run anymore.
> + * For PMD collapse all pages are isolated and locked so anon_vma
> + * rmap can't run anymore. For mTHP collapse the PMD entry has been
> + * removed and not all pages are isolated and locked, so we must hold
> + * the lock to prevent neighboring folios from attempting to access
> + * this PMD until its reinstalled.
> */
That makes sense. I was wondering whether there was another reason for dropping
the anon_vma lock ... I guess it was just for latency purposes given that there
was no actual need for the lock anymore once all folios in the range were
isolate+locked.
With the two its above addressed
Acked-by: David Hildenbrand (arm) <david@kernel.org>
--
Cheers,
David
^ permalink raw reply
* Re: [PATCH v4 27/30] KVM: x86: Add KVM_VCPU_TSC_EFFECTIVE_FREQ attribute
From: Sean Christopherson @ 2026-05-22 21:47 UTC (permalink / raw)
To: David Woodhouse
Cc: Paolo Bonzini, Jonathan Corbet, Shuah Khan, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
Vitaly Kuznetsov, Juergen Gross, Boris Ostrovsky, Paul Durrant,
Jonathan Cameron, Sascha Bischoff, Marc Zyngier, Joey Gouly,
Jack Allister, Dongli Zhang, joe.jin, kvm, linux-doc,
linux-kernel, xen-devel, linux-kselftest
In-Reply-To: <99356a588677e9ff31c1747db1705d9250a2728d.camel@infradead.org>
On Fri, May 22, 2026, David Woodhouse wrote:
> On Fri, 2026-05-22 at 10:21 -0700, Sean Christopherson wrote:
> >
> > I'll send a standalone patch, along with a selftest tweak to verify the fix.
> > It's technically a fix and won't generate any conflicts, no reason to delay it.
>
> Are you suggesting the other 30 should be delayed? :P
LOL, just acknowledging that it'll take me a minute to page all of that code in.
^ permalink raw reply
* Re: [PATCH v6 01/43] KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
From: Ackerley Tng @ 2026-05-22 21:45 UTC (permalink / raw)
To: Ackerley Tng via B4 Relay, aik, andrew.jones, binbin.wu, brauner,
chao.p.peng, david, ira.weiny, jmattson, jthoughton, michael.roth,
oupton, pankaj.gupta, qperret, rick.p.edgecombe, rientjes,
shivankg, steven.price, tabba, willy, wyihan, yan.y.zhao,
forkloop, pratyush, suzuki.poulose, aneesh.kumar, liam,
Paolo Bonzini, Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-1-91ab5a8b19a4@google.com>
Ackerley Tng via B4 Relay <devnull+ackerleytng.google.com@kernel.org>
writes:
>
> [...snip...]
>
> +static int kvm_gmem_init_inode(struct inode *inode, loff_t size, u64 flags)
> +{
>
> [...snip...]
>
> + filemap_invalidate_lock(inode->i_mapping);
> + r = mas_store_gfp(&mas, xa_mk_value(attrs), GFP_KERNEL);
Sashiko says using GFP_KERNEL with this attributes maple_tree could
allow a process creating a very fragmented maple tree to consume lots of
memory not charged to some memcg and proposed using GFP_KERNEL_ACCOUNT.
The problem with using GFP_KERNEL_ACCOUNT is that the maple tree nodes
are allocated from a shared kmem_cache maple_node_cache. Allocating the
maple tree nodes using GFP_KERNEL_ACCOUNT would mean that the node could
be reused by other maple trees unrelated to this process, and so the
nodes might long outlive the process using this guest_memfd, keeping the
memcg alive far longer than the VM.
For now I think it's okay to stick with GFP_KERNEL? Does anyone else
have suggestions on how to solve this?
> + filemap_invalidate_unlock(inode->i_mapping);
> +
> + return r;
> +}
>
> [...snip...]
>
^ permalink raw reply
* Re: [PATCH mm-unstable v18 04/14] mm/khugepaged: generalize __collapse_huge_page_* for mTHP support
From: David Hildenbrand (Arm) @ 2026-05-22 21:24 UTC (permalink / raw)
To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260522150009.121603-5-npache@redhat.com>
On 5/22/26 16:59, Nico Pache wrote:
> generalize the order of the __collapse_huge_page_* and collapse_max_*
> functions to support future mTHP collapse.
>
> The current mechanism for determining collapse with the
> khugepaged_max_ptes_none value is not designed with mTHP in mind. This
> raises a key design issue: if we support user defined max_pte_none values
> (even those scaled by order), a collapse of a lower order can introduces
> an feedback loop, or "creep", when max_ptes_none is set to a value greater
> than HPAGE_PMD_NR / 2. [1]
>
> With this configuration, a successful collapse to order N will populate
> enough pages to satisfy the collapse condition on order N+1 on the next
> scan. This leads to unnecessary work and memory churn.
>
> To fix this issue introduce a helper function that will limit mTHP
> collapse support to two max_ptes_none values, 0 and HPAGE_PMD_NR - 1.
> This effectively supports two modes: [2]
>
> - max_ptes_none=0: never collapses if it encounters an empty PTE or a PTE
> that maps the shared zeropage. Consequently, no memory bloat.
> - max_ptes_none=511 (on 4k pagesz): Always collapse to the highest
> available mTHP order.
>
> This removes the possibility of "creep", and a warning will be emitted if
> any non-supported max_ptes_none value is configured with mTHP enabled.
> Any intermediate value will default mTHP collapse to max_ptes_none=0.
>
> mTHP collapse will not honor the khugepaged_max_ptes_shared or
> khugepaged_max_ptes_swap parameters, and will fail if it encounters a
> shared or swapped entry.
>
> No functional changes in this patch; however it defines future behavior
> for mTHP collapse.
>
> [1] - https://lore.kernel.org/all/e46ab3ab-a3d7-4fb7-9970-d0704bd5d05a@arm.com
> [2] - https://lore.kernel.org/all/37375ace-5601-4d6c-9dac-d1c8268698e9@redhat.com
>
> Reviewed-by: Lance Yang <lance.yang@linux.dev>
> Co-developed-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
> mm/khugepaged.c | 121 +++++++++++++++++++++++++++++++++++-------------
> 1 file changed, 88 insertions(+), 33 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 116f39518948..e98ba5b15163 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -353,30 +353,52 @@ static bool pte_none_or_zero(pte_t pte)
> * the shared zeropage for the given collapse operation.
> * @cc: The collapse control struct
> * @vma: The vma to check for userfaultfd
> + * @order: The folio order being collapsed to
> *
> * Return: Maximum number of empty/shared zeropage PTEs for the collapse operation
> */
> static unsigned int collapse_max_ptes_none(struct collapse_control *cc,
> - struct vm_area_struct *vma)
> + struct vm_area_struct *vma, unsigned int order)
> {
> + unsigned int max_ptes_none = khugepaged_max_ptes_none;
Can be const, right?
> +
> if (vma && userfaultfd_armed(vma))
> return 0;
> /* for MADV_COLLAPSE, allow any empty/shared zeropage PTEs */
> if (!cc->is_khugepaged)
> return HPAGE_PMD_NR;
> - /* For all other cases respect the user defined maximum */
> - return khugepaged_max_ptes_none;
> + /* for PMD collapse, respect the user defined maximum */
> + if (is_pmd_order(order))
> + return max_ptes_none;
> + /*
> + * for mTHP collapse with the sysctl value set to KHUGEPAGED_MAX_PTES_LIMIT,
> + * scale the maximum number of PTEs to the order of the collapse.
> + */
> + if (max_ptes_none == KHUGEPAGED_MAX_PTES_LIMIT)
> + return (1 << order) - 1;
> + if (!max_ptes_none)
> + return 0;
> + /*
> + * For mTHP collapse of values other than 0 or KHUGEPAGED_MAX_PTES_LIMIT,
> + * emit a warning and return 0.
> + */
> + pr_warn_once("mTHP collapse does not support max_ptes_none values"
> + " other than 0 or %u, defaulting to 0.\n",
> + KHUGEPAGED_MAX_PTES_LIMIT);
> + return 0;
This might read slightly clearer as
/*
* For mTHP ...
*/
if (max_ptes_none)
pr_warn_once(...)
return 0;
IOW, have a single "return 0" label here and only special-case when to warn.
Acked-by: David Hildenbrand (arm) <david@kernel.org>
--
Cheers,
David
^ permalink raw reply
* Re: [PATCH] ARM: zte: clean up zx297520v3 doc. warnings
From: Randy Dunlap @ 2026-05-22 21:20 UTC (permalink / raw)
To: Stefan Dösinger, linux-kernel
Cc: Linus Walleij, Krzysztof Kozlowski, linux-arm-kernel,
Jonathan Corbet, Shuah Khan, linux-doc
In-Reply-To: <6270885.lOV4Wx5bFT@strix>
On 5/22/26 12:31 PM, Stefan Dösinger wrote:
> Hi Randy,
>
> Am Freitag, 22. Mai 2026, 20:44:24 Ostafrikanische Zeit schrieben Sie:
>> Does this mean that you will be merging this patch since you merged the
>> original patch?
>
> I am new to the kernel development process, so I don't know what's the
> preferred way. I guess for me it is easier if your patch gets merged as-is.
>
> I can certainly submit a pull request myself though since I made myself the
> maintainer for this thing. Does that go to linux-doc@vger.kernel.org or the
> soc list?
The same way that this commit was merged:
commit 220ae5d36dba
Author: Stefan Dösinger <stefandoesinger@gmail.com>
Date: Tue Jan 27 20:52:08 2026 +0300
ARM: zte: Add zx297520v3 platform support
I guess to the soc list.
--
~Randy
^ permalink raw reply
* Re: [PATCH mm-unstable v18 03/14] mm/khugepaged: rework max_ptes_* handling with helper functions
From: David Hildenbrand (Arm) @ 2026-05-22 21:16 UTC (permalink / raw)
To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
willy, yang, ying.huang, ziy, zokeefe, Usama Arif
In-Reply-To: <20260522150009.121603-4-npache@redhat.com>
> int hugepage_madvise(struct vm_area_struct *vma,
> vm_flags_t *vm_flags, int advice)
> {
> @@ -540,6 +598,8 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
> unsigned long start_addr, pte_t *pte, struct collapse_control *cc,
> struct list_head *compound_pagelist)
> {
> + const unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma);
> + const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc);
Yeah, it's good that these are all const now.
--
Cheers,
David
^ permalink raw reply
* Re: [PATCH mm-hotfixes-unstable v18 00/14] khugepaged: add mTHP collapse support
From: David Hildenbrand (Arm) @ 2026-05-22 21:13 UTC (permalink / raw)
To: Nico Pache, Vlastimil Babka (SUSE)
Cc: linux-doc, akpm, linux-kernel, linux-mm, linux-trace-kernel,
aarcange, anshuman.khandual, apopple, baohua, baolin.wang,
byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
joshua.hahnjy, kas, lance.yang, liam, ljs, mathieu.desnoyers,
matthew.brost, mhiramat, mhocko, peterx, pfalcato, rakie.kim,
raquini, rdunlap, richard.weiyang, rientjes, rostedt, rppt,
ryan.roberts, shivankg, sunnanyong, surenb, thomas.hellstrom,
tiwai, usamaarif642, vbabka, vishal.moola, wangkefeng.wang, will,
willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <CAA1CXcD373fFfo9YPWRj8mJ_rsnzyCrpn1uk3=k7kU=QuaLOgg@mail.gmail.com>
On 5/22/26 18:11, Nico Pache wrote:
> On Fri, May 22, 2026 at 9:13 AM Vlastimil Babka (SUSE)
> <vbabka@kernel.org> wrote:
>>
>> On 5/22/26 17:07, Nico Pache wrote:
>>>
>>> Whoops I manually changed the coverletter subject to reflect that this
>>> in on mm-hotfixes-unstable but never updated the others...
>>
>> But why? That branch is for hotfixes that would go to the current 7.1-rcX
>> series. mm-unstable would be the correct one for this, AFAICT.
>
> Sorry this was a misunderstanding. The goal here was to base this off
> the closest base commit behind where my v17 already lies in the tree.
Ah, I guess this is a problem of "v17 is already in mm-unstable, so against what
to base v18".
Yeah, we touched on that problem in the LSF/MM process discussion ...
--
Cheers,
David
^ permalink raw reply
* Re: [PATCH mm-hotfixes-unstable v18 00/14] khugepaged: add mTHP collapse support
From: Andrew Morton @ 2026-05-22 20:47 UTC (permalink / raw)
To: Nico Pache
Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
lance.yang, liam, ljs, mathieu.desnoyers, matthew.brost, mhiramat,
mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
zokeefe
In-Reply-To: <20260522150009.121603-1-npache@redhat.com>
On Fri, 22 May 2026 08:59:55 -0600 Nico Pache <npache@redhat.com> wrote:
> The following series provides khugepaged with the capability to collapse
> anonymous memory regions to mTHPs.
Thanks, I've update mm.git's mm-unstable branch to this version.
It sounds like I might be dropping it soon, haven't started looking at
that yet. But let's at least eyeball the latest version at this time.
Sashiko was able to apply this, so the base-it-on-hotfixes thing worked
well, thanks. The AI checking made a few allegations:
https://sashiko.dev/#/patchset/20260522150009.121603-1-npache@redhat.com
> V18 Changes:
> - Added RBs/Acks
> - [patch 02] Guard count_memcg_folio_events with is_pmd_order() to keep
> THP_COLLAPSE_ALLOC PMD-only (Usama, Lance)
> - [patch 03] Convert C++ comments to C-style; fix "none-page" terminology
> to "empty PTEs or PTEs mapping the shared zeropage"; drop unnecessary
> userfaultfd comment; add const to local max_ptes_* variables; fix
> "repect" typo (Lance, David)
> - [patch 04] collapse_max_ptes_none() now returns 0 instead of -EINVAL for
> unsupported values; remove SCAN_INVALID_PTES_NONE; change return type
> from int to unsigned int and propagate to all callers; add comment above
> __collapse_huge_page_swapin explaining mTHP swap bail-out (David,
> Lorenzo, Lance, Wei Yang, Usama)
> - [patch 05] Rewrite collapse_huge_page lock comment to David's suggested
> wording (David)
> - [patch 11] Propagate unsigned int return type for max_ptes_none; remove
> the now-unnecessary negative return check (consequence of patch 04);
> Add optimization to the next_order goto that will prevent unnecessary
> iterations if there are no lower orders enabled (Vernon); update locking
> comment; pass VMA to mthp_collapse to improve uffd-armed detection, and
> prevent unnecessary work. (Wei)
> - [patch 14] Update documentation to reflect fallback-to-0 behavior
>
Below is how v18 altered mm.git.
Quite a lot of it seems to be replacement of "//"-style comments. It's
unfortunate that this work isn't separated from the substantive
changes. We could have done that with a few followup fixes rather than
a wholesale replacement of the series.
Documentation/admin-guide/mm/transhuge.rst | 5
include/trace/events/huge_memory.h | 3
mm/khugepaged.c | 121 +++++++++----------
3 files changed, 66 insertions(+), 63 deletions(-)
--- a/Documentation/admin-guide/mm/transhuge.rst~b
+++ a/Documentation/admin-guide/mm/transhuge.rst
@@ -312,8 +312,9 @@ when collapsing a group of small pages i
For PMD-sized THP collapse, this directly limits the number of empty pages
allowed in the 2MB region.
-For mTHP collapse, only 0 or (HPAGE_PMD_NR - 1) are supported. Any other value
-will emit a warning and no mTHP collapse will be attempted.
+For mTHP collapse, only 0 or (HPAGE_PMD_NR - 1) are supported. At
+HPAGE_PMD_NR - 1, we collapse to the highest possible order. Any intermediate
+value will emit a warning and mTHP collapse will default to max_ptes_none=0.
A higher value allows more empty pages, potentially leading to more memory
usage but better THP performance. A lower value is more conservative and
--- a/include/trace/events/huge_memory.h~b
+++ a/include/trace/events/huge_memory.h
@@ -39,8 +39,7 @@
EM( SCAN_STORE_FAILED, "store_failed") \
EM( SCAN_COPY_MC, "copy_poisoned_page") \
EM( SCAN_PAGE_FILLED, "page_filled") \
- EM(SCAN_PAGE_DIRTY_OR_WRITEBACK, "page_dirty_or_writeback") \
- EMe(SCAN_INVALID_PTES_NONE, "invalid_ptes_none")
+ EMe(SCAN_PAGE_DIRTY_OR_WRITEBACK, "page_dirty_or_writeback")
#undef EM
#undef EMe
--- a/mm/khugepaged.c~b
+++ a/mm/khugepaged.c
@@ -61,7 +61,6 @@ enum scan_result {
SCAN_COPY_MC,
SCAN_PAGE_FILLED,
SCAN_PAGE_DIRTY_OR_WRITEBACK,
- SCAN_INVALID_PTES_NONE,
};
#define CREATE_TRACE_POINTS
@@ -380,41 +379,43 @@ static bool pte_none_or_zero(pte_t pte)
}
/**
- * collapse_max_ptes_none - Calculate maximum allowed none-page or zero-page
- * PTEs for the given collapse operation.
+ * collapse_max_ptes_none - Calculate maximum allowed empty PTEs or PTEs mapping
+ * the shared zeropage for the given collapse operation.
* @cc: The collapse control struct
* @vma: The vma to check for userfaultfd
* @order: The folio order being collapsed to
*
- * Return: Maximum number of none-page or zero-page PTEs allowed for the
- * collapse operation.
+ * Return: Maximum number of empty/shared zeropage PTEs for the collapse operation
*/
-static int collapse_max_ptes_none(struct collapse_control *cc,
+static unsigned int collapse_max_ptes_none(struct collapse_control *cc,
struct vm_area_struct *vma, unsigned int order)
{
unsigned int max_ptes_none = khugepaged_max_ptes_none;
- // If the vma is userfaultfd-armed, allow no none-page or zero-page PTEs.
+
if (vma && userfaultfd_armed(vma))
return 0;
- // for MADV_COLLAPSE, allow any none-page or zero-page PTEs.
+ /* for MADV_COLLAPSE, allow any empty/shared zeropage PTEs */
if (!cc->is_khugepaged)
return HPAGE_PMD_NR;
- // for PMD collapse, respect the user defined maximum.
+ /* for PMD collapse, respect the user defined maximum */
if (is_pmd_order(order))
return max_ptes_none;
- /* Zero/non-present collapse disabled. */
- if (!max_ptes_none)
- return 0;
- // for mTHP collapse with the sysctl value set to KHUGEPAGED_MAX_PTES_LIMIT,
- // scale the maximum number of PTEs to the order of the collapse.
+ /*
+ * for mTHP collapse with the sysctl value set to KHUGEPAGED_MAX_PTES_LIMIT,
+ * scale the maximum number of PTEs to the order of the collapse.
+ */
if (max_ptes_none == KHUGEPAGED_MAX_PTES_LIMIT)
return (1 << order) - 1;
-
- // We currently only support max_ptes_none values of 0 or KHUGEPAGED_MAX_PTES_LIMIT.
- // Emit a warning and return -EINVAL.
- pr_warn_once("mTHP collapse only supports max_ptes_none values of 0 or %u\n",
- KHUGEPAGED_MAX_PTES_LIMIT);
- return -EINVAL;
+ if (!max_ptes_none)
+ return 0;
+ /*
+ * For mTHP collapse of values other than 0 or KHUGEPAGED_MAX_PTES_LIMIT,
+ * emit a warning and return 0.
+ */
+ pr_warn_once("mTHP collapse does not support max_ptes_none values"
+ " other than 0 or %u, defaulting to 0.\n",
+ KHUGEPAGED_MAX_PTES_LIMIT);
+ return 0;
}
/**
@@ -429,15 +430,19 @@ static int collapse_max_ptes_none(struct
static unsigned int collapse_max_ptes_shared(struct collapse_control *cc,
unsigned int order)
{
- // for MADV_COLLAPSE, do not restrict the number of PTEs that map shared
- // anonymous pages.
+ /*
+ * For MADV_COLLAPSE, do not restrict the number of PTEs that map shared
+ * anonymous pages.
+ */
if (!cc->is_khugepaged)
return HPAGE_PMD_NR;
- // for mTHP collapse do not allow collapsing anonymous memory pages that
- // are shared between processes.
+ /*
+ * for mTHP collapse do not allow collapsing anonymous memory pages that
+ * are shared between processes.
+ */
if (!is_pmd_order(order))
return 0;
- // for PMD collapse, respect the user defined maximum.
+ /* for PMD collapse, respect the user defined maximum */
return khugepaged_max_ptes_shared;
}
@@ -453,14 +458,16 @@ static unsigned int collapse_max_ptes_sh
static unsigned int collapse_max_ptes_swap(struct collapse_control *cc,
unsigned int order)
{
- // for MADV_COLLAPSE, do not restrict the number PTEs entries or
- // pagecache entries that are non-present.
+ /*
+ * For MADV_COLLAPSE, do not restrict the number PTEs entries or
+ * pagecache entries that are non-present.
+ */
if (!cc->is_khugepaged)
return HPAGE_PMD_NR;
- // for mTHP collapse do not allow any non-present PTEs or pagecache entries.
+ /* for mTHP collapse do not allow any non-present PTEs or pagecache entries */
if (!is_pmd_order(order))
return 0;
- // for PMD collapse, respect the user defined maximum.
+ /* for PMD collapse, respect the user defined maximum */
return khugepaged_max_ptes_swap;
}
@@ -593,9 +600,8 @@ static unsigned long collapse_allowable_
void khugepaged_enter_vma(struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
- collapse_allowable_orders(vma, vm_flags, TVA_KHUGEPAGED) &&
- hugepage_enabled())
+ if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) && hugepage_enabled()
+ && collapse_allowable_orders(vma, vm_flags, TVA_KHUGEPAGED))
__khugepaged_enter(vma->vm_mm);
}
@@ -670,6 +676,8 @@ static enum scan_result __collapse_huge_
unsigned long start_addr, pte_t *pte, struct collapse_control *cc,
unsigned int order, struct list_head *compound_pagelist)
{
+ const unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma, order);
+ const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, order);
const unsigned long nr_pages = 1UL << order;
struct page *page = NULL;
struct folio *folio = NULL;
@@ -677,11 +685,6 @@ static enum scan_result __collapse_huge_
pte_t *_pte;
int none_or_zero = 0, shared = 0, referenced = 0;
enum scan_result result = SCAN_FAIL;
- int max_ptes_none = collapse_max_ptes_none(cc, vma, order);
- unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, order);
-
- if (max_ptes_none < 0)
- return SCAN_INVALID_PTES_NONE;
for (_pte = pte; _pte < pte + nr_pages;
_pte++, addr += PAGE_SIZE) {
@@ -1136,6 +1139,10 @@ static enum scan_result check_pmd_still_
* Bring missing pages in from swap, to complete THP collapse.
* Only done if khugepaged_scan_pmd believes it is worthwhile.
*
+ * For mTHP orders the function bails on the first swap entry, because
+ * faulting pages back in during collapse could re-populate PTEs that
+ * push a later scan over the threshold for a higher-order collapse.
+ *
* Called and returns without pte mapped or spinlocks held.
* Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
*/
@@ -1257,19 +1264,18 @@ static enum scan_result alloc_charge_fol
return SCAN_CGROUP_CHARGE_FAIL;
}
- count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);
+ if (is_pmd_order(order))
+ count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);
*foliop = folio;
return SCAN_SUCCEED;
}
/*
- * collapse_huge_page expects the mmap_read_lock to be dropped before
- * entering this function. The function will also always return with the lock
- * dropped. The function starts by allocation a folio, which can potentially
- * take a long time if it involves sync compaction, and we do not need to hold
- * the mmap_lock during that. We must recheck the vma after taking it again in
- * write mode.
+ * collapse_huge_page expects the mmap_lock to be unlocked before entering and
+ * will always return with the lock unlocked, to avoid holding the mmap_lock
+ * while allocating a THP, as that could trigger direct reclaim/compaction.
+ * Note that the VMA must be rechecked after grabbing the mmap_lock again.
*/
static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long start_addr,
int referenced, int unmapped, struct collapse_control *cc,
@@ -1500,12 +1506,12 @@ static unsigned int collapse_mthp_count_
* If a collapse is permitted, we attempt to collapse the PTE range into a
* mTHP.
*/
-static int mthp_collapse(struct mm_struct *mm, unsigned long address,
- int referenced, int unmapped, struct collapse_control *cc,
- unsigned long enabled_orders)
+static int mthp_collapse(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, int referenced, int unmapped,
+ struct collapse_control *cc, unsigned long enabled_orders)
{
- unsigned int nr_occupied_ptes, nr_ptes;
- int max_ptes_none, collapsed = 0, stack_size = 0;
+ unsigned int nr_occupied_ptes, nr_ptes, max_ptes_none;
+ int collapsed = 0, stack_size = 0;
unsigned long collapse_address;
struct mthp_range range;
u16 offset;
@@ -1522,10 +1528,7 @@ static int mthp_collapse(struct mm_struc
if (!test_bit(order, &enabled_orders))
goto next_order;
- max_ptes_none = collapse_max_ptes_none(cc, NULL, order);
-
- if (max_ptes_none < 0)
- return collapsed;
+ max_ptes_none = collapse_max_ptes_none(cc, vma, order);
nr_occupied_ptes = collapse_mthp_count_present(cc, offset,
nr_ptes);
@@ -1565,7 +1568,7 @@ static int mthp_collapse(struct mm_struc
}
next_order:
- if (order > KHUGEPAGED_MIN_MTHP_ORDER) {
+ if ((BIT(order) - 1) & enabled_orders) {
const u8 next_order = order - 1;
const u16 mid_offset = offset + (nr_ptes / 2);
@@ -1582,9 +1585,9 @@ static enum scan_result collapse_scan_pm
struct vm_area_struct *vma, unsigned long start_addr,
bool *lock_dropped, struct collapse_control *cc)
{
- int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, HPAGE_PMD_ORDER);
const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
+ unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
enum tva_type tva_flags = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
pmd_t *pmd;
pte_t *pte, *_pte, pteval;
@@ -1772,9 +1775,9 @@ out_unmap:
if (result == SCAN_SUCCEED) {
/* collapse_huge_page expects the lock to be dropped before calling */
mmap_read_unlock(mm);
- nr_collapsed = mthp_collapse(mm, start_addr, referenced, unmapped,
- cc, enabled_orders);
- /* collapse_huge_page will return with the mmap_lock released */
+ nr_collapsed = mthp_collapse(mm, vma, start_addr, referenced,
+ unmapped, cc, enabled_orders);
+ /* mmap_lock was released above, set lock_dropped */
*lock_dropped = true;
result = nr_collapsed ? SCAN_SUCCEED : SCAN_FAIL;
}
@@ -2665,7 +2668,7 @@ static enum scan_result collapse_scan_fi
unsigned long addr, struct file *file, pgoff_t start,
struct collapse_control *cc)
{
- const int max_ptes_none = collapse_max_ptes_none(cc, NULL, HPAGE_PMD_ORDER);
+ const unsigned int max_ptes_none = collapse_max_ptes_none(cc, NULL, HPAGE_PMD_ORDER);
const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
struct folio *folio = NULL;
struct address_space *mapping = file->f_mapping;
_
^ permalink raw reply
* [PATCH v6 12/12] Documentation: PCI: Add documentation for Live Update
From: David Matlack @ 2026-05-22 20:24 UTC (permalink / raw)
To: kexec, linux-doc, linux-kernel, linux-mm, linux-pci
Cc: Adithya Jayachandran, Alexander Graf, Alex Williamson,
Bjorn Helgaas, Chris Li, David Matlack, David Rientjes, Jacob Pan,
Jason Gunthorpe, Jonathan Corbet, Josh Hilke, Leon Romanovsky,
Lukas Wunner, Mike Rapoport, Parav Pandit, Pasha Tatashin,
Pranjal Shrivastava, Pratyush Yadav, Saeed Mahameed,
Samiullah Khawaja, Shuah Khan, Vipin Sharma, William Tu, Yi Liu
In-Reply-To: <20260522202410.3104264-1-dmatlack@google.com>
Add documentation files for the PCI subsystem's participation in Live
Update.
These documentation files are generated from the kernel-doc comments
in the PCI Live Update source code. They describe the File-Lifecycle
Bound (FLB) API, the device tracking API, and the specific policies
applied to preserved devices (such as bus number inheritance and bus
mastering preservation).
Signed-off-by: David Matlack <dmatlack@google.com>
---
Documentation/PCI/index.rst | 1 +
Documentation/PCI/liveupdate.rst | 29 +++++++++++++++++++++++++++
Documentation/core-api/liveupdate.rst | 1 +
MAINTAINERS | 1 +
4 files changed, 32 insertions(+)
create mode 100644 Documentation/PCI/liveupdate.rst
diff --git a/Documentation/PCI/index.rst b/Documentation/PCI/index.rst
index 5d720d2a415e..23fb737ac969 100644
--- a/Documentation/PCI/index.rst
+++ b/Documentation/PCI/index.rst
@@ -20,3 +20,4 @@ PCI Bus Subsystem
controller/index
boot-interrupts
tph
+ liveupdate
diff --git a/Documentation/PCI/liveupdate.rst b/Documentation/PCI/liveupdate.rst
new file mode 100644
index 000000000000..eba55f8a92ae
--- /dev/null
+++ b/Documentation/PCI/liveupdate.rst
@@ -0,0 +1,29 @@
+.. SPDX-License-Identifier: GPL-2.0-or-later
+
+===========================
+PCI Support for Live Update
+===========================
+
+.. kernel-doc:: drivers/pci/liveupdate.c
+ :doc: PCI Live Update
+
+Driver API
+==========
+
+.. kernel-doc:: drivers/pci/liveupdate.c
+ :export:
+
+Live Update ABI
+===============
+
+.. kernel-doc:: include/linux/kho/abi/pci.h
+ :doc: PCI File-Lifecycle Bound (FLB) Live Update ABI
+
+.. kernel-doc:: include/linux/kho/abi/pci.h
+ :internal:
+
+See Also
+========
+
+ * :doc:`/core-api/liveupdate`
+ * :doc:`/core-api/kho/index`
diff --git a/Documentation/core-api/liveupdate.rst b/Documentation/core-api/liveupdate.rst
index 5a292d0f3706..d56a7760978a 100644
--- a/Documentation/core-api/liveupdate.rst
+++ b/Documentation/core-api/liveupdate.rst
@@ -70,3 +70,4 @@ See Also
- :doc:`Live Update uAPI </userspace-api/liveupdate>`
- :doc:`/core-api/kho/index`
+- :doc:`PCI </PCI/liveupdate>`
diff --git a/MAINTAINERS b/MAINTAINERS
index 0e262c0ceb43..6f0b0ebf67cd 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -20536,6 +20536,7 @@ L: kexec@lists.infradead.org
L: linux-pci@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/liveupdate/linux.git
+F: Documentation/PCI/liveupdate.rst
F: drivers/pci/liveupdate.c
F: drivers/pci/liveupdate.h
F: include/linux/kho/abi/pci.h
--
2.54.0.746.g67dd491aae-goog
^ permalink raw reply related
* [PATCH v6 11/12] PCI: liveupdate: Do not disable bus mastering on preserved devices during kexec
From: David Matlack @ 2026-05-22 20:24 UTC (permalink / raw)
To: kexec, linux-doc, linux-kernel, linux-mm, linux-pci
Cc: Adithya Jayachandran, Alexander Graf, Alex Williamson,
Bjorn Helgaas, Chris Li, David Matlack, David Rientjes, Jacob Pan,
Jason Gunthorpe, Jonathan Corbet, Josh Hilke, Leon Romanovsky,
Lukas Wunner, Mike Rapoport, Parav Pandit, Pasha Tatashin,
Pranjal Shrivastava, Pratyush Yadav, Saeed Mahameed,
Samiullah Khawaja, Shuah Khan, Vipin Sharma, William Tu, Yi Liu
In-Reply-To: <20260522202410.3104264-1-dmatlack@google.com>
Do not disable bus mastering on outgoing preserved devices during
pci_device_shutdown() for kexec.
Preserved devices must be allowed to perform memory transactions during
a Live Update to ensure continuous operation. Clearing the bus
mastering bit would prevent these devices from issuing any memory
requests while the new kernel boots.
Because bridges upstream of preserved endpoint devices are also
automatically preserved, this change also avoids clearing bus mastering
on them. This is critical because clearing bus mastering on an upstream
bridge prevents the bridge from forwarding memory requests upstream (i.e.
it would prevent the endpoint device from accessing system RAM and doing
peer-to-peer transactions with devices not downstream of the bridge).
Signed-off-by: David Matlack <dmatlack@google.com>
---
drivers/pci/liveupdate.c | 11 +++++++++++
drivers/pci/liveupdate.h | 6 ++++++
drivers/pci/pci-driver.c | 7 +++++--
3 files changed, 22 insertions(+), 2 deletions(-)
diff --git a/drivers/pci/liveupdate.c b/drivers/pci/liveupdate.c
index d404e64a4e55..a6f2790bc1bf 100644
--- a/drivers/pci/liveupdate.c
+++ b/drivers/pci/liveupdate.c
@@ -132,6 +132,10 @@
* * The PCI core inherits ARI Forwarding Enable on all bridges with downstream
* preserved devices to ensure that all preserved devices on the bridge's
* secondary bus are addressable after the Live Update.
+ *
+ * * The PCI core does not disable bus mastering on outgoing preserved devices
+ * during kexec. This allows preserved devices to issue memory transactions
+ * throughout the Live Update.
*/
#define pr_fmt(fmt) "PCI: liveupdate: " fmt
@@ -790,6 +794,13 @@ int pci_liveupdate_configure_ari(struct pci_dev *dev)
return 0;
}
+bool pci_liveupdate_is_outgoing(struct pci_dev *dev)
+{
+ guard(rwsem_read)(&pci_liveupdate.rwsem);
+ pci_WARN_ONCE(dev, !dev->liveupdate.frozen, "Preservation status is unstable!\n");
+ return dev->liveupdate.outgoing;
+}
+
/**
* pci_liveupdate_is_incoming() - Check if a device is incoming-preserved
* @dev: The PCI device to check
diff --git a/drivers/pci/liveupdate.h b/drivers/pci/liveupdate.h
index bcb0bc73d684..b266406aaac8 100644
--- a/drivers/pci/liveupdate.h
+++ b/drivers/pci/liveupdate.h
@@ -20,6 +20,7 @@ void pci_liveupdate_scan_bridge_end(struct pci_dev *dev, int pass);
void pci_liveupdate_init_acs(struct pci_dev *dev);
int pci_liveupdate_enable_acs(struct pci_dev *dev);
int pci_liveupdate_configure_ari(struct pci_dev *dev);
+bool pci_liveupdate_is_outgoing(struct pci_dev *dev);
#else
static inline void pci_liveupdate_setup_device(struct pci_dev *dev)
{
@@ -57,6 +58,11 @@ static inline int pci_liveupdate_configure_ari(struct pci_dev *dev)
{
return -EINVAL;
}
+
+static inline bool pci_liveupdate_is_outgoing(struct pci_dev *dev)
+{
+ return false;
+}
#endif
#endif /* DRIVERS_PCI_LIVEUPDATE_H */
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index f7a5e65a7c75..0b1f8d01d7a5 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -546,11 +546,14 @@ static void pci_device_shutdown(struct device *dev)
/*
* If this is a kexec reboot, turn off Bus Master bit on the
* device to tell it to not continue to do DMA. Don't touch
- * devices in D3cold or unknown states.
+ * devices being preserved for Live Update or in D3cold or
+ * unknown states.
+ *
* If it is not a kexec reboot, firmware will hit the PCI
* devices with big hammer and stop their DMA any way.
*/
- if (kexec_in_progress && (pci_dev->current_state <= PCI_D3hot))
+ if (kexec_in_progress && !pci_liveupdate_is_outgoing(pci_dev) &&
+ pci_dev->current_state <= PCI_D3hot)
pci_clear_master(pci_dev);
}
--
2.54.0.746.g67dd491aae-goog
^ permalink raw reply related
* [PATCH v6 10/12] PCI: liveupdate: Freeze preservation status during shutdown
From: David Matlack @ 2026-05-22 20:24 UTC (permalink / raw)
To: kexec, linux-doc, linux-kernel, linux-mm, linux-pci
Cc: Adithya Jayachandran, Alexander Graf, Alex Williamson,
Bjorn Helgaas, Chris Li, David Matlack, David Rientjes, Jacob Pan,
Jason Gunthorpe, Jonathan Corbet, Josh Hilke, Leon Romanovsky,
Lukas Wunner, Mike Rapoport, Parav Pandit, Pasha Tatashin,
Pranjal Shrivastava, Pratyush Yadav, Saeed Mahameed,
Samiullah Khawaja, Shuah Khan, Vipin Sharma, William Tu, Yi Liu
In-Reply-To: <20260522202410.3104264-1-dmatlack@google.com>
Freeze a device's outgoing preservation status (preserved or not
preserved) during shutdown. This enables the PCI core and drivers to
safely make decisions based on the device's preservation status during
shutdown.
Note that pci_liveupdate_freeze() is triggered by the PCI core rather
than from drivers participating in Live Update so that all devices can
have their status frozen (i.e. prevent non-preserved devices from
getting preserved late).
Signed-off-by: David Matlack <dmatlack@google.com>
---
drivers/pci/liveupdate.c | 16 ++++++++++++++++
drivers/pci/liveupdate.h | 5 +++++
drivers/pci/pci-driver.c | 2 ++
include/linux/pci_liveupdate.h | 3 +++
4 files changed, 26 insertions(+)
diff --git a/drivers/pci/liveupdate.c b/drivers/pci/liveupdate.c
index 701276ef6cfb..d404e64a4e55 100644
--- a/drivers/pci/liveupdate.c
+++ b/drivers/pci/liveupdate.c
@@ -297,6 +297,11 @@ static int pci_liveupdate_unpreserve_device(struct pci_ser *ser, struct pci_dev
{
struct pci_dev_ser *dev_ser = dev->liveupdate.outgoing;
+ if (dev->liveupdate.frozen) {
+ pci_warn(dev, "Cannot unpreserve device after it is frozen!\n");
+ return -EINVAL;
+ }
+
if (!dev_ser) {
pci_warn(dev, "Cannot unpreserve device that is not preserved\n");
return -EINVAL;
@@ -380,6 +385,11 @@ static int __pci_liveupdate_preserve_device(struct pci_ser *ser, struct pci_dev
static int pci_liveupdate_preserve_device(struct pci_ser *ser, struct pci_dev *dev)
{
+ if (dev->liveupdate.frozen) {
+ pci_warn(dev, "Cannot preserve device after it is frozen!\n");
+ return -EINVAL;
+ }
+
if (dev->liveupdate.outgoing)
return pci_liveupdate_preserve_device_again(dev);
@@ -653,6 +663,12 @@ void pci_liveupdate_cleanup_device(struct pci_dev *dev)
}
}
+void pci_liveupdate_freeze(struct pci_dev *dev)
+{
+ guard(rwsem_write)(&pci_liveupdate.rwsem);
+ dev->liveupdate.frozen = 1;
+}
+
static int pci_liveupdate_finish_device(struct pci_ser *ser, struct pci_dev *dev)
{
if (!dev->liveupdate.incoming) {
diff --git a/drivers/pci/liveupdate.h b/drivers/pci/liveupdate.h
index 6f21ec50927b..bcb0bc73d684 100644
--- a/drivers/pci/liveupdate.h
+++ b/drivers/pci/liveupdate.h
@@ -13,6 +13,7 @@
#ifdef CONFIG_PCI_LIVEUPDATE
void pci_liveupdate_setup_device(struct pci_dev *dev);
void pci_liveupdate_cleanup_device(struct pci_dev *dev);
+void pci_liveupdate_freeze(struct pci_dev *dev);
bool pci_liveupdate_scan_bridge_begin(struct pci_bus *bus, struct pci_dev *dev,
int pass);
void pci_liveupdate_scan_bridge_end(struct pci_dev *dev, int pass);
@@ -28,6 +29,10 @@ static inline void pci_liveupdate_cleanup_device(struct pci_dev *dev)
{
}
+static inline void pci_liveupdate_freeze(struct pci_dev *dev)
+{
+}
+
static inline bool pci_liveupdate_scan_bridge_begin(struct pci_bus *bus,
struct pci_dev *dev,
int pass)
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index d10ece0889f0..f7a5e65a7c75 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -21,6 +21,7 @@
#include <linux/acpi.h>
#include <linux/dma-map-ops.h>
#include <linux/iommu.h>
+#include "liveupdate.h"
#include "pci.h"
#include "pcie/portdrv.h"
@@ -536,6 +537,7 @@ static void pci_device_shutdown(struct device *dev)
struct pci_dev *pci_dev = to_pci_dev(dev);
struct pci_driver *drv = pci_dev->driver;
+ pci_liveupdate_freeze(pci_dev);
pm_runtime_resume(dev);
if (drv && drv->shutdown)
diff --git a/include/linux/pci_liveupdate.h b/include/linux/pci_liveupdate.h
index 2446c6d237ca..150993405754 100644
--- a/include/linux/pci_liveupdate.h
+++ b/include/linux/pci_liveupdate.h
@@ -24,6 +24,8 @@
* @was_preserved: True if this struct pci_dev was preserved by the previous
* kernel. Unlike @incoming, this field is not cleared after
* the device is finished participating in Live Update.
+ * @frozen: True if the outgoing preservation status of this device is frozen
+ * and thus cannot be changed.
*/
struct pci_liveupdate {
struct pci_dev_ser *outgoing;
@@ -31,6 +33,7 @@ struct pci_liveupdate {
u16 acs_ctrl;
bool inherit_buses;
bool was_preserved;
+ bool frozen;
};
struct pci_dev;
--
2.54.0.746.g67dd491aae-goog
^ permalink raw reply related
* [PATCH v6 09/12] PCI: liveupdate: Inherit ARI Forwarding Enable on preserved bridges
From: David Matlack @ 2026-05-22 20:24 UTC (permalink / raw)
To: kexec, linux-doc, linux-kernel, linux-mm, linux-pci
Cc: Adithya Jayachandran, Alexander Graf, Alex Williamson,
Bjorn Helgaas, Chris Li, David Matlack, David Rientjes, Jacob Pan,
Jason Gunthorpe, Jonathan Corbet, Josh Hilke, Leon Romanovsky,
Lukas Wunner, Mike Rapoport, Parav Pandit, Pasha Tatashin,
Pranjal Shrivastava, Pratyush Yadav, Saeed Mahameed,
Samiullah Khawaja, Shuah Khan, Vipin Sharma, William Tu, Yi Liu
In-Reply-To: <20260522202410.3104264-1-dmatlack@google.com>
Inherit the ARI Forwarding Enable on preserved bridges and update
pci_dev->ari_enabled accordingly during a Live Update. This ensures that
the preserved devices on the bridge's secondary bus can be identified
with the same expanded 8-bit function number after a Live Update.
Signed-off-by: David Matlack <dmatlack@google.com>
---
drivers/pci/liveupdate.c | 18 ++++++++++++++++++
drivers/pci/liveupdate.h | 6 ++++++
drivers/pci/pci.c | 8 +++++++-
3 files changed, 31 insertions(+), 1 deletion(-)
diff --git a/drivers/pci/liveupdate.c b/drivers/pci/liveupdate.c
index a93b7ef065f2..701276ef6cfb 100644
--- a/drivers/pci/liveupdate.c
+++ b/drivers/pci/liveupdate.c
@@ -128,6 +128,10 @@
* way after Live Update and ensures that IOMMU groups do not change. Note
* that a device will use its inherited ACS flags for the lifetime of its
* struct pci_dev (i.e. even after pci_liveupdate_finish()).
+ *
+ * * The PCI core inherits ARI Forwarding Enable on all bridges with downstream
+ * preserved devices to ensure that all preserved devices on the bridge's
+ * secondary bus are addressable after the Live Update.
*/
#define pr_fmt(fmt) "PCI: liveupdate: " fmt
@@ -756,6 +760,20 @@ int pci_liveupdate_enable_acs(struct pci_dev *dev)
return 0;
}
+int pci_liveupdate_configure_ari(struct pci_dev *dev)
+{
+ u16 val;
+
+ guard(rwsem_read)(&pci_liveupdate.rwsem);
+
+ if (!dev->liveupdate.incoming)
+ return -EINVAL;
+
+ pcie_capability_read_word(dev, PCI_EXP_DEVCTL2, &val);
+ dev->ari_enabled = !!(val & PCI_EXP_DEVCTL2_ARI);
+ return 0;
+}
+
/**
* pci_liveupdate_is_incoming() - Check if a device is incoming-preserved
* @dev: The PCI device to check
diff --git a/drivers/pci/liveupdate.h b/drivers/pci/liveupdate.h
index 4e8a01bcb4bb..6f21ec50927b 100644
--- a/drivers/pci/liveupdate.h
+++ b/drivers/pci/liveupdate.h
@@ -18,6 +18,7 @@ bool pci_liveupdate_scan_bridge_begin(struct pci_bus *bus, struct pci_dev *dev,
void pci_liveupdate_scan_bridge_end(struct pci_dev *dev, int pass);
void pci_liveupdate_init_acs(struct pci_dev *dev);
int pci_liveupdate_enable_acs(struct pci_dev *dev);
+int pci_liveupdate_configure_ari(struct pci_dev *dev);
#else
static inline void pci_liveupdate_setup_device(struct pci_dev *dev)
{
@@ -46,6 +47,11 @@ static inline int pci_liveupdate_enable_acs(struct pci_dev *dev)
{
return -EINVAL;
}
+
+static inline int pci_liveupdate_configure_ari(struct pci_dev *dev)
+{
+ return -EINVAL;
+}
#endif
#endif /* DRIVERS_PCI_LIVEUPDATE_H */
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 211df4618164..271da55af270 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -3495,7 +3495,7 @@ void pci_configure_ari(struct pci_dev *dev)
u32 cap;
struct pci_dev *bridge;
- if (pcie_ari_disabled || !pci_is_pcie(dev) || dev->devfn)
+ if (!pci_is_pcie(dev) || dev->devfn)
return;
bridge = dev->bus->self;
@@ -3506,6 +3506,12 @@ void pci_configure_ari(struct pci_dev *dev)
if (!(cap & PCI_EXP_DEVCAP2_ARI))
return;
+ if (!pci_liveupdate_configure_ari(bridge))
+ return;
+
+ if (pcie_ari_disabled)
+ return;
+
if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ARI)) {
pcie_capability_set_word(bridge, PCI_EXP_DEVCTL2,
PCI_EXP_DEVCTL2_ARI);
--
2.54.0.746.g67dd491aae-goog
^ permalink raw reply related
* [PATCH v6 08/12] PCI: liveupdate: Inherit ACS flags in incoming preserved devices
From: David Matlack @ 2026-05-22 20:24 UTC (permalink / raw)
To: kexec, linux-doc, linux-kernel, linux-mm, linux-pci
Cc: Adithya Jayachandran, Alexander Graf, Alex Williamson,
Bjorn Helgaas, Chris Li, David Matlack, David Rientjes, Jacob Pan,
Jason Gunthorpe, Jonathan Corbet, Josh Hilke, Leon Romanovsky,
Lukas Wunner, Mike Rapoport, Parav Pandit, Pasha Tatashin,
Pranjal Shrivastava, Pratyush Yadav, Saeed Mahameed,
Samiullah Khawaja, Shuah Khan, Vipin Sharma, William Tu, Yi Liu
In-Reply-To: <20260522202410.3104264-1-dmatlack@google.com>
Inherit Access Control Services (ACS) flags on all incoming preserved
devices (endpoints and upstream bridges) during a Live Update.
Inheriting ACS flags avoids changing routing rules while memory
transactions are in flight from preserved devices. This is also strictly
necessary to ensure that IOMMU group assignments do not change across
a Live Update for preserved devices, as changing ACS configurations can
split or merge IOMMU groups.
Cache the inherited ACS controls established by the previous kernel in
struct pci_dev so that ACS controls do not change after a reset
(pci_restore_state() calls pci_enable_acs()).
To simplify ACS inheritance, reject preserving any devices that require
quirks to enable ACS as those quirks would also have to take Live Update
into account.
Signed-off-by: David Matlack <dmatlack@google.com>
---
drivers/pci/liveupdate.c | 68 ++++++++++++++++++++++++++++++++++
drivers/pci/liveupdate.h | 11 ++++++
drivers/pci/pci.c | 5 +++
drivers/pci/pci.h | 5 +++
drivers/pci/quirks.c | 7 ++++
include/linux/pci_liveupdate.h | 6 +++
6 files changed, 102 insertions(+)
diff --git a/drivers/pci/liveupdate.c b/drivers/pci/liveupdate.c
index 4c79e19b7f98..a93b7ef065f2 100644
--- a/drivers/pci/liveupdate.c
+++ b/drivers/pci/liveupdate.c
@@ -71,6 +71,9 @@
*
* * The device cannot be a Virtual Function (VF).
*
+ * * The device cannot require device-specific quirks to enable Access
+ * Control Services (ACS).
+ *
* Driver Binding
* ==============
*
@@ -113,6 +116,18 @@
* This enables the PCI core and any drivers bound to the bridge to participate
* in the Live Update so that preserved endpoints can continue issuing memory
* transactions during the Live Update.
+ *
+ * Handling Preserved Devices
+ * ==========================
+ *
+ * The PCI core treats preserved devices differently than non-preserved devices.
+ * This section enumerates those differences.
+ *
+ * * The PCI core inherits all ACS flags enabled on incoming preserved devices
+ * rather than assigning new ones. This ensures that TLPs are routed the same
+ * way after Live Update and ensures that IOMMU groups do not change. Note
+ * that a device will use its inherited ACS flags for the lifetime of its
+ * struct pci_dev (i.e. even after pci_liveupdate_finish()).
*/
#define pr_fmt(fmt) "PCI: liveupdate: " fmt
@@ -126,6 +141,7 @@
#include <linux/pci.h>
#include "liveupdate.h"
+#include "pci.h"
/**
* struct pci_liveupdate_global - Global state for PCI Live Update support
@@ -319,6 +335,16 @@ static int __pci_liveupdate_preserve_device(struct pci_ser *ser, struct pci_dev
{
int i;
+ /*
+ * Do not preserve devices that rely on device-specific ACS equivalents
+ * (for now) since that would complicate keeping ACS constant across
+ * Live Update.
+ */
+ if (pci_need_dev_specific_enable_acs(dev)) {
+ pci_warn(dev, "Refusing to preserve device that relies on ACS quirks\n");
+ return -EINVAL;
+ }
+
if (ser->nr_devices == ser->max_nr_devices)
return -ENOSPC;
@@ -598,6 +624,7 @@ void pci_liveupdate_setup_device(struct pci_dev *dev)
pci_info(dev, "Device was preserved by previous kernel across Live Update\n");
dev->liveupdate.incoming = dev_ser;
+ dev->liveupdate.was_preserved = true;
/*
* Hold the ref on the incoming FLB until pci_liveupdate_finish() so
@@ -688,6 +715,47 @@ void pci_liveupdate_finish(struct pci_dev *dev)
}
EXPORT_SYMBOL_GPL(pci_liveupdate_finish);
+void pci_liveupdate_init_acs(struct pci_dev *dev)
+{
+ guard(rwsem_read)(&pci_liveupdate.rwsem);
+
+ if (!dev->acs_cap || !dev->liveupdate.incoming)
+ return;
+
+ pci_read_config_word(dev, dev->acs_cap + PCI_ACS_CTRL, &dev->liveupdate.acs_ctrl);
+}
+
+int pci_liveupdate_enable_acs(struct pci_dev *dev)
+{
+ u16 acs_ctrl = dev->liveupdate.acs_ctrl;
+ u16 acs_cap = dev->acs_cap;
+
+ /*
+ * Use liveupdate.was_preserved instead of liveupdate.incoming since the
+ * device's ACS controls should not change even after the device is
+ * finished participating in the Live Update.
+ */
+ if (!dev->liveupdate.was_preserved)
+ return -EINVAL;
+
+ /*
+ * The previous kernel should not have preserved any devices that
+ * require device-specific quirks to enable ACS, but if such a device is
+ * detected, log a big warning and fall back to the normal enable ACS
+ * path.
+ */
+ if (pci_need_dev_specific_enable_acs(dev)) {
+ pci_warn(dev, "Device-specific quirk required to enable ACS!\n");
+ WARN_ON_ONCE(true);
+ return -EINVAL;
+ }
+
+ if (acs_cap)
+ pci_write_config_word(dev, acs_cap + PCI_ACS_CTRL, acs_ctrl);
+
+ return 0;
+}
+
/**
* pci_liveupdate_is_incoming() - Check if a device is incoming-preserved
* @dev: The PCI device to check
diff --git a/drivers/pci/liveupdate.h b/drivers/pci/liveupdate.h
index c763255a8de4..4e8a01bcb4bb 100644
--- a/drivers/pci/liveupdate.h
+++ b/drivers/pci/liveupdate.h
@@ -16,6 +16,8 @@ void pci_liveupdate_cleanup_device(struct pci_dev *dev);
bool pci_liveupdate_scan_bridge_begin(struct pci_bus *bus, struct pci_dev *dev,
int pass);
void pci_liveupdate_scan_bridge_end(struct pci_dev *dev, int pass);
+void pci_liveupdate_init_acs(struct pci_dev *dev);
+int pci_liveupdate_enable_acs(struct pci_dev *dev);
#else
static inline void pci_liveupdate_setup_device(struct pci_dev *dev)
{
@@ -35,6 +37,15 @@ static inline bool pci_liveupdate_scan_bridge_begin(struct pci_bus *bus,
static inline void pci_liveupdate_scan_bridge_end(struct pci_dev *dev, int pass)
{
}
+
+static inline void pci_liveupdate_init_acs(struct pci_dev *dev)
+{
+}
+
+static inline int pci_liveupdate_enable_acs(struct pci_dev *dev)
+{
+ return -EINVAL;
+}
#endif
#endif /* DRIVERS_PCI_LIVEUPDATE_H */
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 8f7cfcc00090..211df4618164 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -33,6 +33,7 @@
#include <asm/dma.h>
#include <linux/aer.h>
#include <linux/bitfield.h>
+#include "liveupdate.h"
#include "pci.h"
DEFINE_MUTEX(pci_slot_mutex);
@@ -1017,6 +1018,9 @@ void pci_enable_acs(struct pci_dev *dev)
bool enable_acs = false;
int pos;
+ if (!pci_liveupdate_enable_acs(dev))
+ return;
+
/* If an iommu is present we start with kernel default caps */
if (pci_acs_enable) {
if (pci_dev_specific_enable_acs(dev))
@@ -3657,6 +3661,7 @@ void pci_acs_init(struct pci_dev *dev)
pci_read_config_word(dev, pos + PCI_ACS_CAP, &dev->acs_capabilities);
pci_disable_broken_acs_cap(dev);
+ pci_liveupdate_init_acs(dev);
}
/**
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 4a14f88e543a..b55f3deddd57 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -1062,6 +1062,7 @@ void pci_acs_init(struct pci_dev *dev);
void pci_enable_acs(struct pci_dev *dev);
#ifdef CONFIG_PCI_QUIRKS
int pci_dev_specific_acs_enabled(struct pci_dev *dev, u16 acs_flags);
+bool pci_need_dev_specific_enable_acs(struct pci_dev *dev);
int pci_dev_specific_enable_acs(struct pci_dev *dev);
int pci_dev_specific_disable_acs_redir(struct pci_dev *dev);
void pci_disable_broken_acs_cap(struct pci_dev *pdev);
@@ -1072,6 +1073,10 @@ static inline int pci_dev_specific_acs_enabled(struct pci_dev *dev,
{
return -ENOTTY;
}
+static inline bool pci_need_dev_specific_enable_acs(struct pci_dev *dev)
+{
+ return false;
+}
static inline int pci_dev_specific_enable_acs(struct pci_dev *dev)
{
return -ENOTTY;
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 171caec2bc47..59b0b19c3783 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -5482,6 +5482,13 @@ static const struct pci_dev_acs_ops *pci_dev_acs_ops_get(struct pci_dev *dev)
return NULL;
}
+bool pci_need_dev_specific_enable_acs(struct pci_dev *dev)
+{
+ const struct pci_dev_acs_ops *p = pci_dev_acs_ops_get(dev);
+
+ return p && p->enable_acs;
+}
+
int pci_dev_specific_enable_acs(struct pci_dev *dev)
{
const struct pci_dev_acs_ops *p = pci_dev_acs_ops_get(dev);
diff --git a/include/linux/pci_liveupdate.h b/include/linux/pci_liveupdate.h
index 2be98819e313..2446c6d237ca 100644
--- a/include/linux/pci_liveupdate.h
+++ b/include/linux/pci_liveupdate.h
@@ -17,14 +17,20 @@
* struct pci_liveupdate - PCI Live Update state for a struct pci_dev
* @outgoing: State preserved for the next kernel.
* @incoming: State preserved by the previous kernel.
+ * @acs_ctrl: ACS features established by the previous kernel.
* @inherit_buses: True if the PCI core should inherit the secondary and
* subordinate bus numbers assigned to this device due to
* an ongoing Live Update.
+ * @was_preserved: True if this struct pci_dev was preserved by the previous
+ * kernel. Unlike @incoming, this field is not cleared after
+ * the device is finished participating in Live Update.
*/
struct pci_liveupdate {
struct pci_dev_ser *outgoing;
struct pci_dev_ser *incoming;
+ u16 acs_ctrl;
bool inherit_buses;
+ bool was_preserved;
};
struct pci_dev;
--
2.54.0.746.g67dd491aae-goog
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox