Linux Documentation

Linux Documentation
 help / color / mirror / Atom feed

* Re: [PATCH v6 02/43] KVM: Rename KVM_GENERIC_MEMORY_ATTRIBUTES to KVM_VM_MEMORY_ATTRIBUTES
From: Fuad Tabba @ 2026-05-20 12:08 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-2-91ab5a8b19a4@google.com>

On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Sean Christopherson <seanjc@google.com>
>
> Rename the per-VM memory attributes Kconfig to make it explicitly about
> per-VM attributes in anticipation of adding memory attributes support to
> guest_memfd, at which point it will be possible (and desirable) to have
> memory attributes without the per-VM support, even in x86.
>
> No functional change intended.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Reviewed-by: Fuad Tabba <tabba@google.com>

Cheers,
/fuad


> ---
>  arch/x86/include/asm/kvm_host.h |  2 +-
>  arch/x86/kvm/Kconfig            |  6 +++---
>  arch/x86/kvm/mmu/mmu.c          |  2 +-
>  arch/x86/kvm/x86.c              |  2 +-
>  include/linux/kvm_host.h        |  8 ++++----
>  include/trace/events/kvm.h      |  4 ++--
>  virt/kvm/Kconfig                |  2 +-
>  virt/kvm/kvm_main.c             | 14 +++++++-------
>  8 files changed, 20 insertions(+), 20 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index c470e40a00aa4..60b997764beef 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -2369,7 +2369,7 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
>                        int tdp_max_root_level, int tdp_huge_page_level);
>
>
> -#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
> +#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>  #define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
>  #endif
>
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index 801bf9e520db3..26f6afd51bbdc 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -84,7 +84,7 @@ config KVM_SW_PROTECTED_VM
>         bool "Enable support for KVM software-protected VMs"
>         depends on EXPERT
>         depends on KVM_X86 && X86_64
> -       select KVM_GENERIC_MEMORY_ATTRIBUTES
> +       select KVM_VM_MEMORY_ATTRIBUTES
>         help
>           Enable support for KVM software-protected VMs.  Currently, software-
>           protected VMs are purely a development and testing vehicle for
> @@ -135,7 +135,7 @@ config KVM_INTEL_TDX
>         bool "Intel Trust Domain Extensions (TDX) support"
>         default y
>         depends on INTEL_TDX_HOST
> -       select KVM_GENERIC_MEMORY_ATTRIBUTES
> +       select KVM_VM_MEMORY_ATTRIBUTES
>         select HAVE_KVM_ARCH_GMEM_POPULATE
>         help
>           Provides support for launching Intel Trust Domain Extensions (TDX)
> @@ -159,7 +159,7 @@ config KVM_AMD_SEV
>         depends on KVM_AMD && X86_64
>         depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
>         select ARCH_HAS_CC_PLATFORM
> -       select KVM_GENERIC_MEMORY_ATTRIBUTES
> +       select KVM_VM_MEMORY_ATTRIBUTES
>         select HAVE_KVM_ARCH_GMEM_PREPARE
>         select HAVE_KVM_ARCH_GMEM_INVALIDATE
>         select HAVE_KVM_ARCH_GMEM_POPULATE
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 892246204435c..a80a876ab4ad6 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -7899,7 +7899,7 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
>                 vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread);
>  }
>
> -#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
> +#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>  static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
>                                 int level)
>  {
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 0a1b63c63d1a9..1560de1e95be0 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -13625,7 +13625,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
>                 }
>         }
>
> -#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
> +#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>         kvm_mmu_init_memslot_memory_attributes(kvm, slot);
>  #endif
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 4c14aee1fb063..7b9faa3545300 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -722,7 +722,7 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
>  }
>  #endif
>
> -#ifndef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
> +#ifndef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>  static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
>  {
>         return false;
> @@ -871,7 +871,7 @@ struct kvm {
>  #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
>         struct notifier_block pm_notifier;
>  #endif
> -#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
> +#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>         /* Protected by slots_lock (for writes) and RCU (for reads) */
>         struct xarray mem_attr_array;
>  #endif
> @@ -2528,7 +2528,7 @@ static inline bool kvm_memslot_is_gmem_only(const struct kvm_memory_slot *slot)
>         return slot->flags & KVM_MEMSLOT_GMEM_ONLY;
>  }
>
> -#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
> +#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>  static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
>  {
>         return xa_to_value(xa_load(&kvm->mem_attr_array, gfn));
> @@ -2550,7 +2550,7 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
>  {
>         return false;
>  }
> -#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
> +#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
>
>  #ifdef CONFIG_KVM_GUEST_MEMFD
>  int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
> index b282e3a867696..1ba72bd73ea2f 100644
> --- a/include/trace/events/kvm.h
> +++ b/include/trace/events/kvm.h
> @@ -358,7 +358,7 @@ TRACE_EVENT(kvm_dirty_ring_exit,
>         TP_printk("vcpu %d", __entry->vcpu_id)
>  );
>
> -#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
> +#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>  /*
>   * @start:     Starting address of guest memory range
>   * @end:       End address of guest memory range
> @@ -383,7 +383,7 @@ TRACE_EVENT(kvm_vm_set_mem_attributes,
>         TP_printk("%#016llx -- %#016llx [0x%lx]",
>                   __entry->start, __entry->end, __entry->attr)
>  );
> -#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
> +#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
>
>  TRACE_EVENT(kvm_unmap_hva_range,
>         TP_PROTO(unsigned long start, unsigned long end),
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index 794976b88c6f9..5119cb37145fc 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -100,7 +100,7 @@ config KVM_ELIDE_TLB_FLUSH_IF_YOUNG
>  config KVM_MMU_LOCKLESS_AGING
>         bool
>
> -config KVM_GENERIC_MEMORY_ATTRIBUTES
> +config KVM_VM_MEMORY_ATTRIBUTES
>         bool
>
>  config KVM_GUEST_MEMFD
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 89489996fbc1e..306153abbafa5 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1115,7 +1115,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
>         spin_lock_init(&kvm->mn_invalidate_lock);
>         rcuwait_init(&kvm->mn_memslots_update_rcuwait);
>         xa_init(&kvm->vcpu_array);
> -#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
> +#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>         xa_init(&kvm->mem_attr_array);
>  #endif
>
> @@ -1300,7 +1300,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
>         cleanup_srcu_struct(&kvm->irq_srcu);
>         srcu_barrier(&kvm->srcu);
>         cleanup_srcu_struct(&kvm->srcu);
> -#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
> +#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>         xa_destroy(&kvm->mem_attr_array);
>  #endif
>         kvm_arch_free_vm(kvm);
> @@ -2418,7 +2418,7 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
>  }
>  #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
>
> -#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
> +#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>  static u64 kvm_supported_mem_attributes(struct kvm *kvm)
>  {
>         if (!kvm || kvm_arch_has_private_mem(kvm))
> @@ -2623,7 +2623,7 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
>
>         return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
>  }
> -#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
> +#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
>
>  struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
>  {
> @@ -4921,7 +4921,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
>         case KVM_CAP_SYSTEM_EVENT_DATA:
>         case KVM_CAP_DEVICE_CTRL:
>                 return 1;
> -#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
> +#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>         case KVM_CAP_MEMORY_ATTRIBUTES:
>                 return kvm_supported_mem_attributes(kvm);
>  #endif
> @@ -5325,7 +5325,7 @@ static long kvm_vm_ioctl(struct file *filp,
>                 break;
>         }
>  #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
> -#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
> +#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>         case KVM_SET_MEMORY_ATTRIBUTES: {
>                 struct kvm_memory_attributes attrs;
>
> @@ -5336,7 +5336,7 @@ static long kvm_vm_ioctl(struct file *filp,
>                 r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
>                 break;
>         }
> -#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
> +#endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
>         case KVM_CREATE_DEVICE: {
>                 struct kvm_create_device cd;
>
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

* Re: [PATCH v6 03/43] KVM: Enumerate support for PRIVATE memory iff kvm_arch_has_private_mem is defined
From: Fuad Tabba @ 2026-05-20 12:08 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-3-91ab5a8b19a4@google.com>

On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Sean Christopherson <seanjc@google.com>
>
> Explicitly guard reporting support for KVM_MEMORY_ATTRIBUTE_PRIVATE based
> on kvm_arch_has_private_mem being #defined in anticipation of decoupling
> kvm_supported_mem_attributes() from CONFIG_KVM_VM_MEMORY_ATTRIBUTES.
> guest_memfd support for memory attributes will be unconditional to avoid
> yet more macros (all architectures that support guest_memfd are expected to
> use per-gmem attributes at some point), at which point enumerating support
> KVM_MEMORY_ATTRIBUTE_PRIVATE based solely on memory attributes being
> supported _somewhere_ would result in KVM over-reporting support on arm64.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Reviewed-by: Fuad Tabba <tabba@google.com>

Cheers,
/fuad

> ---
>  include/linux/kvm_host.h | 2 +-
>  virt/kvm/kvm_main.c      | 2 ++
>  2 files changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 7b9faa3545300..7d079f9701346 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -722,7 +722,7 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
>  }
>  #endif
>
> -#ifndef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
> +#ifndef kvm_arch_has_private_mem
>  static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
>  {
>         return false;
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 306153abbafa5..abb9cfa3eb04d 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2421,8 +2421,10 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
>  #ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>  static u64 kvm_supported_mem_attributes(struct kvm *kvm)
>  {
> +#ifdef kvm_arch_has_private_mem
>         if (!kvm || kvm_arch_has_private_mem(kvm))
>                 return KVM_MEMORY_ATTRIBUTE_PRIVATE;
> +#endif
>
>         return 0;
>  }
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

* Re: [PATCH v6 04/43] KVM: Stub in ability to disable per-VM memory attribute tracking
From: Fuad Tabba @ 2026-05-20 12:08 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-4-91ab5a8b19a4@google.com>

On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Sean Christopherson <seanjc@google.com>
>
> Introduce the basic infrastructure to allow per-VM memory attribute
> tracking to be disabled. This will be built-upon in a later patch, where a
> module param can disable per-VM memory attribute tracking.
>
> Split the Kconfig option into a base KVM_MEMORY_ATTRIBUTES and the
> existing KVM_VM_MEMORY_ATTRIBUTES. The base option provides the core
> plumbing, while the latter enables the full per-VM tracking via an xarray
> and the associated ioctls.
>
> kvm_get_memory_attributes() now performs a static call that either looks up
> kvm->mem_attr_array with CONFIG_KVM_VM_MEMORY_ATTRIBUTES is enabled, or
> just returns 0 otherwise. The static call can be patched depending on
> whether per-VM tracking is enabled by the CONFIG.
>
> No functional change intended.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Reviewed-by: Fuad Tabba <tabba@google.com>

Cheers,
/fuad

> ---
>  arch/x86/include/asm/kvm_host.h |  2 +-
>  include/linux/kvm_host.h        | 23 ++++++++++++---------
>  virt/kvm/Kconfig                |  4 ++++
>  virt/kvm/kvm_main.c             | 44 ++++++++++++++++++++++++++++++++++++++++-
>  4 files changed, 62 insertions(+), 11 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 60b997764beef..c9aa50bcdac2d 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -2369,7 +2369,7 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
>                        int tdp_max_root_level, int tdp_huge_page_level);
>
>
> -#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
> +#ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
>  #define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
>  #endif
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 7d079f9701346..c5ba2cb34e45c 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -2528,19 +2528,15 @@ static inline bool kvm_memslot_is_gmem_only(const struct kvm_memory_slot *slot)
>         return slot->flags & KVM_MEMSLOT_GMEM_ONLY;
>  }
>
> -#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
> +#ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
> +typedef unsigned long (kvm_get_memory_attributes_t)(struct kvm *kvm, gfn_t gfn);
> +DECLARE_STATIC_CALL(__kvm_get_memory_attributes, kvm_get_memory_attributes_t);
> +
>  static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
>  {
> -       return xa_to_value(xa_load(&kvm->mem_attr_array, gfn));
> +       return static_call(__kvm_get_memory_attributes)(kvm, gfn);
>  }
>
> -bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
> -                                    unsigned long mask, unsigned long attrs);
> -bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
> -                                       struct kvm_gfn_range *range);
> -bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
> -                                        struct kvm_gfn_range *range);
> -
>  static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
>  {
>         return kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
> @@ -2550,6 +2546,15 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
>  {
>         return false;
>  }
> +#endif
> +
> +#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
> +bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
> +                                    unsigned long mask, unsigned long attrs);
> +bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
> +                                       struct kvm_gfn_range *range);
> +bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
> +                                        struct kvm_gfn_range *range);
>  #endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
>
>  #ifdef CONFIG_KVM_GUEST_MEMFD
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index 5119cb37145fc..3fea89c45cfb4 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -100,7 +100,11 @@ config KVM_ELIDE_TLB_FLUSH_IF_YOUNG
>  config KVM_MMU_LOCKLESS_AGING
>         bool
>
> +config KVM_MEMORY_ATTRIBUTES
> +       bool
> +
>  config KVM_VM_MEMORY_ATTRIBUTES
> +       select KVM_MEMORY_ATTRIBUTES
>         bool
>
>  config KVM_GUEST_MEMFD
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index abb9cfa3eb04d..ee26f1d9b5fda 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -101,6 +101,17 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_shrink);
>  static bool __ro_after_init allow_unsafe_mappings;
>  module_param(allow_unsafe_mappings, bool, 0444);
>
> +#ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
> +#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
> +static bool vm_memory_attributes = true;
> +#else
> +#define vm_memory_attributes false
> +#endif
> +DEFINE_STATIC_CALL_RET0(__kvm_get_memory_attributes, kvm_get_memory_attributes_t);
> +EXPORT_SYMBOL_FOR_KVM_INTERNAL(STATIC_CALL_KEY(__kvm_get_memory_attributes));
> +EXPORT_SYMBOL_FOR_KVM_INTERNAL(STATIC_CALL_TRAMP(__kvm_get_memory_attributes));
> +#endif
> +
>  /*
>   * Ordering of locks:
>   *
> @@ -2418,7 +2429,7 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
>  }
>  #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
>
> -#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
> +#ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
>  static u64 kvm_supported_mem_attributes(struct kvm *kvm)
>  {
>  #ifdef kvm_arch_has_private_mem
> @@ -2429,6 +2440,12 @@ static u64 kvm_supported_mem_attributes(struct kvm *kvm)
>         return 0;
>  }
>
> +#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
> +static unsigned long kvm_get_vm_memory_attributes(struct kvm *kvm, gfn_t gfn)
> +{
> +       return xa_to_value(xa_load(&kvm->mem_attr_array, gfn));
> +}
> +
>  /*
>   * Returns true if _all_ gfns in the range [@start, @end) have attributes
>   * such that the bits in @mask match @attrs.
> @@ -2625,7 +2642,24 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
>
>         return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
>  }
> +#else  /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
> +static unsigned long kvm_get_vm_memory_attributes(struct kvm *kvm, gfn_t gfn)
> +{
> +       BUILD_BUG_ON(1);
> +}
>  #endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
> +static void kvm_init_memory_attributes(void)
> +{
> +       if (vm_memory_attributes)
> +               static_call_update(__kvm_get_memory_attributes,
> +                                  kvm_get_vm_memory_attributes);
> +       else
> +               static_call_update(__kvm_get_memory_attributes,
> +                                  (void *)__static_call_return0);
> +}
> +#else  /* CONFIG_KVM_MEMORY_ATTRIBUTES */
> +static void kvm_init_memory_attributes(void) { }
> +#endif /* CONFIG_KVM_MEMORY_ATTRIBUTES */
>
>  struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
>  {
> @@ -4925,6 +4959,9 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
>                 return 1;
>  #ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>         case KVM_CAP_MEMORY_ATTRIBUTES:
> +               if (!vm_memory_attributes)
> +                       return 0;
> +
>                 return kvm_supported_mem_attributes(kvm);
>  #endif
>  #ifdef CONFIG_KVM_GUEST_MEMFD
> @@ -5331,6 +5368,10 @@ static long kvm_vm_ioctl(struct file *filp,
>         case KVM_SET_MEMORY_ATTRIBUTES: {
>                 struct kvm_memory_attributes attrs;
>
> +               r = -ENOTTY;
> +               if (!vm_memory_attributes)
> +                       goto out;
> +
>                 r = -EFAULT;
>                 if (copy_from_user(&attrs, argp, sizeof(attrs)))
>                         goto out;
> @@ -6527,6 +6568,7 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
>         kvm_preempt_ops.sched_in = kvm_sched_in;
>         kvm_preempt_ops.sched_out = kvm_sched_out;
>
> +       kvm_init_memory_attributes();
>         kvm_init_debug();
>
>         r = kvm_vfio_ops_init();
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

* Re: [PATCH v6 05/43] KVM: guest_memfd: Wire up kvm_get_memory_attributes() to per-gmem attributes
From: Fuad Tabba @ 2026-05-20 12:08 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-5-91ab5a8b19a4@google.com>

On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Sean Christopherson <seanjc@google.com>
>
> Implement kvm_gmem_get_memory_attributes() for guest_memfd to allow the KVM
> core and architecture code to query per-GFN memory attributes.
>
> kvm_gmem_get_memory_attributes() finds the memory slot for a given GFN and
> queries the guest_memfd file's to determine if the page is marked as
> private.
>
> If vm_memory_attributes is not enabled, there is no shared/private tracking
> at the VM level. Install the guest_memfd implementation as long as
> guest_memfd is enabled to give guest_memfd a chance to respond on
> attributes.
>
> guest_memfd should look up attributes regardless of whether this memslot is
> gmem-only since attributes are now tracked by gmem regardless of whether
> mmap() is enabled.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Co-developed-by: Ackerley Tng <ackerleytng@google.com>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> ---
>  include/linux/kvm_host.h |  2 ++
>  virt/kvm/guest_memfd.c   | 31 +++++++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c      |  3 +++
>  3 files changed, 36 insertions(+)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index c5ba2cb34e45c..28a54298d27db 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -2557,6 +2557,8 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
>                                          struct kvm_gfn_range *range);
>  #endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
>
> +unsigned long kvm_gmem_get_memory_attributes(struct kvm *kvm, gfn_t gfn);
> +
>  #ifdef CONFIG_KVM_GUEST_MEMFD
>  int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>                      gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 5011d38820d0d..f055e058a3f28 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -509,6 +509,37 @@ static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
>         return 0;
>  }
>
> +unsigned long kvm_gmem_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
> +{
> +       struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
> +       struct inode *inode;
> +
> +       /*
> +        * If this gfn has no associated memslot, there's no chance of the gfn
> +        * being backed by private memory, since guest_memfd must be used for
> +        * private memory, and guest_memfd must be associated with some memslot.
> +        */
> +       if (!slot)
> +               return 0;
> +
> +       CLASS(gmem_get_file, file)(slot);
> +       if (!file)
> +               return 0;
> +
> +       inode = file_inode(file);
> +
> +       /*
> +        * Rely on the maple tree's internal RCU lock to ensure a
> +        * stable result. This result can become stale as soon as the
> +        * lock is dropped, so the caller _must_ still protect
> +        * consumption of private vs. shared by checking
> +        * mmu_invalidate_retry_gfn() under mmu_lock to serialize
> +        * against ongoing attribute updates.
> +        */
> +       return kvm_gmem_get_attributes(inode, kvm_gmem_get_index(slot, gfn));
> +}

Doesn't this imply that all consumers of kvm_mem_is_private() should
validate the result using mmu_lock and the invalidation sequence?
sev_handle_rmp_fault() calls kvm_mem_is_private() without holding
mmu_lock and without any retry mechanism. Is that a problem?

Cheers,
/fuad


> +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_memory_attributes);
> +
>  static struct file_operations kvm_gmem_fops = {
>         .mmap           = kvm_gmem_mmap,
>         .open           = generic_file_open,
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index ee26f1d9b5fda..4139e903f756a 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2653,6 +2653,9 @@ static void kvm_init_memory_attributes(void)
>         if (vm_memory_attributes)
>                 static_call_update(__kvm_get_memory_attributes,
>                                    kvm_get_vm_memory_attributes);
> +       else if (IS_ENABLED(CONFIG_KVM_GUEST_MEMFD))
> +               static_call_update(__kvm_get_memory_attributes,
> +                                  kvm_gmem_get_memory_attributes);
>         else
>                 static_call_update(__kvm_get_memory_attributes,
>                                    (void *)__static_call_return0);
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

* Re: [PATCH net-next v2 2/2] net: ti: icssg: Add HSR and LRE PA statistics
From: Felix Maurer @ 2026-05-20 12:31 UTC (permalink / raw)
  To: Luka Gejak
  Cc: Jakub Kicinski, MD Danish Anwar, David S. Miller, Eric Dumazet,
	Paolo Abeni, Simon Horman, Jonathan Corbet, Shuah Khan,
	Roger Quadros, Andrew Lunn, Meghana Malladi, Jacob Keller,
	David Carlier, Vadim Fedorenko, Kevin Hao, netdev, linux-doc,
	linux-kernel, linux-arm-kernel, Vladimir Oltean
In-Reply-To: <E30AAC96-01D2-4A23-B562-126087DEB7FA@linux.dev>

Hi everyone,

On Tue, May 19, 2026 at 07:55:55AM +0200, Luka Gejak wrote:
> On May 19, 2026 3:45:06 AM GMT+02:00, Jakub Kicinski <kuba@kernel.org> wrote:
> >On Thu, 14 May 2026 13:26:05 +0530 MD Danish Anwar wrote:
> >> Add new firmware PA statistics counters for HSR and LRE to the ethtool
> >> statistics exposed by the ICSSG driver.
> >>
> >> New statistics added:
> >>  - FW_HSR_FWD_CHECK_FAIL_DROP: Packets dropped on the HSR forwarding path
> >>  - FW_HSR_HE_CHECK_FAIL_DROP: Packets dropped on the HSR host egress path
> >>  - FW_HSR_SKIP_HOST_DUP_DISCARD_FRAMES: Frames with duplicate discard
> >>    skipped
> >>  - FW_LRE_CNT_UNIQUE/DUPLICATE/MULTIPLE_RX: LRE duplicate detection
> >>    counters
> >>  - FW_LRE_CNT_RX/TX: LRE per-port frame counters
> >>  - FW_LRE_CNT_OWN_RX: Own HSR tagged frames received
> >>  - FW_LRE_CNT_ERRWRONGLAN: Frames with wrong LAN identifier (PRP)
> >>
> >> Document the new HSR/LRE statistics in icssg_prueth.rst.
> >
> >To an untrained eye these stats look like stuff that could
> >be standardized across drivers.
> >
> >Luka, Felix, others on CC, do you think we should expose these
> >from HSR over netlink as "standard" offload stats different drivers
> >can plug into or not worth it?
>
> Hi Jakub,
> I think there is a case for standardizing part of this, but I would
> not standardize the whole set as-is.
>
> The LRE counters look generic enough to me, especially:
>  - unique rx
>  - duplicate rx
>  - multiple rx
>  - rx / tx
>  - own rx
>  - wrong LAN, PRP only

I'm very much in favor of having standardized stats for hsr hardware
offloads that the drivers can supply. The list above looks about right,
I'd add "frames with errors" and "(proxy) node table entry count" as
well and that "own rx" is HSR only.

In general, I don't think we need to standardize this ourselves but can
adapt to the counters that the SNMP MIB for IEC 62439-3 [1] already has.
It's part of the standard and IMHO we should gather these counters from
offloads (and later supply the same set from our sw implementation, but
the current netlink interface for hsr is quite messy). For reference,
the list in the MIB is (no need to fully adopt this naming):
- lreCntTx{A,B,C}: Sent frames per-port (for A,B only tagged frames)
- lreCntRx{A,B,C}: Received frames per-port (for A,B only tagged frames)
- lreCntErrWrongLan{A,B}: Received frames per-port with wrong LAN ID
  (only for PRP)
- lreCntErrWrongLanC: Received frames on interlink port of HSR-PRP
  RedBox with wrong LAN ID
- lreCntErrors{A,B,C}: Received frames with errors per-port
- lreCnt{,Proxy}Nodes: Nodes in the (proxy) node table
- lreCntUnique{A,B,C}: Frames only received once, per-port
- lreCntDuplicate{A,B,C}: Frames received with exactly one duplicate,
  per-port
- lreCntMulti{A,B,C}: Frames received with more than one duplicate,
  per-port
- lreCntOwnRx{A,B}: Frames received per-port (A,B) that originated from
  this node, only for HSR rings

Note that we can not currently completely distinguish
Unique/Duplicate/Multi in the kernel implementation and their meaning is
not entirely clear to me from the MIB.

The explanations in the MIB in [1] are otherwise quite explicit for each
of the counters but we may want to adapt the meaning of "port C" to the
counters. For example, there is lreCntRx{A,B,C} for received HSR/PRP
tagged frames (by the LRE). Port A and B are clear, but for port C the
meaning is "number of frames received from the application interface of
a DANP or DANH or the number of number of frames received on the
interlink of a RedBox". IMHO, we should consider separating "application
interface" (what the kernel calls master) and the interlink port because
these two are not mutually exclusive in the kernel (nor in the NICs that
support hardware offload).

Thanks,
   Felix


[1]: you can find it for example here: https://mibbrowser.online/mibdb_search.php?mib=IEC-62439-3-MIB


^ permalink raw reply

* Re: [PATCH 00/12] misc/syncobj: add /dev/syncobj device
From: Xaver Hugl @ 2026-05-20 12:33 UTC (permalink / raw)
  To: Christian König
  Cc: Julian Orth, Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
	David Airlie, Simona Vetter, Sumit Semwal, Jonathan Corbet,
	Shuah Khan, Arnd Bergmann, Greg Kroah-Hartman, dri-devel,
	linux-kernel, linux-media, linaro-mm-sig, linux-doc,
	wayland-devel, Michel Dänzer
In-Reply-To: <53edf0b5-e733-4b96-87d7-3307275500c0@amd.com>

Am Mi., 20. Mai 2026 um 10:08 Uhr schrieb Christian König
<christian.koenig@amd.com>:
> Well I would say the other way around is a pretty common use case.
>
> In other words the compositors uses the internal GPU for composing and displaying the picture. And the client uses the external GPU for fast rendering.
Sure, but that's not what I'm talking about.

> > - the buffers from the client stay valid
>
> Buffers from the hot plugged GPU don't stay valid. Accessing CPU mappings either result in a SIGBUS or are redirected to a dummy page.
Again, not what I wrote about. The buffers are on the integrated GPU.

> > - the syncobj stays valid on the client side
> > - the syncobj becomes invalid on the compositor side
>
> Nope that's not correct. The syncobj itself stays valid even if you completely hot plug the device.
>
> It can just be that the fences inside the syncobj are terminated with an error.
What about eventfd created for a point on the syncobj?

Another (future) problem with hotplugs will be if the sync file hasn't
materialized for the timeline point when the device is hotunplugged,
since there can't be an error on the fence if there isn't one. Or
could userspace somehow set an 'artificial' fence with an error in
that case?

> > "invalid" there means either
> > - the acquire point of the client is marked as signaled, before
> > rendering on the client side is completed
> > - the acquire point of the client is never signaled. Since the
> > compositor waits for the acquire point, the Wayland surface is stuck
> > forever
>
> Both of those would be a *massive* violation of documented kernel rules for hot-plugging which could lead to random data corruption and/or deadlocks.
>
> If you see any HW driver showing behavior like that please open up a bug report and ping the relevant maintainers immediately.
If there are no error codes with syncobj yet, then to userspace, the
latter behavior is exactly what we get, isn't it?

> When a hotplug happens all operations of the device should return an -ENODEV error, even when exposed to other devices/application through syncobj or syncfile.
Okay, that at least gives us a way to fail imports somewhat
gracefully. Normally, failing to import a syncobj is a fatal error in
the Wayland protocol.

> One problem is that only syncfile allows for querying such error codes at the moment, we have patches pending to add that to syncobj as well but we lack a compositor with support for that as userspace client.
As long as the error case can be detected with an eventfd,
implementing that in KWin shouldn't be a challenge.

> Well the question here is if the device the compositor is using or the client is using is gone?
>
> If the client device is hot removed the compositor should be perfectly capable to import the syncobj.
>
> If the compositor device is gone then you don't have a device to display anything any more, so generating the next frame doesn't seem to make sense either.
>
> What could be is that you want the compositor to be kept alive even when the display device is gone to switch over to vkms or whatever so that a VNC session or other remote desktop still works.
There are two GPUs in the example I gave. The compositor can use both
for rendering (in cosmic-comp's case) or switch between them (what I'm
trying to do with KWin), or use one device for rendering, and another
for importing the syncobj.

> >>>>> 3. It removes the need to translate between syncobjs fds and handles.
> >>>>
> >>>> That's a pretty big no-go as well. The differentiation between FDs and handles is completely intentional.
> >>> Could you expand on why it's needed? For compositors, the handle is
> >>> just an intermediary thing when translating between file descriptors.
> >>
> >> Well what we could do is to add an IOCTL to directly attach an syncobj file descriptor to an eventfd.
> > That would be nice.
>
> Take a look at drm_syncobj_file_fops and how drm_syncobj_add_eventfd() is used. Adding that functionality shouldn't be more than a typing exercise.
Yeah, this patchset already adds that functionality (on the new device).

> Do I see it right that this would already solve most problems in the compositor side?
Skipping the syncobj handle step would only reduce the amounts of
ioctls the compositor does, but afaict it wouldn't solve any
compositor problems. At least not as long as it's still tied to a drm
device.
For device hotplugs, the only new thing we need for correctly handling
syncobj is a way to receive errors on the eventfd.

A device-independent way to create and use syncobj would still be
useful to us though, both to simplify the compositor and to improve
the software rendering use cases.

- Xaver

^ permalink raw reply

* Re: [PATCH v2 1/2] tools/mm: add a standalone GUP microbenchmark
From: Mike Rapoport @ 2026-05-20 12:58 UTC (permalink / raw)
  To: Mark Brown
  Cc: Sarthak Sharma, Andrew Morton, David Hildenbrand, Jonathan Corbet,
	Jason Gunthorpe, John Hubbard, Peter Xu, Lorenzo Stoakes,
	Liam R . Howlett, Vlastimil Babka, Suren Baghdasaryan,
	Michal Hocko, Shuah Khan, linux-mm, linux-kselftest, linux-kernel,
	linux-doc
In-Reply-To: <67e9ecff-e532-4659-b4de-7019474af608@sirena.org.uk>

On Wed, May 20, 2026 at 12:58:32PM +0100, Mark Brown wrote:
> On Wed, May 20, 2026 at 03:45:53PM +0530, Sarthak Sharma wrote:
> > On 5/20/26 2:25 PM, Mike Rapoport wrote:
> 
> > > It seems that we need to better share the common code in
> > > tools/testing/selftest.
> 
> > > And adding another copy of the hugetlb detection and setup code does not
> > > seem like a great idea.
> 
> > Agreed, but that was the least disruptive approach I could think of.
> 
> > I am thinking of doing this now: should I move the
> > hugepage_settings.[ch] to tools/lib/ and move the read_num(),
> > write_num(), read_file() and write_file() helpers to a separate file in

these might need some adjustments because they use ksft_(), but in general
it makes sense to me.

> > tools/lib/ itself without any ksft dependency? Then both
> > tools/testing/selftests/* and tools/mm/ could share the same code.
>
> Using tools/lib sounds sensible to me - as well as the sharing it makes
> it clear that it's a library used by multiple things so avoids the
> issues we sometimes have with selftest directories referencing each
> other.

I'd make it tools/lib/mm as most of the files tools/lib/*.c are stubs for
the kernel functions.

-- 
Sincerely yours,
Mike.

^ permalink raw reply

* [PATCH v4 0/3] Let userspace know about swapped out panthor GEM objects
From: Nicolas Frattaroli @ 2026-05-20 13:04 UTC (permalink / raw)
  To: Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
	Simona Vetter, Boris Brezillon, Steven Price, Liviu Dudau,
	Jonathan Corbet, Shuah Khan, Tvrtko Ursulin
  Cc: dri-devel, linux-kernel, kernel, linux-doc, Nicolas Frattaroli

Panthor has recently gained a GEM shrinker. It allows evicting memory
that backs unused GEM objects to swap.

In this series, both fdinfo and Panthor's gems debugfs are extended so
that information on evicted pages can be gathered by users through these
two methods.

---
Changes in v4:
- Change "evicted" memory type documentation to no longer explicitly
  mention swap
- Link to v3: https://patch.msgid.link/20260423-panthor-bo-reclaim-observability-v3-0-60af32164a4f@collabora.com

Changes in v3:
- Add documentation for new "evicted" memory type in fdinfo
- Link to v2: https://patch.msgid.link/20260421-panthor-bo-reclaim-observability-v2-0-c9135eedfb6f@collabora.com

Changes in v2:
- Change reclaimed_count to saturate at INT_MAX
- Add "evictions" column to panthor gems debugfs which prints
  reclaimed_count
- Add a patch to reduce the padding of one panthor gems debugfs column a
  bit
- Link to v1: https://patch.msgid.link/20260420-panthor-bo-reclaim-observability-v1-0-a4d1a36ee84f@collabora.com

To: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
To: Maxime Ripard <mripard@kernel.org>
To: Thomas Zimmermann <tzimmermann@suse.de>
To: David Airlie <airlied@gmail.com>
To: Simona Vetter <simona@ffwll.ch>
To: Boris Brezillon <boris.brezillon@collabora.com>
To: Steven Price <steven.price@arm.com>
To: Liviu Dudau <liviu.dudau@arm.com>
To: Jonathan Corbet <corbet@lwn.net>
To: Shuah Khan <skhan@linuxfoundation.org>
To: Tvrtko Ursulin <tursulin@ursulin.net>
Cc: dri-devel@lists.freedesktop.org
Cc: linux-kernel@vger.kernel.org
Cc: kernel@collabora.com
Cc: linux-doc@vger.kernel.org
Signed-off-by: Nicolas Frattaroli <nicolas.frattaroli@collabora.com>

---
Nicolas Frattaroli (3):
      drm/fdinfo: Add "evicted" memory accounting
      drm/panthor: Implement evicted status for GEM objects
      drm/panthor: Reduce padding in gems debugfs for refcount

 Documentation/gpu/drm-usage-stats.rst |  6 ++++++
 drivers/gpu/drm/drm_file.c            |  8 ++++++++
 drivers/gpu/drm/panthor/panthor_gem.c | 18 ++++++++++++++----
 drivers/gpu/drm/panthor/panthor_gem.h | 10 ++++++++++
 include/drm/drm_file.h                |  2 ++
 include/drm/drm_gem.h                 |  2 ++
 6 files changed, 42 insertions(+), 4 deletions(-)
---
base-commit: 69c95e4c529297c25503e60acba757fba24fdc95
change-id: 20260420-panthor-bo-reclaim-observability-970679c9533c

Best regards,
--  
Nicolas Frattaroli <nicolas.frattaroli@collabora.com>


^ permalink raw reply

* [PATCH v4 1/3] drm/fdinfo: Add "evicted" memory accounting
From: Nicolas Frattaroli @ 2026-05-20 13:04 UTC (permalink / raw)
  To: Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
	Simona Vetter, Boris Brezillon, Steven Price, Liviu Dudau,
	Jonathan Corbet, Shuah Khan, Tvrtko Ursulin
  Cc: dri-devel, linux-kernel, kernel, linux-doc, Nicolas Frattaroli
In-Reply-To: <20260520-panthor-bo-reclaim-observability-v4-0-a47ab61cb80d@collabora.com>

Currently, there's no way to know for certain how much GPU memory was
swapped out. The difference between total and resident memory would
include newly allocated pages, which are not resident, but also aren't
swapped out.

Add a new drm_gem_object_status so drivers can signal when an object has
been evicted to swap, and add a new "evicted" counter to
drm_memory_stats.

Due to how the supported_flags bitmask is determined, the "evicted"
count won't be printed to fdinfo if there's no swapped out pages.

Reviewed-by: Steven Price <steven.price@arm.com>
Signed-off-by: Nicolas Frattaroli <nicolas.frattaroli@collabora.com>
---
 Documentation/gpu/drm-usage-stats.rst | 6 ++++++
 drivers/gpu/drm/drm_file.c            | 8 ++++++++
 include/drm/drm_file.h                | 2 ++
 include/drm/drm_gem.h                 | 2 ++
 4 files changed, 18 insertions(+)

diff --git a/Documentation/gpu/drm-usage-stats.rst b/Documentation/gpu/drm-usage-stats.rst
index 70b7cfcc194f..ac1dbf52d96d 100644
--- a/Documentation/gpu/drm-usage-stats.rst
+++ b/Documentation/gpu/drm-usage-stats.rst
@@ -202,6 +202,12 @@ One practical example of this could be the presence of unsignaled fences in a
 GEM buffer reservation object. Therefore, the active category is a subset of the
 resident category.
 
+- drm-evicted-<region>: <uint> [KiB|MiB]
+
+The total size of buffers that have been evicted and are no longer pinned by the
+device. Only present if there are buffers that are currently evicted, and if the
+driver implements reporting of this type of memory.
+
 Implementation Details
 ======================
 
diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c
index ec820686b302..5078172976c0 100644
--- a/drivers/gpu/drm/drm_file.c
+++ b/drivers/gpu/drm/drm_file.c
@@ -868,6 +868,7 @@ int drm_memory_stats_is_zero(const struct drm_memory_stats *stats)
 		stats->private == 0 &&
 		stats->resident == 0 &&
 		stats->purgeable == 0 &&
+		stats->evicted == 0 &&
 		stats->active == 0);
 }
 EXPORT_SYMBOL(drm_memory_stats_is_zero);
@@ -901,6 +902,10 @@ void drm_print_memory_stats(struct drm_printer *p,
 	if (supported_status & DRM_GEM_OBJECT_PURGEABLE)
 		drm_fdinfo_print_size(p, prefix, "purgeable", region,
 				      stats->purgeable);
+
+	if (supported_status & DRM_GEM_OBJECT_EVICTED)
+		drm_fdinfo_print_size(p, prefix, "evicted", region,
+				      stats->evicted);
 }
 EXPORT_SYMBOL(drm_print_memory_stats);
 
@@ -954,6 +959,9 @@ void drm_show_memory_stats(struct drm_printer *p, struct drm_file *file)
 
 		if (s & DRM_GEM_OBJECT_PURGEABLE)
 			status.purgeable += add_size;
+
+		if (s & DRM_GEM_OBJECT_EVICTED)
+			status.evicted += add_size;
 	}
 	spin_unlock(&file->table_lock);
 
diff --git a/include/drm/drm_file.h b/include/drm/drm_file.h
index 6ee70ad65e1f..7e4cb45a52c3 100644
--- a/include/drm/drm_file.h
+++ b/include/drm/drm_file.h
@@ -500,6 +500,7 @@ void drm_send_event_timestamp_locked(struct drm_device *dev,
  * @resident: Total size of GEM objects backing pages
  * @purgeable: Total size of GEM objects that can be purged (resident and not active)
  * @active: Total size of GEM objects active on one or more engines
+ * @evicted: Total size of GEM objects that have been evicted
  *
  * Used by drm_print_memory_stats()
  */
@@ -509,6 +510,7 @@ struct drm_memory_stats {
 	u64 resident;
 	u64 purgeable;
 	u64 active;
+	u64 evicted;
 };
 
 enum drm_gem_object_status;
diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
index 86f5846154f7..799588a2762a 100644
--- a/include/drm/drm_gem.h
+++ b/include/drm/drm_gem.h
@@ -53,6 +53,7 @@ struct drm_gem_object;
  * @DRM_GEM_OBJECT_RESIDENT: object is resident in memory (ie. not unpinned)
  * @DRM_GEM_OBJECT_PURGEABLE: object marked as purgeable by userspace
  * @DRM_GEM_OBJECT_ACTIVE: object is currently used by an active submission
+ * @DRM_GEM_OBJECT_EVICTED: object is evicted and no longer pinned by driver
  *
  * Bitmask of status used for fdinfo memory stats, see &drm_gem_object_funcs.status
  * and drm_show_fdinfo().  Note that an object can report DRM_GEM_OBJECT_PURGEABLE
@@ -67,6 +68,7 @@ enum drm_gem_object_status {
 	DRM_GEM_OBJECT_RESIDENT  = BIT(0),
 	DRM_GEM_OBJECT_PURGEABLE = BIT(1),
 	DRM_GEM_OBJECT_ACTIVE    = BIT(2),
+	DRM_GEM_OBJECT_EVICTED   = BIT(3),
 };
 
 /**

-- 
2.54.0


^ permalink raw reply related

* [PATCH v4 2/3] drm/panthor: Implement evicted status for GEM objects
From: Nicolas Frattaroli @ 2026-05-20 13:04 UTC (permalink / raw)
  To: Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
	Simona Vetter, Boris Brezillon, Steven Price, Liviu Dudau,
	Jonathan Corbet, Shuah Khan, Tvrtko Ursulin
  Cc: dri-devel, linux-kernel, kernel, linux-doc, Nicolas Frattaroli
In-Reply-To: <20260520-panthor-bo-reclaim-observability-v4-0-a47ab61cb80d@collabora.com>

For fdinfo to be able to fill its evicted counter with data, panthor
needs to keep track of whether a GEM object has ever been reclaimed.
Just checking whether the pages are resident isn't enough, as newly
allocated objects also won't be resident.

Do this with a new atomic_t member on panthor_gem_object. It's increased
when an object gets evicted by the shrinker, and saturates at INT_MAX.
This means that once an object has been evicted at least once, its
reclaim counter will never return to 0.

Due to this, it's possible to distinguish evicted non-resident pages
from newly allocated non-resident pages by checking whether
reclaimed_count is != 0

Use this new member to then set the appropriate DRM_GEM_OBJECT_EVICTED
status flag for fdinfo.

Also add a new column and status flag to the panthor gems debugfs: the
column is the number of times an object has been evicted, whereas the
flag indicates whether it currently is evicted.

Reviewed-by: Steven Price <steven.price@arm.com>
Signed-off-by: Nicolas Frattaroli <nicolas.frattaroli@collabora.com>
---
 drivers/gpu/drm/panthor/panthor_gem.c | 18 ++++++++++++++----
 drivers/gpu/drm/panthor/panthor_gem.h | 10 ++++++++++
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_gem.c b/drivers/gpu/drm/panthor/panthor_gem.c
index 13295d7a593d..068aa935c8fc 100644
--- a/drivers/gpu/drm/panthor/panthor_gem.c
+++ b/drivers/gpu/drm/panthor/panthor_gem.c
@@ -687,6 +687,8 @@ static void panthor_gem_evict_locked(struct panthor_gem_object *bo)
 	if (drm_WARN_ON_ONCE(bo->base.dev, !bo->backing.pages))
 		return;
 
+	atomic_add_unless(&bo->reclaimed_count, 1, INT_MAX);
+
 	panthor_gem_dev_map_cleanup_locked(bo);
 	panthor_gem_backing_cleanup_locked(bo);
 	panthor_gem_update_reclaim_state_locked(bo, NULL);
@@ -788,6 +790,8 @@ static enum drm_gem_object_status panthor_gem_status(struct drm_gem_object *obj)
 
 	if (drm_gem_is_imported(&bo->base) || bo->backing.pages)
 		res |= DRM_GEM_OBJECT_RESIDENT;
+	else if (atomic_read(&bo->reclaimed_count))
+		res |= DRM_GEM_OBJECT_EVICTED;
 
 	return res;
 }
@@ -1595,6 +1599,7 @@ static void panthor_gem_debugfs_print_flag_names(struct seq_file *m)
 	static const char * const gem_state_flags_names[] = {
 		[PANTHOR_DEBUGFS_GEM_STATE_IMPORTED_BIT] = "imported",
 		[PANTHOR_DEBUGFS_GEM_STATE_EXPORTED_BIT] = "exported",
+		[PANTHOR_DEBUGFS_GEM_STATE_EVICTED_BIT] = "evicted",
 	};
 
 	static const char * const gem_usage_flags_names[] = {
@@ -1625,6 +1630,7 @@ static void panthor_gem_debugfs_bo_print(struct panthor_gem_object *bo,
 {
 	enum panthor_gem_reclaim_state reclaim_state = bo->reclaim_state;
 	unsigned int refcount = kref_read(&bo->base.refcount);
+	int reclaimed_count = atomic_read(&bo->reclaimed_count);
 	char creator_info[32] = {};
 	size_t resident_size;
 	u32 gem_usage_flags = bo->debugfs.flags;
@@ -1638,16 +1644,20 @@ static void panthor_gem_debugfs_bo_print(struct panthor_gem_object *bo,
 
 	snprintf(creator_info, sizeof(creator_info),
 		 "%s/%d", bo->debugfs.creator.process_name, bo->debugfs.creator.tgid);
-	seq_printf(m, "%-32s%-16d%-16d%-16zd%-16zd0x%-16lx",
+	seq_printf(m, "%-32s%-16d%-16d%-11d%-16zd%-16zd0x%-16lx",
 		   creator_info,
 		   bo->base.name,
 		   refcount,
+		   reclaimed_count,
 		   bo->base.size,
 		   resident_size,
 		   drm_vma_node_start(&bo->base.vma_node));
 
 	if (drm_gem_is_imported(&bo->base))
 		gem_state_flags |= PANTHOR_DEBUGFS_GEM_STATE_FLAG_IMPORTED;
+	else if (!resident_size && reclaimed_count)
+		gem_state_flags |= PANTHOR_DEBUGFS_GEM_STATE_FLAG_EVICTED;
+
 	if (bo->base.dma_buf)
 		gem_state_flags |= PANTHOR_DEBUGFS_GEM_STATE_FLAG_EXPORTED;
 
@@ -1671,8 +1681,8 @@ static void panthor_gem_debugfs_print_bos(struct panthor_device *ptdev,
 
 	panthor_gem_debugfs_print_flag_names(m);
 
-	seq_puts(m, "created-by                      global-name     refcount        size            resident-size   file-offset       state      usage       label\n");
-	seq_puts(m, "----------------------------------------------------------------------------------------------------------------------------------------------\n");
+	seq_puts(m, "created-by                      global-name     refcount        evictions  size            resident-size   file-offset       state      usage       label\n");
+	seq_puts(m, "---------------------------------------------------------------------------------------------------------------------------------------------------------\n");
 
 	scoped_guard(mutex, &ptdev->gems.lock) {
 		list_for_each_entry(bo, &ptdev->gems.node, debugfs.node) {
@@ -1680,7 +1690,7 @@ static void panthor_gem_debugfs_print_bos(struct panthor_device *ptdev,
 		}
 	}
 
-	seq_puts(m, "==============================================================================================================================================\n");
+	seq_puts(m, "=========================================================================================================================================================\n");
 	seq_printf(m, "Total size: %zd, Total resident: %zd, Total reclaimable: %zd\n",
 		   totals.size, totals.resident, totals.reclaimable);
 }
diff --git a/drivers/gpu/drm/panthor/panthor_gem.h b/drivers/gpu/drm/panthor/panthor_gem.h
index ae0491d0b121..56d63137b4eb 100644
--- a/drivers/gpu/drm/panthor/panthor_gem.h
+++ b/drivers/gpu/drm/panthor/panthor_gem.h
@@ -19,12 +19,16 @@ struct panthor_vm;
 enum panthor_debugfs_gem_state_flags {
 	PANTHOR_DEBUGFS_GEM_STATE_IMPORTED_BIT = 0,
 	PANTHOR_DEBUGFS_GEM_STATE_EXPORTED_BIT = 1,
+	PANTHOR_DEBUGFS_GEM_STATE_EVICTED_BIT = 2,
 
 	/** @PANTHOR_DEBUGFS_GEM_STATE_FLAG_IMPORTED: GEM BO is PRIME imported. */
 	PANTHOR_DEBUGFS_GEM_STATE_FLAG_IMPORTED = BIT(PANTHOR_DEBUGFS_GEM_STATE_IMPORTED_BIT),
 
 	/** @PANTHOR_DEBUGFS_GEM_STATE_FLAG_EXPORTED: GEM BO is PRIME exported. */
 	PANTHOR_DEBUGFS_GEM_STATE_FLAG_EXPORTED = BIT(PANTHOR_DEBUGFS_GEM_STATE_EXPORTED_BIT),
+
+	/** @PANTHOR_DEBUGFS_GEM_STATE_FLAG_EVICTED: GEM BO is evicted to swap. */
+	PANTHOR_DEBUGFS_GEM_STATE_FLAG_EVICTED = BIT(PANTHOR_DEBUGFS_GEM_STATE_EVICTED_BIT),
 };
 
 enum panthor_debugfs_gem_usage_flags {
@@ -172,6 +176,12 @@ struct panthor_gem_object {
 	/** @reclaim_state: Cached reclaim state */
 	enum panthor_gem_reclaim_state reclaim_state;
 
+	/**
+	 * @reclaimed_count: How many times object has been evicted to swap.
+	 * The count saturates at %INT_MAX and will never wrap around to 0.
+	 */
+	atomic_t reclaimed_count;
+
 	/**
 	 * @exclusive_vm_root_gem: Root GEM of the exclusive VM this GEM object
 	 * is attached to.

-- 
2.54.0


^ permalink raw reply related

* Re: [PATCH v2 1/2] tools/mm: add a standalone GUP microbenchmark
From: Sarthak Sharma @ 2026-05-20 13:06 UTC (permalink / raw)
  To: Mike Rapoport, Mark Brown
  Cc: Andrew Morton, David Hildenbrand, Jonathan Corbet,
	Jason Gunthorpe, John Hubbard, Peter Xu, Lorenzo Stoakes,
	Liam R . Howlett, Vlastimil Babka, Suren Baghdasaryan,
	Michal Hocko, Shuah Khan, linux-mm, linux-kselftest, linux-kernel,
	linux-doc
In-Reply-To: <ag2v6KW8E94kl4M_@kernel.org>



On 5/20/26 6:28 PM, Mike Rapoport wrote:
> On Wed, May 20, 2026 at 12:58:32PM +0100, Mark Brown wrote:
>> On Wed, May 20, 2026 at 03:45:53PM +0530, Sarthak Sharma wrote:
>>> On 5/20/26 2:25 PM, Mike Rapoport wrote:
>>
>>>> It seems that we need to better share the common code in
>>>> tools/testing/selftest.
>>
>>>> And adding another copy of the hugetlb detection and setup code does not
>>>> seem like a great idea.
>>
>>> Agreed, but that was the least disruptive approach I could think of.
>>
>>> I am thinking of doing this now: should I move the
>>> hugepage_settings.[ch] to tools/lib/ and move the read_num(),
>>> write_num(), read_file() and write_file() helpers to a separate file in
> 
> these might need some adjustments because they use ksft_(), but in general
> it makes sense to me.
> 
>>> tools/lib/ itself without any ksft dependency? Then both
>>> tools/testing/selftests/* and tools/mm/ could share the same code.
>>
>> Using tools/lib sounds sensible to me - as well as the sharing it makes
>> it clear that it's a library used by multiple things so avoids the
>> issues we sometimes have with selftest directories referencing each
>> other.
> 
> I'd make it tools/lib/mm as most of the files tools/lib/*.c are stubs for
> the kernel functions.
> 

Thanks Mark and Mike, I'll include these changes in v3.


^ permalink raw reply

* Re: [PATCH AUTOSEL 7.0] Documentation: security-bugs: do not systematically Cc the security team
From: Jonathan Corbet @ 2026-05-20 13:07 UTC (permalink / raw)
  To: Sasha Levin, patches, stable
  Cc: Willy Tarreau, Greg KH, Leon Romanovsky, Sasha Levin, security,
	workflows, linux-doc, linux-kernel
In-Reply-To: <20260520111944.3424570-63-sashal@kernel.org>

Sasha Levin <sashal@kernel.org> writes:

> From: Willy Tarreau <w@1wt.eu>
>
> [ Upstream commit aed3c3346765e4317bb2ec6ff872e1c952e128ab ]
>
> With the increase of automated reports, the security team is dealing
> with way more messages than really needed. The reporting process works
> well with most teams so there is no need to systematically involve the
> security team in reports.

You'll want, at a minimum, f2e65e4e5b4b4b9ecf43f03c3fdbe8c9a8a43a9e to
go with this, or you'll add a broken reference with this commit.

Thanks,

jon

^ permalink raw reply

* Re: [PATCH v4 0/4] Introduce Per-CPU Work helpers (was QPW)
From: Sebastian Andrzej Siewior @ 2026-05-20 13:09 UTC (permalink / raw)
  To: Leonardo Bras
  Cc: Jonathan Corbet, Shuah Khan, Peter Zijlstra, Ingo Molnar,
	Will Deacon, Boqun Feng, Waiman Long, Andrew Morton,
	David Hildenbrand, Lorenzo Stoakes, Liam R. Howlett,
	Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, Michal Hocko,
	Jann Horn, Pedro Falcato, Brendan Jackman, Johannes Weiner,
	Zi Yan, Harry Yoo, Hao Li, Christoph Lameter, David Rientjes,
	Roman Gushchin, Chris Li, Kairui Song, Kemeng Shi, Nhat Pham,
	Baoquan He, Barry Song, Youngjun Park, Qi Zheng, Shakeel Butt,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Borislav Petkov (AMD),
	Randy Dunlap, Thomas Gleixner, Feng Tang, Dapeng Mi, Kees Cook,
	Marco Elver, Jakub Kicinski, Li RongQing, Eric Biggers,
	Paul E. McKenney, Nathan Chancellor, Miguel Ojeda, Nicolas Schier,
	Thomas Weißschuh, Douglas Anderson, Gary Guo,
	Christian Brauner, Pasha Tatashin, Masahiro Yamada, Coiby Xu,
	Frederic Weisbecker, linux-doc, linux-kernel, linux-mm,
	linux-rt-devel
In-Reply-To: <20260519012754.240804-1-leobras.c@gmail.com>

On 2026-05-18 22:27:46 [-0300], Leonardo Bras wrote:
> The problem:
> Some places in the kernel implement a parallel programming strategy
> consisting on local_locks() for most of the work, and some rare remote
> operations are scheduled on target cpu. This keeps cache bouncing low since
> cacheline tends to be mostly local, and avoids the cost of locks in non-RT
> kernels, even though the very few remote operations will be expensive due
> to scheduling overhead.
> 
> On the other hand, for RT workloads this can represent a problem: getting
> an important workload scheduled out to deal with remote requests is
> sure to introduce unexpected deadline misses.
> 
> The idea:
> Currently with PREEMPT_RT=y, local_locks() become per-cpu spinlocks.
It does not become a _spin_lock because it does not spin. It sleeps.

> In this case, instead of scheduling work on a remote cpu, it should
> be safe to grab that remote cpu's per-cpu spinlock and run the required
> work locally. That major cost, which is un/locking in every local function,
> already happens in PREEMPT_RT.

We did have this before but only in the RT tree. It was a bit messy from
the naming because it started with local_ but then it was a remote CPU.
The main issue was the different code path which led to a few deadlocks
back then.
By the time local_lock_t went upstream, the cross-CPU locking was
removed. As far as I remember, the cross-CPU user which did schedule
work on a remote CPU and annoyed NOHZ folks were replaced.

> Also, there is no need to worry about extra cache bouncing:
> The cacheline invalidation already happens due to schedule_work_on().
> 
> This will avoid schedule_work_on(), and thus avoid scheduling-out an
> RT workload.
> 
> Proposed solution:
> A new interface called PerCPU Work (PW), which should replace
> Work Queue in the above mentioned use case.
> 
> If CONFIG_PWLOCKS=n this interfaces just wraps the current
> local_locks + WorkQueue behavior, so no expected change in runtime.
> 
> If CONFIG_PWLOCKS=y, and kernel boot option pwlocks=1,
> pw_queue_on(cpu,...) will lock that cpu's per-cpu structure
> and perform work on it locally. 
> 
Sebastian

^ permalink raw reply

* [PATCH v4 3/3] drm/panthor: Reduce padding in gems debugfs for refcount
From: Nicolas Frattaroli @ 2026-05-20 13:04 UTC (permalink / raw)
  To: Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
	Simona Vetter, Boris Brezillon, Steven Price, Liviu Dudau,
	Jonathan Corbet, Shuah Khan, Tvrtko Ursulin
  Cc: dri-devel, linux-kernel, kernel, linux-doc, Nicolas Frattaroli
In-Reply-To: <20260520-panthor-bo-reclaim-observability-v4-0-a47ab61cb80d@collabora.com>

The "gems" debugfs file is getting a little too wide for comfort. While
a lot of this is unavoidable due to the theoretical upper limits of
numbers here (e.g. size needs to be 16 chars because 2**48-1 in decimal
is 15 digits, plus one space for separation), the refcount column has a
decent 5 characters to be saved, as it can only ever contain a 10-digit
decimal number.

Reduce the refcount column's width to 11, which fulfils this requirement
with an additional space for separation.

Reviewed-by: Steven Price <steven.price@arm.com>
Signed-off-by: Nicolas Frattaroli <nicolas.frattaroli@collabora.com>
---
 drivers/gpu/drm/panthor/panthor_gem.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_gem.c b/drivers/gpu/drm/panthor/panthor_gem.c
index 068aa935c8fc..dfdcda3d0189 100644
--- a/drivers/gpu/drm/panthor/panthor_gem.c
+++ b/drivers/gpu/drm/panthor/panthor_gem.c
@@ -1644,7 +1644,7 @@ static void panthor_gem_debugfs_bo_print(struct panthor_gem_object *bo,
 
 	snprintf(creator_info, sizeof(creator_info),
 		 "%s/%d", bo->debugfs.creator.process_name, bo->debugfs.creator.tgid);
-	seq_printf(m, "%-32s%-16d%-16d%-11d%-16zd%-16zd0x%-16lx",
+	seq_printf(m, "%-32s%-16d%-11d%-11d%-16zd%-16zd0x%-16lx",
 		   creator_info,
 		   bo->base.name,
 		   refcount,
@@ -1681,8 +1681,8 @@ static void panthor_gem_debugfs_print_bos(struct panthor_device *ptdev,
 
 	panthor_gem_debugfs_print_flag_names(m);
 
-	seq_puts(m, "created-by                      global-name     refcount        evictions  size            resident-size   file-offset       state      usage       label\n");
-	seq_puts(m, "---------------------------------------------------------------------------------------------------------------------------------------------------------\n");
+	seq_puts(m, "created-by                      global-name     refcount   evictions  size            resident-size   file-offset       state      usage       label\n");
+	seq_puts(m, "----------------------------------------------------------------------------------------------------------------------------------------------------\n");
 
 	scoped_guard(mutex, &ptdev->gems.lock) {
 		list_for_each_entry(bo, &ptdev->gems.node, debugfs.node) {
@@ -1690,7 +1690,7 @@ static void panthor_gem_debugfs_print_bos(struct panthor_device *ptdev,
 		}
 	}
 
-	seq_puts(m, "=========================================================================================================================================================\n");
+	seq_puts(m, "====================================================================================================================================================\n");
 	seq_printf(m, "Total size: %zd, Total resident: %zd, Total reclaimable: %zd\n",
 		   totals.size, totals.resident, totals.reclaimable);
 }

-- 
2.54.0


^ permalink raw reply related

* Re: [PATCH v4 2/3] drm/panthor: Implement evicted status for GEM objects
From: Boris Brezillon @ 2026-05-20 13:24 UTC (permalink / raw)
  To: Nicolas Frattaroli
  Cc: Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
	Simona Vetter, Steven Price, Liviu Dudau, Jonathan Corbet,
	Shuah Khan, Tvrtko Ursulin, dri-devel, linux-kernel, kernel,
	linux-doc
In-Reply-To: <20260520-panthor-bo-reclaim-observability-v4-2-a47ab61cb80d@collabora.com>

On Wed, 20 May 2026 15:04:49 +0200
Nicolas Frattaroli <nicolas.frattaroli@collabora.com> wrote:

> For fdinfo to be able to fill its evicted counter with data, panthor
> needs to keep track of whether a GEM object has ever been reclaimed.
> Just checking whether the pages are resident isn't enough, as newly
> allocated objects also won't be resident.
> 
> Do this with a new atomic_t member on panthor_gem_object. It's increased
> when an object gets evicted by the shrinker, and saturates at INT_MAX.
> This means that once an object has been evicted at least once, its
> reclaim counter will never return to 0.
> 
> Due to this, it's possible to distinguish evicted non-resident pages
> from newly allocated non-resident pages by checking whether
> reclaimed_count is != 0
> 
> Use this new member to then set the appropriate DRM_GEM_OBJECT_EVICTED
> status flag for fdinfo.
> 
> Also add a new column and status flag to the panthor gems debugfs: the
> column is the number of times an object has been evicted, whereas the
> flag indicates whether it currently is evicted.
> 
> Reviewed-by: Steven Price <steven.price@arm.com>
> Signed-off-by: Nicolas Frattaroli <nicolas.frattaroli@collabora.com>

Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>

> ---
>  drivers/gpu/drm/panthor/panthor_gem.c | 18 ++++++++++++++----
>  drivers/gpu/drm/panthor/panthor_gem.h | 10 ++++++++++
>  2 files changed, 24 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_gem.c b/drivers/gpu/drm/panthor/panthor_gem.c
> index 13295d7a593d..068aa935c8fc 100644
> --- a/drivers/gpu/drm/panthor/panthor_gem.c
> +++ b/drivers/gpu/drm/panthor/panthor_gem.c
> @@ -687,6 +687,8 @@ static void panthor_gem_evict_locked(struct panthor_gem_object *bo)
>  	if (drm_WARN_ON_ONCE(bo->base.dev, !bo->backing.pages))
>  		return;
>  
> +	atomic_add_unless(&bo->reclaimed_count, 1, INT_MAX);
> +
>  	panthor_gem_dev_map_cleanup_locked(bo);
>  	panthor_gem_backing_cleanup_locked(bo);
>  	panthor_gem_update_reclaim_state_locked(bo, NULL);
> @@ -788,6 +790,8 @@ static enum drm_gem_object_status panthor_gem_status(struct drm_gem_object *obj)
>  
>  	if (drm_gem_is_imported(&bo->base) || bo->backing.pages)
>  		res |= DRM_GEM_OBJECT_RESIDENT;
> +	else if (atomic_read(&bo->reclaimed_count))
> +		res |= DRM_GEM_OBJECT_EVICTED;
>  
>  	return res;
>  }
> @@ -1595,6 +1599,7 @@ static void panthor_gem_debugfs_print_flag_names(struct seq_file *m)
>  	static const char * const gem_state_flags_names[] = {
>  		[PANTHOR_DEBUGFS_GEM_STATE_IMPORTED_BIT] = "imported",
>  		[PANTHOR_DEBUGFS_GEM_STATE_EXPORTED_BIT] = "exported",
> +		[PANTHOR_DEBUGFS_GEM_STATE_EVICTED_BIT] = "evicted",
>  	};
>  
>  	static const char * const gem_usage_flags_names[] = {
> @@ -1625,6 +1630,7 @@ static void panthor_gem_debugfs_bo_print(struct panthor_gem_object *bo,
>  {
>  	enum panthor_gem_reclaim_state reclaim_state = bo->reclaim_state;
>  	unsigned int refcount = kref_read(&bo->base.refcount);
> +	int reclaimed_count = atomic_read(&bo->reclaimed_count);
>  	char creator_info[32] = {};
>  	size_t resident_size;
>  	u32 gem_usage_flags = bo->debugfs.flags;
> @@ -1638,16 +1644,20 @@ static void panthor_gem_debugfs_bo_print(struct panthor_gem_object *bo,
>  
>  	snprintf(creator_info, sizeof(creator_info),
>  		 "%s/%d", bo->debugfs.creator.process_name, bo->debugfs.creator.tgid);
> -	seq_printf(m, "%-32s%-16d%-16d%-16zd%-16zd0x%-16lx",
> +	seq_printf(m, "%-32s%-16d%-16d%-11d%-16zd%-16zd0x%-16lx",
>  		   creator_info,
>  		   bo->base.name,
>  		   refcount,
> +		   reclaimed_count,
>  		   bo->base.size,
>  		   resident_size,
>  		   drm_vma_node_start(&bo->base.vma_node));
>  
>  	if (drm_gem_is_imported(&bo->base))
>  		gem_state_flags |= PANTHOR_DEBUGFS_GEM_STATE_FLAG_IMPORTED;
> +	else if (!resident_size && reclaimed_count)
> +		gem_state_flags |= PANTHOR_DEBUGFS_GEM_STATE_FLAG_EVICTED;
> +
>  	if (bo->base.dma_buf)
>  		gem_state_flags |= PANTHOR_DEBUGFS_GEM_STATE_FLAG_EXPORTED;
>  
> @@ -1671,8 +1681,8 @@ static void panthor_gem_debugfs_print_bos(struct panthor_device *ptdev,
>  
>  	panthor_gem_debugfs_print_flag_names(m);
>  
> -	seq_puts(m, "created-by                      global-name     refcount        size            resident-size   file-offset       state      usage       label\n");
> -	seq_puts(m, "----------------------------------------------------------------------------------------------------------------------------------------------\n");
> +	seq_puts(m, "created-by                      global-name     refcount        evictions  size            resident-size   file-offset       state      usage       label\n");
> +	seq_puts(m, "---------------------------------------------------------------------------------------------------------------------------------------------------------\n");
>  
>  	scoped_guard(mutex, &ptdev->gems.lock) {
>  		list_for_each_entry(bo, &ptdev->gems.node, debugfs.node) {
> @@ -1680,7 +1690,7 @@ static void panthor_gem_debugfs_print_bos(struct panthor_device *ptdev,
>  		}
>  	}
>  
> -	seq_puts(m, "==============================================================================================================================================\n");
> +	seq_puts(m, "=========================================================================================================================================================\n");
>  	seq_printf(m, "Total size: %zd, Total resident: %zd, Total reclaimable: %zd\n",
>  		   totals.size, totals.resident, totals.reclaimable);
>  }
> diff --git a/drivers/gpu/drm/panthor/panthor_gem.h b/drivers/gpu/drm/panthor/panthor_gem.h
> index ae0491d0b121..56d63137b4eb 100644
> --- a/drivers/gpu/drm/panthor/panthor_gem.h
> +++ b/drivers/gpu/drm/panthor/panthor_gem.h
> @@ -19,12 +19,16 @@ struct panthor_vm;
>  enum panthor_debugfs_gem_state_flags {
>  	PANTHOR_DEBUGFS_GEM_STATE_IMPORTED_BIT = 0,
>  	PANTHOR_DEBUGFS_GEM_STATE_EXPORTED_BIT = 1,
> +	PANTHOR_DEBUGFS_GEM_STATE_EVICTED_BIT = 2,
>  
>  	/** @PANTHOR_DEBUGFS_GEM_STATE_FLAG_IMPORTED: GEM BO is PRIME imported. */
>  	PANTHOR_DEBUGFS_GEM_STATE_FLAG_IMPORTED = BIT(PANTHOR_DEBUGFS_GEM_STATE_IMPORTED_BIT),
>  
>  	/** @PANTHOR_DEBUGFS_GEM_STATE_FLAG_EXPORTED: GEM BO is PRIME exported. */
>  	PANTHOR_DEBUGFS_GEM_STATE_FLAG_EXPORTED = BIT(PANTHOR_DEBUGFS_GEM_STATE_EXPORTED_BIT),
> +
> +	/** @PANTHOR_DEBUGFS_GEM_STATE_FLAG_EVICTED: GEM BO is evicted to swap. */
> +	PANTHOR_DEBUGFS_GEM_STATE_FLAG_EVICTED = BIT(PANTHOR_DEBUGFS_GEM_STATE_EVICTED_BIT),
>  };
>  
>  enum panthor_debugfs_gem_usage_flags {
> @@ -172,6 +176,12 @@ struct panthor_gem_object {
>  	/** @reclaim_state: Cached reclaim state */
>  	enum panthor_gem_reclaim_state reclaim_state;
>  
> +	/**
> +	 * @reclaimed_count: How many times object has been evicted to swap.
> +	 * The count saturates at %INT_MAX and will never wrap around to 0.
> +	 */
> +	atomic_t reclaimed_count;
> +
>  	/**
>  	 * @exclusive_vm_root_gem: Root GEM of the exclusive VM this GEM object
>  	 * is attached to.
> 


^ permalink raw reply

* Re: [PATCH v6 06/43] KVM: x86/mmu: Bug the VM if gmem attributes are queried to determine max mapping level
From: Fuad Tabba @ 2026-05-20 13:33 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-6-91ab5a8b19a4@google.com>

On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Ackerley Tng <ackerleytng@google.com>
>
> When the maximum mapping level is queried, KVM's MMU lock is held, and
> while the MMU lock is held, guest_memfd cannot take the
> filemap_invalidate_lock() to look up the current shared/private state of
> the gfn, for these reasons:
>
> + The MMU lock is a spinlock or rwlock and cannot be held while taking a
>   lock that can sleep.
> + In guest_memfd's code paths (such as truncate), the
>   filemap_invalidate_lock() is held while taking the MMU lock, and taking
>   the locks in reverse order would introduce a AB-BA deadlock.
>
> Currently, the maximum mapping level is only queried from guest_memfd in
> the process of recovering huge pages, if dirty logging is disabled on a
> memslot. Dirty logging is not currently supported for guest_memfd, and
> guest_memfd memslots also cannot be updated.
>
> For now, bug the VM if guest_memfd needs to be queried to determine the
> maximum mapping level. This guard can be removed if/when support is added.
>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> ---
>  arch/x86/kvm/mmu/mmu.c | 9 +++++++++
>  1 file changed, 9 insertions(+)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index a80a876ab4ad6..153bcc5369985 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3357,6 +3357,15 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
>                 max_level = fault->max_level;
>                 is_private = fault->is_private;
>         } else {
> +               /*
> +                * Memory attributes cannot be obtained from guest_memfd while
> +                * the MMU lock is held.
> +                */
> +               if (KVM_BUG_ON(static_call_query(__kvm_get_memory_attributes) ==
> +                              kvm_gmem_get_memory_attributes, kvm)) {
> +                       return 0;
> +               }
> +

This directly takes the address of kvm_gmem_get_memory_attributes,
which is only compiled if CONFIG_KVM_GUEST_MEMFD=y. This breaks
ARCH=i386.

Cheers,
/fuad

>                 max_level = PG_LEVEL_NUM;
>                 is_private = kvm_mem_is_private(kvm, gfn);
>         }
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

* Re: [PATCH v3 1/3] soc: bcm2835: raspberrypi-firmware: Add voltage domain IDs
From: Guenter Roeck @ 2026-05-20 13:35 UTC (permalink / raw)
  To: Shubham Chakraborty
  Cc: Florian Fainelli, Jonathan Corbet, Shuah Khan,
	Broadcom internal kernel review list, Ray Jui, Scott Branden,
	linux-hwmon, linux-doc, linux-rpi-kernel, linux-arm-kernel,
	linux-kernel
In-Reply-To: <20260517080445.103962-2-chakrabortyshubham66@gmail.com>

On Sun, May 17, 2026 at 01:34:43PM +0530, Shubham Chakraborty wrote:
> Add Raspberry Pi firmware voltage domain identifiers for the mailbox
> property interface.
> 
> Also add the voltage request structure used with
> RPI_FIRMWARE_GET_VOLTAGE so firmware clients can share the common API
> definition from the firmware header.
> 
> Signed-off-by: Shubham Chakraborty <chakrabortyshubham66@gmail.com>
> Acked-by: Florian Fainelli <florian.fainelli@broadcom.com>

Applied.

Thanks,
Guenter

^ permalink raw reply

* Re: [PATCH v3 2/3] hwmon: raspberrypi: Add voltage input support
From: Guenter Roeck @ 2026-05-20 13:37 UTC (permalink / raw)
  To: Shubham Chakraborty
  Cc: Florian Fainelli, Jonathan Corbet, Shuah Khan,
	Broadcom internal kernel review list, Ray Jui, Scott Branden,
	linux-hwmon, linux-doc, linux-rpi-kernel, linux-arm-kernel,
	linux-kernel
In-Reply-To: <20260517080445.103962-3-chakrabortyshubham66@gmail.com>

On Sun, May 17, 2026 at 01:34:44PM +0530, Shubham Chakraborty wrote:
> Extend the raspberrypi-hwmon driver to expose firmware-provided
> voltage measurements through the hwmon subsystem.
> 
> The driver now exports the following voltage inputs:
> 
>   - in0_input (core)
>   - in1_input (sdram_c)
>   - in2_input (sdram_i)
>   - in3_input (sdram_p)
> 
> Voltage values returned by firmware are converted from microvolts
> to millivolts as expected by the hwmon subsystem.
> 
> Update the documentation related to it.
> 
> The existing undervoltage sticky alarm handling is preserved and
> associated with the first voltage channel.
> 
> Tested in -
> - Raspberry Pi 3b+ (Linux raspberrypi 6.12.75+rpt-rpi-v8 #1 SMP PREEMPT
>   Debian 1:6.12.75-1+rpt1 (2026-03-11) aarch64 GNU/Linux)
> 
> Signed-off-by: Shubham Chakraborty <chakrabortyshubham66@gmail.com>
> Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>

Applied, after fixing:

WARNING: Missing a blank line after declarations
#207: FILE: drivers/hwmon/raspberrypi-hwmon.c:74:
+	int ret;
+	ret = rpi_firmware_property(data->fw, RPI_FIRMWARE_GET_VOLTAGE,

Please run checkpatch on your patches.

Thanks,
Guenter

^ permalink raw reply

* Re: [PATCH v3 3/3] hwmon: raspberrypi: Fix delayed-work teardown race
From: Guenter Roeck @ 2026-05-20 13:39 UTC (permalink / raw)
  To: Shubham Chakraborty
  Cc: Florian Fainelli, Jonathan Corbet, Shuah Khan,
	Broadcom internal kernel review list, Ray Jui, Scott Branden,
	linux-hwmon, linux-doc, linux-rpi-kernel, linux-arm-kernel,
	linux-kernel
In-Reply-To: <20260517080445.103962-4-chakrabortyshubham66@gmail.com>

On Sun, May 17, 2026 at 01:34:45PM +0530, Shubham Chakraborty wrote:
> The delayed polling work rearms itself from the work function, so use
> explicit delayed-work setup and cleanup instead of
> devm_delayed_work_autocancel().
> 
> Initialize the delayed work with INIT_DELAYED_WORK() and register a
> devres cleanup action that calls disable_delayed_work_sync() during
> teardown.
> 
> This addresses the concern raised during review about the polling work
> being able to requeue itself while the driver is being removed.
> 
> Signed-off-by: Shubham Chakraborty <chakrabortyshubham66@gmail.com>

Applied.

Thanks,
Guenter

^ permalink raw reply

* [PATCH v10 00/25] Runtime TDX module update support
From: Chao Gao @ 2026-05-20 13:38 UTC (permalink / raw)
  To: kvm, linux-coco, x86, linux-kernel, linux-rt-devel, linux-doc
  Cc: binbin.wu, dave.hansen, djbw, ira.weiny, kai.huang, kas,
	nik.borisov, paulmck, pbonzini, reinette.chatre, rick.p.edgecombe,
	sagis, seanjc, tony.lindgren, vannapurve, vishal.l.verma,
	yilun.xu, xiaoyao.li, yan.y.zhao, Chao Gao, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, H. Peter Anvin,
	Sebastian Andrzej Siewior, Clark Williams, Steven Rostedt,
	Jonathan Corbet, Shuah Khan

Hi Dave & Rick,

Thanks for your thorough review of v9. This v10 addresses the issues you
pointed out. The main changes in this version are polishing changelogs
and variable renames to improve readability. Specifically:

   - Patches 1-2 (new): Split the original "Consolidate TDX global
     initialization states" into two steps — first move the statics to
     file scope, then clarify the result-caching logic in
     try_init_module_global().
   - Patch 6: Removed user-facing Kconfig help text for TDX_HOST_SERVICES
     (now a silent tristate auto-selected by INTEL_TDX_HOST).
   - Patch 13: Renamed "size" to "data_len" in seamldr_install_module()
     and init_seamldr_params(); renamed "HEADER_SIZE" to
     "TDX_IMAGE_HEADER_SIZE"; renamed "primary" to "is_lead_cpu" in the
     update state machine.
   - Patch 13: Added early data_len validation and explicit bounds checks
     on sigstruct_nr_pages/module_nr_pages against SEAMLDR_MAX_NR_*
     limits, removing the implicit clamping in populate_pa_list().
   - Patch 22: Fixed BIT(16) -> BIT_ULL(16) for
     TDX_SYS_SHUTDOWN_AVOID_COMPAT_SENSITIVE.
   - Patch 22: Removed unused TDX_FEATURES0_UPDATE_COMPAT definition.
   - Various patches: Shortened sysfs ABI descriptions, tightened
     comments across seamldr.h and seamldr.c, and minor style fixes
     (return 0 -> return false, unfolded conditionals)

Please take a look at this new version. I hope it can still be merged
for 7.2.
---

(For transparency, note that I used AI tools to help proofread this
cover-letter and commit messages)

This series adds support for runtime TDX module updates that preserve
running TDX guests. It is also available at:

  https://github.com/gaochaointel/linux-dev/commits/tdx-module-updates-v10/

== Background ==

Intel TDX isolates Trusted Domains (TDs), or confidential guests, from the
host. A key component of Intel TDX is the TDX module, which enforces
security policies to protect the memory and CPU states of TDs from the
host. However, the TDX module is software that requires updates.

== Problems ==

Currently, the TDX module is loaded by the BIOS at boot time, and the only
way to update it is through a reboot, which results in significant system
downtime. Users expect the TDX module to be updatable at runtime without
disrupting TDX guests.

== Solution ==

On TDX platforms, P-SEAMLDR[1] is a component within the protected SEAM
range. It is loaded by the BIOS and provides the host with functions to
install a TDX module at runtime.

This series implements runtime TDX module updates through the fw_upload
mechanism. That interface is a good fit because TDX module selection is not
a simple "load a known file from disk" problem. The update image to load
depends on module versioning, compatibility rules. fw_upload lets userspace
choose the module explicitly while the kernel provides the update
mechanism.

This design intentionally keeps most update validation/policy in userspace.
The kernel exposes the information userspace needs, such as TDX module
version and P-SEAMLDR information, but userspace is responsible for
understanding TDX module's versioning and compatibility rules and for
choosing an appropriate update image (see "TDX module versioning" below).

The kernel still enforces the pieces that must be handled in-kernel:

1. Validate the tdx_blob header fields that are not passed through tothe
TDX module. Just the standard overflow and reserved bits defensive ABI stuff.

2. Make sure no non-update SEAMCALLs are called during the update.

3. Make sure SEAMCALLs are on the right CPU, for any the user has made
available to the kernel.

4. Handle the race between updates and concurrent TD builds by
returning -EBUSY to userspace.

Everything else remains a userspace responsibility.

In the unlikely event the update fails, for example userspace picks an
incompatible update image, or the image is otherwise corrupted, all TDs
will experience SEAMCALL failures and be killed. The recovery of TD
operation from that event requires a reboot.

Given there is no mechanism to quiesce SEAMCALLs, the TDs themselves must
pause execution over an update. The most straightforward way to meet the
'pause TDs while update executes' constraint is to run the update in
stop_machine() context. All other evaluated solutions export more
complexity to KVM, or exports more fragility to userspace.

== How to test this series ==

NOTE: This v10 uses a new tdx_blob format. The scripts and module blobs in
https://github.com/intel/tdx-module-binaries have not yet been updated
to match this version. Those updates will be done separately later.

== Other information relevant to Runtime TDX module updates ==

=== TDX module versioning ===

Each TDX module is assigned a version number x.y.z, where x represents the
"major" version, y the "minor" version, and z the "update" version.

Runtime TDX module updates are restricted to Z-stream releases.

Note that Z-stream releases do not necessarily guarantee compatibility. A
new release may not be compatible with all previous versions. To address this,
Intel provides a separate file containing compatibility information, which
specifies the minimum module version required for a particular update. This
information is referenced by the tool to determine if two modules are
compatible.

=== TCB Stability ===

Updates change the TCB as viewed by attestation reports. In TDX there is
a distinction between "launch-time" version and "current" version where
runtime TDX module updates cause that "current" version number to change,
subject to Z-stream constraints.

The concern that a malicious host may attack confidential VMs by loading
insecure updates was addressed by Alex in [3]. Similarly, the scenario
where some "theoretical paranoid tenant" in the cloud wants to audit
updates and stop trusting the host after updates until audit completion
was also addressed in [4]. Users not in the cloud control the host machine
and can manage updates themselves, so they don't have these concerns.

See more about the implications of current TCB version changes in
attestation as summarized by Dave in [5].

=== TDX module Distribution Model ===

At a high level, Intel publishes all TDX modules on the github [2], along
with a mapping_file.json which documents the compatibility information
about each TDX module and a userspace tool to install the TDX module. OS
vendors can package these modules and distribute them. Administrators
install the package and use the tool to select the appropriate TDX module
and install it via the interfaces exposed by this series.

[1]: https://cdrdv2.intel.com/v1/dl/getContent/733584
[2]: https://github.com/intel/tdx-module-binaries
[3]: https://lore.kernel.org/all/665c5ae0-4b7c-4852-8995-255adf7b3a2f@amazon.com/
[4]: https://lore.kernel.org/all/5d1da767-491b-4077-b472-2cc3d73246d6@amazon.com/
[5]: https://lore.kernel.org/all/94d6047e-3b7c-4bc1-819c-85c16ff85abf@intel.com/

Chao Gao (24):
  x86/virt/tdx: Clarify try_init_module_global() result caching
  x86/virt/tdx: Move TDX global initialization states to file scope
  x86/virt/tdx: Consolidate TDX global initialization states
  x86/virt/tdx: Move TDX_FEATURES0 bits to asm/tdx.h
  coco/tdx-host: Introduce a "tdx_host" device
  coco/tdx-host: Expose TDX module version
  x86/virt/seamldr: Introduce a wrapper for P-SEAMLDR SEAMCALLs
  x86/virt/seamldr: Add a helper to retrieve P-SEAMLDR information
  coco/tdx-host: Expose P-SEAMLDR information via sysfs
  coco/tdx-host: Don't expose P-SEAMLDR information on CPUs with erratum
  coco/tdx-host: Implement firmware upload sysfs ABI for TDX module
    updates
  x86/virt/seamldr: Allocate and populate a module update request
  x86/virt/seamldr: Introduce skeleton for TDX module updates
  x86/virt/seamldr: Abort updates after a failed step
  x86/virt/seamldr: Shut down the current TDX module
  x86/virt/tdx: Reset software states during TDX module shutdown
  x86/virt/seamldr: Install a new TDX module
  x86/virt/seamldr: Do TDX global and per-CPU init after module
    installation
  x86/virt/tdx: Restore TDX module state
  x86/virt/tdx: Refresh TDX module version after update
  x86/virt/tdx: Reject updates during compatibility-sensitive operations
  x86/virt/tdx: Enable TDX module runtime updates
  coco/tdx-host: Document TDX module update compatibility criteria
  x86/virt/tdx: Document TDX module update

Kai Huang (1):
  x86/virt/tdx: Move low level SEAMCALL helpers out of <asm/tdx.h>

 .../ABI/testing/sysfs-devices-faux-tdx-host   |  66 ++++
 Documentation/arch/x86/tdx.rst                |  34 ++
 arch/x86/include/asm/cpufeatures.h            |   1 +
 arch/x86/include/asm/seamldr.h                |  36 ++
 arch/x86/include/asm/tdx.h                    |  66 +---
 arch/x86/include/asm/tdx_global_metadata.h    |   4 +
 arch/x86/include/asm/vmx.h                    |   1 +
 arch/x86/virt/vmx/tdx/Makefile                |   2 +-
 arch/x86/virt/vmx/tdx/seamcall_internal.h     | 109 ++++++
 arch/x86/virt/vmx/tdx/seamldr.c               | 324 ++++++++++++++++++
 arch/x86/virt/vmx/tdx/tdx.c                   | 169 +++++----
 arch/x86/virt/vmx/tdx/tdx.h                   |   8 +-
 arch/x86/virt/vmx/tdx/tdx_global_metadata.c   |  17 +-
 drivers/virt/coco/Kconfig                     |   2 +
 drivers/virt/coco/Makefile                    |   1 +
 drivers/virt/coco/tdx-host/Kconfig            |   6 +
 drivers/virt/coco/tdx-host/Makefile           |   1 +
 drivers/virt/coco/tdx-host/tdx-host.c         | 231 +++++++++++++
 18 files changed, 965 insertions(+), 113 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-devices-faux-tdx-host
 create mode 100644 arch/x86/include/asm/seamldr.h
 create mode 100644 arch/x86/virt/vmx/tdx/seamcall_internal.h
 create mode 100644 arch/x86/virt/vmx/tdx/seamldr.c
 create mode 100644 drivers/virt/coco/tdx-host/Kconfig
 create mode 100644 drivers/virt/coco/tdx-host/Makefile
 create mode 100644 drivers/virt/coco/tdx-host/tdx-host.c

base-commit: 5209e5bfe5cab593476c3e7754e42c5e47ce36de
-- 
2.52.0

^ permalink raw reply

* [PATCH v10 25/25] x86/virt/tdx: Document TDX module update
From: Chao Gao @ 2026-05-20 13:38 UTC (permalink / raw)
  To: kvm, linux-coco, linux-kernel, linux-doc
  Cc: binbin.wu, dave.hansen, djbw, ira.weiny, kai.huang, kas,
	nik.borisov, paulmck, pbonzini, reinette.chatre, rick.p.edgecombe,
	sagis, seanjc, tony.lindgren, vannapurve, vishal.l.verma,
	yilun.xu, xiaoyao.li, yan.y.zhao, Chao Gao, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, x86, H. Peter Anvin,
	Jonathan Corbet, Shuah Khan
In-Reply-To: <20260520133909.409394-1-chao.gao@intel.com>

Document TDX module update as a subsection of "TDX Host Kernel Support" to
provide background information and cover key points that developers and
users may need to know, for example:

 - update is done in stop_machine() context
 - update instructions and results
 - update policy and tooling

Signed-off-by: Chao Gao <chao.gao@intel.com>
Reviewed-by: Kai Huang <kai.huang@intel.com>
Reviewed-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Reviewed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 Documentation/arch/x86/tdx.rst | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/Documentation/arch/x86/tdx.rst b/Documentation/arch/x86/tdx.rst
index 1a3b5bac1021..9d2b7db166b5 100644
--- a/Documentation/arch/x86/tdx.rst
+++ b/Documentation/arch/x86/tdx.rst
@@ -73,6 +73,40 @@ initialize::
 
   [..] virt/tdx: TDX-Module initialization failed ...
 
+TDX module Runtime Update
+-------------------------
+
+The TDX architecture includes a persistent SEAM loader (P-SEAMLDR) that
+runs in SEAM mode separately from the TDX module. The kernel can
+communicate with P-SEAMLDR to perform runtime updates of the TDX module.
+
+During updates, the TDX module becomes unresponsive to other TDX
+operations. To prevent components using TDX (such as KVM) from
+experiencing unexpected errors during updates, updates are performed in
+stop_machine() context.
+
+TDX module updates have complex compatibility requirements; the new module
+must be compatible with the current CPU, P-SEAMLDR, and running TDX module.
+Rather than implementing complex module selection and policy enforcement
+logic in the kernel, userspace is responsible for auditing and selecting
+appropriate updates.
+
+Updates use the standard firmware upload interface. See
+Documentation/driver-api/firmware/fw_upload.rst for detailed instructions.
+
+If updates failed, running TDs may be killed and further TDX operations may
+not be possible until reboot. For detailed error information, see
+Documentation/ABI/testing/sysfs-devices-faux-tdx-host.
+
+Given the risk of losing existing TDs, userspace should verify that the
+update is compatible with the current system and properly validated before
+applying it.
+
+A reference userspace tool that implements necessary checks is available
+at:
+
+  https://github.com/intel/tdx-module-binaries
+
 TDX Interaction to Other Kernel Components
 ------------------------------------------
 
-- 
2.52.0


^ permalink raw reply related

* Re: [PATCH v10 00/25] Runtime TDX module update support
From: Chao Gao @ 2026-05-20 13:46 UTC (permalink / raw)
  To: kvm, linux-coco, x86, linux-kernel, linux-rt-devel, linux-doc
  Cc: binbin.wu, dave.hansen, djbw, ira.weiny, kai.huang, kas,
	nik.borisov, paulmck, pbonzini, reinette.chatre, rick.p.edgecombe,
	sagis, seanjc, tony.lindgren, vannapurve, vishal.l.verma,
	yilun.xu, xiaoyao.li, yan.y.zhao, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, H. Peter Anvin, Sebastian Andrzej Siewior,
	Clark Williams, Steven Rostedt, Jonathan Corbet, Shuah Khan
In-Reply-To: <20260520133909.409394-1-chao.gao@intel.com>

On Wed, May 20, 2026 at 06:38:03AM -0700, Chao Gao wrote:
>Hi Dave & Rick,
>
>Thanks for your thorough review of v9. This v10 addresses the issues you
>pointed out. The main changes in this version are polishing changelogs
>and variable renames to improve readability. Specifically:
> 
>   - Patches 1-2 (new): Split the original "Consolidate TDX global
>     initialization states" into two steps — first move the statics to
>     file scope, then clarify the result-caching logic in
>     try_init_module_global().
>   - Patch 6: Removed user-facing Kconfig help text for TDX_HOST_SERVICES
>     (now a silent tristate auto-selected by INTEL_TDX_HOST).
>   - Patch 13: Renamed "size" to "data_len" in seamldr_install_module()
>     and init_seamldr_params(); renamed "HEADER_SIZE" to
>     "TDX_IMAGE_HEADER_SIZE"; renamed "primary" to "is_lead_cpu" in the
>     update state machine.
>   - Patch 13: Added early data_len validation and explicit bounds checks
>     on sigstruct_nr_pages/module_nr_pages against SEAMLDR_MAX_NR_*
>     limits, removing the implicit clamping in populate_pa_list().
>   - Patch 22: Fixed BIT(16) -> BIT_ULL(16) for
>     TDX_SYS_SHUTDOWN_AVOID_COMPAT_SENSITIVE.
>   - Patch 22: Removed unused TDX_FEATURES0_UPDATE_COMPAT definition.
>   - Various patches: Shortened sysfs ABI descriptions, tightened
>     comments across seamldr.h and seamldr.c, and minor style fixes
>     (return 0 -> return false, unfolded conditionals)

FYI, below is the diff between v9 and v10:

diff --git a/Documentation/ABI/testing/sysfs-devices-faux-tdx-host b/Documentation/ABI/testing/sysfs-devices-faux-tdx-host
index 9e08db231da1..5f18ac972468 100644
--- a/Documentation/ABI/testing/sysfs-devices-faux-tdx-host
+++ b/Documentation/ABI/testing/sysfs-devices-faux-tdx-host
@@ -1,16 +1,14 @@
 What:		/sys/devices/faux/tdx_host/version
 Contact:	linux-coco@lists.linux.dev
-Description:	(RO) Report the version of the loaded TDX module. The TDX module
-		version is formatted as x.y.z, where "x" is the major version,
-		"y" is the minor version and "z" is the update version. Versions
-		are used for bug reporting, TDX module updates etc.
+Description:	(RO) Report the version of the loaded TDX module.
+		Formatted as "major.minor.update". Used by TDX module
+		update tooling. Example: "1.2.03".
 
 What:		/sys/devices/faux/tdx_host/seamldr_version
 Contact:	linux-coco@lists.linux.dev
-Description:	(RO) Report the version of the loaded P-SEAMLDR. The P-SEAMLDR
-		version is formatted as x.y.z, where "x" is the major version,
-		"y" is the minor version and "z" is the update version. Versions
-		are used for bug reporting and compatibility checks.
+Description:	(RO) Report the version of the loaded P-SEAMLDR.
+		Formatted as a TDX module version. Used by TDX module
+		update tooling.
 
 What:		/sys/devices/faux/tdx_host/num_remaining_updates
 Contact:	linux-coco@lists.linux.dev
diff --git a/arch/x86/include/asm/seamldr.h b/arch/x86/include/asm/seamldr.h
index ac6f80f7208b..43084e2daa2d 100644
--- a/arch/x86/include/asm/seamldr.h
+++ b/arch/x86/include/asm/seamldr.h
@@ -5,11 +5,10 @@
 #include <linux/types.h>
 
 /*
- * This is called the "SEAMLDR_INFO" data structure and is defined
- * in "SEAM Loader (SEAMLDR) Interface Specification".
+ * This is the "SEAMLDR_INFO" data structure defined in the
+ * "SEAM Loader (SEAMLDR) Interface Specification".
  *
- * The SEAMLDR.INFO documentation requires this to be aligned to a
- * 256-byte boundary.
+ * Must be aligned to a 256-byte boundary.
  */
 struct seamldr_info {
	u32	version;
@@ -32,6 +31,6 @@ struct seamldr_info {
 static_assert(sizeof(struct seamldr_info) == 256);
 
 int seamldr_get_info(struct seamldr_info *seamldr_info);
-int seamldr_install_module(const u8 *data, u32 size);
+int seamldr_install_module(const u8 *data, u32 data_len);
 
 #endif /* _ASM_X86_SEAMLDR_H */
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index ac042b369843..c848483d815f 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -36,7 +36,6 @@
 /* Bit definitions of TDX_FEATURES0 metadata field */
 #define TDX_FEATURES0_TD_PRESERVING	BIT_ULL(1)
 #define TDX_FEATURES0_NO_RBP_MOD	BIT_ULL(18)
-#define TDX_FEATURES0_UPDATE_COMPAT	BIT_ULL(47)
 
 #ifndef __ASSEMBLER__
 
diff --git a/arch/x86/virt/vmx/tdx/seamldr.c b/arch/x86/virt/vmx/tdx/seamldr.c
index 6a39c9e3ef7d..ff95d8dd1162 100644
--- a/arch/x86/virt/vmx/tdx/seamldr.c
+++ b/arch/x86/virt/vmx/tdx/seamldr.c
@@ -32,10 +32,12 @@
 #define SEAMLDR_SCENARIO_UPDATE		1
 
 /*
- * This is called the "SEAMLDR_PARAMS" data structure and is defined
- * in "SEAM Loader (SEAMLDR) Interface Specification".
+ * This is the "SEAMLDR_PARAMS" data structure defined in the
+ * "SEAM Loader (SEAMLDR) Interface Specification".
  *
- * It describes the TDX module that will be installed.
+ * It is the in-memory ABI that the kernel passes to the P-SEAMLDR
+ * to update the TDX module. It breaks the TDX module image up in
+ * page-size pieces.
  */
 struct seamldr_params {
	u32	version;
@@ -87,7 +89,7 @@ static int seamldr_install(const struct seamldr_params *params)
 #define TDX_IMAGE_VERSION_2		0x200
 
 struct tdx_image_header {
-	u16	version; // This ABI is always 0x200
+	u16	version;
	u16	checksum;
	u8	signature[8];
	u32	sigstruct_nr_pages;
@@ -95,23 +97,28 @@ struct tdx_image_header {
	u8	reserved[4076];
 } __packed;
 
-#define HEADER_SIZE sizeof(struct tdx_image_header)
-static_assert(HEADER_SIZE == 4096);
+#define TDX_IMAGE_HEADER_SIZE sizeof(struct tdx_image_header)
+static_assert(TDX_IMAGE_HEADER_SIZE == 4096);
 
-/* Intel TDX module update ABI structure. aka. "TDX module blob". */
+/*
+ * Intel TDX module update ABI structure. aka. "TDX module blob".
+ *
+ * @payload contains sigstruct pages followed by module pages.
+ */
 struct tdx_image {
	struct tdx_image_header header;
-	u8 payload[]; // Contains sigstruct pages followed by module pages
+	u8 payload[];
 };
 
-static void populate_pa_list(u64 *pa_list, u32 max_entries, const u8 *start, u32 nr_pages)
+static void populate_pa_list(u64 *pa_list, const u8 *vmalloc_addr, u32 vmalloc_len_pages)
 {
	int i;
 
-	nr_pages = MIN(nr_pages, max_entries);
-	for (i = 0; i < nr_pages; i++) {
-		pa_list[i] = vmalloc_to_pfn(start) << PAGE_SHIFT;
-		start += PAGE_SIZE;
+	for (i = 0; i < vmalloc_len_pages; i++) {
+		unsigned long offset = i * PAGE_SIZE;
+		unsigned long pfn = vmalloc_to_pfn(&vmalloc_addr[offset]);
+
+		pa_list[i] = pfn << PAGE_SHIFT;
	}
 }
 
@@ -123,39 +130,43 @@ static void populate_seamldr_params(struct seamldr_params *params,
	params->scenario		= SEAMLDR_SCENARIO_UPDATE;
	params->module_nr_pages		= mod_nr_pages;
 
-	populate_pa_list(params->sigstruct_pages_pa_list, SEAMLDR_MAX_NR_SIG_PAGES,
-			 sig, sig_nr_pages);
-	populate_pa_list(params->module_pages_pa_list, SEAMLDR_MAX_NR_MODULE_PAGES,
-			 mod, mod_nr_pages);
+	populate_pa_list(params->sigstruct_pages_pa_list, sig, sig_nr_pages);
+	populate_pa_list(params->module_pages_pa_list, mod, mod_nr_pages);
 }
 
-static int init_seamldr_params(struct seamldr_params *params, const u8 *data, u32 size)
+/*
+ * @image points to a vmalloc()'d 'struct tdx_image'. Transform
+ * it into @params which is the P-SEAMLDR ABI format.
+ */
+static int init_seamldr_params(struct seamldr_params *params,
+			       const struct tdx_image *image,
+			       u32 image_len)
 {
-	const struct tdx_image *image		= (const void *)data;
	const struct tdx_image_header *header	= &image->header;
 
	u32 sigstruct_len	= header->sigstruct_nr_pages * PAGE_SIZE;
	u32 module_len		= header->module_nr_pages * PAGE_SIZE;
 
	u8 *header_start	= (u8 *)header;
-	u8 *header_end		= header_start + HEADER_SIZE;
+	u8 *header_end		= header_start + TDX_IMAGE_HEADER_SIZE;
 
	u8 *sigstruct_start	= header_end;
	u8 *sigstruct_end	= sigstruct_start + sigstruct_len;
 
	u8 *module_start	= sigstruct_end;
 
-	/* Check the calculated payload size against the data size. */
-	if (HEADER_SIZE + sigstruct_len + module_len != size)
+	/* Check the calculated payload size against the image size. */
+	if (TDX_IMAGE_HEADER_SIZE + sigstruct_len + module_len != image_len)
		return -EINVAL;
 
-	/*
-	 * Don't care about user passing the wrong file, but protect
-	 * kernel ABI by preventing accepting garbage.
-	 */
+	/* Reject unsupported tdx_image ABI versions. */
	if (header->version != TDX_IMAGE_VERSION_2)
		return -EINVAL;
 
+	if (header->sigstruct_nr_pages > SEAMLDR_MAX_NR_SIG_PAGES ||
+	    header->module_nr_pages > SEAMLDR_MAX_NR_MODULE_PAGES)
+		return -EINVAL;
+
	if (memcmp(header->signature, "TDX-BLOB", sizeof(header->signature)))
		return -EINVAL;
 
@@ -163,7 +174,7 @@ static int init_seamldr_params(struct seamldr_params *params, const u8 *data, u3
		return -EINVAL;
 
	populate_seamldr_params(params, sigstruct_start, header->sigstruct_nr_pages,
-				module_start, header->module_nr_pages);
+					module_start,    header->module_nr_pages);
	return 0;
 }
 
@@ -230,14 +241,14 @@ static int do_seamldr_install_module(void *seamldr_params)
 {
	enum module_update_state newstate, curstate = MODULE_UPDATE_START;
	int cpu = smp_processor_id();
-	bool primary;
+	bool is_lead_cpu;
	int ret = 0;
 
	/*
-	 * Use CPU 0 to execute update steps that must run exactly once.
-	 * Note CPU 0 is always online.
+	 * Some steps must be run on exactly one CPU. Pick a "lead" CPU to
+	 * execute those steps. Use CPU 0 because it is always online.
	 */
-	primary = cpu == 0;
+	is_lead_cpu = cpu == 0;
 
	do {
		newstate = READ_ONCE(update_ctrl.state);
@@ -250,7 +261,7 @@ static int do_seamldr_install_module(void *seamldr_params)
		curstate = newstate;
		switch (curstate) {
		case MODULE_UPDATE_SHUTDOWN:
-			if (primary)
+			if (is_lead_cpu)
				ret = tdx_module_shutdown();
			break;
		case MODULE_UPDATE_CPU_INSTALL:
@@ -260,7 +271,7 @@ static int do_seamldr_install_module(void *seamldr_params)
			ret = tdx_cpu_enable();
			break;
		case MODULE_UPDATE_RUN_UPDATE:
-			if (primary)
+			if (is_lead_cpu)
				ret = tdx_module_run_update();
			break;
		default:
@@ -276,20 +287,27 @@ static int do_seamldr_install_module(void *seamldr_params)
 /**
  * seamldr_install_module - Install a new TDX module.
  * @data: Pointer to the TDX module image.
- * @size: Size of the TDX module image.
+ * @data_len: Size of the TDX module image.
  *
  * Returns 0 on success, negative error code on failure.
  */
-int seamldr_install_module(const u8 *data, u32 size)
+int seamldr_install_module(const u8 *data, u32 data_len)
 {
	struct seamldr_params *params;
+	const struct tdx_image *image;
	int ret;
 
+	if (data_len < TDX_IMAGE_HEADER_SIZE)
+		return -EINVAL;
+
+	image = (const struct tdx_image *)data;
+
	params = kzalloc_obj(*params);
	if (!params)
		return -ENOMEM;
 
-	ret = init_seamldr_params(params, data, size);
+	/* Populate 'params' from 'image'. */
+	ret = init_seamldr_params(params, image, data_len);
	if (ret)
		goto out;
 
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 2ab6f6efe6d1..0c5660c9ab45 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -69,6 +69,8 @@ static LIST_HEAD(tdx_memlist);
 
 static struct tdx_sys_info tdx_sysinfo;
 
+static DEFINE_RAW_SPINLOCK(sysinit_lock);
+
 /*
  * Do the module global initialization once and return its result.
  * It can be done on any cpu, and from task or IRQ context.
@@ -76,29 +78,34 @@ static struct tdx_sys_info tdx_sysinfo;
 static int try_init_module_global(void)
 {
	struct tdx_module_args args = {};
-	static DEFINE_RAW_SPINLOCK(sysinit_lock);
+	int ret;
 
	raw_spin_lock(&sysinit_lock);
 
-	if (tdx_module_state.sysinit_done)
+	/* Return the "cached" return code. */
+	if (tdx_module_state.sysinit_done) {
+		ret = tdx_module_state.sysinit_ret;
		goto out;
+	}
 
	/* RCX is module attributes and all bits are reserved */
	args.rcx = 0;
-	tdx_module_state.sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
+	ret = seamcall_prerr(TDH_SYS_INIT, &args);
 
	/*
	 * The first SEAMCALL also detects the TDX module, thus
	 * it can fail due to the TDX module is not loaded.
	 * Dump message to let the user know.
	 */
-	if (tdx_module_state.sysinit_ret == -ENODEV)
+	if (ret == -ENODEV)
		pr_err("module not loaded\n");
 
+	/* Save the return code for later callers. */
	tdx_module_state.sysinit_done = true;
+	tdx_module_state.sysinit_ret = ret;
 out:
	raw_spin_unlock(&sysinit_lock);
-	return tdx_module_state.sysinit_ret;
+	return ret;
 }
 
 /**
@@ -1267,7 +1274,7 @@ static __init int tdx_enable(void)
 }
 subsys_initcall(tdx_enable);
 
-#define TDX_SYS_SHUTDOWN_AVOID_COMPAT_SENSITIVE BIT(16)
+#define TDX_SYS_SHUTDOWN_AVOID_COMPAT_SENSITIVE BIT_ULL(16)
 
 int tdx_module_shutdown(void)
 {
diff --git a/drivers/virt/coco/tdx-host/Kconfig b/drivers/virt/coco/tdx-host/Kconfig
index ca600a39d97b..57d0c01a4357 100644
--- a/drivers/virt/coco/tdx-host/Kconfig
+++ b/drivers/virt/coco/tdx-host/Kconfig
@@ -1,12 +1,6 @@
 config TDX_HOST_SERVICES
-	tristate "TDX Host Services Driver"
+	tristate
	depends on INTEL_TDX_HOST
	select FW_LOADER
	select FW_UPLOAD
	default m
-	help
-	  Enable access to TDX host services like module update and
-	  extensions (e.g. TDX Connect).
-
-	  Say y or m if enabling support for confidential virtual machine
-	  support (CONFIG_INTEL_TDX_HOST). The module is called tdx_host.ko.
diff --git a/drivers/virt/coco/tdx-host/tdx-host.c b/drivers/virt/coco/tdx-host/tdx-host.c
index ad116e56aa1a..291464490fe0 100644
--- a/drivers/virt/coco/tdx-host/tdx-host.c
+++ b/drivers/virt/coco/tdx-host/tdx-host.c
@@ -76,6 +76,10 @@ static ssize_t num_remaining_updates_show(struct device *dev,
	return sysfs_emit(buf, "%u\n", info.num_remaining_updates);
 }
 
+/*
+ * These attributes are intended for managing TDX module updates. Reading
+ * them issues a slow, serialized P-SEAMLDR query, so keep them admin-only.
+ */
 static DEVICE_ATTR_ADMIN_RO(seamldr_version);
 static DEVICE_ATTR_ADMIN_RO(num_remaining_updates);
 
@@ -90,7 +94,10 @@ static bool supports_runtime_update(void)
	const struct tdx_sys_info *sysinfo = tdx_get_sysinfo();
 
	if (!sysinfo)
-		return 0;
+		return false;
+
+	if (!tdx_supports_runtime_update(sysinfo))
+		return false;
 
	/*
	 * Calling P-SEAMLDR on CPUs with the seamret_invd_vmcs bug clears
@@ -98,14 +105,17 @@ static bool supports_runtime_update(void)
	 * present before exposing P-SEAMLDR features.
	 */
	if (boot_cpu_has_bug(X86_BUG_SEAMRET_INVD_VMCS))
-		return 0;
+		return false;
 
-	return tdx_supports_runtime_update(sysinfo);
+	return true;
 }
 
 static umode_t seamldr_group_visible(struct kobject *kobj, struct attribute *attr, int idx)
 {
-	return supports_runtime_update() ? attr->mode : 0;
+	if (!supports_runtime_update())
+		return 0;
+
+	return attr->mode;
 }
 
 static const struct attribute_group seamldr_group = {
@@ -120,20 +130,20 @@ static const struct attribute_group *tdx_host_groups[] = {
 };
 
 static enum fw_upload_err tdx_fw_prepare(struct fw_upload *fwl,
-					 const u8 *data, u32 size)
+					 const u8 *data, u32 data_len)
 {
	return FW_UPLOAD_ERR_NONE;
 }
 
 static enum fw_upload_err tdx_fw_write(struct fw_upload *fwl, const u8 *data,
-				       u32 offset, u32 size, u32 *written)
+				       u32 offset, u32 data_len, u32 *written)
 {
	int ret;
 
-	ret = seamldr_install_module(data, size);
+	ret = seamldr_install_module(data, data_len);
	switch (ret) {
	case 0:
-		*written = size;
+		*written = data_len;
		return FW_UPLOAD_ERR_NONE;
	case -EBUSY:
		return FW_UPLOAD_ERR_BUSY;


^ permalink raw reply related

* Re: [PATCH v6 07/43] KVM: guest_memfd: Update kvm_gmem_populate() to use gmem attributes
From: Fuad Tabba @ 2026-05-20 13:47 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-7-91ab5a8b19a4@google.com>

On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Ackerley Tng <ackerleytng@google.com>
>
> Update the guest_memfd populate() flow to pull memory attributes from the
> gmem instance instead of the VM when KVM is not configured to track
> shared/private status in the VM.
>
> Rename the per-VM API to make it clear that it retrieves per-VM
> attributes, i.e. is not suitable for use outside of flows that are
> specific to generic per-VM attributes.
>
> Co-developed-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Reviewed-by: Fuad Tabba <tabba@google.com>
/fuad


> ---
>  arch/x86/kvm/mmu/mmu.c   |  2 +-
>  include/linux/kvm_host.h | 14 +++++++++++++-
>  virt/kvm/guest_memfd.c   | 24 +++++++++++++++++++++---
>  virt/kvm/kvm_main.c      |  8 +++-----
>  4 files changed, 38 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 153bcc5369985..bfcf9be25598e 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -7997,7 +7997,7 @@ static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
>         const unsigned long end = start + KVM_PAGES_PER_HPAGE(level);
>
>         if (level == PG_LEVEL_2M)
> -               return kvm_range_has_memory_attributes(kvm, start, end, ~0, attrs);
> +               return kvm_range_has_vm_memory_attributes(kvm, start, end, ~0, attrs);
>
>         for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) {
>                 if (hugepage_test_mixed(slot, gfn, level - 1) ||
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 28a54298d27db..1deab76dc0a2c 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -2549,12 +2549,24 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
>  #endif
>
>  #ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
> -bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
> +extern bool vm_memory_attributes;
> +bool kvm_range_has_vm_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
>                                      unsigned long mask, unsigned long attrs);
>  bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
>                                         struct kvm_gfn_range *range);
>  bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
>                                          struct kvm_gfn_range *range);
> +#else
> +#define vm_memory_attributes false
> +static inline bool kvm_range_has_vm_memory_attributes(struct kvm *kvm,
> +                                                     gfn_t start, gfn_t end,
> +                                                     unsigned long mask,
> +                                                     unsigned long attrs)
> +{
> +       WARN_ONCE(1, "Unexpected call to kvm_range_has_vm_memory_attributes()");
> +
> +       return false;
> +}
>  #endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
>
>  unsigned long kvm_gmem_get_memory_attributes(struct kvm *kvm, gfn_t gfn);
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index f055e058a3f28..9d025f518c025 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -924,12 +924,31 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>  EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
>
>  #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
> +static bool kvm_gmem_range_is_private(struct gmem_inode *gi, pgoff_t index,
> +                                     size_t nr_pages, struct kvm *kvm, gfn_t gfn)
> +{
> +       pgoff_t end = index + nr_pages - 1;
> +       void *entry;
> +
> +       if (vm_memory_attributes)
> +               return kvm_range_has_vm_memory_attributes(kvm, gfn, gfn + nr_pages,
> +                                                      KVM_MEMORY_ATTRIBUTE_PRIVATE,
> +                                                      KVM_MEMORY_ATTRIBUTE_PRIVATE);
> +
> +       mt_for_each(&gi->attributes, entry, index, end) {
> +               if (xa_to_value(entry) != KVM_MEMORY_ATTRIBUTE_PRIVATE)
> +                       return false;
> +       }
> +
> +       return true;
> +}
>
>  static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot,
>                                 struct file *file, gfn_t gfn, struct page *src_page,
>                                 kvm_gmem_populate_cb post_populate, void *opaque)
>  {
>         pgoff_t index = kvm_gmem_get_index(slot, gfn);
> +       struct gmem_inode *gi;
>         struct folio *folio;
>         kvm_pfn_t pfn;
>         int ret;
> @@ -944,9 +963,8 @@ static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot,
>
>         folio_unlock(folio);
>
> -       if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1,
> -                                            KVM_MEMORY_ATTRIBUTE_PRIVATE,
> -                                            KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
> +       gi = GMEM_I(file_inode(file));
> +       if (!kvm_gmem_range_is_private(gi, index, 1, kvm, gfn)) {
>                 ret = -EINVAL;
>                 goto out_put_folio;
>         }
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 4139e903f756a..0a4024948711a 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -103,9 +103,7 @@ module_param(allow_unsafe_mappings, bool, 0444);
>
>  #ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
>  #ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
> -static bool vm_memory_attributes = true;
> -#else
> -#define vm_memory_attributes false
> +bool vm_memory_attributes = true;
>  #endif
>  DEFINE_STATIC_CALL_RET0(__kvm_get_memory_attributes, kvm_get_memory_attributes_t);
>  EXPORT_SYMBOL_FOR_KVM_INTERNAL(STATIC_CALL_KEY(__kvm_get_memory_attributes));
> @@ -2450,7 +2448,7 @@ static unsigned long kvm_get_vm_memory_attributes(struct kvm *kvm, gfn_t gfn)
>   * Returns true if _all_ gfns in the range [@start, @end) have attributes
>   * such that the bits in @mask match @attrs.
>   */
> -bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
> +bool kvm_range_has_vm_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
>                                      unsigned long mask, unsigned long attrs)
>  {
>         XA_STATE(xas, &kvm->mem_attr_array, start);
> @@ -2584,7 +2582,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
>         mutex_lock(&kvm->slots_lock);
>
>         /* Nothing to do if the entire range has the desired attributes. */
> -       if (kvm_range_has_memory_attributes(kvm, start, end, ~0, attributes))
> +       if (kvm_range_has_vm_memory_attributes(kvm, start, end, ~0, attributes))
>                 goto out_unlock;
>
>         /*
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

* Re: [PATCH v4 1/4] Introducing pw_lock() and per-cpu queue & flush work
From: Sebastian Andrzej Siewior @ 2026-05-20 13:48 UTC (permalink / raw)
  To: Leonardo Bras
  Cc: Jonathan Corbet, Shuah Khan, Peter Zijlstra, Ingo Molnar,
	Will Deacon, Boqun Feng, Waiman Long, Andrew Morton,
	David Hildenbrand, Lorenzo Stoakes, Liam R. Howlett,
	Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, Michal Hocko,
	Jann Horn, Pedro Falcato, Brendan Jackman, Johannes Weiner,
	Zi Yan, Harry Yoo, Hao Li, Christoph Lameter, David Rientjes,
	Roman Gushchin, Chris Li, Kairui Song, Kemeng Shi, Nhat Pham,
	Baoquan He, Barry Song, Youngjun Park, Qi Zheng, Shakeel Butt,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Borislav Petkov (AMD),
	Randy Dunlap, Feng Tang, Dapeng Mi, Kees Cook, Marco Elver,
	Jakub Kicinski, Li RongQing, Eric Biggers, Paul E. McKenney,
	Nathan Chancellor, Nicolas Schier, Miguel Ojeda,
	Thomas Weißschuh, Thomas Gleixner, Douglas Anderson,
	Gary Guo, Christian Brauner, Pasha Tatashin, Coiby Xu,
	Masahiro Yamada, Frederic Weisbecker, linux-doc, linux-kernel,
	linux-mm, linux-rt-devel, Marcelo Tosatti
In-Reply-To: <20260519012754.240804-2-leobras.c@gmail.com>

On 2026-05-18 22:27:47 [-0300], Leonardo Bras wrote:
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 4d0f545fb3ec..68c8a6f9d227 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -2810,20 +2810,30 @@ Kernel parameters
>  			  If a queue's affinity mask contains only isolated
>  			  CPUs then this parameter has no effect on the
>  			  interrupt routing decision, though interrupts are
>  			  only delivered when tasks running on those
>  			  isolated CPUs submit IO. IO submitted on
>  			  housekeeping CPUs has no influence on those
>  			  queues.
>  
>  			The format of <cpu-list> is described above.
>  
> +	pwlocks=	[KNL,SMP] Select a behavior on per-CPU resource sharing
> +			and remote interference mechanism on a kernel built with
> +			CONFIG_PWLOCKS.
> +			Format: { "0" | "1" }
> +			0 - local_lock() + queue_work_on(remote_cpu)
> +			1 - spin_lock() for both local and remote operations
> +
> +			Selecting 1 may be interesting for systems that want
> +			to avoid interruption & context switches from IPIs.
> +

This documentation is supposed to be for an administrator/ user of the
system. Exposing him to underlying kernel technique shouldn't happen.
It does not explain the users/ outcome so it sounds like best hope.

>  	iucv=		[HW,NET]
>  
>  	ivrs_ioapic	[HW,X86-64]
>  			Provide an override to the IOAPIC-ID<->DEVICE-ID
>  			mapping provided in the IVRS ACPI table.
>  			By default, PCI segment is 0, and can be omitted.
>  
>  			For example, to map IOAPIC-ID decimal 10 to
>  			PCI segment 0x1 and PCI device 00:14.0,
>  			write the parameter as:
> diff --git a/Documentation/locking/pwlocks.rst b/Documentation/locking/pwlocks.rst
> new file mode 100644
> index 000000000000..09f4a5417bc1
> --- /dev/null
> +++ b/Documentation/locking/pwlocks.rst
> @@ -0,0 +1,76 @@
> +.. SPDX-License-Identifier: GPL-2.0
> +
> +=========
> +PW (Per-CPU Work) locks
> +=========
> +
> +Some places in the kernel implement a parallel programming strategy
> +consisting on local_locks() for most of the work, and some rare remote
> +operations are scheduled on target cpu. This keeps cache bouncing low since
> +cacheline tends to be mostly local, and avoids the cost of locks in non-RT

PREEMPT_RT can be spelled out if you mean it so it is not confused with
other meanings of the two letters.

> +kernels, even though the very few remote operations will be expensive due
> +to scheduling overhead.
> +
> +On the other hand, for RT workloads this can represent a problem:
> +scheduling work on remote cpu that are executing low latency tasks
> +is undesired and can introduce unexpected deadline misses.
> +
> +PW locks help to convert sites that use local_locks (for cpu local operations)
> +and queue_work_on (for queueing work remotely, to be executed
> +locally on the owner cpu of the lock) to a spinlocks.

not spinlocks.

> +
> +The lock is declared pw_lock_t type.
> +The lock is initialized with pw_lock_init.
> +The lock is locked with pw_lock (takes a lock and cpu as a parameter).
> +The lock is unlocked with pw_unlock (takes a lock and cpu as a parameter).

If it is a function, it should end with ()

> +The pw_lock_irqsave function disables interrupts and saves current interrupt state,
> +cpu as a parameter.

CPU.

> +For trylock variant, there is the pw_trylock_t type, initialized with
> +pw_trylock_init. Then the corresponding pw_trylock and pw_trylock_irqsave.
> +
> +work_struct should be replaced by pw_struct, which contains a cpu parameter
> +(owner cpu of the lock), initialized by INIT_PW.
> +
> +The queue work related functions (analogous to queue_work_on and flush_work) are:
> +pw_queue_on and pw_flush.
> +
> +The behaviour of the PW lock functions is as follows:
> +
> +* !CONFIG_PWLOCKS (or CONFIG_PWLOCKS and pwlocks=off kernel boot parameter):
> +        - pw_lock:			local_lock
> +        - pw_lock_irqsave:		local_lock_irqsave
> +        - pw_trylock:			local_trylock
> +        - pw_trylock_irqsave:		local_trylock_irqsave
> +        - pw_unlock:			local_unlock
> +        - pw_lock_local:		local_lock
> +        - pw_trylock_local:		local_trylock
> +        - pw_unlock_local:		local_unlock
> +        - pw_queue_on:         		queue_work_on
> +        - pw_flush:	            	flush_work
> +
> +* CONFIG_PWLOCKS (and CONFIG_PWLOCKS_DEFAULT=y or pwlocks=on kernel boot parameter),
> +        - pw_lock:			spin_lock
> +        - pw_lock_irqsave:		spin_lock_irqsave
> +        - pw_trylock:			spin_trylock
> +        - pw_trylock_irqsave:		spin_trylock_irqsave
> +        - pw_unlock:			spin_unlock
> +        - pw_lock_local:		preempt_disable OR migrate_disable + spin_lock
> +        - pw_trylock_local:		preempt_disable OR migrate_disable + spin_trylock
> +        - pw_unlock_local:		preempt_enable OR migrate_enable + spin_unlock
> +        - pw_queue_on:         		executes work function on caller cpu
> +        - pw_flush:            		empty
> +
> +pw_get_cpu(work_struct), to be called from within per-cpu work function,
> +returns the target cpu.
> +
> +On the locking functions above, there are the local locking functions
> +(pw_lock_local, pw_trylock_local and pw_unlock_local) that must only
> +be used to access per-CPU data from the CPU that owns that data,
> +and never remotely. They disable preemption/migration and don't require
> +a cpu parameter, making them a replacement for local_lock functions that
> +does not introduce overhead.

Why do you need to either the one or the other? My only guess is that
migrate_disable() is sufficient but you prefer preempt_disable() on
!PREEMPT_RT because it is cheaper.

> +These should only be used when accessing per-CPU data of the local CPU.
> +
> diff --git a/init/Kconfig b/init/Kconfig
> index 2937c4d308ae..3fb751dc4530 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -764,20 +764,55 @@ config CPU_ISOLATION
>  	depends on SMP
>  	default y
>  	help
>  	  Make sure that CPUs running critical tasks are not disturbed by
>  	  any source of "noise" such as unbound workqueues, timers, kthreads...
>  	  Unbound jobs get offloaded to housekeeping CPUs. This is driven by
>  	  the "isolcpus=" boot parameter.
>  
>  	  Say Y if unsure.
>  
> +config PWLOCKS
> +	bool "Per-CPU Work locks"
> +	depends on SMP || COMPILE_TEST
> +	default n
> +	help
> +	  Allow changing the behavior on per-CPU resource sharing with cache,
> +	  from the regular local_locks() + queue_work_on(remote_cpu) to using
> +	  per-CPU spinlocks on both local and remote operations.
> +
> +	  This is useful to give user the option on reducing IPIs to CPUs, and
> +	  thus reduce interruptions and context switches. On the other hand, it
> +	  increases generated code and will use atomic operations if spinlocks
> +	  are selected.

I think the goal is to avoid scheduling a task on a remote CPU to get
something done.

> +
> +	  If set, will use the default behavior set in PWLOCKS_DEFAULT unless boot
> +	  parameter pwlocks is passed with a different behavior.
> +
> +	  If unset, will use the local_lock() + queue_work_on() strategy,
> +	  regardless of the boot parameter or PWLOCKS_DEFAULT.

This sounds like it affects the greater kernel.

> +	  Say N if unsure.
> +
> +config PWLOCKS_DEFAULT
> +	bool "Use per-CPU spinlocks by default on PWLOCKS"
> +	depends on PWLOCKS
> +	default n

n is default.

> +	help
> +	  If set, will use per-CPU spinlocks as default behavior for per-CPU
> +	  remote operations.
> +
> +	  If unset, will use local_lock() + queue_work_on(cpu) as default
> +	  behavior for remote operations.
> +
> +	  Say N if unsure
> +
>  source "kernel/rcu/Kconfig"
>  
>  config IKCONFIG
>  	tristate "Kernel .config support"
>  	help
>  	  This option enables the complete Linux kernel ".config" file
>  	  contents to be saved in the kernel. It provides documentation
>  	  of which kernel options are used in a running kernel or in an
>  	  on-disk kernel.  This information can be extracted from the kernel
>  	  image file with the script scripts/extract-ikconfig and used as
> diff --git a/include/linux/pwlocks.h b/include/linux/pwlocks.h
> new file mode 100644
> index 000000000000..3d79621655f9
> --- /dev/null
> +++ b/include/linux/pwlocks.h
> @@ -0,0 +1,265 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_PWLOCKS_H
> +#define _LINUX_PWLOCKS_H
> +
> +#include "linux/spinlock.h"
> +#include "linux/local_lock.h"
> +#include "linux/workqueue.h"
> +
> +#ifndef CONFIG_PWLOCKS
> +
> +typedef local_lock_t pw_lock_t;
> +typedef local_trylock_t pw_trylock_t;
> +
> +struct pw_struct {
> +	struct work_struct work;
> +};
> +
> +#define pw_lock_init(lock)				\
> +	local_lock_init(lock)
> +
> +#define pw_trylock_init(lock)				\
> +	local_trylock_init(lock)
> +
> +#define pw_lock(lock, cpu)				\
> +	local_lock(lock)
> +
> +#define pw_lock_local(lock)				\
> +	local_lock(lock)
> +
> +#define pw_lock_irqsave(lock, flags, cpu)		\
> +	local_lock_irqsave(lock, flags)

The part where you have a `cpu' argument which is not used is entirely
confusing.

> +
> +#define pw_lock_local_irqsave(lock, flags)		\
> +	local_lock_irqsave(lock, flags)
> +
> +#define pw_trylock(lock, cpu)				\
> +	local_trylock(lock)
> +
> +#define pw_trylock_local(lock)				\
> +	local_trylock(lock)
> +
> +#define pw_trylock_irqsave(lock, flags, cpu)		\
> +	local_trylock_irqsave(lock, flags)
> +
> +#define pw_unlock(lock, cpu)				\
> +	local_unlock(lock)
> +
> +#define pw_unlock_local(lock)				\
> +	local_unlock(lock)
> +
> +#define pw_unlock_irqrestore(lock, flags, cpu)		\
> +	local_unlock_irqrestore(lock, flags)
> +
> +#define pw_unlock_local_irqrestore(lock, flags)		\
> +	local_unlock_irqrestore(lock, flags)
> +
> +#define pw_lockdep_assert_held(lock)			\
> +	lockdep_assert_held(lock)
> +
> +#define pw_queue_on(c, wq, pw)				\
> +	queue_work_on(c, wq, &(pw)->work)
> +
> +#define pw_flush(pw)					\
> +	flush_work(&(pw)->work)
> +
> +#define pw_get_cpu(pw)	smp_processor_id()
> +
> +#define pw_is_cpu_remote(cpu)		(false)
> +
> +#define INIT_PW(pw, func, c)				\
> +	INIT_WORK(&(pw)->work, (func))
> +
> +#else /* CONFIG_PWLOCKS */
> +
> +DECLARE_STATIC_KEY_MAYBE(CONFIG_PWLOCKS_DEFAULT, pw_sl);
> +
> +typedef union {
> +	spinlock_t sl;
> +	local_lock_t ll;
> +} pw_lock_t;
> +
> +typedef union {
> +	spinlock_t sl;
> +	local_trylock_t ll;
> +} pw_trylock_t;

Why do you use local_trylock_t ? Its use case is different compared to
local_lock_t. _IF_ you are fine with local_trylock_t then you should be
able to deal with a per-CPU spinlock_t and none of this should be
needed.

> +struct pw_struct {
> +	struct work_struct work;
> +	int cpu;
> +};
> +
> +#ifdef CONFIG_PREEMPT_RT
> +#define preempt_or_migrate_disable migrate_disable
> +#define preempt_or_migrate_enable migrate_enable
> +#else
> +#define preempt_or_migrate_disable preempt_disable
> +#define preempt_or_migrate_enable preempt_enable
> +#endif

if then () but this looks terrible.

> +
> +#define pw_lock_init(lock)							\
> +do {										\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl))		\
> +		spin_lock_init(lock.sl);					\
> +	else									\
> +		local_lock_init(lock.ll);					\
> +} while (0)
> +
> +#define pw_trylock_init(lock)							\
> +do {										\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl))		\
> +		spin_lock_init(lock.sl);					\
> +	else									\
> +		local_trylock_init(lock.ll);					\
> +} while (0)
> +
> +#define pw_lock(lock, cpu)							\
> +do {										\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl))		\
> +		spin_lock(per_cpu_ptr(lock.sl, cpu));				\
> +	else									\
> +		local_lock(lock.ll);						\
> +} while (0)
> +
> +#define pw_lock_local(lock)							\
> +do {										\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl)) {		\
> +		preempt_or_migrate_disable();					\
> +		spin_lock(this_cpu_ptr(lock.sl));				\
> +	} else {								\
> +		local_lock(lock.ll);						\
> +	}									\
> +} while (0)
> +
> +#define pw_lock_irqsave(lock, flags, cpu)					\
> +do {										\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl))		\
> +		spin_lock_irqsave(per_cpu_ptr(lock.sl, cpu), flags);	\
> +	else									\
> +		local_lock_irqsave(lock.ll, flags);				\
> +} while (0)
> +
> +#define pw_lock_local_irqsave(lock, flags)					\
> +do {										\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl)) {		\
> +		preempt_or_migrate_disable();					\
> +		spin_lock_irqsave(this_cpu_ptr(lock.sl), flags);		\
> +	} else {								\
> +		local_lock_irqsave(lock.ll, flags);				\
> +	}									\
> +} while (0)
> +
> +#define pw_trylock(lock, cpu)							\
> +({										\
> +	int t;									\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl))		\
> +		t = spin_trylock(per_cpu_ptr(lock.sl, cpu));			\
> +	else									\
> +		t = local_trylock(lock.ll);					\
> +	t;									\
> +})
> +
> +#define pw_trylock_local(lock)							\
> +({										\
> +	int t;									\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl)) {		\
> +		preempt_or_migrate_disable();					\
> +		t = spin_trylock(this_cpu_ptr(lock.sl));			\
> +		if (!t)								\
> +			preempt_or_migrate_enable();				\
> +	} else {								\
> +		t = local_trylock(lock.ll);					\
> +	}									\
> +	t;									\
> +})
> +
> +#define pw_trylock_irqsave(lock, flags, cpu)					\
> +({										\
> +	int t;									\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl))		\
> +		t = spin_trylock_irqsave(per_cpu_ptr(lock.sl, cpu), flags);	\
> +	else									\
> +		t = local_trylock_irqsave(lock.ll, flags);			\
> +	t;									\
> +})
> +
> +#define pw_unlock(lock, cpu)							\
> +do {										\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl))		\
> +		spin_unlock(per_cpu_ptr(lock.sl, cpu));			\
> +	else									\
> +		local_unlock(lock.ll);					\
> +} while (0)
> +
> +#define pw_unlock_local(lock)							\
> +do {										\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl)) {		\
> +		spin_unlock(this_cpu_ptr(lock.sl));				\
> +		preempt_or_migrate_enable();					\
> +	} else {								\
> +		local_unlock(lock.ll);						\
> +	}									\
> +} while (0)
> +
> +#define pw_unlock_irqrestore(lock, flags, cpu)					\
> +do {										\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl))		\
> +		spin_unlock_irqrestore(per_cpu_ptr(lock.sl, cpu), flags);	\
> +	else									\
> +		local_unlock_irqrestore(lock.ll, flags);			\
> +} while (0)
> +
> +#define pw_unlock_local_irqrestore(lock, flags)					\
> +do {										\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl)) {		\
> +		spin_unlock_irqrestore(this_cpu_ptr(lock.sl), flags);	\
> +		preempt_or_migrate_enable();					\
> +	} else {								\
> +		local_unlock_irqrestore(lock.ll, flags);			\
> +	}									\
> +} while (0)
> +
> +#define pw_lockdep_assert_held(lock)						\
> +do {										\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl))		\
> +		lockdep_assert_held(this_cpu_ptr(lock.sl));			\
> +	else									\
> +		lockdep_assert_held(this_cpu_ptr(lock.ll));			\
> +} while (0)
> +
> +#define pw_queue_on(c, wq, pw)							\
> +do {										\
> +	int __c = c;								\
> +	struct pw_struct *__pw = (pw);						\
> +	if (static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl)) {		\
> +		WARN_ON((__c) != __pw->cpu);					\
> +		__pw->work.func(&__pw->work);					\
> +	} else {								\
> +		queue_work_on(__c, wq, &(__pw)->work);				\
> +	}									\
> +} while (0)
> +
> +/*
> + * Does nothing if PWLOCKS is set to use spinlock, as the task is already done at the
> + * time pw_queue_on() returns.
> + */
> +#define pw_flush(pw)								\
> +do {										\
> +	struct pw_struct *__pw = (pw);						\
> +	if (!static_branch_maybe(CONFIG_PWLOCKS_DEFAULT, &pw_sl))		\
> +		flush_work(&__pw->work);					\
> +} while (0)

I don't think this should be a collection of macros. Either proper
functions or static inline _if_ this is performance critical for some
reason.

> +
> +#define pw_get_cpu(w)			container_of((w), struct pw_struct, work)->cpu
> +
> +#define pw_is_cpu_remote(cpu)		((cpu) != smp_processor_id())
> +
> +#define INIT_PW(pw, func, c)							\
> +do {										\
> +	struct pw_struct *__pw = (pw);						\
> +	INIT_WORK(&__pw->work, (func));						\
> +	__pw->cpu = (c);							\
> +} while (0)
> +
> +#endif /* CONFIG_PWLOCKS */
> +#endif /* LINUX_PWLOCKS_H */
> diff --git a/kernel/pwlocks.c b/kernel/pwlocks.c
> new file mode 100644
> index 000000000000..1ebf5cb979b9
> --- /dev/null
> +++ b/kernel/pwlocks.c
> @@ -0,0 +1,47 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include "linux/export.h"
> +#include <linux/sched.h>
> +#include <linux/pwlocks.h>
> +#include <linux/string.h>
> +#include <linux/sched/isolation.h>
> +
> +DEFINE_STATIC_KEY_MAYBE(CONFIG_PWLOCKS_DEFAULT, pw_sl);
> +EXPORT_SYMBOL(pw_sl);
> +
> +static bool pwlocks_param_specified;
> +
> +static int __init pwlocks_setup(char *str)
> +{
> +	int opt;
> +
> +	if (!get_option(&str, &opt)) {
> +		pr_warn("PWLOCKS: invalid pwlocks parameter: %s, ignoring.\n", str);
> +		return 0;
> +	}
> +
> +	if (opt)
> +		static_branch_enable(&pw_sl);
> +	else
> +		static_branch_disable(&pw_sl);
> +
> +	pwlocks_param_specified = true;
> +
> +	return 1;
> +}
> +__setup("pwlocks=", pwlocks_setup);
> +
> +/*
> + * Enable PWLOCKS if CPUs want to avoid kernel noise.
> + */
> +static int __init pwlocks_init(void)
> +{
> +	if (pwlocks_param_specified)
> +		return 0;
> +
> +	if (housekeeping_enabled(HK_TYPE_KERNEL_NOISE))
> +		static_branch_enable(&pw_sl);

How likely is it, that you you had users before late_initcall()? Also
can it happen that one of them uses one function to lock and the other
unlock in this brief window? There is no check if this was used before
static_branch usage.

> +
> +	return 0;
> +}
> +
> +late_initcall(pwlocks_init);

Sebastian

^ permalink raw reply

* Re: [PATCH v6 08/43] KVM: guest_memfd: Only prepare folios for private pages
From: Fuad Tabba @ 2026-05-20 13:51 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-8-91ab5a8b19a4@google.com>

On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Ackerley Tng <ackerleytng@google.com>
>
> All-shared guest_memfd used to be only supported for non-CoCo VMs where
> preparation doesn't apply. INIT_SHARED is about to be supported for
> non-CoCo VMs in a later patch in this series.
>
> In addition, KVM_SET_MEMORY_ATTRIBUTES2 is about to be supported in
> guest_memfd in a later patch in this series.
>
> This means that the kvm fault handler may now call kvm_gmem_get_pfn() on a
> shared folio for a CoCo VM where preparation applies.
>
> Add a check to make sure that preparation is only performed for private
> folios.
>
> Preparation will be undone on freeing (see kvm_gmem_free_folio()) and on
> conversion to shared.
>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Reviewed-by: Fuad Tabba <tabba@google.com>

Cheers,
/fuad
> ---
>  virt/kvm/guest_memfd.c | 9 ++++++---
>  1 file changed, 6 insertions(+), 3 deletions(-)
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 9d025f518c025..4f7c4824c3a45 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -888,6 +888,7 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>                      int *max_order)
>  {
>         pgoff_t index = kvm_gmem_get_index(slot, gfn);
> +       struct inode *inode;
>         struct folio *folio;
>         int r = 0;
>
> @@ -895,7 +896,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>         if (!file)
>                 return -EFAULT;
>
> -       filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
> +       inode = file_inode(file);
> +       filemap_invalidate_lock_shared(inode->i_mapping);
>
>         folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
>         if (IS_ERR(folio)) {
> @@ -908,7 +910,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>                 folio_mark_uptodate(folio);
>         }
>
> -       r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
> +       if (kvm_gmem_is_private_mem(inode, index))
> +               r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
>
>         folio_unlock(folio);
>
> @@ -918,7 +921,7 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>                 folio_put(folio);
>
>  out:
> -       filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
> +       filemap_invalidate_unlock_shared(inode->i_mapping);
>         return r;
>  }
>  EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox