Linux Confidential Computing Development

Linux Confidential Computing Development
 help / color / mirror / Atom feed

* Re: [PATCH v6 24/43] KVM: selftests: Rename guest_memfd{,_offset} to gmem_{fd,offset}
From: Fuad Tabba @ 2026-05-21 12:13 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-24-91ab5a8b19a4@google.com>

On Thu, 7 May 2026 at 21:23, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Sean Christopherson <seanjc@google.com>
>
> Rename local variables and function parameters for the guest memory file
> descriptor and its offset to use a "gmem_" prefix instead of
> "guest_memfd_".
>
> No functional change intended.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Reviewed-by: Fuad Tabba <tabba@google.com>

Cheers,
/fuad

> ---
>  tools/testing/selftests/kvm/include/kvm_util.h |  6 +++---
>  tools/testing/selftests/kvm/lib/kvm_util.c     | 26 +++++++++++++-------------
>  2 files changed, 16 insertions(+), 16 deletions(-)
>
> diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
> index 2ecaaa0e99654..f19383376ee8e 100644
> --- a/tools/testing/selftests/kvm/include/kvm_util.h
> +++ b/tools/testing/selftests/kvm/include/kvm_util.h
> @@ -690,17 +690,17 @@ int __vm_set_user_memory_region(struct kvm_vm *vm, u32 slot, u32 flags,
>                                 gpa_t gpa, u64 size, void *hva);
>  void vm_set_user_memory_region2(struct kvm_vm *vm, u32 slot, u32 flags,
>                                 gpa_t gpa, u64 size, void *hva,
> -                               u32 guest_memfd, u64 guest_memfd_offset);
> +                               u32 gmem_fd, u64 gmem_offset);
>  int __vm_set_user_memory_region2(struct kvm_vm *vm, u32 slot, u32 flags,
>                                  gpa_t gpa, u64 size, void *hva,
> -                                u32 guest_memfd, u64 guest_memfd_offset);
> +                                u32 gmem_fd, u64 gmem_offset);
>
>  void vm_userspace_mem_region_add(struct kvm_vm *vm,
>                                  enum vm_mem_backing_src_type src_type,
>                                  gpa_t gpa, u32 slot, u64 npages, u32 flags);
>  void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
>                 gpa_t gpa, u32 slot, u64 npages, u32 flags,
> -               int guest_memfd_fd, u64 guest_memfd_offset);
> +               int gmem_fd, u64 gmem_offset);
>
>  #ifndef vm_arch_has_protected_memory
>  static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm)
> diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
> index df73b23a4c66a..11da9b7546d03 100644
> --- a/tools/testing/selftests/kvm/lib/kvm_util.c
> +++ b/tools/testing/selftests/kvm/lib/kvm_util.c
> @@ -947,7 +947,7 @@ void vm_set_user_memory_region(struct kvm_vm *vm, u32 slot, u32 flags,
>
>  int __vm_set_user_memory_region2(struct kvm_vm *vm, u32 slot, u32 flags,
>                                  gpa_t gpa, u64 size, void *hva,
> -                                u32 guest_memfd, u64 guest_memfd_offset)
> +                                u32 gmem_fd, u64 gmem_offset)
>  {
>         struct kvm_userspace_memory_region2 region = {
>                 .slot = slot,
> @@ -955,8 +955,8 @@ int __vm_set_user_memory_region2(struct kvm_vm *vm, u32 slot, u32 flags,
>                 .guest_phys_addr = gpa,
>                 .memory_size = size,
>                 .userspace_addr = (uintptr_t)hva,
> -               .guest_memfd = guest_memfd,
> -               .guest_memfd_offset = guest_memfd_offset,
> +               .guest_memfd = gmem_fd,
> +               .guest_memfd_offset = gmem_offset,
>         };
>
>         TEST_REQUIRE_SET_USER_MEMORY_REGION2();
> @@ -966,10 +966,10 @@ int __vm_set_user_memory_region2(struct kvm_vm *vm, u32 slot, u32 flags,
>
>  void vm_set_user_memory_region2(struct kvm_vm *vm, u32 slot, u32 flags,
>                                 gpa_t gpa, u64 size, void *hva,
> -                               u32 guest_memfd, u64 guest_memfd_offset)
> +                               u32 gmem_fd, u64 gmem_offset)
>  {
>         int ret = __vm_set_user_memory_region2(vm, slot, flags, gpa, size, hva,
> -                                              guest_memfd, guest_memfd_offset);
> +                                              gmem_fd, gmem_offset);
>
>         TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION2 failed, errno = %d (%s)",
>                     errno, strerror(errno));
> @@ -979,7 +979,7 @@ void vm_set_user_memory_region2(struct kvm_vm *vm, u32 slot, u32 flags,
>  /* FIXME: This thing needs to be ripped apart and rewritten. */
>  void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
>                 gpa_t gpa, u32 slot, u64 npages, u32 flags,
> -               int guest_memfd, u64 guest_memfd_offset)
> +               int gmem_fd, u64 gmem_offset)
>  {
>         int ret;
>         struct userspace_mem_region *region;
> @@ -1055,12 +1055,12 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
>                 region->mmap_size += alignment;
>
>         if (flags & KVM_MEM_GUEST_MEMFD) {
> -               if (guest_memfd < 0) {
> -                       u32 guest_memfd_flags = 0;
> +               if (gmem_fd < 0) {
> +                       u32 gmem_flags = 0;
>
> -                       TEST_ASSERT(!guest_memfd_offset,
> +                       TEST_ASSERT(!gmem_offset,
>                                     "Offset must be zero when creating new guest_memfd");
> -                       guest_memfd = vm_create_guest_memfd(vm, mem_size, guest_memfd_flags);
> +                       gmem_fd = vm_create_guest_memfd(vm, mem_size, gmem_flags);
>                 } else {
>                         /*
>                          * Install a unique fd for each memslot so that the fd
> @@ -1068,11 +1068,11 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
>                          * needing to track if the fd is owned by the framework
>                          * or by the caller.
>                          */
> -                       guest_memfd = kvm_dup(guest_memfd);
> +                       gmem_fd = kvm_dup(gmem_fd);
>                 }
>
> -               region->region.guest_memfd = guest_memfd;
> -               region->region.guest_memfd_offset = guest_memfd_offset;
> +               region->region.guest_memfd = gmem_fd;
> +               region->region.guest_memfd_offset = gmem_offset;
>         } else {
>                 region->region.guest_memfd = -1;
>         }
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

* Re: [PATCH v6 23/43] KVM: selftests: Create gmem fd before "regular" fd when adding memslot
From: Fuad Tabba @ 2026-05-21 12:11 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-23-91ab5a8b19a4@google.com>

On Thu, 7 May 2026 at 21:23, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Sean Christopherson <seanjc@google.com>
>
> When adding a memslot associated a guest_memfd instance, create/dup the
> guest_memfd before creating the "normal" backing file.  This will allow
> dup'ing the gmem fd as the normal fd when guest_memfd supports mmap(),
> i.e. to make guest_memfd the _only_ backing source for the memslot.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Reviewed-by: Fuad Tabba <tabba@google.com>

Cheers,
/fuad

> ---
>  tools/testing/selftests/kvm/lib/kvm_util.c | 45 +++++++++++++++---------------
>  1 file changed, 23 insertions(+), 22 deletions(-)
>
> diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
> index 2a76eca7029d3..df73b23a4c66a 100644
> --- a/tools/testing/selftests/kvm/lib/kvm_util.c
> +++ b/tools/testing/selftests/kvm/lib/kvm_util.c
> @@ -1054,6 +1054,29 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
>         if (alignment > 1)
>                 region->mmap_size += alignment;
>
> +       if (flags & KVM_MEM_GUEST_MEMFD) {
> +               if (guest_memfd < 0) {
> +                       u32 guest_memfd_flags = 0;
> +
> +                       TEST_ASSERT(!guest_memfd_offset,
> +                                   "Offset must be zero when creating new guest_memfd");
> +                       guest_memfd = vm_create_guest_memfd(vm, mem_size, guest_memfd_flags);
> +               } else {
> +                       /*
> +                        * Install a unique fd for each memslot so that the fd
> +                        * can be closed when the region is deleted without
> +                        * needing to track if the fd is owned by the framework
> +                        * or by the caller.
> +                        */
> +                       guest_memfd = kvm_dup(guest_memfd);
> +               }
> +
> +               region->region.guest_memfd = guest_memfd;
> +               region->region.guest_memfd_offset = guest_memfd_offset;
> +       } else {
> +               region->region.guest_memfd = -1;
> +       }
> +
>         region->fd = -1;
>         if (backing_src_is_shared(src_type))
>                 region->fd = kvm_memfd_alloc(region->mmap_size,
> @@ -1083,28 +1106,6 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
>
>         region->backing_src_type = src_type;
>
> -       if (flags & KVM_MEM_GUEST_MEMFD) {
> -               if (guest_memfd < 0) {
> -                       u32 guest_memfd_flags = 0;
> -                       TEST_ASSERT(!guest_memfd_offset,
> -                                   "Offset must be zero when creating new guest_memfd");
> -                       guest_memfd = vm_create_guest_memfd(vm, mem_size, guest_memfd_flags);
> -               } else {
> -                       /*
> -                        * Install a unique fd for each memslot so that the fd
> -                        * can be closed when the region is deleted without
> -                        * needing to track if the fd is owned by the framework
> -                        * or by the caller.
> -                        */
> -                       guest_memfd = kvm_dup(guest_memfd);
> -               }
> -
> -               region->region.guest_memfd = guest_memfd;
> -               region->region.guest_memfd_offset = guest_memfd_offset;
> -       } else {
> -               region->region.guest_memfd = -1;
> -       }
> -
>         region->unused_phy_pages = sparsebit_alloc();
>         if (vm_arch_has_protected_memory(vm))
>                 region->protected_phy_pages = sparsebit_alloc();
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

* Re: [PATCH v10 22/25] x86/virt/tdx: Reject updates during compatibility-sensitive operations
From: Chao Gao @ 2026-05-21 12:04 UTC (permalink / raw)
  To: Dave Hansen
  Cc: kvm, linux-coco, linux-kernel, binbin.wu, dave.hansen, djbw,
	ira.weiny, kai.huang, kas, nik.borisov, paulmck, pbonzini,
	reinette.chatre, rick.p.edgecombe, sagis, seanjc, tony.lindgren,
	vannapurve, vishal.l.verma, yilun.xu, xiaoyao.li, yan.y.zhao,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, x86,
	H. Peter Anvin
In-Reply-To: <fc5f0061-8402-4f78-ab65-a7c8acc7e82e@intel.com>

>This function is pretty tidy. More or less:
>
> 	ret = get_tdx_sys_info_handoff(&handoff);
>	if (ret)
>		return
>
>	args.foo = handoff.bar;
>	ret = seamcall_prerr(TDH_SYS_SHUTDOWN, &args);
>	if (ret)
>		return
>
>	memset(&tdx_module_state, 0, sizeof(tdx_module_state));
>	for_each_possible_cpu(cpu)
>		per_cpu(tdx_lp_initialized, cpu) = false;
>
>The logic's not bad, right? Get the handoff data, hand it off to
>something, then go set some fields.
>
>Then what does this patch do? It goes and globs a just huge blob of
>TDH_SYS_SHUTDOWN errata handling and implementation details right smack
>in the middle. Our tidy little function is no more.
>
>I really with this would trigger folks' gag reflexes. It's *SO* easy to
>fix. It's *so* easy to keep the code tidy and hide the dead bodies so
>that the logic can still be followed.

Apologies.

FWIW, we can add a tdh_sys_shutdown() helper and hide those details there.

From 987b7107d79e94d1d35be93bfc48cbeb9ce6741b Mon Sep 17 00:00:00 2001
From: Chao Gao <chao.gao@intel.com>
Date: Tue, 31 Mar 2026 05:41:30 -0700
Subject: [PATCH] x86/virt/tdx: Reject updates during compatibility-sensitive
 operations

A TDX module erratum can cause TD state corruption if a module update races
with a compatibility-sensitive operation. For example, if an update races
with TD build, the TD measurement hash may be corrupted, which can later
cause attestation failure.

Handle this by requesting the TDX module to detect such races during
TDH.SYS.SHUTDOWN and reject the update when one is found. Report the
failure to userspace as -EBUSY so the update can be retried.

The downside is that module updates can be blocked indefinitely if
compatibility-sensitive operations do not quiesce. In that case,
userspace must resolve the conflict and retry the update.

Do not pre-check whether the TDX module supports this race-detection
capability. If it does not, rely on the TDX module to reject module
shutdown.

== Alternatives ==

Two alternatives were considered and rejected [1]:

  a. Fail TD build when the race occurs. This would complicate KVM error
     handling and risk KVM uABI instability.

  b. Allow the issue to leak through. This would make the problem harder to
     detect and recover from.

Signed-off-by: Chao Gao <chao.gao@intel.com>
Link: https://lore.kernel.org/linux-coco/aQIbM5m09G0FYTzE@google.com/ # [1]
---
v10:
 - Don't add a "dead" TDX_FEATURE0 bit [Sashiko]
 - s/BIT/BIT_ULL
---
 arch/x86/include/asm/tdx.h            |  5 ++--
 arch/x86/virt/vmx/tdx/tdx.c           | 34 ++++++++++++++++++++++++++-
 drivers/virt/coco/tdx-host/tdx-host.c |  2 ++
 3 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index e5a9cf656c07..c848483d815f 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -29,8 +29,9 @@
 /*
  * TDX module SEAMCALL leaf function error codes
  */
-#define TDX_SUCCESS		0ULL
-#define TDX_RND_NO_ENTROPY	0x8000020300000000ULL
+#define TDX_SUCCESS			0ULL
+#define TDX_RND_NO_ENTROPY		0x8000020300000000ULL
+#define TDX_UPDATE_COMPAT_SENSITIVE	0x8000051200000000ULL
 
 /* Bit definitions of TDX_FEATURES0 metadata field */
 #define TDX_FEATURES0_TD_PRESERVING	BIT_ULL(1)
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index ce548400f7f5..ed974106ecfa 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -1274,6 +1274,38 @@ static __init int tdx_enable(void)
 }
 subsys_initcall(tdx_enable);
 
+#define TDX_SYS_SHUTDOWN_AVOID_COMPAT_SENSITIVE BIT_ULL(16)
+
+static int tdh_sys_shutdown(struct tdx_module_args *args)
+{
+	u64 err;
+
+	/*
+	 * This flag tells the TDX module to reject shutdown if it races
+	 * with a "sensitive" ongoing operation. That eliminates exposure
+	 * to a TDX erratum which can corrupt TDX guest states.
+	 *
+	 * This flag is not supported by all TDX modules and may cause
+	 * the shutdown (and subsequent update procedure) to fail.
+	 */
+	args->rcx |= TDX_SYS_SHUTDOWN_AVOID_COMPAT_SENSITIVE;
+
+	err = seamcall(TDH_SYS_SHUTDOWN, args);
+	/*
+	 * The shutdown ran into a "sensitive" ongoing operation. Signal
+	 * to userspace that it can retry.
+	 */
+	if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_UPDATE_COMPAT_SENSITIVE)
+		return -EBUSY;
+
+	if (err) {
+		seamcall_err(TDH_SYS_SHUTDOWN, err, args);
+		return -EIO;
+	}
+
+	return 0;
+}
+
 int tdx_module_shutdown(void)
 {
	struct tdx_sys_info_handoff handoff = {};
@@ -1295,7 +1327,7 @@ int tdx_module_shutdown(void)
	 */
	args.rcx = handoff.module_hv;
 
-	ret = seamcall_prerr(TDH_SYS_SHUTDOWN, &args);
+	ret = tdh_sys_shutdown(&args);
	if (ret)
		return ret;
 
diff --git a/drivers/virt/coco/tdx-host/tdx-host.c b/drivers/virt/coco/tdx-host/tdx-host.c
index f8075efff11f..9f68a8aa5380 100644
--- a/drivers/virt/coco/tdx-host/tdx-host.c
+++ b/drivers/virt/coco/tdx-host/tdx-host.c
@@ -137,6 +137,8 @@ static enum fw_upload_err tdx_fw_write(struct fw_upload *fwl, const u8 *data,
	case 0:
		*written = data_len;
		return FW_UPLOAD_ERR_NONE;
+	case -EBUSY:
+		return FW_UPLOAD_ERR_BUSY;
	default:
		return FW_UPLOAD_ERR_FW_INVALID;
	}
-- 
2.52.0



^ permalink raw reply related

* Re: [PATCH v4 13/13] x86/amd-gart: preserve the direct DMA address until GART mapping succeeds
From: Aneesh Kumar K.V @ 2026-05-21 11:54 UTC (permalink / raw)
  To: iommu, linux-arm-kernel, linux-kernel, linux-coco
  Cc: Robin Murphy, Marek Szyprowski, Will Deacon, Marc Zyngier,
	Steven Price, Suzuki K Poulose, Catalin Marinas, Jiri Pirko,
	Jason Gunthorpe, Mostafa Saleh, Petr Tesarik,
	Alexey Kardashevskiy, Dan Williams, Xu Yilun, linuxppc-dev,
	linux-s390, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
	Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, Sven Schnelle, x86
In-Reply-To: <20260512090408.794195-14-aneesh.kumar@kernel.org>

"Aneesh Kumar K.V (Arm)" <aneesh.kumar@kernel.org> writes:

> gart_alloc_coherent() first allocates memory through dma_direct_alloc(),
> which returns a direct-mapped DMA address in dma_addr. When force_iommu is
> enabled, the buffer is then remapped.
>
> Do not overwrite dma_addr before dma_map_area() has succeeded. Keep the
> dma_map_area result in a temporary variable so the direct DMA address
> remains available for dma_direct_free() on the error path.
>
> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
> ---
>  arch/x86/kernel/amd_gart_64.c | 10 ++++++----
>  1 file changed, 6 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
> index b5f1f031d45b..a109649c5649 100644
> --- a/arch/x86/kernel/amd_gart_64.c
> +++ b/arch/x86/kernel/amd_gart_64.c
> @@ -467,18 +467,20 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
>  		    gfp_t flag, unsigned long attrs)
>  {
>  	void *vaddr;
> +	dma_addr_t dma_map_addr;
>  
>  	vaddr = dma_direct_alloc(dev, size, dma_addr, flag, attrs);
>  	if (!vaddr ||
>  	    !force_iommu || dev->coherent_dma_mask <= DMA_BIT_MASK(24))
>  		return vaddr;
>  
> -	*dma_addr = dma_map_area(dev, virt_to_phys(vaddr), size,
> -				 DMA_BIDIRECTIONAL,
> -				 (1UL << get_order(size)) - 1, attrs);
> +	dma_map_addr = dma_map_area(dev, virt_to_phys(vaddr), size,
> +				     DMA_BIDIRECTIONAL,
> +				     (1UL << get_order(size)) - 1, attrs);
>  	flush_gart();
> -	if (unlikely(*dma_addr == DMA_MAPPING_ERROR))
> +	if (unlikely(dma_map_addr == DMA_MAPPING_ERROR))
>  		goto out_free;
> +	*dma_addr = dma_map_addr;
>  	return vaddr;
>  out_free:
>  	dma_direct_free(dev, size, vaddr, *dma_addr, attrs);
> -- 
> 2.43.0
>

This needs corresponding changes on the gart_free_coherent() side as well.

https://sashiko.dev/#/patchset/20260512090408.794195-1-aneesh.kumar%40kernel.org?part=13

I will avoid making that change as part of this series, since I assume
it would require specific testing.

-aneesh

^ permalink raw reply

* Re: [PATCH v14 02/44] kvm: arm64: Avoid including linux/kvm_host.h in kvm_pgtable.h
From: Marc Zyngier @ 2026-05-21 10:26 UTC (permalink / raw)
  To: Steven Price
  Cc: kvm, kvmarm, Catalin Marinas, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Gavin Shan,
	Shanker Donthineni, Alper Gun, Aneesh Kumar K . V, Emi Kisanuki,
	Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-3-steven.price@arm.com>

On Wed, 13 May 2026 14:17:10 +0100,
Steven Price <steven.price@arm.com> wrote:
> 
> To avoid future include cycles, drop the linux/kvm_host.h include in
> kvm_pgtable.h and include two _types.h headers for the types that are
> actually used. Additionally provide a forward declaration for struct
> kvm_s2_mmu as it's only used as a pointer in this file.
> 
> Both pgtable.c and kvm_pkvm.h relied on the indirect inclusion of
> kvm_host.h, so make that explicit.
> 
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> New patch in v13
> ---
>  arch/arm64/include/asm/kvm_pgtable.h | 5 ++++-
>  arch/arm64/include/asm/kvm_pkvm.h    | 2 +-
>  arch/arm64/kvm/hyp/pgtable.c         | 1 +
>  3 files changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> index 41a8687938eb..e4770ce2ccf6 100644
> --- a/arch/arm64/include/asm/kvm_pgtable.h
> +++ b/arch/arm64/include/asm/kvm_pgtable.h
> @@ -8,9 +8,12 @@
>  #define __ARM64_KVM_PGTABLE_H__
>  
>  #include <linux/bits.h>
> -#include <linux/kvm_host.h>
> +#include <linux/kvm_types.h>
> +#include <linux/rbtree_types.h>

I'm surprised by this. Where is the rbtree_type.h requirement coming
from?

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.

^ permalink raw reply

* Re: [PATCH v14 01/44] kvm: arm64: Include kvm_emulate.h in kvm/arm_psci.h
From: Marc Zyngier @ 2026-05-21 10:19 UTC (permalink / raw)
  To: Steven Price
  Cc: kvm, kvmarm, Suzuki K Poulose, Catalin Marinas, Will Deacon,
	James Morse, Oliver Upton, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Gavin Shan,
	Shanker Donthineni, Alper Gun, Aneesh Kumar K . V, Emi Kisanuki,
	Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-2-steven.price@arm.com>

On Wed, 13 May 2026 14:17:09 +0100,
Steven Price <steven.price@arm.com> wrote:
> 
> From: Suzuki K Poulose <suzuki.poulose@arm.com>
> 
> Fix a potential build error (like below, when asm/kvm_emulate.h gets
> included after the kvm/arm_psci.h) by including the missing header file
> in kvm/arm_psci.h:
> 
> ./include/kvm/arm_psci.h: In function ‘kvm_psci_version’:
> ./include/kvm/arm_psci.h:29:13: error: implicit declaration of function
>    ‘vcpu_has_feature’; did you mean ‘cpu_have_feature’? [-Werror=implicit-function-declaration]
>    29 |         if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PSCI_0_2)) {
> 	         |             ^~~~~~~~~~~~~~~~
> 			       |             cpu_have_feature
> 
> Reviewed-by: Gavin Shan <gshan@redhat.com>
> Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
> Signed-off-by: Steven Price <steven.price@arm.com>

Unrelated to this patch, but really easy to fix: the standard prefix
for patches targeting KVM/arm64 is:

"KVM: arm64: [opt subsys:] Something starting with a capital letter"

where "opt subsys" could be "CCA" where applicable.

It'd be good to have some consistency.

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.

^ permalink raw reply

* Re: [PATCH v10 04/25] x86/virt/tdx: Move TDX_FEATURES0 bits to asm/tdx.h
From: Xiaoyao Li @ 2026-05-21 10:12 UTC (permalink / raw)
  To: Chao Gao, kvm, linux-coco, linux-kernel
  Cc: binbin.wu, dave.hansen, djbw, ira.weiny, kai.huang, kas,
	nik.borisov, paulmck, pbonzini, reinette.chatre, rick.p.edgecombe,
	sagis, seanjc, tony.lindgren, vannapurve, vishal.l.verma,
	yilun.xu, yan.y.zhao, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, x86, H. Peter Anvin
In-Reply-To: <20260520133909.409394-5-chao.gao@intel.com>

On 5/20/2026 9:38 PM, Chao Gao wrote:
> Future changes will add support for new TDX features exposed as
> TDX_FEATURES0 bits. The presence of these features will need to be checked
> outside of arch/x86/virt. So the feature query helpers, and the
> TDX_FEATURES0 defines they reference, will need to live in the widely
> accessible asm/tdx.h header. Move the existing TDX_FEATURES0 to asm/tdx.h
> so that they can all be kept together.
> 
> Opportunistically switch to BIT_ULL() since TDX_FEATURES0 is 64-bit.
> 
> No functional change intended.
> 
> Signed-off-by: Chao Gao <chao.gao@intel.com>
> Link: https://lore.kernel.org/kvm/20260427152854.101171-17-chao.gao@intel.com/ # [1]
> Link: https://lore.kernel.org/kvm/20251121005125.417831-16-rick.p.edgecombe@intel.com/ # [2]
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>

^ permalink raw reply

* Re: [PATCH v10 03/25] x86/virt/tdx: Consolidate TDX global initialization states
From: Xiaoyao Li @ 2026-05-21 10:11 UTC (permalink / raw)
  To: Chao Gao, kvm, linux-coco, linux-kernel
  Cc: binbin.wu, dave.hansen, djbw, ira.weiny, kai.huang, kas,
	nik.borisov, paulmck, pbonzini, reinette.chatre, rick.p.edgecombe,
	sagis, seanjc, tony.lindgren, vannapurve, vishal.l.verma,
	yilun.xu, yan.y.zhao, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, x86, H. Peter Anvin
In-Reply-To: <20260520133909.409394-4-chao.gao@intel.com>

On 5/20/2026 9:38 PM, Chao Gao wrote:
> The kernel uses several global flags to guard one-time TDX initialization
> flows and prevent them from being repeated.
> 
> When the TDX module is updated, all of those states must be reset so that
> the module can be initialized again. Today those states are kept as
> separate global variables, which makes the reset path awkward and easy to
> miss when a new state is added.
> 
> Group the states into a single structure so they can be reset together, for
> example with memset(), and so a newly added state won't be missed.
> 
> Drop the __ro_after_init annotation from tdx_module_initialized because
> the other two states do not have it. And with TDX module update support,
> all the states need to be writable at runtime.
> 
> Signed-off-by: Chao Gao <chao.gao@intel.com>
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>

^ permalink raw reply

* Re: [PATCH v10 02/25] x86/virt/tdx: Move TDX global initialization states to file scope
From: Xiaoyao Li @ 2026-05-21 10:10 UTC (permalink / raw)
  To: Chao Gao, kvm, linux-coco, linux-kernel
  Cc: binbin.wu, dave.hansen, djbw, ira.weiny, kai.huang, kas,
	nik.borisov, paulmck, pbonzini, reinette.chatre, rick.p.edgecombe,
	sagis, seanjc, tony.lindgren, vannapurve, vishal.l.verma,
	yilun.xu, yan.y.zhao, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, x86, H. Peter Anvin
In-Reply-To: <20260520133909.409394-3-chao.gao@intel.com>

On 5/20/2026 9:38 PM, Chao Gao wrote:
> TDX module global initialization is executed only once. The first call
> caches both the result and the "done" state, and later callers reuse the
> saved result. A lock protects that cached states.
> 
> Those states and the lock are currently kept as function-local statics
> because they are used only by try_init_module_global().
> 
> TDX module updates need to reset the cached states so TDX global
> initialization can be run again after an update. That will add another
> access site in the same file.
> 
> Move the cached states to file scope so it is accessible outside
> try_init_module_global(), and move the lock along with the states it
> protects.
> 
> No functional change intended.
> 
> Signed-off-by: Chao Gao <chao.gao@intel.com>

Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>

^ permalink raw reply

* Re: [PATCH v10 01/25] x86/virt/tdx: Clarify try_init_module_global() result caching
From: Xiaoyao Li @ 2026-05-21 10:03 UTC (permalink / raw)
  To: Chao Gao, kvm, linux-coco, linux-kernel
  Cc: binbin.wu, dave.hansen, djbw, ira.weiny, kai.huang, kas,
	nik.borisov, paulmck, pbonzini, reinette.chatre, rick.p.edgecombe,
	sagis, seanjc, tony.lindgren, vannapurve, vishal.l.verma,
	yilun.xu, yan.y.zhao, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, x86, H. Peter Anvin
In-Reply-To: <20260520133909.409394-2-chao.gao@intel.com>

On 5/20/2026 9:38 PM, Chao Gao wrote:
> TDX module global initialization is executed only once. The first call
> caches both the result and the "done" state, and later callers reuse the
> saved result. A lock protects that cached state.
> 
> The current code is hard to read because sysinit_done is accessed under
> the lock, while sysinit_ret is not.
> 
> To improve readability, move sysinit_ret accesses within the lock.
> 
> Group sysinit_ret/sysinit_done updates right after initialization so
> Caching the result is separate from the initialization itself.
> 
> Signed-off-by: Chao Gao <chao.gao@intel.com>
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>

^ permalink raw reply

* Re: [PATCH v6 21/43] KVM: SEV: Make 'uaddr' parameter optional for KVM_SEV_SNP_LAUNCH_UPDATE
From: Fuad Tabba @ 2026-05-21  9:55 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-21-91ab5a8b19a4@google.com>

Hi,

On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Michael Roth <michael.roth@amd.com>
>
> For vm_memory_attributes=1, in-place conversion/population is not
> supported, so the initial contents necessarily must need to come
> from a separate src address, which is enforced by the current
> implementation. However, for vm_memory_attributes=0, it is possible for
> guest memory to be initialized directly from userspace by mmap()'ing the
> guest_memfd and writing to it while the corresponding GPA ranges are in
> a 'shared' state before converting them to the 'private' state expected
> by KVM_SEV_SNP_LAUNCH_UPDATE.
>
> Update the handling/documentation for KVM_SEV_SNP_LAUNCH_UPDATE to allow
> for 'uaddr' to be set to NULL when vm_memory_attributes=0, which
> SNP_LAUNCH_UPDATE will then use to determine when it should/shouldn't
> copy in data from a separate memory location. Continue to enforce
> non-NULL for the original vm_memory_attributes=1 case.
>
> Signed-off-by: Michael Roth <michael.roth@amd.com>
> [Added src_page check in error handling path when the firmware command fails]
> [Dropped ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES]
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>

I'm not very familiar with the SEV-SNP populate flows, but it looks
like Sashiko is on to something:
https://sashiko.dev/#/patchset/20260507-gmem-inplace-conversion-v6-0-91ab5a8b19a4%40google.com?part=21

- a potential read-only page overwrite, because src_page is acquired
via get_user_pages_fast() without the FOLL_WRITE flag, but is then
overwritten via memcpy
- an ordering violation with the kunmap_local() calls

These predate this patch series and are just being touched by the
'src_page' addition, but if Sashiko's right, these should probably be
fixed sooner rather than later.

Cheers,
/fuad



> ---
>  Documentation/virt/kvm/x86/amd-memory-encryption.rst | 15 +++++++++++----
>  arch/x86/kvm/svm/sev.c                               | 18 +++++++++++++-----
>  virt/kvm/kvm_main.c                                  |  1 +
>  3 files changed, 25 insertions(+), 9 deletions(-)
>
> diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst
> index b2395dd4769de..43085f65b2d85 100644
> --- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst
> +++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst
> @@ -503,7 +503,8 @@ secrets.
>
>  It is required that the GPA ranges initialized by this command have had the
>  KVM_MEMORY_ATTRIBUTE_PRIVATE attribute set in advance. See the documentation
> -for KVM_SET_MEMORY_ATTRIBUTES for more details on this aspect.
> +for KVM_SET_MEMORY_ATTRIBUTES/KVM_SET_MEMORY_ATTRIBUTES2 for more details on
> +this aspect.
>
>  Upon success, this command is not guaranteed to have processed the entire
>  range requested. Instead, the ``gfn_start``, ``uaddr``, and ``len`` fields of
> @@ -511,9 +512,15 @@ range requested. Instead, the ``gfn_start``, ``uaddr``, and ``len`` fields of
>  remaining range that has yet to be processed. The caller should continue
>  calling this command until those fields indicate the entire range has been
>  processed, e.g. ``len`` is 0, ``gfn_start`` is equal to the last GFN in the
> -range plus 1, and ``uaddr`` is the last byte of the userspace-provided source
> -buffer address plus 1. In the case where ``type`` is KVM_SEV_SNP_PAGE_TYPE_ZERO,
> -``uaddr`` will be ignored completely.
> +range plus 1, and ``uaddr`` (if specified) is the last byte of the
> +userspace-provided source buffer address plus 1.
> +
> +In the case where ``type`` is KVM_SEV_SNP_PAGE_TYPE_ZERO, ``uaddr`` will be
> +ignored completely. Otherwise, ``uaddr`` is required if
> +kvm.vm_memory_attributes=1 and optional if kvm.vm_memory_attributes=0, since
> +in the latter case guest memory can be initialized directly from userspace
> +prior to converting it to private and passing the GPA range on to this
> +interface.
>
>  Parameters (in): struct  kvm_sev_snp_launch_update
>
> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> index c2126b3c30724..bf10d24907a00 100644
> --- a/arch/x86/kvm/svm/sev.c
> +++ b/arch/x86/kvm/svm/sev.c
> @@ -2343,7 +2343,15 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
>         int level;
>         int ret;
>
> -       if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src_page))
> +       /*
> +        * For vm_memory_attributes=1, in-place conversion/population is not
> +        * supported, so the initial contents necessarily need to come from a
> +        * separate src address. For vm_memory_attributes=0, this isn't
> +        * necessarily the case, since the pages may have been populated
> +        * directly from userspace before calling KVM_SEV_SNP_LAUNCH_UPDATE.
> +        */
> +       if (vm_memory_attributes &&
> +           sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src_page)
>                 return -EINVAL;
>
>         ret = snp_lookup_rmpentry((u64)pfn, &assigned, &level);
> @@ -2390,7 +2398,7 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
>          */
>         if (ret && !snp_page_reclaim(kvm, pfn) &&
>             sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID &&
> -           sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) {
> +           sev_populate_args->fw_error == SEV_RET_INVALID_PARAM && src_page) {
>                 void *src_vaddr = kmap_local_page(src_page);
>                 void *dst_vaddr = kmap_local_pfn(pfn);
>
> @@ -2422,8 +2430,8 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
>         if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
>                 return -EFAULT;
>
> -       pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__,
> -                params.gfn_start, params.len, params.type, params.flags);
> +       pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d src %llx\n", __func__,
> +                params.gfn_start, params.len, params.type, params.flags, params.uaddr);
>
>         if (!params.len || !PAGE_ALIGNED(params.len) || params.flags ||
>             (params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL &&
> @@ -2479,7 +2487,7 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
>
>         params.gfn_start += count;
>         params.len -= count * PAGE_SIZE;
> -       if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO)
> +       if (src && params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO)
>                 params.uaddr += count * PAGE_SIZE;
>
>         if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params)))
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index ba195bb239aaa..3bf212fd99193 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -105,6 +105,7 @@ module_param(allow_unsafe_mappings, bool, 0444);
>  #ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>  bool vm_memory_attributes = true;
>  module_param(vm_memory_attributes, bool, 0444);
> +EXPORT_SYMBOL_FOR_KVM_INTERNAL(vm_memory_attributes);
>  #endif
>  DEFINE_STATIC_CALL_RET0(__kvm_get_memory_attributes, kvm_get_memory_attributes_t);
>  EXPORT_SYMBOL_FOR_KVM_INTERNAL(STATIC_CALL_KEY(__kvm_get_memory_attributes));
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

* Re: [PATCH v3 37/41] x86/kvmclock: Use TSC for sched_clock if it's constant and non-stop
From: Dongli Zhang @ 2026-05-21  9:14 UTC (permalink / raw)
  To: Sean Christopherson, kvm
  Cc: Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, x86, linux-coco, linux-hyperv, virtualization,
	linux-kernel, xen-devel, Kiryl Shutsemau, Paolo Bonzini,
	K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Dave Hansen,
	Andy Lutomirski, Peter Zijlstra, Juergen Gross, Daniel Lezcano,
	Thomas Gleixner, John Stultz, Michael Kelley, Tom Lendacky,
	Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <20260515191942.1892718-38-seanjc@google.com>



On 2026-05-15 12:19 PM, Sean Christopherson wrote:
> Prefer the TSC over kvmclock for sched_clock if the TSC is constant,
> nonstop, and not marked unstable via command line.  I.e. use the same
> criteria as tweaking the clocksource rating so that TSC is preferred over
> kvmclock.  Per the below comment from native_sched_clock(), sched_clock
> is more tolerant of slop than clocksource; using TSC for clocksource but
> not sched_clock makes little to no sense, especially now that KVM CoCo
> guests with a trusted TSC use TSC, not kvmclock.
> 
>         /*
>          * Fall back to jiffies if there's no TSC available:
>          * ( But note that we still use it if the TSC is marked
>          *   unstable. We do this because unlike Time Of Day,
>          *   the scheduler clock tolerates small errors and it's
>          *   very important for it to be as fast as the platform
>          *   can achieve it. )
>          */
> 
> The only advantage of using kvmclock is that doing so allows for early
> and common detection of PVCLOCK_GUEST_STOPPED, but that code has been
> broken for over two years with nary a complaint, i.e. it can't be
> _that_ valuable.  And as above, certain types of KVM guests are losing
> the functionality regardless, i.e. acknowledging PVCLOCK_GUEST_STOPPED
> needs to be decoupled from sched_clock() no matter what.

Has it been broken for two years because of pvclock_clocksource_read_nowd()?

Thank you very much!

Dongli Zhang

^ permalink raw reply

* Re: [PATCH v6 20/43] KVM: guest_memfd: Enable INIT_SHARED on guest_memfd for x86 Coco VMs
From: Fuad Tabba @ 2026-05-21  8:54 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-20-91ab5a8b19a4@google.com>

On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Sean Christopherson <seanjc@google.com>
>
> Now that guest_memfd supports tracking private vs. shared within gmem
> itself, allow userspace to specify INIT_SHARED on a guest_memfd instance
> for x86 Confidential Computing (CoCo) VMs, so long as per-VM attributes
> are disabled, i.e. when it's actually possible for a guest_memfd instance
> to contain shared memory.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Reviewed-by: Fuad Tabba <tabba@google.com>

Cheers,
/fuad


> ---
>  arch/x86/kvm/x86.c | 11 +++++------
>  1 file changed, 5 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 1560de1e95be0..6609957ecfea3 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -14172,14 +14172,13 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
>  }
>
>  #ifdef CONFIG_KVM_GUEST_MEMFD
> -/*
> - * KVM doesn't yet support initializing guest_memfd memory as shared for VMs
> - * with private memory (the private vs. shared tracking needs to be moved into
> - * guest_memfd).
> - */
>  bool kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
>  {
> -       return !kvm_arch_has_private_mem(kvm);
> +       /*
> +        * INIT_SHARED isn't supported if the memory attributes are per-VM,
> +        * in which case guest_memfd can _only_ be used for private memory.
> +        */
> +       return !vm_memory_attributes || !kvm_arch_has_private_mem(kvm);
>  }
>
>  #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

* Re: [PATCH v6 19/43] KVM: Let userspace disable per-VM mem attributes, enable per-gmem attributes
From: Fuad Tabba @ 2026-05-21  8:44 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-19-91ab5a8b19a4@google.com>

Hi Ackerley,

On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Sean Christopherson <seanjc@google.com>
>
> Make vm_memory_attributes a module parameter so that userspace can disable
> the use of memory attributes on the VM level.
>
> To avoid inconsistencies in the way memory attributes are tracked in KVM
> and guest_memfd, the vm_memory_attributes module_param is made
> read-only (0444).
>
> Make CONFIG_KVM_VM_MEMORY_ATTRIBUTES selectable, only for (CoCo) VM types
> that might use vm_memory_attributes.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Config files always confuse me, but Sashiko might be onto something:

https://sashiko.dev/#/patchset/20260507-gmem-inplace-conversion-v6-0-91ab5a8b19a4%40google.com?part=19

I think this partially goes back to commit 6, the one I flagged
yesterday. But also adding "default y" to KVM_VM_MEMORY_ATTRIBUTES?
The default value should at least fix this issue, but I'm not sure if
it would cause other problems...

Cheers,
/fuad


> ---
>  arch/x86/kvm/Kconfig | 13 +++++++++----
>  virt/kvm/kvm_main.c  |  1 +
>  2 files changed, 10 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index b6d65ee664d0f..8b97d341bd33f 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -82,13 +82,20 @@ config KVM_WERROR
>
>  config KVM_VM_MEMORY_ATTRIBUTES
>         select KVM_MEMORY_ATTRIBUTES
> -       bool
> +       depends on KVM_SW_PROTECTED_VM || KVM_INTEL_TDX || KVM_AMD_SEV
> +       bool "Enable per-VM memory attributes (for CoCo VMs)"
> +       help
> +         Enable support for per-VM memory attributes, which are deprecated in
> +         favor of tracking memory attributes in guest_memfd.  Select this if
> +         you need to run CoCo VMs using a VMM that doesn't support guest_memfd
> +         memory attributes.
> +
> +         If unsure, say N.
>
>  config KVM_SW_PROTECTED_VM
>         bool "Enable support for KVM software-protected VMs"
>         depends on EXPERT
>         depends on KVM_X86 && X86_64
> -       select KVM_VM_MEMORY_ATTRIBUTES
>         help
>           Enable support for KVM software-protected VMs.  Currently, software-
>           protected VMs are purely a development and testing vehicle for
> @@ -139,7 +146,6 @@ config KVM_INTEL_TDX
>         bool "Intel Trust Domain Extensions (TDX) support"
>         default y
>         depends on INTEL_TDX_HOST
> -       select KVM_VM_MEMORY_ATTRIBUTES
>         select HAVE_KVM_ARCH_GMEM_POPULATE
>         help
>           Provides support for launching Intel Trust Domain Extensions (TDX)
> @@ -163,7 +169,6 @@ config KVM_AMD_SEV
>         depends on KVM_AMD && X86_64
>         depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
>         select ARCH_HAS_CC_PLATFORM
> -       select KVM_VM_MEMORY_ATTRIBUTES
>         select HAVE_KVM_ARCH_GMEM_PREPARE
>         select HAVE_KVM_ARCH_GMEM_INVALIDATE
>         select HAVE_KVM_ARCH_GMEM_POPULATE
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index cec02d68d7039..ba195bb239aaa 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -104,6 +104,7 @@ module_param(allow_unsafe_mappings, bool, 0444);
>  #ifdef CONFIG_KVM_MEMORY_ATTRIBUTES
>  #ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>  bool vm_memory_attributes = true;
> +module_param(vm_memory_attributes, bool, 0444);
>  #endif
>  DEFINE_STATIC_CALL_RET0(__kvm_get_memory_attributes, kvm_get_memory_attributes_t);
>  EXPORT_SYMBOL_FOR_KVM_INTERNAL(STATIC_CALL_KEY(__kvm_get_memory_attributes));
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

* Re: [PATCH v6 18/43] KVM: Move KVM_VM_MEMORY_ATTRIBUTES config definition to x86
From: Fuad Tabba @ 2026-05-21  8:07 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-18-91ab5a8b19a4@google.com>

On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Sean Christopherson <seanjc@google.com>
>
> Bury KVM_VM_MEMORY_ATTRIBUTES in x86 to discourage other architectures
> from adding support for per-VM memory attributes, because tracking private
> vs. shared memory on a per-VM basis is now deprecated in favor of tracking
> on a per-guest_memfd basis, and no other memory attributes are on the
> horizon.
>
> This will also allow modifying KVM_VM_MEMORY_ATTRIBUTES to be
> user-selectable (in x86) without creating weirdness in KVM's Kconfigs.
> Now that guest_memfd support memory attributes, it's entirely possible to
> run x86 CoCo VMs without support for KVM_VM_MEMORY_ATTRIBUTES.
>
> Leave the code itself in common KVM so that it's trivial to undo this
> change if new per-VM attributes do come along.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Reviewed-by: Fuad Tabba <tabba@google.com>

Cheers,
/fuad
> ---
>  arch/x86/kvm/Kconfig | 4 ++++
>  virt/kvm/Kconfig     | 4 ----
>  2 files changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index 26f6afd51bbdc..b6d65ee664d0f 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -80,6 +80,10 @@ config KVM_WERROR
>
>           If in doubt, say "N".
>
> +config KVM_VM_MEMORY_ATTRIBUTES
> +       select KVM_MEMORY_ATTRIBUTES
> +       bool
> +
>  config KVM_SW_PROTECTED_VM
>         bool "Enable support for KVM software-protected VMs"
>         depends on EXPERT
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index e371e079e2c50..663de6421eda2 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -103,10 +103,6 @@ config KVM_MMU_LOCKLESS_AGING
>  config KVM_MEMORY_ATTRIBUTES
>         bool
>
> -config KVM_VM_MEMORY_ATTRIBUTES
> -       select KVM_MEMORY_ATTRIBUTES
> -       bool
> -
>  config KVM_GUEST_MEMFD
>         select XARRAY_MULTI
>         select KVM_MEMORY_ATTRIBUTES
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

* Re: [PATCH v6 17/43] KVM: guest_memfd: Determine invalidation filter from memory attributes
From: Fuad Tabba @ 2026-05-21  7:56 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-17-91ab5a8b19a4@google.com>

On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Ackerley Tng <ackerleytng@google.com>
>
> Before conversion, the range filter doesn't really matter:
>
> + For non-CoCo VMs that use guest_memfd, they have no mirrored tdp, so
>   KVM_DIRECT_ROOTS would have been invalidated anyway.
> + CoCo VMs could not use INIT_SHARED, and there's no conversion support, so
>   always using KVM_FILTER_PRIVATE would have worked.
>
> Now with conversion support, update kvm_gmem_get_invalidate_filter to
> inspect the memory attributes maple tree for a given range.
>
> Instead of determining the invalidation filter based on static inode
> flags, iterate through the attributes maple tree for the specific range
> being invalidated. This allows KVM to identify if the range contains
> private pages, shared pages, or both, and set the filter bits
> accordingly.
>
> Update kvm_gmem_invalidate_begin and kvm_gmem_release to pass the range
> parameters to the filter helper to ensure invalidation accurately
> targets the memory types present in the affected range.
>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Reviewed-by: Fuad Tabba <tabba@google.com>

Cheers,
/fuad
> ---
>  virt/kvm/guest_memfd.c | 27 ++++++++++++++++++++-------
>  1 file changed, 20 insertions(+), 7 deletions(-)
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 9f6eebfb68f6b..c9f155c2dc5c5 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -193,12 +193,24 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
>         return folio;
>  }
>
> -static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode)
> +static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(
> +               struct inode *inode, pgoff_t start, pgoff_t end)
>  {
> -       if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)
> -               return KVM_FILTER_SHARED;
> +       struct gmem_inode *gi = GMEM_I(inode);
> +       enum kvm_gfn_range_filter filter = 0;
> +       void *entry;
> +
> +       lockdep_assert(mt_lock_is_held(&gi->attributes));
> +
> +       mt_for_each(&gi->attributes, entry, start, end - 1) {
> +               filter |= (xa_to_value(entry) & KVM_MEMORY_ATTRIBUTE_PRIVATE) ?
> +                         KVM_FILTER_PRIVATE : KVM_FILTER_SHARED;
> +
> +               if (filter == (KVM_FILTER_PRIVATE | KVM_FILTER_SHARED))
> +                       break;
> +       }
>
> -       return KVM_FILTER_PRIVATE;
> +       return filter;
>  }
>
>  static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start,
> @@ -244,7 +256,7 @@ static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start,
>         enum kvm_gfn_range_filter attr_filter;
>         struct gmem_file *f;
>
> -       attr_filter = kvm_gmem_get_invalidate_filter(inode);
> +       attr_filter = kvm_gmem_get_invalidate_filter(inode, start, end);
>
>         kvm_gmem_for_each_file(f, inode)
>                 __kvm_gmem_invalidate_begin(f, start, end, attr_filter);
> @@ -367,6 +379,7 @@ static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
>  static int kvm_gmem_release(struct inode *inode, struct file *file)
>  {
>         struct gmem_file *f = file->private_data;
> +       enum kvm_gfn_range_filter filter;
>         struct kvm_memory_slot *slot;
>         struct kvm *kvm = f->kvm;
>         unsigned long index;
> @@ -398,8 +411,8 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
>          * memory, as its lifetime is associated with the inode, not the file.
>          */
>         end = i_size_read(inode) >> PAGE_SHIFT;
> -       __kvm_gmem_invalidate_begin(f, 0, end,
> -                                   kvm_gmem_get_invalidate_filter(inode));
> +       filter = kvm_gmem_get_invalidate_filter(inode, 0, end);
> +       __kvm_gmem_invalidate_begin(f, 0, end, filter);
>         __kvm_gmem_invalidate_end(f, 0, end);
>
>         list_del(&f->entry);
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

* [PATCH kernel] crypto/ccp/tsm: Enable the root port after the endpoint
From: Alexey Kardashevskiy @ 2026-05-21  7:43 UTC (permalink / raw)
  To: linux-crypto
  Cc: linux-kernel, Tom Lendacky, Herbert Xu, David S. Miller,
	Dan Williams, Alexey Kardashevskiy, x86, linux-coco,
	Pratik R . Sampat, Xu Yilun, Aneesh Kumar K . V

The PCIe r7.0, chapter "6.33.8 Other IDE Rules" mandates if selective IDE
is enabled for config requersts, a stream must be enabled on the endpoint
before enabling it on the rootport:

===
For Selective IDE, the Stream must not be used until it has been enabled in
both Partner Ports. For cases where one of the Partner Ports is a Root Port
and Selective IDE for Configuration Requests is enabled, the other
Partner Port must be enabled prior to the Root Port. For other scenarios,
the mechanisms to satisfy this requirement are implementation-specific.
===

Do what the spec says.

Fixes: 4be423572da1 ("crypto/ccp: Implement SEV-TIO PCIe IDE (phase1)")
Signed-off-by: Alexey Kardashevskiy <aik@amd.com>
---
 drivers/crypto/ccp/sev-dev-tsm.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/crypto/ccp/sev-dev-tsm.c b/drivers/crypto/ccp/sev-dev-tsm.c
index b07ae529b591..10549a2cc2ae 100644
--- a/drivers/crypto/ccp/sev-dev-tsm.c
+++ b/drivers/crypto/ccp/sev-dev-tsm.c
@@ -58,13 +58,13 @@ static int stream_enable(struct pci_ide *ide)
 	struct pci_dev *rp = pcie_find_root_port(ide->pdev);
 	int ret;
 
-	ret = pci_ide_stream_enable(rp, ide);
-	if (ret)
+	ret = pci_ide_stream_enable(ide->pdev, ide);
+	if (ret && ret != -ENXIO)
 		return ret;
 
-	ret = pci_ide_stream_enable(ide->pdev, ide);
-	if (ret)
-		pci_ide_stream_disable(rp, ide);
+	ret = pci_ide_stream_enable(rp, ide);
+	if (ret && ret != -ENXIO)
+		pci_ide_stream_disable(ide->pdev, ide);
 
 	return ret;
 }
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH v6 16/43] KVM: guest_memfd: Use actual size for invalidation in kvm_gmem_release()
From: Fuad Tabba @ 2026-05-21  7:30 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-16-91ab5a8b19a4@google.com>

Hi Ackerley,

On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Ackerley Tng <ackerleytng@google.com>
>
> __kvm_gmem_invalidate_begin() and __kvm_gmem_invalidate_end() actually do
> not specially handle -1ul. -1ul is used as a huge number, which legal
> indices do not exceed, and hence the invalidation works as expected.
>
> Since a later patch is going to make use of the exact range, calculate the
> size of the guest_memfd inode and use it as the end range for invalidating
> SPTEs.
>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Want to look at what Sashiko has to say? Seems to be a real issue:

https://sashiko.dev/#/patchset/20260507-gmem-inplace-conversion-v6-0-91ab5a8b19a4%40google.com?part=16

If I understand correctly, the fix should simple: use
check_add_overflow() to validate the offset and size parameters in
kvm_gmem_bind()

   int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
             unsigned int fd, loff_t offset)
   {
       loff_t size = slot->npages << PAGE_SHIFT;
   +    loff_t end;
       unsigned long start, end_index;
       struct gmem_file *f;
...
   -    if (offset < 0 || !PAGE_ALIGNED(offset) ||
   -        offset + size > i_size_read(inode))
   +    if (offset < 0 || !PAGE_ALIGNED(offset) ||
   +        check_add_overflow(offset, size, &end) ||
   +        end > i_size_read(inode))
           goto err;

What do you think?

/fuad

> ---
>  virt/kvm/guest_memfd.c | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 050a8c092b1a3..9f6eebfb68f6b 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -370,6 +370,7 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
>         struct kvm_memory_slot *slot;
>         struct kvm *kvm = f->kvm;
>         unsigned long index;
> +       pgoff_t end;
>
>         /*
>          * Prevent concurrent attempts to *unbind* a memslot.  This is the last
> @@ -396,9 +397,10 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
>          * Zap all SPTEs pointed at by this file.  Do not free the backing
>          * memory, as its lifetime is associated with the inode, not the file.
>          */
> -       __kvm_gmem_invalidate_begin(f, 0, -1ul,
> +       end = i_size_read(inode) >> PAGE_SHIFT;
> +       __kvm_gmem_invalidate_begin(f, 0, end,
>                                     kvm_gmem_get_invalidate_filter(inode));
> -       __kvm_gmem_invalidate_end(f, 0, -1ul);
> +       __kvm_gmem_invalidate_end(f, 0, end);
>
>         list_del(&f->entry);
>
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

* Re: [PATCH v6 05/43] KVM: guest_memfd: Wire up kvm_get_memory_attributes() to per-gmem attributes
From: Fuad Tabba @ 2026-05-21  7:19 UTC (permalink / raw)
  To: Ackerley Tng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <CAEvNRgGQvMdDmVfbk42EY_PGN0ybTp-x21Zj+pg_X1mk9iCRtA@mail.gmail.com>

On Wed, 20 May 2026 at 22:44, Ackerley Tng <ackerleytng@google.com> wrote:
>
> Fuad Tabba <tabba@google.com> writes:
>
> >
> > [...snip...]
> >
> >> +unsigned long kvm_gmem_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
> >> +{
> >> +       struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
> >> +       struct inode *inode;
> >> +
> >> +       /*
> >> +        * If this gfn has no associated memslot, there's no chance of the gfn
> >> +        * being backed by private memory, since guest_memfd must be used for
> >> +        * private memory, and guest_memfd must be associated with some memslot.
> >> +        */
> >> +       if (!slot)
> >> +               return 0;
> >> +
> >> +       CLASS(gmem_get_file, file)(slot);
> >> +       if (!file)
> >> +               return 0;
> >> +
> >> +       inode = file_inode(file);
> >> +
> >> +       /*
> >> +        * Rely on the maple tree's internal RCU lock to ensure a
> >> +        * stable result. This result can become stale as soon as the
> >> +        * lock is dropped, so the caller _must_ still protect
> >> +        * consumption of private vs. shared by checking
> >> +        * mmu_invalidate_retry_gfn() under mmu_lock to serialize
> >> +        * against ongoing attribute updates.
> >> +        */
> >> +       return kvm_gmem_get_attributes(inode, kvm_gmem_get_index(slot, gfn));
> >> +}
> >
> > Doesn't this imply that all consumers of kvm_mem_is_private() should
> > validate the result using mmu_lock and the invalidation sequence?
>
> Let me know how I can improve the comment.

Given Sean's context, the comment is good I think. I would quibble
with the the "_must_ still protect" phrasing being a bit too strict.

Maybe just soften it slightly to acknowledge the exception? Something like:

  * lock is dropped, so callers that require a strict result _must_ protect
  * consumption of private vs. shared by checking mmu_invalidate_retry_gfn()
  * under mmu_lock to serialize against ongoing attribute updates. Callers
  * doing lockless reads must be able to tolerate a stale result.

That aligns the comment with how KVM is actually using it today. That
said, this is nitpicking. Feel free to use or ignore.

>
> I think the "consumption" of private vs shared here actually means
> something like "don't commit a page being faulted into page tables based
> on the result of kvm_gmem_get_memory_attributes() without checking
> kvm->mmu_invalidate_in_progress.", since a racing conversion may
> complete before you commit.
>
> kvm_mem_is_private() is used from these places:
>
> 1. Fault handling in KVM, like page_fault_can_be_fast(),
>    kvm_mmu_faultin_pfn(), kvm_mmu_page_fault(): this already handles the
>    entire mmu_lock and invalidation dance. No fault will be committed if
>    a racing conversion happened after kvm_mem_is_private() but before
>    the commit.
>
> 2. kvm_mmu_max_mapping_level() from recovering huge pages after
>    disabling dirty logging: Other than that it can't be used with
>    guest_memfd now since dirty logging can't be used with guest_memfd
>    and guest_memfd memslots are not updatable, this holds mmu_lock
>    throughout until the huge page recovery is done. invalidate_begin
>    also involves zapping the pages in the range, so if the order of
>    events is
>
>    | Thread A                     | Thread B          |
>    |------------------------------|-------------------|
>    | invalidate_begin + zap       |                   |
>    | update attributes maple_tree | recover huge page |
>    | invalidate_end               |                   |
>
>    Then recovering will never see the zapped pages, nothing to
>    recover, no kvm_mem_is_private() lookup.
>
> 3. kvm_arch_vcpu_pre_fault_memory()
>
>    This eventually calls kvm_tdp_mmu_page_fault(), which checks
>    is_page_fault_stale(), so it does check before committing.
>
> Were there any other calls I missed?

The one I was looking at was `sev_handle_rmp_fault()`, which does a lockless
read without the retry loop. But as Sean just pointed out, that path can
tolerate false positives/negatives and relies on the guest faulting again,
so the lack of synchronization there is existing behavior and considered "fine".

>
> > sev_handle_rmp_fault() calls kvm_mem_is_private() without holding
> > mmu_lock and without any retry mechanism. Is that a problem?
> >
>
> Sean already replied on your actual question separately :)
>
> > Cheers,
> > /fuad
> >
> >
> >>
> >> [...snip...]
> >>

^ permalink raw reply

* Re: [PATCH v6 15/43] KVM: guest_memfd: Handle lru_add fbatch refcounts during conversion safety check
From: Fuad Tabba @ 2026-05-21  7:13 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-15-91ab5a8b19a4@google.com>

On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Ackerley Tng <ackerleytng@google.com>
>
> When checking if a guest_memfd folio is safe for conversion, its refcount
> is examined. A folio may be present in a per-CPU lru_add fbatch, which
> temporarily increases its refcount. This can lead to a false positive,
> incorrectly indicating that the folio is in use and preventing the
> conversion, even if it is otherwise safe. The conversion process might not
> be on the same CPU that holds the folio in its fbatch, making a simple
> per-CPU check insufficient.
>
> To address this, drain all CPUs' lru_add fbatches if an unexpectedly high
> refcount is encountered during the safety check. This is performed at most
> once per conversion request. Draining only if the folio in question may be
> lru cached.
>
> guest_memfd folios are unevictable, so they can only reside in the lru_add
> fbatch. If the folio's refcount is still unsafe after draining, then the
> conversion is truly deemed unsafe.
>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Not an area I've worked with that much, but it seems right to me:

Reviewed-by: Fuad Tabba <tabba@google.com>

Cheers,
/fuad


> ---
>  mm/swap.c              |  2 ++
>  virt/kvm/guest_memfd.c | 18 ++++++++++++++----
>  2 files changed, 16 insertions(+), 4 deletions(-)
>
> diff --git a/mm/swap.c b/mm/swap.c
> index 5cc44f0de9877..3134d9d3d7c30 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -37,6 +37,7 @@
>  #include <linux/page_idle.h>
>  #include <linux/local_lock.h>
>  #include <linux/buffer_head.h>
> +#include <linux/kvm_types.h>
>
>  #include "internal.h"
>
> @@ -904,6 +905,7 @@ void lru_add_drain_all(void)
>         lru_add_drain();
>  }
>  #endif /* CONFIG_SMP */
> +EXPORT_SYMBOL_FOR_KVM(lru_add_drain_all);
>
>  atomic_t lru_disable_count = ATOMIC_INIT(0);
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 034b72b4947fb..050a8c092b1a3 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -8,6 +8,7 @@
>  #include <linux/mempolicy.h>
>  #include <linux/pseudo_fs.h>
>  #include <linux/pagemap.h>
> +#include <linux/swap.h>
>
>  #include "kvm_mm.h"
>
> @@ -596,18 +597,27 @@ static bool kvm_gmem_is_safe_for_conversion(struct inode *inode, pgoff_t start,
>         const int filemap_get_folios_refcount = 1;
>         pgoff_t last = start + nr_pages - 1;
>         struct folio_batch fbatch;
> +       bool lru_drained = false;
>         bool safe = true;
>         int i;
>
>         folio_batch_init(&fbatch);
>         while (safe && filemap_get_folios(mapping, &start, last, &fbatch)) {
>
> -               for (i = 0; i < folio_batch_count(&fbatch); ++i) {
> +               for (i = 0; i < folio_batch_count(&fbatch);) {
>                         struct folio *folio = fbatch.folios[i];
>
> -                       if (folio_ref_count(folio) !=
> -                           folio_nr_pages(folio) + filemap_get_folios_refcount) {
> -                               safe = false;
> +                       safe = (folio_ref_count(folio) ==
> +                               folio_nr_pages(folio) +
> +                               filemap_get_folios_refcount);
> +
> +                       if (safe) {
> +                               ++i;
> +                       } else if (folio_may_be_lru_cached(folio) &&
> +                                  !lru_drained) {
> +                               lru_add_drain_all();
> +                               lru_drained = true;
> +                       } else {
>                                 *err_index = folio->index;
>                                 break;
>                         }
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

* Re: [PATCH v6 11/43] KVM: guest_memfd: Ensure pages are not in use before conversion
From: Fuad Tabba @ 2026-05-21  7:09 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260507-gmem-inplace-conversion-v6-11-91ab5a8b19a4@google.com>

Hi Ackerley,

On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Ackerley Tng <ackerleytng@google.com>
>
> When converting memory to private in guest_memfd, it is necessary to ensure
> that the pages are not currently being accessed by any other part of the
> kernel or userspace to avoid any current user writing to guest private
> memory.
>
> guest_memfd checks for unexpected refcounts to determine whether a page is
> still in use. The only expected refcounts after unmapping the range
> requested for conversion are those that are held by guest_memfd itself.
>
> Update the kvm_memory_attributes2 structure to include an error_offset
> field. This allows KVM to report the exact offset where a conversion
> failed to userspace. If the safety check fails, return -EAGAIN and copy
> the error_offset back to userspace so that it can potentially retry the
> operation or handle the failure gracefully.
>
> Suggested-by: David Hildenbrand <david@kernel.org>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
> ---
>  include/uapi/linux/kvm.h |  3 ++-
>  virt/kvm/guest_memfd.c   | 65 ++++++++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 62 insertions(+), 6 deletions(-)
>
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index e6bbf68a83813..0b55258573d3d 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1658,7 +1658,8 @@ struct kvm_memory_attributes2 {
>         __u64 size;
>         __u64 attributes;
>         __u64 flags;
> -       __u64 reserved[12];
> +       __u64 error_offset;
> +       __u64 reserved[11];
>  };
>
>  #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 91e89b188f583..9d82642a025e9 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -572,9 +572,42 @@ static int kvm_gmem_mas_preallocate(struct ma_state *mas, u64 attributes,
>         return mas_preallocate(mas, xa_mk_value(attributes), GFP_KERNEL);
>  }
>
> +static bool kvm_gmem_is_safe_for_conversion(struct inode *inode, pgoff_t start,
> +                                           size_t nr_pages, pgoff_t *err_index)
> +{
> +       struct address_space *mapping = inode->i_mapping;
> +       const int filemap_get_folios_refcount = 1;
> +       pgoff_t last = start + nr_pages - 1;
> +       struct folio_batch fbatch;
> +       bool safe = true;
> +       int i;
> +
> +       folio_batch_init(&fbatch);
> +       while (safe && filemap_get_folios(mapping, &start, last, &fbatch)) {
> +
> +               for (i = 0; i < folio_batch_count(&fbatch); ++i) {
> +                       struct folio *folio = fbatch.folios[i];
> +
> +                       if (folio_ref_count(folio) !=
> +                           folio_nr_pages(folio) + filemap_get_folios_refcount) {
> +                               safe = false;
> +                               *err_index = folio->index;
> +                               break;

https://sashiko.dev/#/patchset/20260507-gmem-inplace-conversion-v6-0-91ab5a8b19a4%40google.com?part=11

Sashiko raised a few issues here, but I think this one might be
genuine. Can you look into it please?

If that's right, when huge page support lands, if start falls in the
middle of a large folio, returning folio->index as the err_index will
return an offset strictly less than the requested start. A naive
userspace retry loop resuming from error_offset would step backwards
and corrupt attributes on memory it didn't intend to convert.
err_index should be clamped to max(start, folio->index).

Cheers,
/fuad

> +                       }
> +               }
> +
> +               folio_batch_release(&fbatch);
> +               cond_resched();
> +       }
> +
> +       return safe;
> +}
> +
>  static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start,
> -                                    size_t nr_pages, uint64_t attrs)
> +                                    size_t nr_pages, uint64_t attrs,
> +                                    pgoff_t *err_index)
>  {
> +       bool to_private = attrs & KVM_MEMORY_ATTRIBUTE_PRIVATE;
>         struct address_space *mapping = inode->i_mapping;
>         struct gmem_inode *gi = GMEM_I(inode);
>         pgoff_t end = start + nr_pages;
> @@ -588,8 +621,21 @@ static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start,
>
>         mas_init(&mas, mt, start);
>         r = kvm_gmem_mas_preallocate(&mas, attrs, start, nr_pages);
> -       if (r)
> +       if (r) {
> +               *err_index = start;
>                 goto out;
> +       }
> +
> +       if (to_private) {
> +               unmap_mapping_pages(mapping, start, nr_pages, false);
> +
> +               if (!kvm_gmem_is_safe_for_conversion(inode, start, nr_pages,
> +                                                    err_index)) {
> +                       mas_destroy(&mas);
> +                       r = -EAGAIN;
> +                       goto out;
> +               }
> +       }
>
>         /*
>          * From this point on guest_memfd has performed necessary
> @@ -609,9 +655,10 @@ static long kvm_gmem_set_attributes(struct file *file, void __user *argp)
>         struct gmem_file *f = file->private_data;
>         struct inode *inode = file_inode(file);
>         struct kvm_memory_attributes2 attrs;
> +       pgoff_t err_index;
>         size_t nr_pages;
>         pgoff_t index;
> -       int i;
> +       int i, r;
>
>         if (copy_from_user(&attrs, argp, sizeof(attrs)))
>                 return -EFAULT;
> @@ -635,8 +682,16 @@ static long kvm_gmem_set_attributes(struct file *file, void __user *argp)
>
>         nr_pages = attrs.size >> PAGE_SHIFT;
>         index = attrs.offset >> PAGE_SHIFT;
> -       return __kvm_gmem_set_attributes(inode, index, nr_pages,
> -                                        attrs.attributes);
> +       r = __kvm_gmem_set_attributes(inode, index, nr_pages, attrs.attributes,
> +                                     &err_index);
> +       if (r) {
> +               attrs.error_offset = ((uint64_t)err_index) << PAGE_SHIFT;
> +
> +               if (copy_to_user(argp, &attrs, sizeof(attrs)))
> +                       return -EFAULT;
> +       }
> +
> +       return r;
>  }
>
>  static long kvm_gmem_ioctl(struct file *file, unsigned int ioctl,
>
> --
> 2.54.0.563.g4f69b47b94-goog
>
>

^ permalink raw reply

* Re: [PATCH v14 10/44] arm64: RMI: Add support for SRO
From: Gavin Shan @ 2026-05-21  4:38 UTC (permalink / raw)
  To: Steven Price, kvm, kvmarm
  Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Shanker Donthineni,
	Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
	WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-11-steven.price@arm.com>

Hi Steven,

On 5/13/26 11:17 PM, Steven Price wrote:
> RMM v2.0 introduces the concept of "Stateful RMI Operations" (SRO). This
> means that an SMC can return with an operation still in progress. The
> host is excepted to continue the operation until is reaches a conclusion
> (either success or failure). During this process the RMM can request
> additional memory ('donate') or hand memory back to the host
> ('reclaim'). The host can request an in progress operation is cancelled,
> but still continue the operation until it has completed (otherwise the
> incomplete operation may cause future RMM operations to fail).
> 
> The SRO is tracked using a struct rmi_sro_state object which keeps track
> of any memory which has been allocated but not yet consumed by the RMM
> or reclaimed from the RMM. This allows the memory to be reused in a
> future request within the same operation. It will also permit an
> operation to be done in a context where memory allocation may be
> difficult (e.g. atomic context) with the option to abort the operation
> and retry the memory allocation outside of the atomic context. The
> memory stored in the struct rmi_sro_state object can then be reused on
> the subsequent attempt.
> 
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> v14:
>   * SRO support has improved although is still not fully complete. The
>     infrastructure has been moved out of KVM.
> ---
>   arch/arm64/include/asm/rmi_cmds.h |   1 +
>   arch/arm64/kernel/rmi.c           | 359 ++++++++++++++++++++++++++++++
>   2 files changed, 360 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/rmi_cmds.h b/arch/arm64/include/asm/rmi_cmds.h
> index eb213c8e6f26..1a7b0c8f1e38 100644
> --- a/arch/arm64/include/asm/rmi_cmds.h
> +++ b/arch/arm64/include/asm/rmi_cmds.h
> @@ -35,6 +35,7 @@ struct rmi_sro_state {
>   
>   int rmi_delegate_range(phys_addr_t phys, unsigned long size);
>   int rmi_undelegate_range(phys_addr_t phys, unsigned long size);
> +int free_delegated_page(phys_addr_t phys);
>   
>   static inline int rmi_delegate_page(phys_addr_t phys)
>   {
> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
> index 08cef54acadb..a8107ca9bb6d 100644
> --- a/arch/arm64/kernel/rmi.c
> +++ b/arch/arm64/kernel/rmi.c
> @@ -48,6 +48,365 @@ int rmi_undelegate_range(phys_addr_t phys, unsigned long size)
>   	return ret;
>   }
>   
> +static unsigned long donate_req_to_size(unsigned long donatereq)
> +{
> +	unsigned long unit_size = RMI_DONATE_SIZE(donatereq);
> +
> +	switch (unit_size) {
> +	case 0:
> +		return PAGE_SIZE;
> +	case 1:
> +		return PMD_SIZE;
> +	case 2:
> +		return PUD_SIZE;
> +	case 3:
> +		return P4D_SIZE;
> +	}
> +	unreachable();
> +}
> +

It's worthy to have 'inline'. {P4D, PUD, PMD}_SIZE can be equal if there are
no P4D and PUD, depending on CONFIG_PGTABLE_LEVELS. In this case, can the
'unit_size' be translated to wrong value?

> +static void rmi_smccc_invoke(struct arm_smccc_1_2_regs *regs_in,
> +			     struct arm_smccc_1_2_regs *regs_out)
> +{
> +	struct arm_smccc_1_2_regs regs = *regs_in;
> +	unsigned long status;
> +
> +	do {
> +		arm_smccc_1_2_invoke(&regs, regs_out);
> +		status = RMI_RETURN_STATUS(regs_out->a0);
> +	} while (status == RMI_BUSY || status == RMI_BLOCKED);
> +}
> +
> +int free_delegated_page(phys_addr_t phys)
> +{
> +	if (WARN_ON(rmi_undelegate_page(phys))) {
> +		/* Undelegate failed: leak the page */
> +		return -EBUSY;
> +	}
> +
> +	free_page((unsigned long)phys_to_virt(phys));
> +
> +	return 0;
> +}
> +
> +static int rmi_sro_ensure_capacity(struct rmi_sro_state *sro,
> +				   unsigned long count)
> +{
> +	if (WARN_ON_ONCE(sro->addr_count > RMI_MAX_ADDR_LIST))
> +		return -EOVERFLOW;
> +
> +	if (count > RMI_MAX_ADDR_LIST - sro->addr_count)
> +		return -ENOSPC;
> +
> +	return 0;
> +}
> +
> +static int rmi_sro_donate_contig(struct rmi_sro_state *sro,
> +				 unsigned long sro_handle,
> +				 unsigned long donatereq,
> +				 struct arm_smccc_1_2_regs *out_regs,
> +				 gfp_t gfp)
> +{
> +	unsigned long unit_size = RMI_DONATE_SIZE(donatereq);
> +	unsigned long unit_size_bytes = donate_req_to_size(donatereq);
> +	unsigned long count = RMI_DONATE_COUNT(donatereq);
> +	unsigned long state = RMI_DONATE_STATE(donatereq);
> +	unsigned long size = unit_size_bytes * count;
> +	unsigned long addr_range;
> +	int ret;
> +	void *virt;
> +	phys_addr_t phys;
> +	struct arm_smccc_1_2_regs regs = {
> +		SMC_RMI_OP_MEM_DONATE,
> +		sro_handle
> +	};
> +
> +	for (int i = 0; i < sro->addr_count; i++) {
> +		unsigned long entry = sro->addr_list[i];
> +
> +		if (RMI_ADDR_RANGE_SIZE(entry) == unit_size &&
> +		    RMI_ADDR_RANGE_COUNT(entry) == count &&
> +		    RMI_ADDR_RANGE_STATE(entry) == state) {
> +			sro->addr_count--;
> +			swap(sro->addr_list[sro->addr_count],
> +			     sro->addr_list[i]);
> +
> +			goto out;
> +		}
> +	}
> +
> +	ret = rmi_sro_ensure_capacity(sro, 1);
> +	if (ret)
> +		return ret;
> +
> +	virt = alloc_pages_exact(size, gfp);
> +	if (!virt)
> +		return -ENOMEM;
> +	phys = virt_to_phys(virt);
> +

alloc_pages_exact() will fail if the requested size exceeds the maximal allowed
size (1 << MAX_PAGE_ORDER). The maximal size is usually smaller than PUD_SIZE
but PUD_SIZE is allowed by the RMM.

> +	if (state == RMI_OP_MEM_DELEGATED) {
> +		if (rmi_delegate_range(phys, size)) {
> +			free_pages_exact(virt, size);
> +			return -ENXIO;
> +		}
> +	}
> +
> +	addr_range = phys & RMI_ADDR_RANGE_ADDR_MASK;
> +	FIELD_MODIFY(RMI_ADDR_RANGE_SIZE_MASK, &addr_range, unit_size);
> +	FIELD_MODIFY(RMI_ADDR_RANGE_COUNT_MASK, &addr_range, count);
> +	FIELD_MODIFY(RMI_ADDR_RANGE_STATE_MASK, &addr_range, state);
> +
> +	sro->addr_list[sro->addr_count] = addr_range;
> +
> +out:
> +	regs.a2 = virt_to_phys(&sro->addr_list[sro->addr_count]);
> +	regs.a3 = 1;
> +	rmi_smccc_invoke(&regs, out_regs);
> +
> +	unsigned long donated_granules = out_regs->a1;
> +	unsigned long donated_size = donated_granules << PAGE_SHIFT;
> +
> +	if (donated_granules == 0) {
> +		/* No pages used by the RMM */
> +		sro->addr_count++;
> +	} else if (donated_size < size) {
> +		phys = sro->addr_list[sro->addr_count] & RMI_ADDR_RANGE_ADDR_MASK;
> +
> +		/* Not all granules used by the RMM, free the remaining pages */
> +		for (long i = donated_size; i < size; i += PAGE_SIZE) {
> +			if (state == RMI_OP_MEM_DELEGATED)
> +				free_delegated_page(phys + i);
> +			else
> +				__free_page(phys_to_page(phys + i));
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static int rmi_sro_donate_noncontig(struct rmi_sro_state *sro,
> +				    unsigned long sro_handle,
> +				    unsigned long donatereq,
> +				    struct arm_smccc_1_2_regs *out_regs,
> +				    gfp_t gfp)
> +{
> +	unsigned long unit_size = RMI_DONATE_SIZE(donatereq);
> +	unsigned long unit_size_bytes = donate_req_to_size(donatereq);
> +	unsigned long count = RMI_DONATE_COUNT(donatereq);
> +	unsigned long state = RMI_DONATE_STATE(donatereq);
> +	unsigned long found = 0;
> +	unsigned long addr_list_start = sro->addr_count;
> +	int ret;
> +	struct arm_smccc_1_2_regs regs = {
> +		SMC_RMI_OP_MEM_DONATE,
> +		sro_handle
> +	};
> +
> +	for (int i = 0; i < addr_list_start && found < count; i++) {
> +		unsigned long entry = sro->addr_list[i];
> +
> +		if (RMI_ADDR_RANGE_SIZE(entry) == unit_size &&
> +		    RMI_ADDR_RANGE_COUNT(entry) == 1 &&
> +		    RMI_ADDR_RANGE_STATE(entry) == state) {
> +			addr_list_start--;
> +			swap(sro->addr_list[addr_list_start],
> +			     sro->addr_list[i]);
> +			found++;
> +			i--;
> +		}
> +	}
> +
> +	ret = rmi_sro_ensure_capacity(sro, count - found);
> +	if (ret)
> +		return ret;
> +
> +	while (found < count) {
> +		unsigned long addr_range;
> +		void *virt = alloc_pages_exact(unit_size_bytes, gfp);
> +		phys_addr_t phys;
> +
> +		if (!virt)
> +			return -ENOMEM;
> +
> +		phys = virt_to_phys(virt);
> +
> +		if (state == RMI_OP_MEM_DELEGATED) {
> +			if (rmi_delegate_range(phys, unit_size_bytes)) {
> +				free_pages_exact(virt, unit_size_bytes);
> +				return -ENXIO;
> +			}
> +		}
> +
> +		addr_range = phys & RMI_ADDR_RANGE_ADDR_MASK;
> +		FIELD_MODIFY(RMI_ADDR_RANGE_SIZE_MASK, &addr_range, unit_size);
> +		FIELD_MODIFY(RMI_ADDR_RANGE_COUNT_MASK, &addr_range, 1);
> +		FIELD_MODIFY(RMI_ADDR_RANGE_STATE_MASK, &addr_range, state);
> +
> +		sro->addr_list[sro->addr_count++] = addr_range;
> +		found++;
> +	}
> +
> +	regs.a2 = virt_to_phys(&sro->addr_list[addr_list_start]);
> +	regs.a3 = found;
> +	rmi_smccc_invoke(&regs, out_regs);
> +
> +	unsigned long donated_granules = out_regs->a1;
> +
> +	if (WARN_ON(donated_granules & ((unit_size_bytes >> PAGE_SHIFT) - 1))) {
> +		/*
> +		 * FIXME: RMM has only consumed part of a huge page, this leaks
> +		 * the rest of the huge page
> +		 */
> +		donated_granules = ALIGN(donated_granules,
> +					 (unit_size_bytes >> PAGE_SHIFT));
> +	}
> +	unsigned long donated_blocks = donated_granules / (unit_size_bytes >> PAGE_SHIFT);
> +
> +	if (WARN_ON(donated_blocks > found))
> +		donated_blocks = found;
> +
> +	unsigned long undonated_blocks = found - donated_blocks;
> +
> +	while (donated_blocks && undonated_blocks) {
> +		sro->addr_count--;
> +		swap(sro->addr_list[addr_list_start],
> +		     sro->addr_list[sro->addr_count]);
> +		addr_list_start++;
> +
> +		donated_blocks--;
> +		undonated_blocks--;
> +	}
> +	sro->addr_count -= donated_blocks;
> +
> +	return 0;
> +}
> +
> +static int rmi_sro_donate(struct rmi_sro_state *sro,
> +			  unsigned long sro_handle,
> +			  unsigned long donatereq,
> +			  struct arm_smccc_1_2_regs *regs,
> +			  gfp_t gfp)
> +{
> +	unsigned long count = RMI_DONATE_COUNT(donatereq);
> +
> +	if (WARN_ON(!count))
> +		return 0;
> +
> +	if (RMI_DONATE_CONTIG(donatereq)) {
> +		return rmi_sro_donate_contig(sro, sro_handle, donatereq,
> +					     regs, gfp);
> +	} else {
> +		return rmi_sro_donate_noncontig(sro, sro_handle, donatereq,
> +						regs, gfp);
> +	}
> +}
> +
> +static int rmi_sro_reclaim(struct rmi_sro_state *sro,
> +			   unsigned long sro_handle,
> +			   struct arm_smccc_1_2_regs *out_regs)
> +{
> +	unsigned long capacity;
> +	struct arm_smccc_1_2_regs regs;
> +	int ret;
> +
> +	ret = rmi_sro_ensure_capacity(sro, 1);
> +	if (ret)
> +		rmi_sro_free(sro);
> +
> +	capacity = RMI_MAX_ADDR_LIST - sro->addr_count;
> +
> +	regs = (struct arm_smccc_1_2_regs){
> +		SMC_RMI_OP_MEM_RECLAIM,
> +		sro_handle,
> +		virt_to_phys(&sro->addr_list[sro->addr_count]),
> +		capacity
> +	};
> +	rmi_smccc_invoke(&regs, out_regs);
> +
> +	if (WARN_ON_ONCE(out_regs->a1 > capacity))
> +		out_regs->a1 = capacity;
> +
> +	sro->addr_count += out_regs->a1;
> +
> +	return 0;
> +}
> +
> +void rmi_sro_free(struct rmi_sro_state *sro)
> +{
> +	for (int i = 0; i < sro->addr_count; i++) {
> +		unsigned long entry = sro->addr_list[i];
> +		unsigned long addr = RMI_ADDR_RANGE_ADDR(entry);
> +		unsigned long unit_size = RMI_ADDR_RANGE_SIZE(entry);
> +		unsigned long count = RMI_ADDR_RANGE_COUNT(entry);
> +		unsigned long state = RMI_ADDR_RANGE_STATE(entry);
> +		unsigned long size = donate_req_to_size(unit_size) * count;
> +
> +		if (state == RMI_OP_MEM_DELEGATED) {
> +			if (WARN_ON(rmi_undelegate_range(addr, size))) {
> +				/* Leak the pages */
> +				continue;
> +			}
> +		}
> +		free_pages_exact(phys_to_virt(addr), size);
> +	}
> +
> +	sro->addr_count = 0;
> +}
> +
> +unsigned long rmi_sro_execute(struct rmi_sro_state *sro, gfp_t gfp)
> +{
> +	unsigned long sro_handle;
> +	struct arm_smccc_1_2_regs regs;
> +	struct arm_smccc_1_2_regs *regs_in = &sro->regs;
> +
> +	rmi_smccc_invoke(regs_in, &regs);
> +
> +	sro_handle = regs.a1;
> +
> +	while (RMI_RETURN_STATUS(regs.a0) == RMI_INCOMPLETE) {
> +		bool can_cancel = RMI_RETURN_CAN_CANCEL(regs.a0);
> +		int ret;
> +
> +		switch (RMI_RETURN_MEMREQ(regs.a0)) {
> +		case RMI_OP_MEM_REQ_NONE:
> +			regs = (struct arm_smccc_1_2_regs){
> +				SMC_RMI_OP_CONTINUE, sro_handle, 0
> +			};
> +			rmi_smccc_invoke(&regs, &regs);
> +			break;

'ret' isn't initialized for case RMI_OP_MEM_REQ_NONE.

> +		case RMI_OP_MEM_REQ_DONATE:
> +			ret = rmi_sro_donate(sro, sro_handle, regs.a2, &regs,
> +					     gfp);
> +			break;
> +		case RMI_OP_MEM_REQ_RECLAIM:
> +			ret = rmi_sro_reclaim(sro, sro_handle, &regs);
> +			break;
> +		default:
> +			ret = WARN_ON(1);
> +			break;
> +		}
> +
> +		if (ret) {
> +			if (can_cancel) {
> +				/*
> +				 * FIXME: Handle cancelling properly!
> +				 *
> +				 * If the operation has failed due to memory
> +				 * allocation failure then the information on
> +				 * the memory allocation should be saved, so
> +				 * that the allocation can be repeated outside
> +				 * of any context which prevented the
> +				 * allocation.
> +				 */
> +			}
> +			if (WARN_ON(ret))
> +				return ret;
> +		}
> +	}
> +
> +	return regs.a0;
> +}
> +
>   static int rmi_check_version(void)
>   {
>   	struct arm_smccc_res res;

Thanks,
Gavin


^ permalink raw reply

* Re: [PATCH v14 08/44] arm64: RMI: Ensure that the RMM has GPT entries for memory
From: Gavin Shan @ 2026-05-21  0:58 UTC (permalink / raw)
  To: Steven Price, kvm, kvmarm
  Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Shanker Donthineni,
	Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
	WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-9-steven.price@arm.com>

Hi Steven,

On 5/13/26 11:17 PM, Steven Price wrote:
> The RMM maintains the state of all the granules in the system to make
> sure that the host is abiding by the rules. This state can be maintained
> at different granularity, per page (TRACKING_FINE) or per region
> (TRACKING_COARSE). The region size depends on the underlying
> "RMI_GRANULE_SIZE". For a "coarse" region all pages in the region must
> be of the same state, this implies we need to have "fine" tracking for
> DRAM, so that we can delegated individual pages.
> 
> For now we only support a statically carved out memory for tracking
> granules for the "fine" regions. This can be extended in the future to
> allow modifying the tracking granularity and remove the need for a
> static allocation.
> 
> Similarly, the firmware may create L0 GPT entries describing the total
> address space. But if we change the "PAS" (Physical Address Space) of a
> granule then the firmware may need to create L1 tables to track the PAS
> at a finer granularity.
> 
> Note: support is currently missing for SROs which means that if the RMM
> needs memory donating this will fail (and render CCA unusable in Linux).
> This effectively means that the L1 GPT tables must be created before
> Linux starts.
> 
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> Changes since v13:
>   * Moved out of KVM
> ---
>   arch/arm64/include/asm/rmi_cmds.h |   2 +
>   arch/arm64/kernel/rmi.c           | 103 ++++++++++++++++++++++++++++++
>   2 files changed, 105 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/rmi_cmds.h b/arch/arm64/include/asm/rmi_cmds.h
> index 9179934925c5..9078a2920a7c 100644
> --- a/arch/arm64/include/asm/rmi_cmds.h
> +++ b/arch/arm64/include/asm/rmi_cmds.h
> @@ -33,6 +33,8 @@ struct rmi_sro_state {
>   } while (RMI_RETURN_STATUS(res.a0) == RMI_BUSY ||			\
>   	 RMI_RETURN_STATUS(res.a0) == RMI_BLOCKED)
>   
> +bool rmi_is_available(void);
> +
>   unsigned long rmi_sro_execute(struct rmi_sro_state *sro, gfp_t gfp);
>   void rmi_sro_free(struct rmi_sro_state *sro);
>   
> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
> index a14ead5dedda..52a415e99500 100644
> --- a/arch/arm64/kernel/rmi.c
> +++ b/arch/arm64/kernel/rmi.c
> @@ -7,6 +7,8 @@
>   
>   #include <asm/rmi_cmds.h>
>   
> +static bool arm64_rmi_is_available;
> +
>   unsigned long rmm_feat_reg0;
>   unsigned long rmm_feat_reg1;
>   
> @@ -88,6 +90,102 @@ static int rmi_configure(void)
>   	return 0;
>   }
>   
> +/*
> + * For now we set the tracking_region_size to 0 for RMI_RMM_CONFIG_SET().
> + * TODO: Support other tracking sizes (via Kconfig option).
> + */
> +#ifdef CONFIG_PAGE_SIZE_4KB
> +#define RMM_GRANULE_TRACKING_SIZE	SZ_1G
> +#elif defined(CONFIG_PAGE_SIZE_16KB)
> +#define RMM_GRANULE_TRACKING_SIZE	SZ_32M
> +#elif defined(CONFIG_PAGE_SIZE_64KB)
> +#define RMM_GRANULE_TRACKING_SIZE	SZ_512M
> +#endif
> +

RMM_GRANULE_TRACKING_SIZE is never used in this series.

> +/*
> + * Make sure the area is tracked by RMM at FINE granularity.
> + * We do not support changing the tracking yet.
> + */
> +static int rmi_verify_memory_tracking(phys_addr_t start, phys_addr_t end)
> +{
> +	while (start < end) {
> +		unsigned long ret, category, state, next;
> +
> +		ret = rmi_granule_tracking_get(start, end, &category, &state, &next);
> +		if (ret != RMI_SUCCESS ||
> +		    state != RMI_TRACKING_FINE ||
> +		    category != RMI_MEM_CATEGORY_CONVENTIONAL) {
> +			/* TODO: Set granule tracking in this case */
> +			pr_err("Granule tracking for region isn't fine/conventional: %llx",
> +			       start);
> +			return -ENODEV;
> +		}
> +		start = next;
> +	}
> +
> +	return 0;
> +}
> +
> +static unsigned long rmi_l0gpt_size(void)
> +{
> +	return 1UL << (30 + FIELD_GET(RMI_FEATURE_REGISTER_1_L0GPTSZ,
> +				      rmm_feat_reg1));
> +}
> +

rmi_l0gpt_size() is only used by rmi_create_gpts(), its logic can be
combined to that function.

> +static int rmi_create_gpts(phys_addr_t start, phys_addr_t end)
> +{
> +	unsigned long l0gpt_sz = rmi_l0gpt_size();
> +
> +	start = ALIGN_DOWN(start, l0gpt_sz);
> +	end = ALIGN(end, l0gpt_sz);
> +
> +	while (start < end) {
> +		int ret = rmi_gpt_l1_create(start);
> +
> +		/*
> +		 * Make sure the L1 GPT tables are created for the region.
> +		 * RMI_ERROR_GPT indicates the L1 table already exists.
> +		 */
> +		if (ret && ret != RMI_ERROR_GPT) {
> +			/*
> +			 * FIXME: Handle SRO so that memory can be donated for
> +			 * the tables.
> +			 */
> +			pr_err("GPT Level1 table missing for %llx\n", start);
> +			return -ENOMEM;
> +		}
> +		start += l0gpt_sz;
> +	}
> +
> +	return 0;
> +}
> +
> +static int rmi_init_metadata(void)
> +{
> +	phys_addr_t start, end;
> +	const struct memblock_region *r;
> +
> +	for_each_mem_region(r) {
> +		int ret;
> +
> +		start = memblock_region_memory_base_pfn(r) << PAGE_SHIFT;
> +		end = memblock_region_memory_end_pfn(r) << PAGE_SHIFT;
> +		ret = rmi_verify_memory_tracking(start, end);
> +		if (ret)
> +			return ret;
> +		ret = rmi_create_gpts(start, end);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +bool rmi_is_available(void)
> +{
> +	return arm64_rmi_is_available;
> +}
> +
>   static int __init arm64_init_rmi(void)
>   {
>   	/* Continue without realm support if we can't agree on a version */
> @@ -101,6 +199,11 @@ static int __init arm64_init_rmi(void)
>   
>   	if (rmi_configure())
>   		return 0;
> +	if (rmi_init_metadata())
> +		return 0;
> +
> +	arm64_rmi_is_available = true;
> +	pr_info("RMI configured");
>   
>   	return 0;
>   }

Thanks,
Gavin


^ permalink raw reply

* Re: [PATCH v14 07/44] arm64: RMI: Configure the RMM with the host's page size
From: Gavin Shan @ 2026-05-21  0:51 UTC (permalink / raw)
  To: Steven Price, kvm, kvmarm
  Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Shanker Donthineni,
	Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
	WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-8-steven.price@arm.com>

Hi Steven,

On 5/13/26 11:17 PM, Steven Price wrote:
> RMM v2.0 brings the ability to set the RMM's granule size. Check the
> feature registers and configure the RMM so that it matches the host's
> page size. This means that operations can be done with a granulatity
> equal to PAGE_SIZE.
> 
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> Changes since v13:
>   * Moved out of KVM.
> ---
>   arch/arm64/kernel/rmi.c | 42 +++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 42 insertions(+)
> 
> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
> index 99c1ccc35c11..a14ead5dedda 100644
> --- a/arch/arm64/kernel/rmi.c
> +++ b/arch/arm64/kernel/rmi.c
> @@ -49,6 +49,45 @@ static int rmi_check_version(void)
>   	return 0;
>   }
>   
> +static int rmi_configure(void)
> +{
> +	struct rmm_config *config __free(free_page) = NULL;
> +	unsigned long ret;
> +
> +	config = (struct rmm_config *)get_zeroed_page(GFP_KERNEL);
> +	if (!config)
> +		return -ENOMEM;
> +
> +	switch (PAGE_SIZE) {
> +	case SZ_4K:
> +		config->rmi_granule_size = RMI_GRANULE_SIZE_4KB;
> +		break;
> +	case SZ_16K:
> +		config->rmi_granule_size = RMI_GRANULE_SIZE_16KB;
> +		break;
> +	case SZ_64K:
> +		config->rmi_granule_size = RMI_GRANULE_SIZE_64KB;
> +		break;
> +	default:
> +		pr_err("Unsupported PAGE_SIZE for RMM\n");
> +		return -EINVAL;
> +	}
> +
> +	ret = rmi_rmm_config_set(virt_to_phys(config));
> +	if (ret) {
> +		pr_err("RMM config set failed\n");
> +		return -EINVAL;
> +	}
> +

Looking at branch 'topics/rmm-v2.0-poc_2' of RMM implementation, the granule size
is fixed to be 4KB at present. I'm not sure if I have looked into correct RMM
implementation, but 'topics/rmm-v2.0-poc_2' is recommended one in the cover
letter.

Besides, there has checks in the handler of the RMI command to make sure that
struct rmm_config::tracking_region_size to be 1GB, indicated by zero. It maybe
worthy to set it before call to rmi_rmm_config_set().

	config.tracking_region_size = 0; /* 1GB */
	ret = rmi_rmm_config_set(virt_to_phys(config));


> +	ret = rmi_rmm_activate();
> +	if (ret) {
> +		pr_err("RMM activate failed\n");
> +		return -ENXIO;
> +	}
> +
> +	return 0;
> +}
> +
>   static int __init arm64_init_rmi(void)
>   {
>   	/* Continue without realm support if we can't agree on a version */
> @@ -60,6 +99,9 @@ static int __init arm64_init_rmi(void)
>   	if (WARN_ON(rmi_features(1, &rmm_feat_reg1)))
>   		return 0;
>   
> +	if (rmi_configure())
> +		return 0;
> +
>   	return 0;
>   }
>   subsys_initcall(arm64_init_rmi);

Thanks,
Gavin


^ permalink raw reply

* Re: [PATCH v14 06/44] arm64: RMI: Check for RMI support at init
From: Gavin Shan @ 2026-05-21  0:39 UTC (permalink / raw)
  To: Steven Price, kvm, kvmarm
  Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Shanker Donthineni,
	Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
	WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-7-steven.price@arm.com>

Hi Steven,

On 5/13/26 11:17 PM, Steven Price wrote:
> Query the RMI version number and check if it is a compatible version.
> The first two feature registers are read and exposed for future code to
> use.
> 
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> v14:
>   * This moves the basic RMI setup into the 'kernel' directory. This is
>     because RMI will be used for some features outside of KVM so should
>     be available even if KVM isn't compiled in.
> ---
>   arch/arm64/include/asm/rmi_cmds.h |  3 ++
>   arch/arm64/kernel/Makefile        |  2 +-
>   arch/arm64/kernel/cpufeature.c    |  1 +
>   arch/arm64/kernel/rmi.c           | 65 +++++++++++++++++++++++++++++++
>   4 files changed, 70 insertions(+), 1 deletion(-)
>   create mode 100644 arch/arm64/kernel/rmi.c
> 

[...]

> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
> new file mode 100644
> index 000000000000..99c1ccc35c11
> --- /dev/null
> +++ b/arch/arm64/kernel/rmi.c
> @@ -0,0 +1,65 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2023-2025 ARM Ltd.
> + */
> +
> +#include <linux/memblock.h>
> +
> +#include <asm/rmi_cmds.h>
> +
> +unsigned long rmm_feat_reg0;
> +unsigned long rmm_feat_reg1;
> +
> +static int rmi_check_version(void)
> +{
> +	struct arm_smccc_res res;
> +	unsigned short version_major, version_minor;
> +	unsigned long host_version = RMI_ABI_VERSION(RMI_ABI_MAJOR_VERSION,
> +						     RMI_ABI_MINOR_VERSION);
> +	unsigned long aa64pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
> +
> +	/* If RME isn't supported, then RMI can't be */
> +	if (cpuid_feature_extract_unsigned_field(aa64pfr0, ID_AA64PFR0_EL1_RME_SHIFT) == 0)
> +		return -ENXIO;
> +
> +	arm_smccc_1_1_invoke(SMC_RMI_VERSION, host_version, &res);
> +
> +	if (res.a0 == SMCCC_RET_NOT_SUPPORTED)
> +		return -ENXIO;
> +
> +	version_major = RMI_ABI_VERSION_GET_MAJOR(res.a1);
> +	version_minor = RMI_ABI_VERSION_GET_MINOR(res.a1);
> +
> +	if (res.a0 != RMI_SUCCESS) {
> +		unsigned short high_version_major, high_version_minor;
> +
> +		high_version_major = RMI_ABI_VERSION_GET_MAJOR(res.a2);
> +		high_version_minor = RMI_ABI_VERSION_GET_MINOR(res.a2);
> +
> +		pr_err("Unsupported RMI ABI (v%d.%d - v%d.%d) we want v%d.%d\n",
> +		       version_major, version_minor,
> +		       high_version_major, high_version_minor,
> +		       RMI_ABI_MAJOR_VERSION,
> +		       RMI_ABI_MINOR_VERSION);
> +		return -ENXIO;
> +	}
> +
> +	pr_info("RMI ABI version %d.%d\n", version_major, version_minor);
> +
> +	return 0;
> +}
> +
> +static int __init arm64_init_rmi(void)
> +{
> +	/* Continue without realm support if we can't agree on a version */
> +	if (rmi_check_version())
> +		return 0;

Is this still a valid point that we have to return zero on errors returned
from rmi_check_version() or other other function calls like rmi_features()?
arm64_init_rmi() is triggered by subsys_initcall() where the return value
needs to indicate success or failure. It's fine to return error code from
arm64_init_rmi() in the path.

> +
> +	if (WARN_ON(rmi_features(0, &rmm_feat_reg0)))
> +		return 0;
> +	if (WARN_ON(rmi_features(1, &rmm_feat_reg1)))
> +		return 0;
> +
> +	return 0;
> +}
> +subsys_initcall(arm64_init_rmi);

Thanks,
Gavin


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox