* [PATCH v6 02/20] dma-direct: swiotlb: handle swiotlb alloc/free outside __dma_direct_alloc_pages
From: Aneesh Kumar K.V (Arm) @ 2026-06-04 8:39 UTC (permalink / raw)
To: iommu, linux-arm-kernel, linux-kernel, linux-coco
Cc: Aneesh Kumar K.V (Arm), Robin Murphy, Marek Szyprowski,
Will Deacon, Marc Zyngier, Steven Price, Suzuki K Poulose,
Catalin Marinas, Jiri Pirko, Jason Gunthorpe, Mostafa Saleh,
Petr Tesarik, Alexey Kardashevskiy, Dan Williams, Xu Yilun,
linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86, Jiri Pirko,
Michael Kelley
In-Reply-To: <20260604083959.1265923-1-aneesh.kumar@kernel.org>
Move swiotlb allocation out of __dma_direct_alloc_pages() and handle it in
dma_direct_alloc() / dma_direct_alloc_pages().
This is needed for follow-up changes that simplify the handling of
memory encryption/decryption based on the DMA attribute flags.
swiotlb backing pages are already mapped decrypted by
swiotlb_update_mem_attributes() and rmem_swiotlb_device_init(), so
dma-direct should not call dma_set_decrypted() on allocation nor
dma_set_encrypted() on free for swiotlb-backed memory.
Update alloc/free paths to detect swiotlb-backed pages and skip
encrypt/decrypt transitions for those paths. Keep the existing highmem
rejection in dma_direct_alloc_pages() for swiotlb allocations.
Only for "restricted-dma-pool", we currently set `for_alloc = true`, while
rmem_swiotlb_device_init() decrypts the whole pool up front. This pool is
typically used together with "shared-dma-pool", where the shared region is
accessed after remap/ioremap and the returned address is suitable for
decrypted memory access. So existing code paths remain valid.
Tested-by: Jiri Pirko <jiri@nvidia.com>
Tested-by: Michael Kelley <mhklinux@outlook.com>
Tested-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
include/linux/swiotlb.h | 6 ++++
kernel/dma/direct.c | 71 ++++++++++++++++++++++++++++++-----------
kernel/dma/swiotlb.c | 6 ++++
3 files changed, 65 insertions(+), 18 deletions(-)
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 3dae0f592063..133bb8ca9032 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -284,6 +284,8 @@ extern void swiotlb_print_info(void);
#ifdef CONFIG_DMA_RESTRICTED_POOL
struct page *swiotlb_alloc(struct device *dev, size_t size);
bool swiotlb_free(struct device *dev, struct page *page, size_t size);
+void swiotlb_free_from_pool(struct device *dev, phys_addr_t tlb_addr,
+ size_t size, struct io_tlb_pool *pool);
static inline bool is_swiotlb_for_alloc(struct device *dev)
{
@@ -299,6 +301,10 @@ static inline bool swiotlb_free(struct device *dev, struct page *page,
{
return false;
}
+static inline void swiotlb_free_from_pool(struct device *dev, phys_addr_t tlb_addr,
+ size_t size, struct io_tlb_pool *pool)
+{
+}
static inline bool is_swiotlb_for_alloc(struct device *dev)
{
return false;
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 583c5922bca2..a741c8a2ee66 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -96,14 +96,6 @@ static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size)
return ret;
}
-static void __dma_direct_free_pages(struct device *dev, struct page *page,
- size_t size)
-{
- if (swiotlb_free(dev, page, size))
- return;
- dma_free_contiguous(dev, page, size);
-}
-
static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size)
{
struct page *page = swiotlb_alloc(dev, size);
@@ -125,9 +117,6 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
WARN_ON_ONCE(!PAGE_ALIGNED(size));
- if (is_swiotlb_for_alloc(dev))
- return dma_direct_alloc_swiotlb(dev, size);
-
gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit);
page = dma_alloc_contiguous(dev, size, gfp);
if (page) {
@@ -204,6 +193,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
bool remap = false, set_uncached = false;
+ bool mark_mem_decrypt = true;
struct page *page;
void *ret;
@@ -250,11 +240,21 @@ void *dma_direct_alloc(struct device *dev, size_t size,
dma_direct_use_pool(dev, gfp))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
+ if (is_swiotlb_for_alloc(dev)) {
+ page = dma_direct_alloc_swiotlb(dev, size);
+ if (page) {
+ mark_mem_decrypt = false;
+ goto setup_page;
+ }
+ return NULL;
+ }
+
/* we always manually zero the memory once we are done */
page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true);
if (!page)
return NULL;
+setup_page:
/*
* dma_alloc_contiguous can return highmem pages depending on a
* combination the cma= arguments and per-arch setup. These need to be
@@ -281,7 +281,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
goto out_free_pages;
} else {
ret = page_address(page);
- if (dma_set_decrypted(dev, ret, size))
+ if (mark_mem_decrypt && dma_set_decrypted(dev, ret, size))
goto out_leak_pages;
}
@@ -298,10 +298,11 @@ void *dma_direct_alloc(struct device *dev, size_t size,
return ret;
out_encrypt_pages:
- if (dma_set_encrypted(dev, page_address(page), size))
+ if (mark_mem_decrypt && dma_set_encrypted(dev, page_address(page), size))
return NULL;
out_free_pages:
- __dma_direct_free_pages(dev, page, size);
+ if (!swiotlb_free(dev, page, size))
+ dma_free_contiguous(dev, page, size);
return NULL;
out_leak_pages:
return NULL;
@@ -310,6 +311,9 @@ void *dma_direct_alloc(struct device *dev, size_t size,
void dma_direct_free(struct device *dev, size_t size,
void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
{
+ phys_addr_t phys;
+ bool mark_mem_encrypted = true;
+ struct io_tlb_pool *swiotlb_pool;
unsigned int page_order = get_order(size);
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
@@ -338,16 +342,25 @@ void dma_direct_free(struct device *dev, size_t size,
dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
return;
+ phys = dma_to_phys(dev, dma_addr);
+ swiotlb_pool = swiotlb_find_pool(dev, phys);
+ if (swiotlb_pool)
+ /* Swiotlb doesn't need a page attribute update on free */
+ mark_mem_encrypted = false;
+
if (is_vmalloc_addr(cpu_addr)) {
vunmap(cpu_addr);
} else {
if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
arch_dma_clear_uncached(cpu_addr, size);
- if (dma_set_encrypted(dev, cpu_addr, size))
+ if (mark_mem_encrypted && dma_set_encrypted(dev, cpu_addr, size))
return;
}
- __dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size);
+ if (swiotlb_pool)
+ swiotlb_free_from_pool(dev, phys, size, swiotlb_pool);
+ else
+ dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size);
}
struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
@@ -359,6 +372,15 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
+ if (is_swiotlb_for_alloc(dev)) {
+ page = dma_direct_alloc_swiotlb(dev, size);
+ if (!page)
+ return NULL;
+
+ ret = page_address(page);
+ goto setup_page;
+ }
+
page = __dma_direct_alloc_pages(dev, size, gfp, false);
if (!page)
return NULL;
@@ -366,6 +388,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
ret = page_address(page);
if (dma_set_decrypted(dev, ret, size))
goto out_leak_pages;
+setup_page:
memset(ret, 0, size);
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
return page;
@@ -377,16 +400,28 @@ void dma_direct_free_pages(struct device *dev, size_t size,
struct page *page, dma_addr_t dma_addr,
enum dma_data_direction dir)
{
+ phys_addr_t phys;
void *vaddr = page_address(page);
+ struct io_tlb_pool *swiotlb_pool;
+ bool mark_mem_encrypted = true;
/* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
dma_free_from_pool(dev, vaddr, size))
return;
- if (dma_set_encrypted(dev, vaddr, size))
+ phys = page_to_phys(page);
+ swiotlb_pool = swiotlb_find_pool(dev, phys);
+ if (swiotlb_pool)
+ mark_mem_encrypted = false;
+
+ if (mark_mem_encrypted && dma_set_encrypted(dev, vaddr, size))
return;
- __dma_direct_free_pages(dev, page, size);
+
+ if (swiotlb_pool)
+ swiotlb_free_from_pool(dev, phys, size, swiotlb_pool);
+ else
+ dma_free_contiguous(dev, page, size);
}
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 1abd3e6146f4..ac03a6856c2e 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1809,6 +1809,12 @@ bool swiotlb_free(struct device *dev, struct page *page, size_t size)
return true;
}
+void swiotlb_free_from_pool(struct device *dev, phys_addr_t tlb_addr, size_t size,
+ struct io_tlb_pool *pool)
+{
+ swiotlb_release_slots(dev, tlb_addr, pool);
+}
+
static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
struct device *dev)
{
--
2.43.0
^ permalink raw reply related
* [PATCH v6 01/20] s390: Expose protected virtualization through cc_platform_has()
From: Aneesh Kumar K.V (Arm) @ 2026-06-04 8:39 UTC (permalink / raw)
To: iommu, linux-arm-kernel, linux-kernel, linux-coco
Cc: Aneesh Kumar K.V (Arm), Robin Murphy, Marek Szyprowski,
Will Deacon, Marc Zyngier, Steven Price, Suzuki K Poulose,
Catalin Marinas, Jiri Pirko, Jason Gunthorpe, Mostafa Saleh,
Petr Tesarik, Alexey Kardashevskiy, Dan Williams, Xu Yilun,
linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86, Halil Pasic,
Matthew Rosato, Jaehoon Kim
In-Reply-To: <20260604083959.1265923-1-aneesh.kumar@kernel.org>
Protected virtualization guests use memory encryption, so advertise that to
the rest of the kernel through cc_platform_has(CC_ATTR_MEM_ENCRYPT).
s390 already forces DMA mappings to be unencrypted for protected
virtualization guests through force_dma_unencrypted(). Add
ARCH_HAS_CC_PLATFORM and provide the matching cc_platform_has()
implementation
Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
Cc: Halil Pasic <pasic@linux.ibm.com>
Cc: Matthew Rosato <mjrosato@linux.ibm.com>
Cc: Jaehoon Kim <jhkim@linux.ibm.com>
---
arch/s390/Kconfig | 1 +
arch/s390/mm/init.c | 14 ++++++++++++++
2 files changed, 15 insertions(+)
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index ecbcbb781e40..9b5e6029e043 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -87,6 +87,7 @@ config S390
select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
select ARCH_HAS_CC_CAN_LINK
+ select ARCH_HAS_CC_PLATFORM
select ARCH_HAS_CPU_FINALIZE_INIT
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 1f72efc2a579..ad3c6d92b801 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -50,6 +50,7 @@
#include <linux/virtio_anchor.h>
#include <linux/virtio_config.h>
#include <linux/execmem.h>
+#include <linux/cc_platform.h>
pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir");
pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir");
@@ -140,6 +141,19 @@ bool force_dma_unencrypted(struct device *dev)
return is_prot_virt_guest();
}
+
+bool cc_platform_has(enum cc_attr attr)
+{
+ switch (attr) {
+ case CC_ATTR_MEM_ENCRYPT:
+ return is_prot_virt_guest();
+
+ default:
+ return false;
+ }
+}
+EXPORT_SYMBOL_GPL(cc_platform_has);
+
/* protected virtualization */
static void __init pv_init(void)
{
--
2.43.0
^ permalink raw reply related
* [PATCH v6 00/20] dma-mapping: Use DMA_ATTR_CC_SHARED through direct, pool and swiotlb paths
From: Aneesh Kumar K.V (Arm) @ 2026-06-04 8:39 UTC (permalink / raw)
To: iommu, linux-arm-kernel, linux-kernel, linux-coco
Cc: Aneesh Kumar K.V (Arm), Robin Murphy, Marek Szyprowski,
Will Deacon, Marc Zyngier, Steven Price, Suzuki K Poulose,
Catalin Marinas, Jiri Pirko, Jason Gunthorpe, Mostafa Saleh,
Petr Tesarik, Alexey Kardashevskiy, Dan Williams, Xu Yilun,
linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86
This series propagates DMA_ATTR_CC_SHARED through the dma-direct,
dma-pool, and swiotlb paths so that encrypted and decrypted DMA buffers
are handled consistently.
Today, the direct DMA path mostly relies on force_dma_unencrypted() for
shared/decrypted buffer handling. This series consolidates the
force_dma_unencrypted() checks in the top-level functions and ensures
that the remaining DMA interfaces use DMA attributes to make the correct
decisions.
The series:
- moves swiotlb-backed allocations out of __dma_direct_alloc_pages(),
- propagates DMA_ATTR_CC_SHARED through the dma-direct alloc/free
paths
- teaches the atomic DMA pools to track encrypted versus decrypted
state
- tracks swiotlb pool encryption state and enforces strict pool
selection
- centralizes encrypted/decrypted pgprot handling in dma_pgprot() using
DMA attributes
- passes DMA attributes down to dma_capable() so capability checks can
validate whether the selected DMA address encoding matches
DMA_ATTR_CC_SHARED
- makes dma_direct_map_phys() choose the DMA address encoding from
DMA_ATTR_CC_SHARED and fall back to swiotlb when a shared DMA request
cannot use the direct mapping, which lets arm64 and x86 CCA guests stop
relying on SWIOTLB_FORCE for DMA mappings
- use the selected swiotlb pool state to derive the returned DMA
address.
Changes since v5:
https://lore.kernel.org/all/20260522042815.370873-1-aneesh.kumar@kernel.org
* Add Tested-by
* Drop the pKVM patch, which has now been posted separately:
https://lore.kernel.org/all/20260603110522.3331819-1-smostafa@google.com
* Remove the DO_NOT_MERGE tag from the s390 change.
* Add a patch to drop the SWIOTLB_FORCE flag.
* Rebase onto the latest kernel.
Changes since v4:
https://lore.kernel.org/all/20260512090408.794195-1-aneesh.kumar@kernel.org
* Add new patches based on Sashiko review:
swiotlb: Preserve allocation virtual address for dynamic pools
dma: free atomic pool pages by physical address
dma: swiotlb: handle set_memory_decrypted() failures
dma: swiotlb: free dynamic pools from process context
iommu/dma: Check atomic pool allocation result directly
* Include pKVM and s390 changes as dependent patches. These are not yet
ready to merge and are waiting for subsystem testing feedback.
* Drop the AMD GART patch because it requires wider testing.
* Update swiotlb_tbl_map_single() to take attrs by reference.
* Switch swiotlb_free() to use rcu_work.
* Avoid calling swiotlb_find_pool() multiple times in the free path.
* Make DMA_ATTR_MMIO imply DMA_ATTR_CC_SHARED for devices requiring unencrypted DMA.
Changes from v3:
https://lore.kernel.org/all/20260427055509.898190-1-aneesh.kumar@kernel.org
* Handle DMA_ATTR_MMIO correctly in dma_direct_map_phys()
* Address most of sashiko review
* Rebase to latest kernel
* drop SWIOTLB_FORCE for s390 and powerpc secure guest.
Changes from v2:
https://lore.kernel.org/all/20260420061415.3650870-1-aneesh.kumar@kernel.org
* pass attrs to dma_capable() and update direct, swiotlb, Xen swiotlb, and
x86 GART paths so the capability checks see the DMA address attr value
DMA_ATTR_CC_SHARED.
* rework dma_direct_map_phys() so DMA_ATTR_CC_SHARED selects
phys_to_dma_unencrypted() while the default path uses
phys_to_dma_encrypted(), with swiotlb fallback when the requested
shared/private state cannot be satisfied by a direct DMA address.
* stop relying on SWIOTLB_FORCE for arm64 and x86 CC guest DMA mappings;
swiotlb is still enabled there, but shared mappings is now selected
through the generic dma_direct_map_phys()/dma_capable() decision instead
of a global force-bounce flag.
Changes from v1:
https://lore.kernel.org/all/20260417085900.3062416-1-aneesh.kumar@kernel.org
* rebased to latest kernel (change from DMA_ATTR_CC_DECRYPTED -> DMA_ATTR_CC_SHARED)
* update the alloc path so DMA_ATTR_CC_SHARED is not a caller-visible attribute.
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Will Deacon <will@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: Suzuki K Poulose <Suzuki.Poulose@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Mostafa Saleh <smostafa@google.com>
Cc: Petr Tesarik <ptesarik@suse.com>
Cc: Alexey Kardashevskiy <aik@amd.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Xu Yilun <yilun.xu@linux.intel.com>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-s390@vger.kernel.org
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: "Christophe Leroy (CS GROUP)" <chleroy@kernel.org>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: x86@kernel.org
Aneesh Kumar K.V (Arm) (20):
s390: Expose protected virtualization through cc_platform_has()
dma-direct: swiotlb: handle swiotlb alloc/free outside
__dma_direct_alloc_pages
dma-direct: use DMA_ATTR_CC_SHARED in alloc/free paths
dma-pool: track decrypted atomic pools and select them via attrs
dma: swiotlb: pass mapping attributes by reference
dma: swiotlb: track pool encryption state and honor DMA_ATTR_CC_SHARED
dma-mapping: make dma_pgprot() honor DMA_ATTR_CC_SHARED
dma-direct: pass attrs to dma_capable() for DMA_ATTR_CC_SHARED checks
dma-direct: make dma_direct_map_phys() honor DMA_ATTR_CC_SHARED
dma-direct: set decrypted flag for remapped DMA allocations
dma-direct: select DMA address encoding from DMA_ATTR_CC_SHARED
dma-pool: fix page leak in atomic_pool_expand() cleanup
dma-direct: rename ret to cpu_addr in alloc helpers
dma-direct: return struct page from dma_direct_alloc_from_pool()
iommu/dma: Check atomic pool allocation result directly
dma: swiotlb: free dynamic pools from process context
dma: swiotlb: handle set_memory_decrypted() failures
dma: free atomic pool pages by physical address
swiotlb: Preserve allocation virtual address for dynamic pools
swiotlb: remove unused SWIOTLB_FORCE flag
arch/arm64/mm/init.c | 4 +-
arch/powerpc/platforms/pseries/svm.c | 2 +-
arch/s390/Kconfig | 1 +
arch/s390/mm/init.c | 16 +-
arch/x86/kernel/amd_gart_64.c | 30 +--
arch/x86/kernel/pci-dma.c | 4 +-
drivers/iommu/dma-iommu.c | 15 +-
drivers/xen/swiotlb-xen.c | 8 +-
include/linux/dma-direct.h | 20 +-
include/linux/dma-map-ops.h | 3 +-
include/linux/swiotlb.h | 21 +-
kernel/dma/direct.c | 275 +++++++++++++++++++--------
kernel/dma/direct.h | 47 ++---
kernel/dma/mapping.c | 16 +-
kernel/dma/pool.c | 221 +++++++++++++++------
kernel/dma/swiotlb.c | 273 ++++++++++++++++++++------
16 files changed, 692 insertions(+), 264 deletions(-)
base-commit: ba3e43a9e601636f5edb54e259a74f96ca3b8fd8
--
2.43.0
^ permalink raw reply
* Re: [PATCH v7 00/42] guest_memfd: In-place conversion support
From: Ackerley Tng @ 2026-06-03 21:27 UTC (permalink / raw)
To: Ackerley Tng via B4 Relay, aik, andrew.jones, binbin.wu, brauner,
chao.p.peng, david, ira.weiny, jmattson, jthoughton, michael.roth,
oupton, pankaj.gupta, qperret, rick.p.edgecombe, rientjes,
shivankg, steven.price, tabba, willy, wyihan, yan.y.zhao,
forkloop, pratyush, suzuki.poulose, aneesh.kumar, liam,
Paolo Bonzini, Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>
Ackerley Tng via B4 Relay <devnull+ackerleytng.google.com@kernel.org>
writes:
> This is v7 of guest_memfd in-place conversion support.
>
Here's the outstanding items after going over everyone's comments
including Sashiko's:
+ KVM: TDX: Make source page optional for KVM_TDX_INIT_MEM_REGION
+ Need to move page clearing into __kvm_gmem_get_pfn to resolve
leak where populate can put initialized kernel memory into TDX
guest
+ See suggested fix at [1]
+ KVM: guest_memfd: Only prepare folios for private pages,
+ s/non-CoCo/CoCo in commit message "INIT_SHARED is about to be
supported for non-CoCo VMs in a later patch in this series
+ Use Suggested-by: Michael Roth <michael.roth@amd.com>
+ KVM: selftests: Test that shared/private status is consistent across
processes
+ Improve test reliability using pthread_mutex
+ I have a fixup patch offline.
I would like feedback on these:
+ KVM: selftests: Test conversion with elevated page refcount
+ Askar pointed out that soon vmsplice may not pin pages. Should I
pin pages through CONFIG_GUP_TEST like in [2]? I prefer not to
take a dependency on CONFIG_GUP_TEST.
+ KVM: selftests: Add script to exercise private_mem_conversions_test
+ Would like to know what people think of a wrapper script before
I address Sashiko's comments.
[1] https://lore.kernel.org/all/CAEvNRgEVC=fFuKVgZYvWyZD7t_zvUZihFG8hrACjvtkD5cwugw@mail.gmail.com/
[2] https://lore.kernel.org/all/baa8838f623102931e755cf34c86314b305af49c.1747264138.git.ackerleytng@google.com/
>
> [...snip...]
>
^ permalink raw reply
* Re: [PATCH v6 1/4] firmware: smccc: Add an Arm SMCCC bus
From: Sudeep Holla @ 2026-06-03 18:52 UTC (permalink / raw)
To: Aneesh Kumar K.V (Arm)
Cc: linux-coco, linux-arm-kernel, linux-kernel, Catalin Marinas,
Greg KH, Jeremy Linton, Jonathan Cameron, Lorenzo Pieralisi,
Mark Rutland, Will Deacon, Steven Price, Suzuki K Poulose
In-Reply-To: <20260527100233.428018-2-aneesh.kumar@kernel.org>
On Wed, May 27, 2026 at 03:32:30PM +0530, Aneesh Kumar K.V (Arm) wrote:
> SMCCC-discovered firmware services are currently represented by separate
> platform devices, such as smccc_trng and arm-cca-dev. Those devices do not
> represent independent DT/ACPI-described platform resources; they are
> features of the SMCCC firmware interface.
>
> Add an Arm SMCCC bus for services discovered through the SMCCC firmware
> interface. The bus provides SMCCC device and driver registration helpers,
> name-based matching, modalias generation, and a sysfs modalias attribute so
> SMCCC service drivers can bind to discovered firmware services and autoload
> as modules.
>
> Follow-up changes can then register SMCCC firmware services as arm-smccc
> devices instead of creating independent per-feature platform devices.
>
This looks good to me.
> Based on arm_ffa code
>
> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
> ---
> drivers/firmware/smccc/smccc.c | 158 ++++++++++++++++++++++++++++++
I think it is better to keep it separate say bus.c ?
--
Regards,
Sudeep
^ permalink raw reply
* Re: SVSM Development Call June 3rd, 2026
From: Jörg Rödel @ 2026-06-03 18:03 UTC (permalink / raw)
To: coconut-svsm, linux-coco
In-Reply-To: <ah7_t33cbOVFJcbo@8bytes.org>
Meeting minutes are now ready for review:
https://github.com/coconut-svsm/governance/pull/110
-Joerg
^ permalink raw reply
* Re: [PATCH v14 08/44] arm64: RMI: Ensure that the RMM has GPT entries for memory
From: Steven Price @ 2026-06-03 15:48 UTC (permalink / raw)
To: Suzuki K Poulose, Marc Zyngier
Cc: kvm, kvmarm, Catalin Marinas, Will Deacon, James Morse,
Oliver Upton, Zenghui Yu, linux-arm-kernel, linux-kernel,
Joey Gouly, Alexandru Elisei, Christoffer Dall, Fuad Tabba,
linux-coco, Ganapatrao Kulkarni, Gavin Shan, Shanker Donthineni,
Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <e175cb7b-b4fa-4139-b46d-1986e2372d16@arm.com>
On 21/05/2026 16:39, Suzuki K Poulose wrote:
> On 21/05/2026 14:47, Marc Zyngier wrote:
>> On Wed, 13 May 2026 14:17:16 +0100,
>> Steven Price <steven.price@arm.com> wrote:
>>>
>>> The RMM maintains the state of all the granules in the system to make
>>> sure that the host is abiding by the rules. This state can be maintained
>>> at different granularity, per page (TRACKING_FINE) or per region
>>> (TRACKING_COARSE). The region size depends on the underlying
>>> "RMI_GRANULE_SIZE". For a "coarse" region all pages in the region must
>>> be of the same state, this implies we need to have "fine" tracking for
>>> DRAM, so that we can delegated individual pages.
>>>
>>> For now we only support a statically carved out memory for tracking
>>> granules for the "fine" regions. This can be extended in the future to
>>> allow modifying the tracking granularity and remove the need for a
>>> static allocation.
>>>
>>> Similarly, the firmware may create L0 GPT entries describing the total
>>> address space. But if we change the "PAS" (Physical Address Space) of a
>>> granule then the firmware may need to create L1 tables to track the PAS
>>> at a finer granularity.
>>>
>>> Note: support is currently missing for SROs which means that if the RMM
>>> needs memory donating this will fail (and render CCA unusable in Linux).
>>> This effectively means that the L1 GPT tables must be created before
>>> Linux starts.
>>>
>>> Signed-off-by: Steven Price <steven.price@arm.com>
>>> ---
>>> Changes since v13:
>>> * Moved out of KVM
>>> ---
>>> arch/arm64/include/asm/rmi_cmds.h | 2 +
>>> arch/arm64/kernel/rmi.c | 103 ++++++++++++++++++++++++++++++
>>> 2 files changed, 105 insertions(+)
>>>
>>> diff --git a/arch/arm64/include/asm/rmi_cmds.h b/arch/arm64/include/
>>> asm/rmi_cmds.h
>>> index 9179934925c5..9078a2920a7c 100644
>>> --- a/arch/arm64/include/asm/rmi_cmds.h
>>> +++ b/arch/arm64/include/asm/rmi_cmds.h
>>> @@ -33,6 +33,8 @@ struct rmi_sro_state {
>>> } while (RMI_RETURN_STATUS(res.a0) == RMI_BUSY || \
>>> RMI_RETURN_STATUS(res.a0) == RMI_BLOCKED)
>>> +bool rmi_is_available(void);
>>> +
>>> unsigned long rmi_sro_execute(struct rmi_sro_state *sro, gfp_t gfp);
>>> void rmi_sro_free(struct rmi_sro_state *sro);
>>> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
>>> index a14ead5dedda..52a415e99500 100644
>>> --- a/arch/arm64/kernel/rmi.c
>>> +++ b/arch/arm64/kernel/rmi.c
>>> @@ -7,6 +7,8 @@
>>> #include <asm/rmi_cmds.h>
>>> +static bool arm64_rmi_is_available;
>>> +
>>> unsigned long rmm_feat_reg0;
>>> unsigned long rmm_feat_reg1;
>>> @@ -88,6 +90,102 @@ static int rmi_configure(void)
>>> return 0;
>>> }
>>> +/*
>>> + * For now we set the tracking_region_size to 0 for
>>> RMI_RMM_CONFIG_SET().
>>> + * TODO: Support other tracking sizes (via Kconfig option).
>>> + */
>>> +#ifdef CONFIG_PAGE_SIZE_4KB
>>> +#define RMM_GRANULE_TRACKING_SIZE SZ_1G
>>> +#elif defined(CONFIG_PAGE_SIZE_16KB)
>>> +#define RMM_GRANULE_TRACKING_SIZE SZ_32M
>>> +#elif defined(CONFIG_PAGE_SIZE_64KB)
>>> +#define RMM_GRANULE_TRACKING_SIZE SZ_512M
>>> +#endif
>>
>> Basically, a level 2 mapping. Which means this whole block really is:
>>
>> #define RMM_GRANULE_TRAKING_SIZE (2 * PAGE_SHIFT - 3)
>>
>> (adjust for D128 as needed).
>
> True,
As Gavin pointed out we actually don't need this anymore because of the
move to a range based API.
It's also not quite that simple because for 4K PAGE_SIZED the RMM
doesn't support 2MB (which would be the level 2 size), instead jumping
to 1GB. And if we add a Kconfig option in the future then this could
change because of that.
For now I'll just delete this block since it's unused.
>>
>>> +
>>> +/*
>>> + * Make sure the area is tracked by RMM at FINE granularity.
>>> + * We do not support changing the tracking yet.
>>> + */
>>> +static int rmi_verify_memory_tracking(phys_addr_t start, phys_addr_t
>>> end)
>>> +{
>>> + while (start < end) {
>>> + unsigned long ret, category, state, next;
>>> +
>>> + ret = rmi_granule_tracking_get(start, end, &category,
>>> &state, &next);
>>> + if (ret != RMI_SUCCESS ||
>>> + state != RMI_TRACKING_FINE ||
>>> + category != RMI_MEM_CATEGORY_CONVENTIONAL) {
>>> + /* TODO: Set granule tracking in this case */
>>> + pr_err("Granule tracking for region isn't fine/
>>> conventional: %llx",
>>> + start);
>>> + return -ENODEV;
>>
>> How is this triggered? Do we really need to spam the console with
>> this? A PA doesn't mean much, and there is no context (stack trace).
I'm not sure 1 message really counts as 'spam' - it provides the
information on why the RMI interface (and therefore realm guests) is
unavailable. The PA might help track down whether this physical region
was intended to be given to Linux.
> This could be triggered if the RMM doesn't have static carveout
> for tracking the DRAM granules. (state != RMI_TRACKING_FINE).
> This not worth WARN_ONCE(), we could simply not enable KVM.
> We plan to add support for donating memory to the RMM in
> the future. (Primarily we don't yet have an RMM implementation
> that does dynamic management via SRO. This can be added later
> as a separate series)
As Suzuki says - this case should be handled in the future - so it's a
limitation in the current implementation. So a WARN_ONCE is a bit strong
- it's not a "can never happen" situation - it's a "Linux doesn't
support this (yet)".
>>
>> If that's not expected, turn this into a WARN_ONCE().
>
>
>
>
>>
>>> + }
>>> + start = next;
>>> + }
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static unsigned long rmi_l0gpt_size(void)
>>> +{
>>> + return 1UL << (30 + FIELD_GET(RMI_FEATURE_REGISTER_1_L0GPTSZ,
>>> + rmm_feat_reg1));
>>> +}
>>> +
>>> +static int rmi_create_gpts(phys_addr_t start, phys_addr_t end)
>>> +{
>>> + unsigned long l0gpt_sz = rmi_l0gpt_size();
>>> +
>>> + start = ALIGN_DOWN(start, l0gpt_sz);
>>> + end = ALIGN(end, l0gpt_sz);
>>> +
>>> + while (start < end) {
>>> + int ret = rmi_gpt_l1_create(start);
>>> +
>>> + /*
>>> + * Make sure the L1 GPT tables are created for the region.
>>> + * RMI_ERROR_GPT indicates the L1 table already exists.
>>> + */
>>> + if (ret && ret != RMI_ERROR_GPT) {
>>> + /*
>>> + * FIXME: Handle SRO so that memory can be donated for
>>> + * the tables.
>>> + */
>>> + pr_err("GPT Level1 table missing for %llx\n", start);
>>> + return -ENOMEM;
>>
>> If any of this fails, where is the cleanup done? Is that part of the
>> missing SRO support that's indicated in the commit message?
>>
>
> For now, there is no cleanup required. What we essentially do here is
> making sure that the GPT tables have been created upto L1 (i.e.,
> by checking ret == RMI_ERROR_GPT).
>
> We do not donate any memory now, but only support RMMs with static
> memory carved out for L1 GPT. Support for dynamic RMMs could be added as
> a separate series, at which point, we could defer the table creation to
> the actual use case (e.g, RMI_GRANULE_DELEGATE).
>
> Clean up would be required when we donate memory to the RMM.
The missing SRO support is why we're not donating memory - with that
missing the clean up is unnecessary as Suzuki says.
>>> + }
>>> + start += l0gpt_sz;
>>> + }
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static int rmi_init_metadata(void)
>>> +{
>>> + phys_addr_t start, end;
>>> + const struct memblock_region *r;
>>> +
>>> + for_each_mem_region(r) {
>>> + int ret;
>>> +
>>> + start = memblock_region_memory_base_pfn(r) << PAGE_SHIFT;
>>> + end = memblock_region_memory_end_pfn(r) << PAGE_SHIFT;
>>> + ret = rmi_verify_memory_tracking(start, end);
>>> + if (ret)
>>> + return ret;
>>> + ret = rmi_create_gpts(start, end);
>>> + if (ret)
>>> + return ret;
>>> + }
>>
>> How does this work with, say, memory hotplug?
>
> Good point, we need a hook for hotpug to make sure this is taken care
> of. As mentioned above, when we add support for RMM with support for
> dynamic Tracking/GPT with SRO, this could be deferred to the actual
> use (handling RMI return codes, RMI_ERROR_TRACKING/RMI_ERROR_GPT)
Yep, that was an oversight - we definitely will need to handle hotplug.
Thanks,
Steve
> Suzuki
>
>
>>
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +bool rmi_is_available(void)
>>> +{
>>> + return arm64_rmi_is_available;
>>> +}
>>> +
>>> static int __init arm64_init_rmi(void)
>>> {
>>> /* Continue without realm support if we can't agree on a
>>> version */
>>> @@ -101,6 +199,11 @@ static int __init arm64_init_rmi(void)
>>> if (rmi_configure())
>>> return 0;
>>> + if (rmi_init_metadata())
>>> + return 0;
>>> +
>>> + arm64_rmi_is_available = true;
>>> + pr_info("RMI configured");
>>> return 0;
>>> }
>>
>> Thanks,
>>
>> M.
>>
>
^ permalink raw reply
* Re: [PATCH v14 08/44] arm64: RMI: Ensure that the RMM has GPT entries for memory
From: Steven Price @ 2026-06-03 15:48 UTC (permalink / raw)
To: Gavin Shan, kvm, kvmarm
Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Shanker Donthineni,
Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <2ad282da-88fd-49ab-8145-964ff298ca83@redhat.com>
On 21/05/2026 01:58, Gavin Shan wrote:
> Hi Steven,
>
> On 5/13/26 11:17 PM, Steven Price wrote:
>> The RMM maintains the state of all the granules in the system to make
>> sure that the host is abiding by the rules. This state can be maintained
>> at different granularity, per page (TRACKING_FINE) or per region
>> (TRACKING_COARSE). The region size depends on the underlying
>> "RMI_GRANULE_SIZE". For a "coarse" region all pages in the region must
>> be of the same state, this implies we need to have "fine" tracking for
>> DRAM, so that we can delegated individual pages.
>>
>> For now we only support a statically carved out memory for tracking
>> granules for the "fine" regions. This can be extended in the future to
>> allow modifying the tracking granularity and remove the need for a
>> static allocation.
>>
>> Similarly, the firmware may create L0 GPT entries describing the total
>> address space. But if we change the "PAS" (Physical Address Space) of a
>> granule then the firmware may need to create L1 tables to track the PAS
>> at a finer granularity.
>>
>> Note: support is currently missing for SROs which means that if the RMM
>> needs memory donating this will fail (and render CCA unusable in Linux).
>> This effectively means that the L1 GPT tables must be created before
>> Linux starts.
>>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>> Changes since v13:
>> * Moved out of KVM
>> ---
>> arch/arm64/include/asm/rmi_cmds.h | 2 +
>> arch/arm64/kernel/rmi.c | 103 ++++++++++++++++++++++++++++++
>> 2 files changed, 105 insertions(+)
>>
>> diff --git a/arch/arm64/include/asm/rmi_cmds.h b/arch/arm64/include/
>> asm/rmi_cmds.h
>> index 9179934925c5..9078a2920a7c 100644
>> --- a/arch/arm64/include/asm/rmi_cmds.h
>> +++ b/arch/arm64/include/asm/rmi_cmds.h
>> @@ -33,6 +33,8 @@ struct rmi_sro_state {
>> } while (RMI_RETURN_STATUS(res.a0) == RMI_BUSY || \
>> RMI_RETURN_STATUS(res.a0) == RMI_BLOCKED)
>> +bool rmi_is_available(void);
>> +
>> unsigned long rmi_sro_execute(struct rmi_sro_state *sro, gfp_t gfp);
>> void rmi_sro_free(struct rmi_sro_state *sro);
>> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
>> index a14ead5dedda..52a415e99500 100644
>> --- a/arch/arm64/kernel/rmi.c
>> +++ b/arch/arm64/kernel/rmi.c
>> @@ -7,6 +7,8 @@
>> #include <asm/rmi_cmds.h>
>> +static bool arm64_rmi_is_available;
>> +
>> unsigned long rmm_feat_reg0;
>> unsigned long rmm_feat_reg1;
>> @@ -88,6 +90,102 @@ static int rmi_configure(void)
>> return 0;
>> }
>> +/*
>> + * For now we set the tracking_region_size to 0 for
>> RMI_RMM_CONFIG_SET().
>> + * TODO: Support other tracking sizes (via Kconfig option).
>> + */
>> +#ifdef CONFIG_PAGE_SIZE_4KB
>> +#define RMM_GRANULE_TRACKING_SIZE SZ_1G
>> +#elif defined(CONFIG_PAGE_SIZE_16KB)
>> +#define RMM_GRANULE_TRACKING_SIZE SZ_32M
>> +#elif defined(CONFIG_PAGE_SIZE_64KB)
>> +#define RMM_GRANULE_TRACKING_SIZE SZ_512M
>> +#endif
>> +
>
> RMM_GRANULE_TRACKING_SIZE is never used in this series.
Ah, good spot. In a previous version the tracking size was necessary
when walking below. But the spec was updated to a range based API so
this is no longer necessary.
>> +/*
>> + * Make sure the area is tracked by RMM at FINE granularity.
>> + * We do not support changing the tracking yet.
>> + */
>> +static int rmi_verify_memory_tracking(phys_addr_t start, phys_addr_t
>> end)
>> +{
>> + while (start < end) {
>> + unsigned long ret, category, state, next;
>> +
>> + ret = rmi_granule_tracking_get(start, end, &category, &state,
>> &next);
>> + if (ret != RMI_SUCCESS ||
>> + state != RMI_TRACKING_FINE ||
>> + category != RMI_MEM_CATEGORY_CONVENTIONAL) {
>> + /* TODO: Set granule tracking in this case */
>> + pr_err("Granule tracking for region isn't fine/
>> conventional: %llx",
>> + start);
>> + return -ENODEV;
>> + }
>> + start = next;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static unsigned long rmi_l0gpt_size(void)
>> +{
>> + return 1UL << (30 + FIELD_GET(RMI_FEATURE_REGISTER_1_L0GPTSZ,
>> + rmm_feat_reg1));
>> +}
>> +
>
> rmi_l0gpt_size() is only used by rmi_create_gpts(), its logic can be
> combined to that function.
True - I think partly due to the long line I split this into a separate
function. But I could do something like:
unsigned long l0gpt_sz;
l0gpt_sz = 1UL << (30 + FIELD_GET(RMI_FEATURE_REGISTER_1_L0GPTSZ,
rmi_feat_reg(1)));
which isn't too bad.
Thanks,
Steve
>> +static int rmi_create_gpts(phys_addr_t start, phys_addr_t end)
>> +{
>> + unsigned long l0gpt_sz = rmi_l0gpt_size();
>> +
>> + start = ALIGN_DOWN(start, l0gpt_sz);
>> + end = ALIGN(end, l0gpt_sz);
>> +
>> + while (start < end) {
>> + int ret = rmi_gpt_l1_create(start);
>> +
>> + /*
>> + * Make sure the L1 GPT tables are created for the region.
>> + * RMI_ERROR_GPT indicates the L1 table already exists.
>> + */
>> + if (ret && ret != RMI_ERROR_GPT) {
>> + /*
>> + * FIXME: Handle SRO so that memory can be donated for
>> + * the tables.
>> + */
>> + pr_err("GPT Level1 table missing for %llx\n", start);
>> + return -ENOMEM;
>> + }
>> + start += l0gpt_sz;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static int rmi_init_metadata(void)
>> +{
>> + phys_addr_t start, end;
>> + const struct memblock_region *r;
>> +
>> + for_each_mem_region(r) {
>> + int ret;
>> +
>> + start = memblock_region_memory_base_pfn(r) << PAGE_SHIFT;
>> + end = memblock_region_memory_end_pfn(r) << PAGE_SHIFT;
>> + ret = rmi_verify_memory_tracking(start, end);
>> + if (ret)
>> + return ret;
>> + ret = rmi_create_gpts(start, end);
>> + if (ret)
>> + return ret;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +bool rmi_is_available(void)
>> +{
>> + return arm64_rmi_is_available;
>> +}
>> +
>> static int __init arm64_init_rmi(void)
>> {
>> /* Continue without realm support if we can't agree on a version */
>> @@ -101,6 +199,11 @@ static int __init arm64_init_rmi(void)
>> if (rmi_configure())
>> return 0;
>> + if (rmi_init_metadata())
>> + return 0;
>> +
>> + arm64_rmi_is_available = true;
>> + pr_info("RMI configured");
>> return 0;
>> }
>
> Thanks,
> Gavin
>
^ permalink raw reply
* Re: [PATCH v14 08/44] arm64: RMI: Ensure that the RMM has GPT entries for memory
From: Steven Price @ 2026-06-03 15:48 UTC (permalink / raw)
To: Aneesh Kumar K.V, kvm, kvmarm
Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Gavin Shan,
Shanker Donthineni, Alper Gun, Emi Kisanuki, Vishal Annapurve,
WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <yq5aa4twt03b.fsf@kernel.org>
On 19/05/2026 06:55, Aneesh Kumar K.V wrote:
>> +
>> +bool rmi_is_available(void)
>> +{
>> + return arm64_rmi_is_available;
>> +}
>> +
>
> Can we rename to is_rmi_available(void) ?
Sure, will do.
Thanks,
Steve
^ permalink raw reply
* Re: [PATCH v14 07/44] arm64: RMI: Configure the RMM with the host's page size
From: Steven Price @ 2026-06-03 15:48 UTC (permalink / raw)
To: Marc Zyngier
Cc: kvm, kvmarm, Catalin Marinas, Will Deacon, James Morse,
Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Gavin Shan,
Shanker Donthineni, Alper Gun, Aneesh Kumar K . V, Emi Kisanuki,
Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <86a4tsx536.wl-maz@kernel.org>
On 21/05/2026 14:30, Marc Zyngier wrote:
> On Wed, 13 May 2026 14:17:15 +0100,
> Steven Price <steven.price@arm.com> wrote:
>>
>> RMM v2.0 brings the ability to set the RMM's granule size. Check the
>> feature registers and configure the RMM so that it matches the host's
>> page size. This means that operations can be done with a granulatity
>> equal to PAGE_SIZE.
>>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>> Changes since v13:
>> * Moved out of KVM.
>> ---
>> arch/arm64/kernel/rmi.c | 42 +++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 42 insertions(+)
>>
>> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
>> index 99c1ccc35c11..a14ead5dedda 100644
>> --- a/arch/arm64/kernel/rmi.c
>> +++ b/arch/arm64/kernel/rmi.c
>> @@ -49,6 +49,45 @@ static int rmi_check_version(void)
>> return 0;
>> }
>>
>> +static int rmi_configure(void)
>> +{
>> + struct rmm_config *config __free(free_page) = NULL;
>> + unsigned long ret;
>> +
>> + config = (struct rmm_config *)get_zeroed_page(GFP_KERNEL);
>> + if (!config)
>> + return -ENOMEM;
>
> This is the sort of buggy construct that is highlighted in
> include/linux/cleanup.h: initialising the object for cleanup with
> NULL, and only later assigning the expected value.
>
> It may not matter here, but it will catch you (or more probably me) in
> the future.
Good spot. I have to admit I'm still getting the hang of these cleanup
handlers.
>> +
>> + switch (PAGE_SIZE) {
>> + case SZ_4K:
>> + config->rmi_granule_size = RMI_GRANULE_SIZE_4KB;
>> + break;
>> + case SZ_16K:
>> + config->rmi_granule_size = RMI_GRANULE_SIZE_16KB;
>> + break;
>> + case SZ_64K:
>> + config->rmi_granule_size = RMI_GRANULE_SIZE_64KB;
>> + break;
>> + default:
>> + pr_err("Unsupported PAGE_SIZE for RMM\n");
>
> Do you really anticipate PAGE_SIZE being any other value? This is 100%
> dead code. If you want to be extra cautious, have a BUILD_BUg_ON().
No, but falling through is clearly wrong (and likely to trigger AI
review comments if nothing else) - BUILD_BUG() sounds like a good solution.
>> + return -EINVAL;
>> + }
>> +
>> + ret = rmi_rmm_config_set(virt_to_phys(config));
>> + if (ret) {
>> + pr_err("RMM config set failed\n");
>> + return -EINVAL;
>> + }
>
> What is the live cycle of the page when the call succeeds? Is it
> switched back to the NS PAS and allowed to be freed?
Yes, as Suzuki answered - it never leaves the NS PAS. The RMM just reads it.
Thanks,
Steve
>> +
>> + ret = rmi_rmm_activate();
>> + if (ret) {
>> + pr_err("RMM activate failed\n");
>> + return -ENXIO;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> static int __init arm64_init_rmi(void)
>> {
>> /* Continue without realm support if we can't agree on a version */
>> @@ -60,6 +99,9 @@ static int __init arm64_init_rmi(void)
>> if (WARN_ON(rmi_features(1, &rmm_feat_reg1)))
>> return 0;
>>
>> + if (rmi_configure())
>> + return 0;
>> +
>> return 0;
>> }
>> subsys_initcall(arm64_init_rmi);
>
> Thanks,
>
> M.
>
^ permalink raw reply
* Re: [PATCH v7 07/42] KVM: guest_memfd: Only prepare folios for private pages
From: Michael Roth @ 2026-06-03 13:54 UTC (permalink / raw)
To: Ackerley Tng
Cc: Suzuki K Poulose, aik, andrew.jones, binbin.wu, brauner,
chao.p.peng, david, ira.weiny, jmattson, jthoughton, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, aneesh.kumar, liam, Paolo Bonzini, Sean Christopherson,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
Kemeng Shi, Nhat Pham, Baoquan He, Barry Song, Axel Rasmussen,
Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka, kvm,
linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco
In-Reply-To: <CAEvNRgGzOnA34WyOHtkOx5MZDZhOHaXAe+nD75AiJsZ-PsTSFQ@mail.gmail.com>
On Tue, Jun 02, 2026 at 01:46:09PM -0700, Ackerley Tng wrote:
> Suzuki K Poulose <suzuki.poulose@arm.com> writes:
>
> > On 23/05/2026 01:17, Ackerley Tng via B4 Relay wrote:
> >> From: Ackerley Tng <ackerleytng@google.com>
> >>
> >> All-shared guest_memfd used to be only supported for non-CoCo VMs where
> >> preparation doesn't apply. INIT_SHARED is about to be supported for
> >> non-CoCo VMs in a later patch in this series.
> >
> > nit: s/non-CoCo/CoCo ?
> >
>
> Yes, thanks!
>
> >>
> >> In addition, KVM_SET_MEMORY_ATTRIBUTES2 is about to be supported in
> >> guest_memfd in a later patch in this series.
> >>
> >> This means that the kvm fault handler may now call kvm_gmem_get_pfn() on a
> >> shared folio for a CoCo VM where preparation applies.
> >>
> >> Add a check to make sure that preparation is only performed for private
> >> folios.
> >>
> >> Preparation will be undone on freeing (see kvm_gmem_free_folio()) and on
> >> conversion to shared.
> >>
> >> Signed-off-by: Michael Roth <michael.roth@amd.com>
> >
> > nit: Missing Co-Developed-by: ?
> >
>
> IIRC this should have been
>
> Suggested-by: Michael Roth <michael.roth@amd.com>
>
> IIRC Michael suggested this on one of the guest_memfd calls, Michael
> please let me know if you remember otherwise!
That rings a bell. Feel free to add, or just drop the stray SoB, either
way.
-Mike
>
> >>
> >> [...snip...]
> >>
^ permalink raw reply
* Re: [PATCH v7 07/42] KVM: guest_memfd: Only prepare folios for private pages
From: Michael Roth @ 2026-06-03 13:51 UTC (permalink / raw)
To: Suzuki K Poulose
Cc: Ackerley Tng, aik, andrew.jones, binbin.wu, brauner, chao.p.peng,
david, ira.weiny, jmattson, jthoughton, oupton, pankaj.gupta,
qperret, rick.p.edgecombe, rientjes, shivankg, steven.price,
tabba, willy, wyihan, yan.y.zhao, forkloop, pratyush,
aneesh.kumar, liam, Paolo Bonzini, Sean Christopherson,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
Kemeng Shi, Nhat Pham, Baoquan He, Barry Song, Axel Rasmussen,
Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka, kvm,
linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco
In-Reply-To: <88cae738-18e9-4ed3-8414-506a1ad8fb18@arm.com>
On Wed, Jun 03, 2026 at 09:58:45AM +0100, Suzuki K Poulose wrote:
> On 02/06/2026 23:41, Ackerley Tng wrote:
> > Suzuki K Poulose <suzuki.poulose@arm.com> writes:
> >
> > >
> > > [...snip...]
> > >
> > > > > @@ -914,7 +916,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct
> > > > > kvm_memory_slot *slot,
> > > > > folio_mark_uptodate(folio);
> > > > > }
> > > > > - r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
> > > > > + if (kvm_gmem_is_private_mem(inode, index))
> > > >
> > > > Don't we need to make sure the entire folio is private ? Not just the
> > > > page at the index ?
> > > > if (kvm_gmem_range_is_private(, index, folio_nr_pages(folio)) ?
> >
> > I was thinking to fix this when I do huge pages, for now guest_memfd is
> > always just PAGE_SIZE, so just looking up index is fine.
> >
> > Is that okay?
>
> Thats fine, but would be good to enforce that here, so that we don't miss
> out when we add support for multi page folios.
We sort of already enforce that in kvm_gmem_get_folio():
/*
* External interfaces like kvm_gmem_get_pfn() support dealing
* with hugepages to a degree, but internally, guest_memfd currently
* assumes that all folios are order-0 and handling would need
* to be updated for anything otherwise (e.g. page-clearing
* operations).
*/
WARN_ON_ONCE(!IS_ERR(folio) && folio_order(folio));
which was done as part of:
commit 6538b6221cc2feda415ca1946e66a5ef02dc6a0a
Author: Michael Roth <michael.roth@amd.com>
Date: Thu Jan 8 15:46:18 2026 -0600
KVM: guest_memfd: Remove partial hugepage handling from kvm_gmem_populate()
and that should trigger before you even reach the prepare path, so I think
that's covered.
In general, there some previous discussion where we decided we would stop wasting
time guessing at what we'll need to do for hugepages and instead just strip out
the partial support. Sean wanted the folio order kept at part of the internal API
since we know MMU will need that one way or another, but elsewhere within
guest_memfd we are okay to assume 4K. If we *know* certain points that will need
to change then a comment mentioning it isn't a bad idea, but even those comments
have tended to be wrong so far about exactly what changes are supposed to happen.
I'm not sure where the original discussion happened but there's some aftermath
discussion here[1] that I think summarizes current [non-]plans around
prepare+hugepages.
[1] https://lore.kernel.org/kvm/20250711163440.kwjebnzd7zeb4bxt@amd.com/
>
> >
> > >
> > > Or rather, we should go through the individual pages and apply the
> > > prepare for ones that are private ?
> > >
> > > Suzuki
> > >
> >
> > IIRC the plan was to make kvm_gmem_prepare_folio() idempotent, as in, if
> > a page is already private, just skip. Currently sev_gmem_prepare() does
> > a pr_debug(), which I guess is technically still idempotent.
> >
> > I'm thinking that the information tha needs tracking to make
> > .gmem_prepare() idempotent should be tracked by arch code.
> >
> > Does this work for ARM CCA?
>
> We don't hook into the prepare yet, but have plans to do that. We should
> be able to handle the pages that are already private. (For CCA context,
> RMI_GRANULE_DELEGATE_RANGE can skip over already REALM pages). So this
> should be fine.
>
> My point is, in a given folio, there may be pages that are shared.
> Like you said, this could be dealt with when we support hugepages.
Sounds good, that's also what SNP will do once hugepages come along.
-Mike
>
> Suzuki
>
>
> >
> > > >
> > > > [...snip...]
> > > >
>
^ permalink raw reply
* Re: [PATCH v14 06/44] arm64: RMI: Check for RMI support at init
From: Steven Price @ 2026-06-03 10:57 UTC (permalink / raw)
To: Marc Zyngier
Cc: kvm, kvmarm, Catalin Marinas, Will Deacon, James Morse,
Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Gavin Shan,
Shanker Donthineni, Alper Gun, Aneesh Kumar K . V, Emi Kisanuki,
Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <86bje8x6dj.wl-maz@kernel.org>
On 21/05/2026 14:02, Marc Zyngier wrote:
> On Wed, 13 May 2026 14:17:14 +0100,
> Steven Price <steven.price@arm.com> wrote:
>>
>> Query the RMI version number and check if it is a compatible version.
>> The first two feature registers are read and exposed for future code to
>> use.
>>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>> v14:
>> * This moves the basic RMI setup into the 'kernel' directory. This is
>> because RMI will be used for some features outside of KVM so should
>> be available even if KVM isn't compiled in.
>> ---
>> arch/arm64/include/asm/rmi_cmds.h | 3 ++
>> arch/arm64/kernel/Makefile | 2 +-
>> arch/arm64/kernel/cpufeature.c | 1 +
>> arch/arm64/kernel/rmi.c | 65 +++++++++++++++++++++++++++++++
>> 4 files changed, 70 insertions(+), 1 deletion(-)
>> create mode 100644 arch/arm64/kernel/rmi.c
>>
>> diff --git a/arch/arm64/include/asm/rmi_cmds.h b/arch/arm64/include/asm/rmi_cmds.h
>> index 04f7066894e9..9179934925c5 100644
>> --- a/arch/arm64/include/asm/rmi_cmds.h
>> +++ b/arch/arm64/include/asm/rmi_cmds.h
>> @@ -10,6 +10,9 @@
>>
>> #include <asm/rmi_smc.h>
>>
>> +extern unsigned long rmm_feat_reg0;
>> +extern unsigned long rmm_feat_reg1;
>> +
>> struct rtt_entry {
>> unsigned long walk_level;
>> unsigned long desc;
>> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
>> index 74b76bb70452..d68f351aae75 100644
>> --- a/arch/arm64/kernel/Makefile
>> +++ b/arch/arm64/kernel/Makefile
>> @@ -34,7 +34,7 @@ obj-y := debug-monitors.o entry.o irq.o fpsimd.o \
>> cpufeature.o alternative.o cacheinfo.o \
>> smp.o smp_spin_table.o topology.o smccc-call.o \
>> syscall.o proton-pack.o idle.o patching.o pi/ \
>> - rsi.o jump_label.o
>> + rsi.o jump_label.o rmi.o
>>
>> obj-$(CONFIG_COMPAT) += sys32.o signal32.o \
>> sys_compat.o
>> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
>> index 6d53bb15cf7b..8bdd95a8c2de 100644
>> --- a/arch/arm64/kernel/cpufeature.c
>> +++ b/arch/arm64/kernel/cpufeature.c
>> @@ -292,6 +292,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar3[] = {
>> static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
>> ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_CSV3_SHIFT, 4, 0),
>> ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_CSV2_SHIFT, 4, 0),
>> + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_RME_SHIFT, 4, 0),
>> ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_DIT_SHIFT, 4, 0),
>> ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_AMU_SHIFT, 4, 0),
>> ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_MPAM_SHIFT, 4, 0),
>> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
>> new file mode 100644
>> index 000000000000..99c1ccc35c11
>> --- /dev/null
>> +++ b/arch/arm64/kernel/rmi.c
>> @@ -0,0 +1,65 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Copyright (C) 2023-2025 ARM Ltd.
>> + */
>> +
>> +#include <linux/memblock.h>
>> +
>> +#include <asm/rmi_cmds.h>
>> +
>> +unsigned long rmm_feat_reg0;
>> +unsigned long rmm_feat_reg1;
>
> What is the requirement for making those globally accessible? Can't
> they be made static and use an accessor that returns them? Can the
> variables be made __ro_after_init?
Good point - there's no requirement. Also the name isn't quite right -
these should be named rmi_ as there is a different set for RSI.
>> +
>> +static int rmi_check_version(void)
>> +{
>> + struct arm_smccc_res res;
>> + unsigned short version_major, version_minor;
>> + unsigned long host_version = RMI_ABI_VERSION(RMI_ABI_MAJOR_VERSION,
>> + RMI_ABI_MINOR_VERSION);
>> + unsigned long aa64pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
>> +
>> + /* If RME isn't supported, then RMI can't be */
>> + if (cpuid_feature_extract_unsigned_field(aa64pfr0, ID_AA64PFR0_EL1_RME_SHIFT) == 0)
>> + return -ENXIO;
>> +
>> + arm_smccc_1_1_invoke(SMC_RMI_VERSION, host_version, &res);
>> +
>> + if (res.a0 == SMCCC_RET_NOT_SUPPORTED)
>> + return -ENXIO;
>> +
>> + version_major = RMI_ABI_VERSION_GET_MAJOR(res.a1);
>> + version_minor = RMI_ABI_VERSION_GET_MINOR(res.a1);
>> +
>> + if (res.a0 != RMI_SUCCESS) {
>> + unsigned short high_version_major, high_version_minor;
>> +
>> + high_version_major = RMI_ABI_VERSION_GET_MAJOR(res.a2);
>> + high_version_minor = RMI_ABI_VERSION_GET_MINOR(res.a2);
>> +
>> + pr_err("Unsupported RMI ABI (v%d.%d - v%d.%d) we want v%d.%d\n",
>> + version_major, version_minor,
>> + high_version_major, high_version_minor,
>> + RMI_ABI_MAJOR_VERSION,
>> + RMI_ABI_MINOR_VERSION);
>> + return -ENXIO;
>> + }
>> +
>> + pr_info("RMI ABI version %d.%d\n", version_major, version_minor);
>> +
>> + return 0;
>> +}
>> +
>> +static int __init arm64_init_rmi(void)
>> +{
>> + /* Continue without realm support if we can't agree on a version */
>> + if (rmi_check_version())
>> + return 0;
>> +
>> + if (WARN_ON(rmi_features(0, &rmm_feat_reg0)))
>> + return 0;
>> + if (WARN_ON(rmi_features(1, &rmm_feat_reg1)))
>> + return 0;
>> +
>> + return 0;
>> +}
>> +subsys_initcall(arm64_init_rmi);
>
> Is there any reliance on this being executed before or after KVM's own
> initialisation? If so, this should be captured.
Yes I'm expecting this to be called before KVM's initialisation.
kvm_init_rmi() alls rmi_is_available() to check if CCA is supported and
only enables the KVM side of things if that check passes. So if the
initialisation was the other way round then Realm guests would be
unsupported. I'll add a comment
/*
* Note arm64_init_rmi() must be called before kvm_init_rmi() otherwise KVM
* will not support realm guests. subsys_initcall() is called before
* module_init() (used for KVM) so this is OK.
*/
Thanks,
Steve
^ permalink raw reply
* Re: [PATCH v14 06/44] arm64: RMI: Check for RMI support at init
From: Steven Price @ 2026-06-03 10:57 UTC (permalink / raw)
To: Gavin Shan, kvm, kvmarm
Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Shanker Donthineni,
Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <3a0f6277-2b68-45db-a07f-16a177b0586d@redhat.com>
On 25/05/2026 07:58, Gavin Shan wrote:
> Hi Steve,
>
> On 5/22/26 1:49 AM, Steven Price wrote:
>> On 21/05/2026 01:39, Gavin Shan wrote:
>>> On 5/13/26 11:17 PM, Steven Price wrote:
>>>> Query the RMI version number and check if it is a compatible version.
>>>> The first two feature registers are read and exposed for future code to
>>>> use.
>>>>
>>>> Signed-off-by: Steven Price <steven.price@arm.com>
>>>> ---
>>>> v14:
>>>> * This moves the basic RMI setup into the 'kernel' directory.
>>>> This is
>>>> because RMI will be used for some features outside of KVM so
>>>> should
>>>> be available even if KVM isn't compiled in.
>>>> ---
>>>> arch/arm64/include/asm/rmi_cmds.h | 3 ++
>>>> arch/arm64/kernel/Makefile | 2 +-
>>>> arch/arm64/kernel/cpufeature.c | 1 +
>>>> arch/arm64/kernel/rmi.c | 65 ++++++++++++++++++++++++++
>>>> +++++
>>>> 4 files changed, 70 insertions(+), 1 deletion(-)
>>>> create mode 100644 arch/arm64/kernel/rmi.c
>>>>
>>>
>>> [...]
>>>
>>>> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
>>>> new file mode 100644
>>>> index 000000000000..99c1ccc35c11
>>>> --- /dev/null
>>>> +++ b/arch/arm64/kernel/rmi.c
>>>> @@ -0,0 +1,65 @@
>>>> +// SPDX-License-Identifier: GPL-2.0
>>>> +/*
>>>> + * Copyright (C) 2023-2025 ARM Ltd.
>>>> + */
>>>> +
>>>> +#include <linux/memblock.h>
>>>> +
>>>> +#include <asm/rmi_cmds.h>
>>>> +
>>>> +unsigned long rmm_feat_reg0;
>>>> +unsigned long rmm_feat_reg1;
>>>> +
>>>> +static int rmi_check_version(void)
>>>> +{
>>>> + struct arm_smccc_res res;
>>>> + unsigned short version_major, version_minor;
>>>> + unsigned long host_version =
>>>> RMI_ABI_VERSION(RMI_ABI_MAJOR_VERSION,
>>>> + RMI_ABI_MINOR_VERSION);
>>>> + unsigned long aa64pfr0 =
>>>> read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
>>>> +
>>>> + /* If RME isn't supported, then RMI can't be */
>>>> + if (cpuid_feature_extract_unsigned_field(aa64pfr0,
>>>> ID_AA64PFR0_EL1_RME_SHIFT) == 0)
>>>> + return -ENXIO;
>>>> +
>>>> + arm_smccc_1_1_invoke(SMC_RMI_VERSION, host_version, &res);
>>>> +
>>>> + if (res.a0 == SMCCC_RET_NOT_SUPPORTED)
>>>> + return -ENXIO;
>>>> +
>>>> + version_major = RMI_ABI_VERSION_GET_MAJOR(res.a1);
>>>> + version_minor = RMI_ABI_VERSION_GET_MINOR(res.a1);
>>>> +
>>>> + if (res.a0 != RMI_SUCCESS) {
>>>> + unsigned short high_version_major, high_version_minor;
>>>> +
>>>> + high_version_major = RMI_ABI_VERSION_GET_MAJOR(res.a2);
>>>> + high_version_minor = RMI_ABI_VERSION_GET_MINOR(res.a2);
>>>> +
>>>> + pr_err("Unsupported RMI ABI (v%d.%d - v%d.%d) we want v%d.
>>>> %d\n",
>>>> + version_major, version_minor,
>>>> + high_version_major, high_version_minor,
>>>> + RMI_ABI_MAJOR_VERSION,
>>>> + RMI_ABI_MINOR_VERSION);
>>>> + return -ENXIO;
>>>> + }
>>>> +
>>>> + pr_info("RMI ABI version %d.%d\n", version_major, version_minor);
>>>> +
>>>> + return 0;
>>>> +}
>>>> +
>>>> +static int __init arm64_init_rmi(void)
>>>> +{
>>>> + /* Continue without realm support if we can't agree on a
>>>> version */
>>>> + if (rmi_check_version())
>>>> + return 0;
>>>
>>> Is this still a valid point that we have to return zero on errors
>>> returned
>>> from rmi_check_version() or other other function calls like
>>> rmi_features()?
>>> arm64_init_rmi() is triggered by subsys_initcall() where the return
>>> value
>>> needs to indicate success or failure. It's fine to return error code
>>> from
>>> arm64_init_rmi() in the path.
>>
>> Hmm, I guess now this is moved to arm64 code this indeed doesn't need
>> to. Within a module I believe an error return can fail the module
>> loading.
>>
>> I'm not sure it really makes much difference though - if this
>> initialisation fails then it's not really an error - it just means the
>> feature is unavailable.
>>
>
> I think the return value would be consistent to the value of
> 'arm64_rmi_is_available'.
> 'arm64_rmi_is_available' is true when zero is returned, otherwise,
> 'arm64_rmi_is_available'
> is false.
>
> With the consistency between the return value and
> 'arm64_rmi_is_available', users are
> able to know the value of 'arm64_rmi_is_available' through kernel
> parameter 'initcall_debug'.
> With the kernel parameter, the initcalls including arm64_init_rmi() are
> traced and its
> return value is outputted in the traced messages, seeing
> do_trace_initcall_start().
Fair enough, and actually refactoring this function to pass error codes
up the call stack I think does improve the look.
Thanks,
Steve
>> Thanks,
>> Steve
>>
>>>> +
>>>> + if (WARN_ON(rmi_features(0, &rmm_feat_reg0)))
>>>> + return 0;
>>>> + if (WARN_ON(rmi_features(1, &rmm_feat_reg1)))
>>>> + return 0;
>>>> +
>>>> + return 0;
>>>> +}
>>>> +subsys_initcall(arm64_init_rmi);
>>>
>
> Thanks,
> Gavin
>
^ permalink raw reply
* Re: [PATCH v14 04/44] arm64: RMI: Add SMC definitions for calling the RMM
From: Steven Price @ 2026-06-03 10:15 UTC (permalink / raw)
To: Marc Zyngier
Cc: kvm, kvmarm, Catalin Marinas, Will Deacon, James Morse,
Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Gavin Shan,
Shanker Donthineni, Alper Gun, Aneesh Kumar K . V, Emi Kisanuki,
Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <87jysvahpb.wl-maz@kernel.org>
On 22/05/2026 10:58, Marc Zyngier wrote:
> On Thu, 21 May 2026 16:33:09 +0100,
> Steven Price <steven.price@arm.com> wrote:
>>
>> On 21/05/2026 13:40, Marc Zyngier wrote:
>>> On Wed, 13 May 2026 14:17:12 +0100,
>>> Steven Price <steven.price@arm.com> wrote:
>>>>
>>>> The RMM (Realm Management Monitor) provides functionality that can be
>>>> accessed by SMC calls from the host.
>>>>
>>>> The SMC definitions are based on DEN0137[1] version 2.0-bet1
>>>>
>>>> [1] https://developer.arm.com/documentation/den0137/2-0bet1/
>>>>
>>>> Signed-off-by: Steven Price <steven.price@arm.com>
>>>> ---
>>>> Changes since v13:
>>>> * Updated to RMM spec v2.0-bet1
>>>> Changes since v12:
>>>> * Updated to RMM spec v2.0-bet0
>>>> Changes since v9:
>>>> * Corrected size of 'ripas_value' in struct rec_exit. The spec states
>>>> this is an 8-bit type with padding afterwards (rather than a u64).
>>>> Changes since v8:
>>>> * Added RMI_PERMITTED_GICV3_HCR_BITS to define which bits the RMM
>>>> permits to be modified.
>>>> Changes since v6:
>>>> * Renamed REC_ENTER_xxx defines to include 'FLAG' to make it obvious
>>>> these are flag values.
>>>> Changes since v5:
>>>> * Sorted the SMC #defines by value.
>>>> * Renamed SMI_RxI_CALL to SMI_RMI_CALL since the macro is only used for
>>>> RMI calls.
>>>> * Renamed REC_GIC_NUM_LRS to REC_MAX_GIC_NUM_LRS since the actual
>>>> number of available list registers could be lower.
>>>> * Provided a define for the reserved fields of FeatureRegister0.
>>>> * Fix inconsistent names for padding fields.
>>>> Changes since v4:
>>>> * Update to point to final released RMM spec.
>>>> * Minor rearrangements.
>>>> Changes since v3:
>>>> * Update to match RMM spec v1.0-rel0-rc1.
>>>> Changes since v2:
>>>> * Fix specification link.
>>>> * Rename rec_entry->rec_enter to match spec.
>>>> * Fix size of pmu_ovf_status to match spec.
>>>> ---
>>>> arch/arm64/include/asm/rmi_smc.h | 448 +++++++++++++++++++++++++++++++
>>>> 1 file changed, 448 insertions(+)
>>>> create mode 100644 arch/arm64/include/asm/rmi_smc.h
>>>>
>>>> diff --git a/arch/arm64/include/asm/rmi_smc.h b/arch/arm64/include/asm/rmi_smc.h
>>>> new file mode 100644
>>>> index 000000000000..a09b7a631fef
>>>> --- /dev/null
>>>> +++ b/arch/arm64/include/asm/rmi_smc.h
>>>> @@ -0,0 +1,448 @@
>>>> +/* SPDX-License-Identifier: GPL-2.0 */
>>>> +/*
>>>> + * Copyright (C) 2023-2026 ARM Ltd.
>>>> + *
>>>> + * The values and structures in this file are from the Realm Management Monitor
>>>> + * specification (DEN0137) version 2.0-bet1:
>>>> + * https://developer.arm.com/documentation/den0137/2-0bet1/
>>>
>>> How long is this spec going to be available on the ARM web site, which
>>> has a tendency of being reorganised every other week? And there is
>>> already a beta2.
>>
>> Obviously I can't predict the next reorganisation - but at least it's a
>> link that could be fed into archive.org or similar.
>
> I found that the PDF spec was less susceptible to creative nonsense,
> and people can download it for future reference, whereas ARM has
> happily *deleted* specs from the website over time (try to find PSCI
> 0.1, for example...).
Sadly the nearest I found to a link directly to the PDF is:
https://documentation-service.arm.com/static/69cb945ac1586b7c59b1c00c
But I have 0 confidence that that link will work for long (if indeed it
even works for others now!). If you know of any way of getting a better
link out of the Arm website that I'm all ears!
> [...]
>
>>>> +struct realm_params {
>>>> + union { /* 0x0 */
>>>> + struct {
>>>> + u64 flags;
>>>> + u64 s2sz;
>>>> + u64 sve_vl;
>>>> + u64 num_bps;
>>>> + u64 num_wps;
>>>> + u64 pmu_num_ctrs;
>>>> + u64 hash_algo;
>>>> + u64 num_aux_planes;
>>>> + };
>>>> + u8 padding0[0x400];
>>>
>>> SZ_1K? And similarly all over the shop?
>>
>> I'm a bit less sure that makes the code more readable - these structures
>> are a bit of a pain because they are somewhat sparse. I've left a
>> comment where the beginning of each union is, and personally I find it
>> easier to see 0x0 + 0x400 == 0x400 rather than trying to work out what
>> SZ_1K is in hex. This is particularly the case in terms of:
>>
>>> struct rec_params {
>>> union { /* 0x0 */
>>> u64 flags;
>>> u8 padding0[0x100];
>>> };
>>> union { /* 0x100 */
>>> u64 mpidr;
>>> u8 padding1[0x100];
>>> };
>>> union { /* 0x200 */
>>> u64 pc;
>>> u8 padding2[0x100];
>>> };
>>> union { /* 0x300 */
>>> u64 gprs[REC_CREATE_NR_GPRS];
>>> u8 padding3[0xd00];
>>> };
>>> };
>>
>> Where 0xd00 doesn't even have a correspoding SZ_ define.
>
> Indeed, but it is (SZ_4K - SZ_256 * 3).
Do you really think
u8 padding3[SZ_4K - SZ_256 * 3];
is better? I certainly don't. I'll give you (SZ_4K - 0x300) is tempting.
Although it then makes the BUILD_BUG_ON idea below somewhat pointless.
> And a lot of these structures> seem to be designed to form a 4kB blob.
I'm sure we can make use of
> that information (BUILD_BUG_ON?).
BUILD_BUG_ON requires being in a function. But static_assert() can be
used in the header by the struct definitions - I'll add that, thanks for
the suggestion.
>>
>> The RMM deals with this with macro magic:
>>
>>> struct rmi_rec_params {
>>> /* Flags */
>>> SET_MEMBER_RMI(unsigned long flags, 0, 0x100); /* Offset 0 */
>>> /* MPIDR of the REC */
>>> SET_MEMBER_RMI(unsigned long mpidr, 0x100, 0x200); /* 0x100 */
>>> /* Program counter */
>>> SET_MEMBER_RMI(unsigned long pc, 0x200, 0x300); /* 0x200 */
>>> /* General-purpose registers */
>>> SET_MEMBER_RMI(unsigned long gprs[REC_CREATE_NR_GPRS], 0x300, 0x1000); /* 0x300 */
>>> };
>>
>> where the offsets are just directly encoded in the macro - but it's not
>> an especially robust macro and I'm not convinced it's more readable.
>
> I think this is just as horrible, but at least it seems to take the
> boundaries of the structure into account.
>
>>
>> I'm happy to hear other suggestions on how to encode this neatly.
>
> Honestly, I wouldn't mind having the structures described in a more
> abstract way and then pre-processed to generate the include files. If
> the architectural MRS wasn't so huge, I would have added it to the
> kernel and used that directly for KVM.
>
>>
>>> I haven't checked the details of the encodings (life is too short),
>>> but I wonder how much of this exists as an MRS and could be
>>> automatically generated?
>>
>> Automatically generating this would be good - I'm not sure whether we
>> have a (public) source available to generate from at the moment. I have
>> tried to methodically work through the spec when updating this file, but
>> as Gavin has already pointed out there was at least one mistake (in
>> currently unused definitions) this time.
>
> I'm slightly baffled that even the RMM is written this way. Given the
> formalism used in the RMM spec, I was expecting that you'd have a
> bunch of JSON at hand and able to generate any output from that. Doing
> this stuff by hand is both incredibly dull work *and* extremely error
> prone.
I'll look into the possibility of generating the headers. While dull and
error prone I have found it is sometimes useful for forcing a review of
the spec itself. There have been a number of bugs I've found (and have
been corrected) in the spec while writing the header files - it's very
easy to skim read those parts of the document otherwise.
Writing the structures out in a "more abstract way" might be a good
idea, but I'm just a little wary of writing another tool which is only
used in this one spot. The RMM structures are somewhat unusual in being
so sparse.
Thanks,
Steve
> Thanks,
>
> M.
>
^ permalink raw reply
* Re: [PATCH v4 07/47] x86/tdx: Force TSC frequency with CPUID-based info provided by the TDX-Module
From: Kiryl Shutsemau @ 2026-06-03 10:02 UTC (permalink / raw)
To: Sean Christopherson
Cc: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
Dave Hansen, x86, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
Dexuan Cui, Long Li, Ajay Kaher, Alexey Makhalov, Jan Kiszka,
Andy Lutomirski, Peter Zijlstra, Juergen Gross, Daniel Lezcano,
John Stultz, H. Peter Anvin, Rick Edgecombe, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
virtualization, xen-devel, David Woodhouse, Tom Lendacky,
Nikunj A Dadhania, David Woodhouse, Michael Kelley,
Thomas Gleixner
In-Reply-To: <20260529144435.704127-8-seanjc@google.com>
On Fri, May 29, 2026 at 07:43:54AM -0700, Sean Christopherson wrote:
> When running as a TDX guest, explicitly set the TSC frequency to a known
> value, using CPUID-based information, instead of potentially relying on a
> hypervisor-controlled PV routine. For TDX guests, CPUID.0x15 is always
> emulated by the TDX-Module, i.e. the information from CPUID is more
> trustworthy than the information provided by the hypervisor.
Right. EBX is configurable by TD_PARAMS.TSC_FREQUENCY at TD build. The
rest is fixed.
> To maintain backwards compatibility with TDX guest kernels that use native
> calibration, and because it's the least awful option, retain
> native_calibrate_tsc()'s stuffing of the local APIC bus period using the
> core crystal frequency. While it's entirely possible for the hypervisor
> to emulate the APIC timer at a different frequency than the core crystal
> frequency, the commonly accepted interpretation of Intel's SDM is that APIC
> timer runs at the core crystal frequency when that latter is enumerated via
> CPUID:
>
> The APIC timer frequency will be the processor’s bus clock or core
> crystal clock frequency (when TSC/core crystal clock ratio is enumerated
> in CPUID leaf 0x15).
>
> If the hypervisor is malicious and deliberately runs the APIC timer at the
> wrong frequency, nothing would stop the hypervisor from modifying the
> frequency at any time, i.e. attempting to manually calibrate the frequency
> out of paranoia would be futile.
Agreed.
> Deliberately leave CPU frequency calibration as is, since the TDX-Module
> doesn't provide any guarantees with respect to CPUID.0x16.
It is fixed to zeros. Sounds like a guarantee to me :P
> Signed-off-by: Sean Christopherson <seanjc@google.com>
Looks sane to me. Including your reasoning about tsc_early_khz= in reply
to Sashiko.
Reviewed-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
--
Kiryl Shutsemau / Kirill A. Shutemov
^ permalink raw reply
* Re: [PATCH v7 07/42] KVM: guest_memfd: Only prepare folios for private pages
From: Suzuki K Poulose @ 2026-06-03 8:58 UTC (permalink / raw)
To: Ackerley Tng, aik, andrew.jones, binbin.wu, brauner, chao.p.peng,
david, ira.weiny, jmattson, jthoughton, michael.roth, oupton,
pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
pratyush, aneesh.kumar, liam, Paolo Bonzini, Sean Christopherson,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
Kemeng Shi, Nhat Pham, Baoquan He, Barry Song, Axel Rasmussen,
Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco
In-Reply-To: <CAEvNRgE1dCVAxJWd_hyFa8N=m9JLfn97ip9tAmvHxspWJ50oGg@mail.gmail.com>
On 02/06/2026 23:41, Ackerley Tng wrote:
> Suzuki K Poulose <suzuki.poulose@arm.com> writes:
>
>>
>> [...snip...]
>>
>>>> @@ -914,7 +916,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct
>>>> kvm_memory_slot *slot,
>>>> folio_mark_uptodate(folio);
>>>> }
>>>> - r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
>>>> + if (kvm_gmem_is_private_mem(inode, index))
>>>
>>> Don't we need to make sure the entire folio is private ? Not just the
>>> page at the index ?
>>> if (kvm_gmem_range_is_private(, index, folio_nr_pages(folio)) ?
>
> I was thinking to fix this when I do huge pages, for now guest_memfd is
> always just PAGE_SIZE, so just looking up index is fine.
>
> Is that okay?
Thats fine, but would be good to enforce that here, so that we don't
miss out when we add support for multi page folios.
>
>>
>> Or rather, we should go through the individual pages and apply the
>> prepare for ones that are private ?
>>
>> Suzuki
>>
>
> IIRC the plan was to make kvm_gmem_prepare_folio() idempotent, as in, if
> a page is already private, just skip. Currently sev_gmem_prepare() does
> a pr_debug(), which I guess is technically still idempotent.
>
> I'm thinking that the information tha needs tracking to make
> .gmem_prepare() idempotent should be tracked by arch code.
>
> Does this work for ARM CCA?
We don't hook into the prepare yet, but have plans to do that. We should
be able to handle the pages that are already private. (For CCA context,
RMI_GRANULE_DELEGATE_RANGE can skip over already REALM pages). So this
should be fine.
My point is, in a given folio, there may be pages that are shared.
Like you said, this could be dealt with when we support hugepages.
Suzuki
>
>>>
>>> [...snip...]
>>>
^ permalink raw reply
* Re: [PATCH v5 05/20] dma-pool: track decrypted atomic pools and select them via attrs
From: Jason Gunthorpe @ 2026-06-03 0:54 UTC (permalink / raw)
To: Michael Kelley
Cc: Aneesh Kumar K.V, iommu@lists.linux.dev,
linux-arm-kernel@lists.infradead.org,
linux-kernel@vger.kernel.org, linux-coco@lists.linux.dev,
Robin Murphy, Marek Szyprowski, Will Deacon, Marc Zyngier,
Steven Price, Suzuki K Poulose, Catalin Marinas, Jiri Pirko,
Mostafa Saleh, Petr Tesarik, Alexey Kardashevskiy, Dan Williams,
Xu Yilun, linuxppc-dev@lists.ozlabs.org,
linux-s390@vger.kernel.org, Madhavan Srinivasan, Michael Ellerman,
Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86@kernel.org, Jiri Pirko
In-Reply-To: <SN6PR02MB4157D9955A93244014AB7978D4122@SN6PR02MB4157.namprd02.prod.outlook.com>
On Tue, Jun 02, 2026 at 02:24:40PM +0000, Michael Kelley wrote:
> Except that in a normal VM, the "unencrypted" pool attribute does *not*
> describe the state of the memory itself. In a normal VM, the memory is
> unencrypted, but the "unencrypted" pool attribute is false. That
> contradiction is the essence of my concern.
I would argue no..
When CC is enabled the default state of memory in a Linux environment
is "encrypted". You have to take a special action to "decrypt" it.
Thus the default state of memory in a non-CC environment is also
paradoxically "encrypted" too. "decryption" is impossible.
Therefore the "unencrypted" state is a special state that only memory
inside a CC VM can have. A normal VM can never have "unencrypted"
memory at all, so having it be false in the pool is accurate as far as
the APIs go.
un-encrypted = true means "the memory in this pool was transformed with
set_memory_decrypted()" - which is impossible on a normal VM.
Jason
^ permalink raw reply
* Re: [PATCH v7 07/42] KVM: guest_memfd: Only prepare folios for private pages
From: Ackerley Tng @ 2026-06-02 22:41 UTC (permalink / raw)
To: Suzuki K Poulose, aik, andrew.jones, binbin.wu, brauner,
chao.p.peng, david, ira.weiny, jmattson, jthoughton, michael.roth,
oupton, pankaj.gupta, qperret, rick.p.edgecombe, rientjes,
shivankg, steven.price, tabba, willy, wyihan, yan.y.zhao,
forkloop, pratyush, aneesh.kumar, liam, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco
In-Reply-To: <144bbb9f-39a2-4c90-8903-51521e022da0@arm.com>
Suzuki K Poulose <suzuki.poulose@arm.com> writes:
>
> [...snip...]
>
>>> @@ -914,7 +916,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct
>>> kvm_memory_slot *slot,
>>> folio_mark_uptodate(folio);
>>> }
>>> - r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
>>> + if (kvm_gmem_is_private_mem(inode, index))
>>
>> Don't we need to make sure the entire folio is private ? Not just the
>> page at the index ?
>> if (kvm_gmem_range_is_private(, index, folio_nr_pages(folio)) ?
I was thinking to fix this when I do huge pages, for now guest_memfd is
always just PAGE_SIZE, so just looking up index is fine.
Is that okay?
>
> Or rather, we should go through the individual pages and apply the
> prepare for ones that are private ?
>
> Suzuki
>
IIRC the plan was to make kvm_gmem_prepare_folio() idempotent, as in, if
a page is already private, just skip. Currently sev_gmem_prepare() does
a pr_debug(), which I guess is technically still idempotent.
I'm thinking that the information tha needs tracking to make
.gmem_prepare() idempotent should be tracked by arch code.
Does this work for ARM CCA?
>>
>> [...snip...]
>>
^ permalink raw reply
* Re: [PATCH v7 34/42] KVM: selftests: Test conversion with elevated page refcount
From: Askar Safin @ 2026-06-02 21:26 UTC (permalink / raw)
To: devnull+ackerleytng.google.com
Cc: ackerleytng, aik, akpm, andrew.jones, aneesh.kumar, axelrasmussen,
baohua, bhe, binbin.wu, bp, brauner, chao.p.peng, chrisl, corbet,
dave.hansen, david, forkloop, hpa, ira.weiny, jgg, jmattson,
jthoughton, kas, kasong, kvm, liam, linux-coco, linux-doc,
linux-kernel, linux-kselftest, linux-mm, linux-trace-kernel,
mathieu.desnoyers, mhiramat, michael.roth, mingo, nphamcs, oupton,
pankaj.gupta, pbonzini, pratyush, qi.zheng, qperret,
rick.p.edgecombe, rientjes, rostedt, seanjc, shakeel.butt,
shikemeng, shivankg, shuah, skhan, steven.price, suzuki.poulose,
tabba, tglx, vannapurve, vbabka, weixugc, willy, wyihan, x86,
yan.y.zhao, youngjun.park, yuanchu
In-Reply-To: <20260522-gmem-inplace-conversion-v7-34-2f0fae496530@google.com>
Ackerley Tng via B4 Relay <devnull+ackerleytng.google.com@kernel.org>:
> This test uses vmsplice to increment the refcount of a specific page
I recently submitted a patch, which makes vmsplice equivalent to
preadv2/pwritev2, and it was accepted to next.
For now it is just an experiment, it is possible it will be reverted.
https://lore.kernel.org/all/20260601-aufweichen-dissens-ausrechnen-0d9b84728113@brauner/
--
Askar Safin
^ permalink raw reply
* Re: [PATCH v7 07/42] KVM: guest_memfd: Only prepare folios for private pages
From: Ackerley Tng @ 2026-06-02 20:46 UTC (permalink / raw)
To: Suzuki K Poulose, aik, andrew.jones, binbin.wu, brauner,
chao.p.peng, david, ira.weiny, jmattson, jthoughton, michael.roth,
oupton, pankaj.gupta, qperret, rick.p.edgecombe, rientjes,
shivankg, steven.price, tabba, willy, wyihan, yan.y.zhao,
forkloop, pratyush, aneesh.kumar, liam, Paolo Bonzini,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
linux-mm, linux-coco
In-Reply-To: <d01cf1ec-b85d-4af6-9810-8107c0e2a4ec@arm.com>
Suzuki K Poulose <suzuki.poulose@arm.com> writes:
> On 23/05/2026 01:17, Ackerley Tng via B4 Relay wrote:
>> From: Ackerley Tng <ackerleytng@google.com>
>>
>> All-shared guest_memfd used to be only supported for non-CoCo VMs where
>> preparation doesn't apply. INIT_SHARED is about to be supported for
>> non-CoCo VMs in a later patch in this series.
>
> nit: s/non-CoCo/CoCo ?
>
Yes, thanks!
>>
>> In addition, KVM_SET_MEMORY_ATTRIBUTES2 is about to be supported in
>> guest_memfd in a later patch in this series.
>>
>> This means that the kvm fault handler may now call kvm_gmem_get_pfn() on a
>> shared folio for a CoCo VM where preparation applies.
>>
>> Add a check to make sure that preparation is only performed for private
>> folios.
>>
>> Preparation will be undone on freeing (see kvm_gmem_free_folio()) and on
>> conversion to shared.
>>
>> Signed-off-by: Michael Roth <michael.roth@amd.com>
>
> nit: Missing Co-Developed-by: ?
>
IIRC this should have been
Suggested-by: Michael Roth <michael.roth@amd.com>
IIRC Michael suggested this on one of the guest_memfd calls, Michael
please let me know if you remember otherwise!
>>
>> [...snip...]
>>
^ permalink raw reply
* [PATCH v6 6/6] x86/sev: Add debugfs support for RMPOPT
From: Ashish Kalra @ 2026-06-02 20:02 UTC (permalink / raw)
To: tglx, mingo, bp, dave.hansen, x86, hpa, seanjc, peterz,
thomas.lendacky, herbert, davem, ardb
Cc: pbonzini, aik, Michael.Roth, KPrateek.Nayak, Tycho.Andersen,
Nathan.Fontenot, ackerleytng, jackyli, pgonda, rientjes, jacobhxu,
xin, pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen,
darwi, linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <cover.1780427587.git.ashish.kalra@amd.com>
From: Ashish Kalra <ashish.kalra@amd.com>
Add a debugfs interface to report per-CPU RMPOPT status across all
system RAM.
To dump the per-CPU RMPOPT status for all system RAM:
/sys/kernel/debug/rmpopt# cat rmpopt-table
Memory @ 0GB: CPU(s): none
Memory @ 1GB: CPU(s): none
Memory @ 2GB: CPU(s): 0-1023
Memory @ 3GB: CPU(s): 0-1023
Memory @ 4GB: CPU(s): none
Memory @ 5GB: CPU(s): 0-1023
Memory @ 6GB: CPU(s): 0-1023
Memory @ 7GB: CPU(s): 0-1023
...
Memory @1025GB: CPU(s): 0-1023
Memory @1026GB: CPU(s): 0-1023
Memory @1027GB: CPU(s): 0-1023
Memory @1028GB: CPU(s): 0-1023
Memory @1029GB: CPU(s): 0-1023
Memory @1030GB: CPU(s): 0-1023
Memory @1031GB: CPU(s): 0-1023
Memory @1032GB: CPU(s): 0-1023
Memory @1033GB: CPU(s): 0-1023
Memory @1034GB: CPU(s): 0-1023
Memory @1035GB: CPU(s): 0-1023
Memory @1036GB: CPU(s): 0-1023
Memory @1037GB: CPU(s): 0-1023
Memory @1038GB: CPU(s): none
Suggested-by: Thomas Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
---
arch/x86/virt/svm/sev.c | 128 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 128 insertions(+)
diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
index 4442ecae3d18..29695bb18991 100644
--- a/arch/x86/virt/svm/sev.c
+++ b/arch/x86/virt/svm/sev.c
@@ -20,6 +20,8 @@
#include <linux/amd-iommu.h>
#include <linux/nospec.h>
#include <linux/workqueue.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
#include <asm/sev.h>
#include <asm/processor.h>
@@ -144,6 +146,15 @@ static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
static unsigned long snp_nr_leaked_pages;
+/* All users of rmpopt_report_cpumask must hold rmpopt_show_mutex. */
+static cpumask_t rmpopt_report_cpumask;
+static struct dentry *rmpopt_debugfs;
+static DEFINE_MUTEX(rmpopt_show_mutex);
+
+struct seq_paddr {
+ phys_addr_t next_seq_paddr;
+};
+
#undef pr_fmt
#define pr_fmt(fmt) "SEV-SNP: " fmt
@@ -585,6 +596,8 @@ static void rmpopt_cleanup(void)
cancel_delayed_work_sync(&rmpopt_delayed_work);
destroy_workqueue(rmpopt_wq);
+ debugfs_remove_recursive(rmpopt_debugfs);
+ rmpopt_debugfs = NULL;
cpus_read_lock();
@@ -622,6 +635,10 @@ static inline bool __rmpopt(u64 pa_start, u64 op_type)
: "a" (pa_start), "c" (op_type)
: "memory", "cc");
+ if (op_type == RMPOPT_FUNC_REPORT_STATUS)
+ assign_cpu(smp_processor_id(), &rmpopt_report_cpumask,
+ optimized);
+
return optimized;
}
@@ -641,6 +658,115 @@ static void rmpopt_smp(void *val)
rmpopt((u64)val);
}
+/*
+ * 'val' is a system physical address.
+ */
+static void rmpopt_report_status(void *val)
+{
+ u64 pa_start = ALIGN_DOWN((u64)val, SZ_1G);
+ u64 op_type = RMPOPT_FUNC_REPORT_STATUS;
+
+ __rmpopt(pa_start, op_type);
+}
+
+/*
+ * start() can be called multiple times if allocated buffer has overflowed
+ * and bigger buffer is allocated.
+ */
+static void *rmpopt_table_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ phys_addr_t end_paddr = rmpopt_pa_end;
+ struct seq_paddr *p = seq->private;
+
+ if (*pos == 0) {
+ p->next_seq_paddr = rmpopt_pa_start;
+ if (p->next_seq_paddr >= end_paddr)
+ return NULL;
+ return &p->next_seq_paddr;
+ }
+
+ if (p->next_seq_paddr >= end_paddr)
+ return NULL;
+
+ return &p->next_seq_paddr;
+}
+
+static void *rmpopt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ phys_addr_t end_paddr = rmpopt_pa_end;
+ phys_addr_t *curr_paddr = v;
+
+ (*pos)++;
+ *curr_paddr += SZ_1G;
+ if (*curr_paddr >= end_paddr)
+ return NULL;
+
+ return curr_paddr;
+}
+
+static void rmpopt_table_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int rmpopt_table_seq_show(struct seq_file *seq, void *v)
+{
+ phys_addr_t *curr_paddr = v;
+
+ guard(mutex)(&rmpopt_show_mutex);
+
+ seq_printf(seq, "Memory @%3lluGB: ",
+ *curr_paddr >> (get_order(SZ_1G) + PAGE_SHIFT));
+
+ /*
+ * Query all online CPUs rather than just rmpopt_cpumask (primary
+ * threads only). The RMPOPT instruction only needs to run on one
+ * thread per core for the optimization to take effect, but debugfs
+ * reporting requires the RMPOPT status across all CPUs.
+ * Performance is not a concern for this diagnostic interface.
+ *
+ * This is safe because RMPOPT_BASE MSR is per-core and
+ * snp_prepare() ensures all CPUs are online when the MSR is
+ * programmed during snp_setup_rmpopt().
+ */
+ cpumask_clear(&rmpopt_report_cpumask);
+ on_each_cpu_mask(cpu_online_mask, rmpopt_report_status,
+ (void *)*curr_paddr, true);
+
+ if (cpumask_empty(&rmpopt_report_cpumask))
+ seq_puts(seq, "CPU(s): none\n");
+ else
+ seq_printf(seq, "CPU(s): %*pbl\n", cpumask_pr_args(&rmpopt_report_cpumask));
+
+ return 0;
+}
+
+static const struct seq_operations rmpopt_table_seq_ops = {
+ .start = rmpopt_table_seq_start,
+ .next = rmpopt_table_seq_next,
+ .stop = rmpopt_table_seq_stop,
+ .show = rmpopt_table_seq_show
+};
+
+static int rmpopt_table_open(struct inode *inode, struct file *file)
+{
+ return seq_open_private(file, &rmpopt_table_seq_ops, sizeof(struct seq_paddr));
+}
+
+static const struct file_operations rmpopt_table_fops = {
+ .open = rmpopt_table_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static void rmpopt_debugfs_setup(void)
+{
+ rmpopt_debugfs = debugfs_create_dir("rmpopt", arch_debugfs_dir);
+
+ debugfs_create_file("rmpopt-table", 0400, rmpopt_debugfs,
+ NULL, &rmpopt_table_fops);
+}
+
/*
* RMPOPT optimizations skip RMP checks at 1GB granularity if this
* range of memory does not contain any SNP guest memory.
@@ -833,6 +959,8 @@ void snp_setup_rmpopt(void)
* optimizations on all physical memory.
*/
queue_delayed_work(rmpopt_wq, &rmpopt_delayed_work, 0);
+
+ rmpopt_debugfs_setup();
}
EXPORT_SYMBOL_FOR_MODULES(snp_setup_rmpopt, "ccp");
--
2.43.0
^ permalink raw reply related
* [PATCH v6 5/6] KVM: SEV: Perform RMP optimizations on SNP guest shutdown
From: Ashish Kalra @ 2026-06-02 20:02 UTC (permalink / raw)
To: tglx, mingo, bp, dave.hansen, x86, hpa, seanjc, peterz,
thomas.lendacky, herbert, davem, ardb
Cc: pbonzini, aik, Michael.Roth, KPrateek.Nayak, Tycho.Andersen,
Nathan.Fontenot, ackerleytng, jackyli, pgonda, rientjes, jacobhxu,
xin, pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen,
darwi, linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <cover.1780427587.git.ashish.kalra@amd.com>
From: Ashish Kalra <ashish.kalra@amd.com>
Pages are converted from shared to private as SNP guests are launched.
This destroys exisiting RMPOPT optimizations in the regions where
pages are converted.
Conversely, guest pages are converted back to shared during SNP guest
termination and their region may become eligible for RMPOPT
optimization.
To take advantage of this, perform RMPOPT after guest termination.
Do it after a delay so that a single RMPOPT pass can be done if
multiple guests terminate in a short period of time.
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
---
arch/x86/kvm/svm/sev.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index e107f368ed2d..29af6f6e603c 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -3005,6 +3005,8 @@ void sev_vm_destroy(struct kvm *kvm)
*/
if (snp_decommission_context(kvm))
return;
+
+ snp_rmpopt_all_physmem();
} else {
sev_unbind_asid(kvm, sev->handle);
}
--
2.43.0
^ permalink raw reply related
* [PATCH v6 4/6] x86/sev: Add interface to re-enable RMP optimizations.
From: Ashish Kalra @ 2026-06-02 20:01 UTC (permalink / raw)
To: tglx, mingo, bp, dave.hansen, x86, hpa, seanjc, peterz,
thomas.lendacky, herbert, davem, ardb
Cc: pbonzini, aik, Michael.Roth, KPrateek.Nayak, Tycho.Andersen,
Nathan.Fontenot, ackerleytng, jackyli, pgonda, rientjes, jacobhxu,
xin, pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen,
darwi, linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <cover.1780427587.git.ashish.kalra@amd.com>
From: Ashish Kalra <ashish.kalra@amd.com>
RMPOPT table is a per-CPU table which indicates if 1GB regions of
physical memory are entirely hypervisor-owned or not.
When performing host memory accesses in hypervisor mode as well as
non-SNP guest mode, the processor may consult the RMPOPT table to
potentially skip an RMP access and improve performance.
Events such as RMPUPDATE can clear RMP optimizations. Add an interface
to re-enable those optimizations.
Reviewed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
---
arch/x86/include/asm/sev.h | 2 ++
arch/x86/virt/svm/sev.c | 15 +++++++++++++++
2 files changed, 17 insertions(+)
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 6fd72a44a51e..09b1c5d33790 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -662,6 +662,7 @@ static inline void snp_leak_pages(u64 pfn, unsigned int pages)
__snp_leak_pages(pfn, pages, true);
}
int snp_prepare(void);
+void snp_rmpopt_all_physmem(void);
void snp_setup_rmpopt(void);
void snp_shutdown(void);
#else
@@ -681,6 +682,7 @@ static inline void snp_leak_pages(u64 pfn, unsigned int npages) {}
static inline void kdump_sev_callback(void) { }
static inline void snp_fixup_e820_tables(void) {}
static inline int snp_prepare(void) { return -ENODEV; }
+static inline void snp_rmpopt_all_physmem(void) {}
static inline void snp_setup_rmpopt(void) {}
static inline void snp_shutdown(void) {}
#endif
diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
index d7e40a5fe5ca..4442ecae3d18 100644
--- a/arch/x86/virt/svm/sev.c
+++ b/arch/x86/virt/svm/sev.c
@@ -741,6 +741,21 @@ static void rmpopt_work_handler(struct work_struct *work)
free_cpumask_var(follower_mask);
}
+void snp_rmpopt_all_physmem(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_RMPOPT))
+ return;
+
+ guard(mutex)(&rmpopt_wq_mutex);
+
+ if (!rmpopt_wq)
+ return;
+
+ queue_delayed_work(rmpopt_wq, &rmpopt_delayed_work,
+ msecs_to_jiffies(RMPOPT_WORK_TIMEOUT));
+}
+EXPORT_SYMBOL_GPL(snp_rmpopt_all_physmem);
+
void snp_setup_rmpopt(void)
{
u64 rmpopt_base;
--
2.43.0
^ permalink raw reply related
* [PATCH v6 3/6] x86/sev: Add support to perform RMP optimizations asynchronously
From: Ashish Kalra @ 2026-06-02 20:01 UTC (permalink / raw)
To: tglx, mingo, bp, dave.hansen, x86, hpa, seanjc, peterz,
thomas.lendacky, herbert, davem, ardb
Cc: pbonzini, aik, Michael.Roth, KPrateek.Nayak, Tycho.Andersen,
Nathan.Fontenot, ackerleytng, jackyli, pgonda, rientjes, jacobhxu,
xin, pawan.kumar.gupta, babu.moger, dyoung, nikunj, john.allen,
darwi, linux-kernel, linux-crypto, kvm, linux-coco
In-Reply-To: <cover.1780427587.git.ashish.kalra@amd.com>
From: Ashish Kalra <ashish.kalra@amd.com>
When SEV-SNP is enabled, all writes to memory are checked to ensure
integrity of SNP guest memory. This imposes performance overhead on the
whole system.
RMPOPT is a new instruction that minimizes the performance overhead of
RMP checks on the hypervisor and on non-SNP guests by allowing RMP
checks to be skipped for 1GB regions of memory that are known not to
contain any SEV-SNP guest memory.
Add support for performing RMP optimizations asynchronously using a
dedicated workqueue.
Enable RMPOPT optimizations for up to 2TB of system RAM starting from
the lowest physical memory address aligned down to a 1GB boundary at
RMP initialization time. RMP checks can initially be skipped for 1GB
memory ranges that do not contain SEV-SNP guest memory (excluding
preassigned pages such as the RMP table and firmware pages). As SNP
guests are launched, RMPUPDATE will disable the corresponding RMPOPT
optimizations.
Suggested-by: Thomas Lendacky <thomas.lendacky@amd.com>
Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
---
arch/x86/virt/svm/sev.c | 196 +++++++++++++++++++++++++++++++++++++++-
1 file changed, 193 insertions(+), 3 deletions(-)
diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
index 089c9a14edc7..d7e40a5fe5ca 100644
--- a/arch/x86/virt/svm/sev.c
+++ b/arch/x86/virt/svm/sev.c
@@ -19,6 +19,7 @@
#include <linux/iommu.h>
#include <linux/amd-iommu.h>
#include <linux/nospec.h>
+#include <linux/workqueue.h>
#include <asm/sev.h>
#include <asm/processor.h>
@@ -125,7 +126,18 @@ static void *rmp_bookkeeping __ro_after_init;
static u64 probed_rmp_base, probed_rmp_size;
static cpumask_t rmpopt_cpumask;
-static phys_addr_t rmpopt_pa_start;
+static phys_addr_t rmpopt_pa_start, rmpopt_pa_end;
+
+enum rmpopt_function {
+ RMPOPT_FUNC_VERIFY_AND_REPORT_STATUS,
+ RMPOPT_FUNC_REPORT_STATUS
+};
+
+#define RMPOPT_WORK_TIMEOUT 10000
+
+static struct workqueue_struct *rmpopt_wq;
+static struct delayed_work rmpopt_delayed_work;
+static DEFINE_MUTEX(rmpopt_wq_mutex);
static LIST_HEAD(snp_leaked_pages_list);
static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
@@ -566,6 +578,14 @@ static void rmpopt_cleanup(void)
{
int cpu;
+ guard(mutex)(&rmpopt_wq_mutex);
+
+ if (!rmpopt_wq)
+ return;
+
+ cancel_delayed_work_sync(&rmpopt_delayed_work);
+ destroy_workqueue(rmpopt_wq);
+
cpus_read_lock();
for_each_cpu(cpu, &rmpopt_cpumask)
@@ -574,7 +594,8 @@ static void rmpopt_cleanup(void)
cpus_read_unlock();
cpumask_clear(&rmpopt_cpumask);
- rmpopt_pa_start = 0;
+ rmpopt_pa_start = rmpopt_pa_end = 0;
+ rmpopt_wq = NULL;
}
void snp_shutdown(void)
@@ -592,6 +613,134 @@ void snp_shutdown(void)
}
EXPORT_SYMBOL_FOR_MODULES(snp_shutdown, "ccp");
+static inline bool __rmpopt(u64 pa_start, u64 op_type)
+{
+ bool optimized;
+
+ asm volatile(".byte 0xf2, 0x0f, 0x01, 0xfc"
+ : "=@ccc" (optimized)
+ : "a" (pa_start), "c" (op_type)
+ : "memory", "cc");
+
+ return optimized;
+}
+
+static void rmpopt(u64 pa)
+{
+ u64 pa_start = ALIGN_DOWN(pa, SZ_1G);
+ u64 op_type = RMPOPT_FUNC_VERIFY_AND_REPORT_STATUS;
+
+ __rmpopt(pa_start, op_type);
+}
+
+/*
+ * 'val' is a system physical address.
+ */
+static void rmpopt_smp(void *val)
+{
+ rmpopt((u64)val);
+}
+
+/*
+ * RMPOPT optimizations skip RMP checks at 1GB granularity if this
+ * range of memory does not contain any SNP guest memory.
+ */
+static void rmpopt_work_handler(struct work_struct *work)
+{
+ cpumask_var_t follower_mask;
+ phys_addr_t pa;
+ int this_cpu;
+
+ pr_info("Attempt RMP optimizations on physical address range @1GB alignment [0x%016llx - 0x%016llx]\n",
+ rmpopt_pa_start, rmpopt_pa_end);
+
+ if (!alloc_cpumask_var(&follower_mask, GFP_KERNEL))
+ return;
+
+ /*
+ * RMPOPT scans the RMP table, stores the result of the scan in the
+ * reserved processor memory. The RMP scan is the most expensive
+ * part. If a second RMPOPT occurs, it can skip the expensive scan
+ * if they can see a cached result in the reserved processor memory.
+ *
+ * Do RMPOPT on one CPU alone. Then, follow that up with RMPOPT
+ * on every other primary thread. Followers are "designed to"
+ * skip the scan if they see the "cached" scan results.
+ */
+ cpumask_copy(follower_mask, &rmpopt_cpumask);
+
+ /*
+ * Pin the worker to the current CPU for the leader loop so that
+ * this_cpu remains valid and the RMPOPT instruction executes on
+ * the correct CPU.
+ *
+ * Use migrate_disable() rather than get_cpu() to prevent
+ * migration while still allowing preemption.
+ */
+ migrate_disable();
+ this_cpu = smp_processor_id();
+
+ if (cpumask_test_cpu(this_cpu, follower_mask)) {
+ /*
+ * Current CPU is a primary thread in rmpopt_cpumask.
+ * Run leader locally and remove from follower mask.
+ */
+ cpumask_clear_cpu(this_cpu, follower_mask);
+
+ for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G)
+ rmpopt(pa);
+ } else if (cpumask_intersects(topology_sibling_cpumask(this_cpu),
+ follower_mask)) {
+ /*
+ * Current CPU is a sibling thread whose primary is in
+ * rmpopt_cpumask. RMPOPT_BASE MSR is per-core, so it
+ * is safe to run the leader locally. Remove the sibling's
+ * primary from the follower mask as this core is already
+ * covered by the leader.
+ */
+ cpumask_andnot(follower_mask, follower_mask,
+ topology_sibling_cpumask(this_cpu));
+
+ for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G)
+ rmpopt(pa);
+ } else {
+ /*
+ * Current CPU does not have RMPOPT_BASE MSR programmed.
+ * Pick an explicit leader from the cpumask to avoid #UD.
+ */
+ int leader_cpu = cpumask_first(follower_mask);
+
+ if (WARN_ON_ONCE(leader_cpu >= nr_cpu_ids)) {
+ migrate_enable();
+ goto out;
+ }
+
+ cpumask_clear_cpu(leader_cpu, follower_mask);
+
+ cpus_read_lock();
+ for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G)
+ smp_call_function_single(leader_cpu, rmpopt_smp,
+ (void *)pa, true);
+ cpus_read_unlock();
+ }
+
+ migrate_enable();
+
+ /* Followers: run RMPOPT on remaining cores */
+ cpus_read_lock();
+ for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
+ on_each_cpu_mask(follower_mask, rmpopt_smp,
+ (void *)pa, true);
+
+ /* Give a chance for other threads to run */
+ cond_resched();
+ }
+ cpus_read_unlock();
+
+out:
+ free_cpumask_var(follower_mask);
+}
+
void snp_setup_rmpopt(void)
{
u64 rmpopt_base;
@@ -600,11 +749,35 @@ void snp_setup_rmpopt(void)
if (!cpu_feature_enabled(X86_FEATURE_RMPOPT))
return;
+ guard(mutex)(&rmpopt_wq_mutex);
+
+ /*
+ * Guard against re-initialization. When SNP_SHUTDOWN_EX is issued
+ * with x86_snp_shutdown=0, snp_shutdown() is not called and
+ * rmpopt_cleanup() is skipped, but snp_initialized is still cleared.
+ * A subsequent __sev_snp_init_locked() would call snp_setup_rmpopt()
+ * again, leaking the existing workqueue, delayed work, debugfs
+ * entries, and cpumask state.
+ */
+ if (rmpopt_wq)
+ return;
+
+ /*
+ * Create an RMPOPT-specific workqueue to avoid scheduling
+ * RMPOPT workitem on the global system workqueue.
+ */
+ rmpopt_wq = alloc_workqueue("rmpopt_wq", WQ_UNBOUND, 1);
+ if (!rmpopt_wq) {
+ pr_err("Failed to allocate RMPOPT workqueue\n");
+ return;
+ }
+
cpus_read_lock();
/*
* The RMPOPT_BASE MSR is per-core, so only one thread per core needs
- * to set up the RMPOPT_BASE MSR.
+ * to set up the RMPOPT_BASE MSR. Likewise, only one thread per core
+ * needs to issue the RMPOPT instruction.
*
* Note: only online primary threads are included. If a core's
* primary thread is offline, that core is not covered. CPU hotplug
@@ -628,6 +801,23 @@ void snp_setup_rmpopt(void)
wrmsrq_on_cpu(cpu, MSR_AMD64_RMPOPT_BASE, rmpopt_base);
cpus_read_unlock();
+
+ INIT_DELAYED_WORK(&rmpopt_delayed_work, rmpopt_work_handler);
+
+ rmpopt_pa_end = ALIGN(PFN_PHYS(max_pfn), SZ_1G);
+
+ /* Limit memory scanning to 2TB of RAM */
+ if ((rmpopt_pa_end - rmpopt_pa_start) > SZ_2T) {
+ pr_info("RMPOPT coverage limited to 2TB; memory above 0x%llx not optimized\n",
+ rmpopt_pa_start + SZ_2T);
+ rmpopt_pa_end = rmpopt_pa_start + SZ_2T;
+ }
+
+ /*
+ * Once all per-CPU RMPOPT tables have been configured, enable RMPOPT
+ * optimizations on all physical memory.
+ */
+ queue_delayed_work(rmpopt_wq, &rmpopt_delayed_work, 0);
}
EXPORT_SYMBOL_FOR_MODULES(snp_setup_rmpopt, "ccp");
--
2.43.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox