* Re: [PATCH v4 04/13] dma: swiotlb: track pool encryption state and honor DMA_ATTR_CC_SHARED
From: Alexey Kardashevskiy @ 2026-05-18 8:19 UTC (permalink / raw)
To: Aneesh Kumar K.V (Arm), iommu, linux-arm-kernel, linux-kernel,
linux-coco
Cc: Robin Murphy, Marek Szyprowski, Will Deacon, Marc Zyngier,
Steven Price, Suzuki K Poulose, Catalin Marinas, Jiri Pirko,
Jason Gunthorpe, Mostafa Saleh, Petr Tesarik, Dan Williams,
Xu Yilun, linuxppc-dev, linux-s390, Madhavan Srinivasan,
Michael Ellerman, Nicholas Piggin, Christophe Leroy (CS GROUP),
Alexander Gordeev, Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86
In-Reply-To: <20260512090408.794195-5-aneesh.kumar@kernel.org>
On 12/5/26 19:03, Aneesh Kumar K.V (Arm) wrote:
> Teach swiotlb to distinguish between encrypted and decrypted bounce
> buffer pools, and make allocation and mapping paths select a pool whose
> state matches the requested DMA attributes.
>
> Add a decrypted flag to io_tlb_mem, initialize it for the default and
> restricted pools, and propagate DMA_ATTR_CC_SHARED into swiotlb pool
> allocation. Reject swiotlb alloc/map requests when the selected pool does
> not match the required encrypted/decrypted state.
>
> Also return DMA addresses with the matching phys_to_dma_{encrypted,
> unencrypted} helper so the DMA address encoding stays consistent with the
> chosen pool.
>
> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
> ---
> include/linux/dma-direct.h | 10 ++++
> include/linux/swiotlb.h | 8 ++-
> kernel/dma/direct.c | 14 +++--
> kernel/dma/swiotlb.c | 108 +++++++++++++++++++++++++++----------
> 4 files changed, 107 insertions(+), 33 deletions(-)
>
> diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
> index c249912456f9..94fad4e7c11e 100644
> --- a/include/linux/dma-direct.h
> +++ b/include/linux/dma-direct.h
> @@ -77,6 +77,10 @@ static inline dma_addr_t dma_range_map_max(const struct bus_dma_region *map)
> #ifndef phys_to_dma_unencrypted
> #define phys_to_dma_unencrypted phys_to_dma
> #endif
> +
> +#ifndef phys_to_dma_encrypted
> +#define phys_to_dma_encrypted phys_to_dma
> +#endif
> #else
> static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr)
> {
> @@ -90,6 +94,12 @@ static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev,
> {
> return dma_addr_unencrypted(__phys_to_dma(dev, paddr));
> }
> +
> +static inline dma_addr_t phys_to_dma_encrypted(struct device *dev,
> + phys_addr_t paddr)
> +{
> + return dma_addr_encrypted(__phys_to_dma(dev, paddr));
> +}
> /*
> * If memory encryption is supported, phys_to_dma will set the memory encryption
> * bit in the DMA address, and dma_to_phys will clear it.
> diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
> index 3dae0f592063..b3fa3c6e0169 100644
> --- a/include/linux/swiotlb.h
> +++ b/include/linux/swiotlb.h
> @@ -81,6 +81,7 @@ struct io_tlb_pool {
> struct list_head node;
> struct rcu_head rcu;
> bool transient;
> + bool unencrypted;
> #endif
> };
>
> @@ -111,6 +112,7 @@ struct io_tlb_mem {
> struct dentry *debugfs;
> bool force_bounce;
> bool for_alloc;
> + bool unencrypted;
> #ifdef CONFIG_SWIOTLB_DYNAMIC
> bool can_grow;
> u64 phys_limit;
> @@ -282,7 +284,8 @@ static inline void swiotlb_sync_single_for_cpu(struct device *dev,
> extern void swiotlb_print_info(void);
>
> #ifdef CONFIG_DMA_RESTRICTED_POOL
> -struct page *swiotlb_alloc(struct device *dev, size_t size);
> +struct page *swiotlb_alloc(struct device *dev, size_t size,
> + unsigned long attrs);
> bool swiotlb_free(struct device *dev, struct page *page, size_t size);
>
> static inline bool is_swiotlb_for_alloc(struct device *dev)
> @@ -290,7 +293,8 @@ static inline bool is_swiotlb_for_alloc(struct device *dev)
> return dev->dma_io_tlb_mem->for_alloc;
> }
> #else
> -static inline struct page *swiotlb_alloc(struct device *dev, size_t size)
> +static inline struct page *swiotlb_alloc(struct device *dev, size_t size,
> + unsigned long attrs)
> {
> return NULL;
> }
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index dc2907439b3d..97ae4fa10521 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -104,9 +104,10 @@ static void __dma_direct_free_pages(struct device *dev, struct page *page,
> dma_free_contiguous(dev, page, size);
> }
>
> -static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size)
> +static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size,
> + unsigned long attrs)
> {
> - struct page *page = swiotlb_alloc(dev, size);
> + struct page *page = swiotlb_alloc(dev, size, attrs);
>
> if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
> swiotlb_free(dev, page, size);
> @@ -266,8 +267,12 @@ void *dma_direct_alloc(struct device *dev, size_t size,
> gfp, attrs);
>
> if (is_swiotlb_for_alloc(dev)) {
here we know it is shared so ...
> - page = dma_direct_alloc_swiotlb(dev, size);
> + page = dma_direct_alloc_swiotlb(dev, size, attrs);
> if (page) {
> + /*
> + * swiotlb allocations comes from pool already marked
> + * decrypted
> + */
... is not this needed here
attrs |= DMA_ATTR_CC_SHARED;
?
and then the setup_page label below can do the right thing, which I tried, with enforcing io_tlb_default_mem.for_alloc=1, it works - accepted device can still do DMA to shared memory. Thanks,
> mark_mem_decrypt = false;
> goto setup_page;
> }
> @@ -374,6 +379,7 @@ void dma_direct_free(struct device *dev, size_t size,
> return;
>
> if (swiotlb_find_pool(dev, dma_to_phys(dev, dma_addr)))
> + /* Swiotlb doesn't need a page attribute update on free */
> mark_mem_encrypted = false;
>
> if (is_vmalloc_addr(cpu_addr)) {
> @@ -403,7 +409,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
> gfp, attrs);
>
> if (is_swiotlb_for_alloc(dev)) {
> - page = dma_direct_alloc_swiotlb(dev, size);
> + page = dma_direct_alloc_swiotlb(dev, size, attrs);
> if (!page)
> return NULL;
>
> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> index ab4eccbaa076..065663be282c 100644
> --- a/kernel/dma/swiotlb.c
> +++ b/kernel/dma/swiotlb.c
> @@ -259,10 +259,21 @@ void __init swiotlb_update_mem_attributes(void)
> struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
> unsigned long bytes;
>
> + /*
> + * if platform support memory encryption, swiotlb buffers are
> + * decrypted by default.
> + */
> + if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
> + io_tlb_default_mem.unencrypted = true;
> + else
> + io_tlb_default_mem.unencrypted = false;
> +
> if (!mem->nslabs || mem->late_alloc)
> return;
> bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT);
> - set_memory_decrypted((unsigned long)mem->vaddr, bytes >> PAGE_SHIFT);
> +
> + if (io_tlb_default_mem.unencrypted)
> + set_memory_decrypted((unsigned long)mem->vaddr, bytes >> PAGE_SHIFT);
> }
>
> static void swiotlb_init_io_tlb_pool(struct io_tlb_pool *mem, phys_addr_t start,
> @@ -505,8 +516,10 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
> if (!mem->slots)
> goto error_slots;
>
> - set_memory_decrypted((unsigned long)vstart,
> - (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
> + if (io_tlb_default_mem.unencrypted)
> + set_memory_decrypted((unsigned long)vstart,
> + (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
> +
> swiotlb_init_io_tlb_pool(mem, virt_to_phys(vstart), nslabs, true,
> nareas);
> add_mem_pool(&io_tlb_default_mem, mem);
> @@ -539,7 +552,9 @@ void __init swiotlb_exit(void)
> tbl_size = PAGE_ALIGN(mem->end - mem->start);
> slots_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), mem->nslabs));
>
> - set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT);
> + if (io_tlb_default_mem.unencrypted)
> + set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT);
> +
> if (mem->late_alloc) {
> area_order = get_order(array_size(sizeof(*mem->areas),
> mem->nareas));
> @@ -563,6 +578,7 @@ void __init swiotlb_exit(void)
> * @gfp: GFP flags for the allocation.
> * @bytes: Size of the buffer.
> * @phys_limit: Maximum allowed physical address of the buffer.
> + * @unencrypted: true to allocate unencrypted memory, false for encrypted memory
> *
> * Allocate pages from the buddy allocator. If successful, make the allocated
> * pages decrypted that they can be used for DMA.
> @@ -570,7 +586,8 @@ void __init swiotlb_exit(void)
> * Return: Decrypted pages, %NULL on allocation failure, or ERR_PTR(-EAGAIN)
> * if the allocated physical address was above @phys_limit.
> */
> -static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes, u64 phys_limit)
> +static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes,
> + u64 phys_limit, bool unencrypted)
> {
> unsigned int order = get_order(bytes);
> struct page *page;
> @@ -588,13 +605,13 @@ static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes, u64 phys_limit)
> }
>
> vaddr = phys_to_virt(paddr);
> - if (set_memory_decrypted((unsigned long)vaddr, PFN_UP(bytes)))
> + if (unencrypted && set_memory_decrypted((unsigned long)vaddr, PFN_UP(bytes)))
> goto error;
> return page;
>
> error:
> /* Intentional leak if pages cannot be encrypted again. */
> - if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
> + if (unencrypted && !set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
> __free_pages(page, order);
> return NULL;
> }
> @@ -604,30 +621,26 @@ static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes, u64 phys_limit)
> * @dev: Device for which a memory pool is allocated.
> * @bytes: Size of the buffer.
> * @phys_limit: Maximum allowed physical address of the buffer.
> + * @attrs: DMA attributes for the allocation.
> * @gfp: GFP flags for the allocation.
> *
> * Return: Allocated pages, or %NULL on allocation failure.
> */
> static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
> - u64 phys_limit, gfp_t gfp)
> + u64 phys_limit, unsigned long attrs, gfp_t gfp)
> {
> struct page *page;
> - unsigned long attrs = 0;
>
> /*
> * Allocate from the atomic pools if memory is encrypted and
> * the allocation is atomic, because decrypting may block.
> */
> - if (!gfpflags_allow_blocking(gfp) && dev && force_dma_unencrypted(dev)) {
> + if (!gfpflags_allow_blocking(gfp) && (attrs & DMA_ATTR_CC_SHARED)) {
> void *vaddr;
>
> if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
> return NULL;
>
> - /* swiotlb considered decrypted by default */
> - if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
> - attrs = DMA_ATTR_CC_SHARED;
> -
> return dma_alloc_from_pool(dev, bytes, &vaddr, gfp,
> attrs, dma_coherent_ok);
> }
> @@ -638,7 +651,8 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
> else if (phys_limit <= DMA_BIT_MASK(32))
> gfp |= __GFP_DMA32;
>
> - while (IS_ERR(page = alloc_dma_pages(gfp, bytes, phys_limit))) {
> + while (IS_ERR(page = alloc_dma_pages(gfp, bytes, phys_limit,
> + !!(attrs & DMA_ATTR_CC_SHARED)))) {
> if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
> phys_limit < DMA_BIT_MASK(64) &&
> !(gfp & (__GFP_DMA32 | __GFP_DMA)))
> @@ -657,15 +671,18 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
> * swiotlb_free_tlb() - free a dynamically allocated IO TLB buffer
> * @vaddr: Virtual address of the buffer.
> * @bytes: Size of the buffer.
> + * @unencrypted: true if @vaddr was allocated decrypted and must be
> + * re-encrypted before being freed
> */
> -static void swiotlb_free_tlb(void *vaddr, size_t bytes)
> +static void swiotlb_free_tlb(void *vaddr, size_t bytes, bool unencrypted)
> {
> if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
> dma_free_from_pool(NULL, vaddr, bytes))
> return;
>
> /* Intentional leak if pages cannot be encrypted again. */
> - if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
> + if (!unencrypted ||
> + !set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
> __free_pages(virt_to_page(vaddr), get_order(bytes));
> }
>
> @@ -676,6 +693,7 @@ static void swiotlb_free_tlb(void *vaddr, size_t bytes)
> * @nslabs: Desired (maximum) number of slabs.
> * @nareas: Number of areas.
> * @phys_limit: Maximum DMA buffer physical address.
> + * @attrs: DMA attributes for the allocation.
> * @gfp: GFP flags for the allocations.
> *
> * Allocate and initialize a new IO TLB memory pool. The actual number of
> @@ -686,7 +704,8 @@ static void swiotlb_free_tlb(void *vaddr, size_t bytes)
> */
> static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
> unsigned long minslabs, unsigned long nslabs,
> - unsigned int nareas, u64 phys_limit, gfp_t gfp)
> + unsigned int nareas, u64 phys_limit, unsigned long attrs,
> + gfp_t gfp)
> {
> struct io_tlb_pool *pool;
> unsigned int slot_order;
> @@ -704,9 +723,10 @@ static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
> if (!pool)
> goto error;
> pool->areas = (void *)pool + sizeof(*pool);
> + pool->unencrypted = !!(attrs & DMA_ATTR_CC_SHARED);
>
> tlb_size = nslabs << IO_TLB_SHIFT;
> - while (!(tlb = swiotlb_alloc_tlb(dev, tlb_size, phys_limit, gfp))) {
> + while (!(tlb = swiotlb_alloc_tlb(dev, tlb_size, phys_limit, attrs, gfp))) {
> if (nslabs <= minslabs)
> goto error_tlb;
> nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
> @@ -724,7 +744,8 @@ static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
> return pool;
>
> error_slots:
> - swiotlb_free_tlb(page_address(tlb), tlb_size);
> + swiotlb_free_tlb(page_address(tlb), tlb_size,
> + !!(attrs & DMA_ATTR_CC_SHARED));
> error_tlb:
> kfree(pool);
> error:
> @@ -742,7 +763,9 @@ static void swiotlb_dyn_alloc(struct work_struct *work)
> struct io_tlb_pool *pool;
>
> pool = swiotlb_alloc_pool(NULL, IO_TLB_MIN_SLABS, default_nslabs,
> - default_nareas, mem->phys_limit, GFP_KERNEL);
> + default_nareas, mem->phys_limit,
> + mem->unencrypted ? DMA_ATTR_CC_SHARED : 0,
> + GFP_KERNEL);
> if (!pool) {
> pr_warn_ratelimited("Failed to allocate new pool");
> return;
> @@ -762,7 +785,7 @@ static void swiotlb_dyn_free(struct rcu_head *rcu)
> size_t tlb_size = pool->end - pool->start;
>
> free_pages((unsigned long)pool->slots, get_order(slots_size));
> - swiotlb_free_tlb(pool->vaddr, tlb_size);
> + swiotlb_free_tlb(pool->vaddr, tlb_size, pool->unencrypted);
> kfree(pool);
> }
>
> @@ -1232,6 +1255,7 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
> nslabs = nr_slots(alloc_size);
> phys_limit = min_not_zero(*dev->dma_mask, dev->bus_dma_limit);
> pool = swiotlb_alloc_pool(dev, nslabs, nslabs, 1, phys_limit,
> + mem->unencrypted ? DMA_ATTR_CC_SHARED : 0,
> GFP_NOWAIT);
> if (!pool)
> return -1;
> @@ -1394,6 +1418,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
> enum dma_data_direction dir, unsigned long attrs)
> {
> struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
> + bool require_decrypted = false;
> unsigned int offset;
> struct io_tlb_pool *pool;
> unsigned int i;
> @@ -1411,6 +1436,16 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
> if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
> pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
>
> + /*
> + * if we are trying to swiotlb map a decrypted paddr or the paddr is encrypted
> + * but the device is forcing decryption, use decrypted io_tlb_mem
> + */
> + if ((attrs & DMA_ATTR_CC_SHARED) || force_dma_unencrypted(dev))
> + require_decrypted = true;
> +
> + if (require_decrypted != mem->unencrypted)
> + return (phys_addr_t)DMA_MAPPING_ERROR;
> +
> /*
> * The default swiotlb memory pool is allocated with PAGE_SIZE
> * alignment. If a mapping is requested with larger alignment,
> @@ -1608,8 +1643,14 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
> if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
> return DMA_MAPPING_ERROR;
>
> - /* Ensure that the address returned is DMA'ble */
> - dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
> + /*
> + * Use the allocated io_tlb_mem encryption type to determine dma addr.
> + */
> + if (dev->dma_io_tlb_mem->unencrypted)
> + dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
> + else
> + dma_addr = phys_to_dma_encrypted(dev, swiotlb_addr);
> +
> if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
> __swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir,
> attrs | DMA_ATTR_SKIP_CPU_SYNC,
> @@ -1773,7 +1814,8 @@ static inline void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
>
> #ifdef CONFIG_DMA_RESTRICTED_POOL
>
> -struct page *swiotlb_alloc(struct device *dev, size_t size)
> +struct page *swiotlb_alloc(struct device *dev, size_t size,
> + unsigned long attrs)
> {
> struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
> struct io_tlb_pool *pool;
> @@ -1784,6 +1826,9 @@ struct page *swiotlb_alloc(struct device *dev, size_t size)
> if (!mem)
> return NULL;
>
> + if (mem->unencrypted != !!(attrs & DMA_ATTR_CC_SHARED))
> + return NULL;
> +
> align = (1 << (get_order(size) + PAGE_SHIFT)) - 1;
> index = swiotlb_find_slots(dev, 0, size, align, &pool);
> if (index == -1)
> @@ -1853,9 +1898,18 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
> kfree(mem);
> return -ENOMEM;
> }
> + /*
> + * if platform supports memory encryption,
> + * restricted mem pool is decrypted by default
> + */
> + if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
> + mem->unencrypted = true;
> + set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
> + rmem->size >> PAGE_SHIFT);
> + } else {
> + mem->unencrypted = false;
> + }
>
> - set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
> - rmem->size >> PAGE_SHIFT);
> swiotlb_init_io_tlb_pool(pool, rmem->base, nslabs,
> false, nareas);
> mem->force_bounce = true;
--
Alexey
^ permalink raw reply
* Re: [PATCH v9 02/23] x86/virt/tdx: Move TDX_FEATURES0 bits to asm/tdx.h
From: Chao Gao @ 2026-05-18 7:52 UTC (permalink / raw)
To: Dave Hansen
Cc: kvm, linux-coco, linux-kernel, binbin.wu, dave.hansen, djbw,
ira.weiny, kai.huang, kas, nik.borisov, paulmck, pbonzini,
reinette.chatre, rick.p.edgecombe, sagis, seanjc, tony.lindgren,
vannapurve, vishal.l.verma, yilun.xu, xiaoyao.li, yan.y.zhao,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, x86,
H. Peter Anvin
In-Reply-To: <c7d64460-d03c-4198-9320-8afceafcecb5@intel.com>
On Fri, May 15, 2026 at 09:15:47AM -0700, Dave Hansen wrote:
>On 5/13/26 08:09, Chao Gao wrote:
>> This prepares for TDX module update [1] and Dynamic PAMT [2] support. Both
>> add new TDX_FEATURES0 capability bits, and both need those capabilities to
>> be queried from code outside arch/x86/virt. The corresponding feature-query
>> helpers therefore need to live in the public asm/tdx.h header, so move the
>> existing bit definitions there first.
>
>Please don't add unnecessary changelog cruft. If you need this move for
>this series, that's enough.
Sure. Will remove "Dynamic PAMT" stuff from the changelog.
^ permalink raw reply
* Re: [PATCH v9 01/23] x86/virt/tdx: Consolidate TDX global initialization states
From: Chao Gao @ 2026-05-18 7:43 UTC (permalink / raw)
To: Dave Hansen
Cc: kvm, linux-coco, linux-kernel, binbin.wu, dave.hansen, djbw,
ira.weiny, kai.huang, kas, nik.borisov, paulmck, pbonzini,
reinette.chatre, rick.p.edgecombe, sagis, seanjc, tony.lindgren,
vannapurve, vishal.l.verma, yilun.xu, xiaoyao.li, yan.y.zhao,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, x86,
H. Peter Anvin
In-Reply-To: <ab35e6e4-b57a-4d49-8521-ca69a9135d3c@intel.com>
On Fri, May 15, 2026 at 09:14:02AM -0700, Dave Hansen wrote:
>On 5/13/26 08:09, Chao Gao wrote:
>...
>> Group the states into a single structure so they can be reset together, for
>> example with memset(), and so a newly added state won't be missed.
>...
>> +struct tdx_module_state {
>> + bool initialized;
>> + bool sysinit_done;
>> + int sysinit_ret;
>> +};
>...
>> @@ -113,30 +119,28 @@ static int try_init_module_global(void)
>> {
>> struct tdx_module_args args = {};
>> static DEFINE_RAW_SPINLOCK(sysinit_lock);
>> - static bool sysinit_done;
>> - static int sysinit_ret;
>
>This doesn't look right to me.
>
>> raw_spin_lock(&sysinit_lock);
>>
>> - if (sysinit_done)
>> + if (tdx_module_state.sysinit_done)
>> goto out;
>>
>> /* RCX is module attributes and all bits are reserved */
>> args.rcx = 0;
>> - sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
>> + tdx_module_state.sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
>>
>> /*
>> * The first SEAMCALL also detects the TDX module, thus
>> * it can fail due to the TDX module is not loaded.
>> * Dump message to let the user know.
>> */
>> - if (sysinit_ret == -ENODEV)
>> + if (tdx_module_state.sysinit_ret == -ENODEV)
>> pr_err("module not loaded\n");
>>
>> - sysinit_done = true;
>> + tdx_module_state.sysinit_done = true;
>> out:
>> raw_spin_unlock(&sysinit_lock);
>> - return sysinit_ret;
>> + return tdx_module_state.sysinit_ret;
>> }
>
>But I think it's because 'sysinit_ret' is really a funky thing. It's
>just here so that the module only _tries_ to be initialized once. That
>one-time init records its error code in sysinit_ret and then secondary
>callers pick it up.
>
>Here's how you do that in a non-confusing way (some context chopped out):
>
>static int try_init_module_global(void)
>{
> struct tdx_module_args args = {};
> static DEFINE_RAW_SPINLOCK(sysinit_lock);
> int ret;
>
> raw_spin_lock(&sysinit_lock);
>
> /* Return the "cached" return code: */
> if (tdx_module_state.sysinit_done) {
> ret = tdx_module_state.sysinit_ret;
> goto out;
> }
>
> ret = seamcall_prerr(TDH_SYS_INIT, &args);
>
> /* Save the return code for later callers: */
> tdx_module_state.sysinit_ret = ret;
> tdx_module_state.sysinit_done = true;
>out:
> raw_spin_unlock(&sysinit_lock);
> return ret;
>}
>
>See how it sets the module state in _one_ place? It also only touches
>the module state under the lock so it's more obvious that it is correct
>and there are no races or tearing or other nonsense.
>
>*That* is a proper refactoring.
Yes, that is a clear readability improvement.
To follow the "one change per patch" rule, I am inclined to do that as a
separate preparatory cleanup before moving all states into a structure.
commit 26bb389c5762fd6a496fbed1cc55e4978e99a5cb
Author: Chao Gao <chao.gao@intel.com>
Date: Sun May 17 20:03:00 2026 -0700
x86/virt/tdx: Clarify try_init_module_global() result caching
TDX module global initialization is executed only once. The first call
caches both the result and the "done" state, and later callers reuse the
saved result. A lock protects that cached state.
The current code is harder to read because sysinit_done is accessed under
the lock, while sysinit_ret is not.
To improve readability, move sysinit_ret accesses within the lock.
Group sysinit_ret/sysinit_done updates right after initialization so
Caching the result is separate from the initialization itself.
Signed-off-by: Chao Gao <chao.gao@intel.com>
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index c0c6281b08a5..ad56f142dd0b 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -115,28 +115,34 @@ static int try_init_module_global(void)
static DEFINE_RAW_SPINLOCK(sysinit_lock);
static bool sysinit_done;
static int sysinit_ret;
+ int ret;
raw_spin_lock(&sysinit_lock);
- if (sysinit_done)
+ /* Return the "cached" return code. */
+ if (sysinit_done) {
+ ret = sysinit_ret;
goto out;
+ }
/* RCX is module attributes and all bits are reserved */
args.rcx = 0;
- sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
+ ret = seamcall_prerr(TDH_SYS_INIT, &args);
/*
* The first SEAMCALL also detects the TDX module, thus
* it can fail due to the TDX module is not loaded.
* Dump message to let the user know.
*/
- if (sysinit_ret == -ENODEV)
+ if (ret == -ENODEV)
pr_err("module not loaded\n");
+ /* Save the return code for later callers. */
sysinit_done = true;
+ sysinit_ret = ret;
out:
raw_spin_unlock(&sysinit_lock);
- return sysinit_ret;
+ return ret;
}
/**
^ permalink raw reply related
* Re: [PATCH v2 03/15] KVM: x86/xen: Don't truncate RAX when handling hypercall from protected guest
From: David Woodhouse @ 2026-05-18 7:15 UTC (permalink / raw)
To: Binbin Wu, Sean Christopherson
Cc: Paolo Bonzini, Vitaly Kuznetsov, Kiryl Shutsemau, Paul Durrant,
Dave Hansen, Rick Edgecombe, kvm, x86, linux-coco, linux-kernel,
Yosry Ahmed, Kai Huang
In-Reply-To: <52fdc61a-60e8-4547-8ff7-f249b4d667b9@linux.intel.com>
[-- Attachment #1: Type: text/plain, Size: 1015 bytes --]
On Mon, 2026-05-18 at 10:19 +0800, Binbin Wu wrote:
>
> > > > longmode = is_64_bit_hypercall(vcpu);
> > >
> > > Is the variable name misleading?
> >
> > It most definitely is. However, @longmode is passed around quite a few locations
> > in xen.c, and so I don't want to opportunistically fix this one variable. Though
> > I'm definitely not opposed to a separate patch to rename them all to is_64bit or
> > something.
>
> OK, I can do it.
This one (as shown above) is clearly indicating whether this particular
vCPU is in 64-bit mode for this particular hypercall. Changing that to
is_64bit makes sense.
However, there is a separate overall mode for the VM, which is stored
in 'kvm->arch.xen.long_mode' and accessed by userspace using the
KVM_XEN_ATTR_TYPE_LONG_MODE attribute. It affects the datatypes used by
shared memory data structures, and is also latched by the kernel when
the guest writes the MSR for the hypercall page. That one should
probably keep its name.
[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5069 bytes --]
^ permalink raw reply
* Re: [PATCH v14 04/44] arm64: RMI: Add SMC definitions for calling the RMM
From: Gavin Shan @ 2026-05-18 7:08 UTC (permalink / raw)
To: Steven Price, kvm, kvmarm
Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Shanker Donthineni,
Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-5-steven.price@arm.com>
Hi Steven,
On 5/13/26 11:17 PM, Steven Price wrote:
> The RMM (Realm Management Monitor) provides functionality that can be
> accessed by SMC calls from the host.
>
> The SMC definitions are based on DEN0137[1] version 2.0-bet1
>
> [1] https://developer.arm.com/documentation/den0137/2-0bet1/
>
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> Changes since v13:
> * Updated to RMM spec v2.0-bet1
> Changes since v12:
> * Updated to RMM spec v2.0-bet0
> Changes since v9:
> * Corrected size of 'ripas_value' in struct rec_exit. The spec states
> this is an 8-bit type with padding afterwards (rather than a u64).
> Changes since v8:
> * Added RMI_PERMITTED_GICV3_HCR_BITS to define which bits the RMM
> permits to be modified.
> Changes since v6:
> * Renamed REC_ENTER_xxx defines to include 'FLAG' to make it obvious
> these are flag values.
> Changes since v5:
> * Sorted the SMC #defines by value.
> * Renamed SMI_RxI_CALL to SMI_RMI_CALL since the macro is only used for
> RMI calls.
> * Renamed REC_GIC_NUM_LRS to REC_MAX_GIC_NUM_LRS since the actual
> number of available list registers could be lower.
> * Provided a define for the reserved fields of FeatureRegister0.
> * Fix inconsistent names for padding fields.
> Changes since v4:
> * Update to point to final released RMM spec.
> * Minor rearrangements.
> Changes since v3:
> * Update to match RMM spec v1.0-rel0-rc1.
> Changes since v2:
> * Fix specification link.
> * Rename rec_entry->rec_enter to match spec.
> * Fix size of pmu_ovf_status to match spec.
> ---
> arch/arm64/include/asm/rmi_smc.h | 448 +++++++++++++++++++++++++++++++
> 1 file changed, 448 insertions(+)
> create mode 100644 arch/arm64/include/asm/rmi_smc.h
>
> diff --git a/arch/arm64/include/asm/rmi_smc.h b/arch/arm64/include/asm/rmi_smc.h
> new file mode 100644
> index 000000000000..a09b7a631fef
> --- /dev/null
> +++ b/arch/arm64/include/asm/rmi_smc.h
> @@ -0,0 +1,448 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2023-2026 ARM Ltd.
> + *
> + * The values and structures in this file are from the Realm Management Monitor
> + * specification (DEN0137) version 2.0-bet1:
> + * https://developer.arm.com/documentation/den0137/2-0bet1/
> + */
> +
> +#ifndef __ASM_RMI_SMC_H
> +#define __ASM_RMI_SMC_H
> +
> +#include <linux/arm-smccc.h>
> +
> +#define SMC_RMI_CALL(func) \
> + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
> + ARM_SMCCC_SMC_64, \
> + ARM_SMCCC_OWNER_STANDARD, \
> + (func))
> +
> +#define SMC_RMI_VERSION SMC_RMI_CALL(0x0150)
> +
> +#define SMC_RMI_RTT_DATA_MAP_INIT SMC_RMI_CALL(0x0153)
> +
> +#define SMC_RMI_REALM_ACTIVATE SMC_RMI_CALL(0x0157)
> +#define SMC_RMI_REALM_CREATE SMC_RMI_CALL(0x0158)
> +#define SMC_RMI_REALM_DESTROY SMC_RMI_CALL(0x0159)
> +#define SMC_RMI_REC_CREATE SMC_RMI_CALL(0x015a)
> +#define SMC_RMI_REC_DESTROY SMC_RMI_CALL(0x015b)
> +#define SMC_RMI_REC_ENTER SMC_RMI_CALL(0x015c)
> +#define SMC_RMI_RTT_CREATE SMC_RMI_CALL(0x015d)
> +#define SMC_RMI_RTT_DESTROY SMC_RMI_CALL(0x015e)
> +
> +#define SMC_RMI_RTT_READ_ENTRY SMC_RMI_CALL(0x0161)
> +
> +#define SMC_RMI_RTT_DEV_VALIDATE SMC_RMI_CALL(0x0163)
> +#define SMC_RMI_PSCI_COMPLETE SMC_RMI_CALL(0x0164)
> +#define SMC_RMI_FEATURES SMC_RMI_CALL(0x0165)
> +#define SMC_RMI_RTT_FOLD SMC_RMI_CALL(0x0166)
> +
> +#define SMC_RMI_RTT_INIT_RIPAS SMC_RMI_CALL(0x0168)
> +#define SMC_RMI_RTT_SET_RIPAS SMC_RMI_CALL(0x0169)
> +#define SMC_RMI_VSMMU_CREATE SMC_RMI_CALL(0x016a)
> +#define SMC_RMI_VSMMU_DESTROY SMC_RMI_CALL(0x016b)
> +#define SMC_RMI_RMM_CONFIG_SET SMC_RMI_CALL(0x016e)
> +#define SMC_RMI_PSMMU_IRQ_NOTIFY SMC_RMI_CALL(0x016f)
> +
> +#define SMC_RMI_PDEV_ABORT SMC_RMI_CALL(0x0174)
> +#define SMC_RMI_PDEV_COMMUNICATE SMC_RMI_CALL(0x0175)
> +#define SMC_RMI_PDEV_CREATE SMC_RMI_CALL(0x0176)
> +#define SMC_RMI_PDEV_DESTROY SMC_RMI_CALL(0x0177)
> +#define SMC_RMI_PDEV_GET_STATE SMC_RMI_CALL(0x0178)
> +
> +#define SMC_RMI_PDEV_STREAM_KEY_REFRESH SMC_RMI_CALL(0x017a)
> +#define SMC_RMI_PDEV_SET_PUBKEY SMC_RMI_CALL(0x017b)
> +#define SMC_RMI_PDEV_STOP SMC_RMI_CALL(0x017c)
> +#define SMC_RMI_RTT_AUX_CREATE SMC_RMI_CALL(0x017d)
> +#define SMC_RMI_RTT_AUX_DESTROY SMC_RMI_CALL(0x017e)
> +#define SMC_RMI_RTT_AUX_FOLD SMC_RMI_CALL(0x017f)
> +
> +#define SMC_RMI_VDEV_ABORT SMC_RMI_CALL(0x0185)
> +#define SMC_RMI_VDEV_COMMUNICATE SMC_RMI_CALL(0x0186)
> +#define SMC_RMI_VDEV_CREATE SMC_RMI_CALL(0x0187)
> +#define SMC_RMI_VDEV_DESTROY SMC_RMI_CALL(0x0188)
> +#define SMC_RMI_VDEV_GET_STATE SMC_RMI_CALL(0x0189)
> +#define SMC_RMI_VDEV_UNLOCK SMC_RMI_CALL(0x018a)
> +#define SMC_RMI_RTT_SET_S2AP SMC_RMI_CALL(0x018b)
> +#define SMC_RMI_VDEV_COMPLETE SMC_RMI_CALL(0x018e)
> +
> +#define SMC_RMI_VDEV_GET_INTERFACE_REPORT SMC_RMI_CALL(0x01d0)
> +#define SMC_RMI_VDEV_GET_MEASUREMENTS SMC_RMI_CALL(0x01d1)
> +#define SMC_RMI_VDEV_LOCK SMC_RMI_CALL(0x01d2)
> +#define SMC_RMI_VDEV_START SMC_RMI_CALL(0x01d3)
> +
> +#define SMC_RMI_VSMMU_EVENT_NOTIFY SMC_RMI_CALL(0x01d6)
> +#define SMC_RMI_PSMMU_ACTIVATE SMC_RMI_CALL(0x01d7)
> +#define SMC_RMI_PSMMU_DEACTIVATE SMC_RMI_CALL(0x01d8)
> +
> +#define SMC_RMI_PSMMU_ST_L2_CREATE SMC_RMI_CALL(0x01db)
> +#define SMC_RMI_PSMMU_ST_L2_DESTROY SMC_RMI_CALL(0x01dc)
> +#define SMC_RMI_DPT_L0_CREATE SMC_RMI_CALL(0x01dd)
> +#define SMC_RMI_DPT_L0_DESTROY SMC_RMI_CALL(0x01de)
> +#define SMC_RMI_DPT_L1_CREATE SMC_RMI_CALL(0x01df)
> +#define SMC_RMI_DPT_L1_DESTROY SMC_RMI_CALL(0x01e0)
> +#define SMC_RMI_GRANULE_TRACKING_GET SMC_RMI_CALL(0x01e1)
> +
> +#define SMC_RMI_GRANULE_TRACKING_SET SMC_RMI_CALL(0x01e3)
> +
> +#define SMC_RMI_RMM_CONFIG_GET SMC_RMI_CALL(0x01ec)
> +
> +#define SMC_RMI_RMM_STATE_GET SMC_RMI_CALL(0x01ee)
> +
> +#define SMC_RMI_PSMMU_EVENT_CONSUME SMC_RMI_CALL(0x01f0)
> +#define SMC_RMI_GRANULE_RANGE_DELEGATE SMC_RMI_CALL(0x01f1)
> +#define SMC_RMI_GRANULE_RANGE_UNDELEGATE SMC_RMI_CALL(0x01f2)
> +#define SMC_RMI_GPT_L1_CREATE SMC_RMI_CALL(0x01f3)
> +#define SMC_RMI_GPT_L1_DESTROY SMC_RMI_CALL(0x01f4)
> +#define SMC_RMI_RTT_DATA_MAP SMC_RMI_CALL(0x01f5)
> +#define SMC_RMI_RTT_DATA_UNMAP SMC_RMI_CALL(0x01f6)
> +#define SMC_RMI_RTT_DEV_MAP SMC_RMI_CALL(0x01f7)
> +#define SMC_RMI_RTT_DEV_UNMAP SMC_RMI_CALL(0x01f8)
> +#define SMC_RMI_RTT_ARCH_DEV_MAP SMC_RMI_CALL(0x01f9)
> +#define SMC_RMI_RTT_ARCH_DEV_UNMAP SMC_RMI_CALL(0x01fa)
> +#define SMC_RMI_RTT_UNPROT_MAP SMC_RMI_CALL(0x01fb)
> +#define SMC_RMI_RTT_UNPROT_UNMAP SMC_RMI_CALL(0x01fc)
> +#define SMC_RMI_RTT_AUX_PROT_MAP SMC_RMI_CALL(0x01fd)
> +#define SMC_RMI_RTT_AUX_PROT_UNMAP SMC_RMI_CALL(0x01fe)
> +#define SMC_RMI_RTT_AUX_UNPROT_MAP SMC_RMI_CALL(0x01ff)
> +#define SMC_RMI_RTT_AUX_UNPROT_UNMAP SMC_RMI_CALL(0x0200)
> +#define SMC_RMI_REALM_TERMINATE SMC_RMI_CALL(0x0201)
> +#define SMC_RMI_RMM_ACTIVATE SMC_RMI_CALL(0x0202)
> +#define SMC_RMI_OP_CONTINUE SMC_RMI_CALL(0x0203)
> +#define SMC_RMI_PDEV_STREAM_CONNECT SMC_RMI_CALL(0x0204)
> +#define SMC_RMI_PDEV_STREAM_DISCONNECT SMC_RMI_CALL(0x0205)
> +#define SMC_RMI_PDEV_STREAM_COMPLETE SMC_RMI_CALL(0x0206)
> +#define SMC_RMI_PDEV_STREAM_KEY_PURGE SMC_RMI_CALL(0x0207)
> +#define SMC_RMI_OP_MEM_DONATE SMC_RMI_CALL(0x0208)
> +#define SMC_RMI_OP_MEM_RECLAIM SMC_RMI_CALL(0x0209)
> +#define SMC_RMI_OP_CANCEL SMC_RMI_CALL(0x020a)
> +#define SMC_RMI_VSMMU_FEATURES SMC_RMI_CALL(0x020b)
> +#define SMC_RMI_VSMMU_CMD_GET SMC_RMI_CALL(0x020c)
> +#define SMC_RMI_VSMMU_CMD_COMPLETE SMC_RMI_CALL(0x020d)
> +#define SMC_RMI_PSMMU_INFO SMC_RMI_CALL(0x020e)
> +
> +#define RMI_ABI_MAJOR_VERSION 2
> +#define RMI_ABI_MINOR_VERSION 0
> +
> +#define RMI_ABI_VERSION_GET_MAJOR(version) ((version) >> 16)
> +#define RMI_ABI_VERSION_GET_MINOR(version) ((version) & 0xFFFF)
> +#define RMI_ABI_VERSION(major, minor) (((major) << 16) | (minor))
> +
> +#define RMI_UNASSIGNED 0
> +#define RMI_ASSIGNED 1
> +#define RMI_TABLE 2
> +
Those definations are inconsistent to those defined in tf-rmm/lib/smc/include/smc-rmi.h
where their size are 64-bits. Also, other two definations are missed here and perhaps
worthy to be added here.
#define RMI_ASSIGNED_DEV UL(3)
#define RMI_AUX_DESTROYED UL(5)
> +#define RMI_RETURN_STATUS(ret) ((ret) & 0xFF)
> +#define RMI_RETURN_INDEX(ret) (((ret) >> 8) & 0xFF)
> +#define RMI_RETURN_MEMREQ(ret) (((ret) >> 8) & 0x3)
> +#define RMI_RETURN_CAN_CANCEL(ret) (((ret) >> 10) & 0x1)
> +
> +#define RMI_SUCCESS 0
> +#define RMI_ERROR_INPUT 1
> +#define RMI_ERROR_REALM 2
> +#define RMI_ERROR_REC 3
> +#define RMI_ERROR_RTT 4
> +#define RMI_ERROR_NOT_SUPPORTED 5
> +#define RMI_ERROR_DEVICE 6
> +#define RMI_ERROR_RTT_AUX 7
> +#define RMI_ERROR_PSMMU_ST 8
> +#define RMI_ERROR_DPT 9
> +#define RMI_BUSY 10
> +#define RMI_ERROR_GLOBAL 11
> +#define RMI_ERROR_TRACKING 12
> +#define RMI_INCOMPLETE 13
> +#define RMI_BLOCKED 14
> +#define RMI_ERROR_GPT 15
> +#define RMI_ERROR_GRANULE 16
> +
> +#define RMI_OP_MEM_REQ_NONE 0
> +#define RMI_OP_MEM_REQ_DONATE 1
> +#define RMI_OP_MEM_REQ_RECLAIM 2
> +
The size of those definations are 32-bits, different to that of them defined
in tf-rmm/lib/smc/include/smc-rmi.h
#define RMI_OP_MEM_REQ_NONE (0UL)
#define RMI_OP_MEM_REQ_DONATE (1UL)
#define RMI_OP_MEM_REQ_RECLAIM (2UL)
> +#define RMI_DONATE_SIZE(req) ((req) & 0x3)
> +#define RMI_DONATE_COUNT_MASK GENMASK(15, 2)
> +#define RMI_DONATE_COUNT(req) (((req) & RMI_DONATE_COUNT_MASK) >> 2)
> +#define RMI_DONATE_CONTIG(req) (!!((req) & BIT(16)))
> +#define RMI_DONATE_STATE(req) (!!((req) & BIT(17)))
> +
> +#define RMI_OP_MEM_DELEGATED 0
> +#define RMI_OP_MEM_UNDELEGATED 1
> +
As above, inconsistent size to those definations in tf-rmm/lib/smc/include/smc-rmi.h
> +#define RMI_ADDR_TYPE_NONE 0
> +#define RMI_ADDR_TYPE_SINGLE 1
> +#define RMI_ADDR_TYPE_LIST 2
> +
As above, inconsistent size to those definations in tf-rmm/lib/smc/include/smc-rmi.h
> +#define RMI_ADDR_RANGE_SIZE_MASK GENMASK(1, 0)
> +#define RMI_ADDR_RANGE_COUNT_MASK GENMASK(PAGE_SHIFT - 1, 2)
> +#define RMI_ADDR_RANGE_ADDR_MASK (PAGE_MASK & GENMASK(51, 0))
> +#define RMI_ADDR_RANGE_STATE_MASK BIT(63)
> +
> +#define RMI_ADDR_RANGE_SIZE(ar) (FIELD_GET(RMI_ADDR_RANGE_SIZE_MASK, \
> + (ar)))
> +#define RMI_ADDR_RANGE_COUNT(ar) (FIELD_GET(RMI_ADDR_RANGE_COUNT_MASK, \
> + (ar)))
> +#define RMI_ADDR_RANGE_ADDR(ar) ((ar) & RMI_ADDR_RANGE_ADDR_MASK)
> +#define RMI_ADDR_RANGE_STATE(ar) (FIELD_GET(RMI_ADDR_RANGE_STATE_MASK, \
> + (ar)))
> +
> +enum rmi_ripas {
> + RMI_EMPTY = 0,
> + RMI_RAM = 1,
> + RMI_DESTROYED = 2,
> + RMI_DEV = 3,
> +};
> +
> +#define RMI_NO_MEASURE_CONTENT 0
> +#define RMI_MEASURE_CONTENT 1
> +
> +#define RMI_FEATURE_REGISTER_0_S2SZ GENMASK(7, 0)
> +#define RMI_FEATURE_REGISTER_0_LPA2 BIT(8)
> +#define RMI_FEATURE_REGISTER_0_SVE BIT(9)
> +#define RMI_FEATURE_REGISTER_0_SVE_VL GENMASK(13, 10)
> +#define RMI_FEATURE_REGISTER_0_NUM_BPS GENMASK(19, 14)
> +#define RMI_FEATURE_REGISTER_0_NUM_WPS GENMASK(25, 20)
> +#define RMI_FEATURE_REGISTER_0_PMU BIT(26)
> +#define RMI_FEATURE_REGISTER_0_PMU_NUM_CTRS GENMASK(31, 27)
> +
> +#define RMI_FEATURE_REGISTER_1_RMI_GRAN_SZ_4KB BIT(0)
> +#define RMI_FEATURE_REGISTER_1_RMI_GRAN_SZ_16KB BIT(1)
> +#define RMI_FEATURE_REGISTER_1_RMI_GRAN_SZ_64KB BIT(2)
> +#define RMI_FEATURE_REGISTER_1_HASH_SHA_256 BIT(3)
> +#define RMI_FEATURE_REGISTER_1_HASH_SHA_384 BIT(4)
> +#define RMI_FEATURE_REGISTER_1_HASH_SHA_512 BIT(5)
> +#define RMI_FEATURE_REGISTER_1_MAX_RECS_ORDER GENMASK(9, 6)
> +#define RMI_FEATURE_REGISTER_1_L0GPTSZ GENMASK(13, 10)
> +#define RMI_FEATURE_REGISTER_1_PPS GENMASK(16, 14)
> +
> +#define RMI_FEATURE_REGISTER_2_DA BIT(0)
> +#define RMI_FEATURE_REGISTER_2_DA_COH BIT(1)
> +#define RMI_FEATURE_REGISTER_2_VSMMU BIT(2)
> +#define RMI_FEATURE_REGISTER_2_ATS BIT(3)
> +#define RMI_FEATURE_REGISTER_2_MAX_VDEVS_ORDER GENMASK(7, 4)
> +#define RMI_FEATURE_REGISTER_2_VDEV_KROU BIT(8)
> +#define RMI_FEATURE_REGISTER_2_NON_TEE_STREAM BIT(9)
> +
> +#define RMI_FEATURE_REGISTER_3_MAX_NUM_AUX_PLANES GENMASK(3, 0)
> +#define RMI_FEATURE_REGISTER_3_RTT_PLAN GENMASK(5, 4)
> +#define RMI_FEATURE_REGISTER_3_RTT_S2AP_INDIRECT BIT(6)
> +
> +#define RMI_FEATURE_REGISTER_4_MEC_COUNT GENMASK(63, 0)
> +
> +#define RMI_MEM_CATEGORY_CONVENTIONAL 0
> +#define RMI_MEM_CATEGORY_DEV_NCOH 1
> +#define RMI_MEM_CATEGORY_DEV_COH 2
> +
> +#define RMI_TRACKING_RESERVED 0
> +#define RMI_TRACKING_NONE 1
> +#define RMI_TRACKING_FINE 2
> +#define RMI_TRACKING_COARSE 3
> +
> +#define RMI_GRANULE_SIZE_4KB 0
> +#define RMI_GRANULE_SIZE_16KB 1
> +#define RMI_GRANULE_SIZE_64KB 2
> +
> +/*
> + * Note many of these fields are smaller than u64 but all fields have u64
> + * alignment, so use u64 to ensure correct alignment.
> + */
> +struct rmm_config {
> + union { /* 0x0 */
> + struct {
> + u64 tracking_region_size;
> + u64 rmi_granule_size;
> + };
> + u8 sizer[0x1000];
> + };
> +};
> +
> +#define RMI_REALM_PARAM_FLAG_LPA2 BIT(0)
> +#define RMI_REALM_PARAM_FLAG_SVE BIT(1)
> +#define RMI_REALM_PARAM_FLAG_PMU BIT(2)
> +
> +struct realm_params {
> + union { /* 0x0 */
> + struct {
> + u64 flags;
> + u64 s2sz;
> + u64 sve_vl;
> + u64 num_bps;
> + u64 num_wps;
> + u64 pmu_num_ctrs;
> + u64 hash_algo;
> + u64 num_aux_planes;
> + };
> + u8 padding0[0x400];
> + };
> + union { /* 0x400 */
> + struct {
> + u8 rpv[64];
> + u64 ats_plane;
> + };
> + u8 padding1[0x400];
> + };
> + union { /* 0x800 */
> + struct {
> + u64 padding;
> + u64 rtt_base;
> + s64 rtt_level_start;
> + u64 rtt_num_start;
> + u64 flags1;
> + u64 aux_rtt_base[3];
> + };
> + u8 padding2[0x800];
> + };
> +};
> +
> +/*
> + * The number of GPRs (starting from X0) that are
> + * configured by the host when a REC is created.
> + */
> +#define REC_CREATE_NR_GPRS 8
> +
> +#define REC_PARAMS_FLAG_RUNNABLE BIT_ULL(0)
> +
> +struct rec_params {
> + union { /* 0x0 */
> + u64 flags;
> + u8 padding0[0x100];
> + };
> + union { /* 0x100 */
> + u64 mpidr;
> + u8 padding1[0x100];
> + };
> + union { /* 0x200 */
> + u64 pc;
> + u8 padding2[0x100];
> + };
> + union { /* 0x300 */
> + u64 gprs[REC_CREATE_NR_GPRS];
> + u8 padding3[0xd00];
> + };
> +};
> +
> +#define REC_ENTER_FLAG_EMULATED_MMIO BIT(0)
> +#define REC_ENTER_FLAG_INJECT_SEA BIT(1)
> +#define REC_ENTER_FLAG_TRAP_WFI BIT(2)
> +#define REC_ENTER_FLAG_TRAP_WFE BIT(3)
> +#define REC_ENTER_FLAG_RIPAS_RESPONSE BIT(4)
> +#define REC_ENTER_FLAG_S2AP_RESPONSE BIT(5)
> +#define REC_ENTER_FLAG_DEV_MEM_RESPONSE BIT(6)
> +#define REC_ENTER_FLAG_FORCE_P0 BIT(7)
> +
> +#define REC_RUN_GPRS 31
> +#define REC_MAX_GIC_NUM_LRS 16
> +
> +#define RMI_PERMITTED_GICV3_HCR_BITS (ICH_HCR_EL2_UIE | \
> + ICH_HCR_EL2_LRENPIE | \
> + ICH_HCR_EL2_NPIE | \
> + ICH_HCR_EL2_VGrp0EIE | \
> + ICH_HCR_EL2_VGrp0DIE | \
> + ICH_HCR_EL2_VGrp1EIE | \
> + ICH_HCR_EL2_VGrp1DIE | \
> + ICH_HCR_EL2_TDIR)
> +
> +struct rec_enter {
> + union { /* 0x000 */
> + u64 flags;
> + u8 padding0[0x200];
> + };
> + union { /* 0x200 */
> + u64 gprs[REC_RUN_GPRS];
> + u8 padding1[0x100];
> + };
> + u8 padding3[0x500];
> +};
> +
> +#define RMI_EXIT_SYNC 0x00
> +#define RMI_EXIT_IRQ 0x01
> +#define RMI_EXIT_FIQ 0x02
> +#define RMI_EXIT_PSCI 0x03
> +#define RMI_EXIT_RIPAS_CHANGE 0x04
> +#define RMI_EXIT_HOST_CALL 0x05
> +#define RMI_EXIT_SERROR 0x06
> +#define RMI_EXIT_S2AP_CHANGE 0x07
> +#define RMI_EXIT_VDEV_REQUEST 0x08
> +#define RMI_EXIT_VDEV_VALIDATE_MAPPING 0x09
> +#define RMI_EXIT_VSMMU_COMMAND 0x0a
> +
> +struct rec_exit {
> + union { /* 0x000 */
> + u8 exit_reason;
> + u8 padding0[0x100];
> + };
> + union { /* 0x100 */
> + struct {
> + u64 esr;
> + u64 far;
> + u64 hpfar;
> + u64 rtt_tree;
> + };
> + u8 padding1[0x100];
> + };
> + union { /* 0x200 */
> + u64 gprs[REC_RUN_GPRS];
> + u8 padding2[0x100];
> + };
> + union { /* 0x300 */
> + u8 padding3[0x100];
> + };
> + union { /* 0x400 */
> + struct {
> + u64 cntp_ctl;
> + u64 cntp_cval;
> + u64 cntv_ctl;
> + u64 cntv_cval;
> + };
> + u8 padding4[0x100];
> + };
> + union { /* 0x500 */
> + struct {
> + u64 ripas_base;
> + u64 ripas_top;
> + u8 ripas_value;
> + u8 padding8[15];
> + u64 s2ap_base;
> + u64 s2ap_top;
> + u64 vdev_id_1;
> + u64 vdev_id_2;
> + u64 dev_mem_base;
> + u64 dev_mem_top;
> + u64 dev_mem_pa;
> + };
> + u8 padding5[0x100];
> + };
> + union { /* 0x600 */
> + struct {
> + u16 imm;
> + u16 padding9;
> + u64 plane;
> + };
> + u8 padding6[0x100];
> + };
> + union { /* 0x700 */
> + struct {
> + u8 pmu_ovf_status;
> + u8 padding10[15];
> + u64 vsmmu;
> + };
> + u8 padding7[0x100];
> + };
> +};
> +
> +struct rec_run {
> + struct rec_enter enter;
> + struct rec_exit exit;
> +};
> +
> +/* RMI_RTT_UNPROT_MAP_FLAGS definitions */
> +#define RMI_RTT_UNPROT_MAP_FLAGS_OADDR_TYPE GENMASK(1, 0)
> +#define RMI_RTT_UNPROT_MAP_FLAGS_LIST_COUNT GENMASK(15, 2)
> +#define RMI_RTT_UNPROT_MAP_FLAGS_MEMATTR GENMASK(18, 16)
> +#define RMI_RTT_UNPROT_MAP_FLAGS_S2AP GENMASK(22, 19)
> +
> +/* S2AP Direct Encodings, used in RMI_RTT_UNPROT_MAP_FLAGS_S2AP */
> +#define RMI_S2AP_DIRECT_WRITE BIT(0)
> +#define RMI_S2AP_DIRECT_READ BIT(1)
> +
> +#endif /* __ASM_RMI_SMC_H */
Thanks,
Gavin
^ permalink raw reply
* Re: [PATCH v5 1/3] firmware: smccc: coco: Manage arm-smccc platform device and CCA auxiliary drivers
From: Aneesh Kumar K.V @ 2026-05-18 6:29 UTC (permalink / raw)
To: Greg KH
Cc: Suzuki K Poulose, linux-coco, linux-arm-kernel, linux-kernel,
Catalin Marinas, Jeremy Linton, Jonathan Cameron,
Lorenzo Pieralisi, Mark Rutland, Sudeep Holla, Will Deacon,
Steven Price
In-Reply-To: <2026051404-headboard-cabbage-e09a@gregkh>
Greg KH <gregkh@linuxfoundation.org> writes:
> On Thu, May 14, 2026 at 08:07:27PM +0530, Aneesh Kumar K.V wrote:
>> Greg KH <gregkh@linuxfoundation.org> writes:
>>
>> > On Thu, May 14, 2026 at 12:04:13PM +0100, Suzuki K Poulose wrote:
>> >> Hi Aneesh
>> >>
>> >> On 14/05/2026 10:40, Aneesh Kumar K.V (Arm) wrote:
>> >> > Make the SMCCC driver responsible for registering the arm-smccc platform
>> >> > device and after confirming the relevant SMCCC function IDs, create
>> >> > the arm_cca_guest auxiliary device.
>> >> >
>> >>
>> >> There are a few changes squashed in to this patch. Please could we
>> >> split the patch in the following order ?
>> >>
>> >> 1. Add platform device for arm-smccc
>> >
>> > Do not make any more "fake" platform devices please.
>> >
>> >> 2. Move TRNG to Auxilliary Device - (Even though it is a later patch, move
>> >> it before the RSI changes)
>> >
>> > No, move it to the faux api please.
>> >
>>
>>
>> Maybe I was not complete in my previous reply. I did not want to repeat
>> the entire thread, so I quoted the lore link for more details.
>>
>> 1. We have platform firmware-provided SMCCC interfaces. Based on the
>> support/availability of these function IDs, we want to load multiple
>> drivers.
>> 2. This patch series adds a platform device to represent the
>> firmware-provided SMCCC resource.
>> 3. Different SMCCC ranges are now represented as auxiliary devices.
>> 4. Different subsystems, such as TSM, can autoload their backend drivers
>> based on the availability of these SMCCC ranges, which are now
>> represented as auxiliary devices.
>>
>> You had agreed to all of this in the previous discussion here:
>> https://lore.kernel.org/all/2025101516-handbook-hyphen-62ec@gregkh
>
> Then why did someone say "this is a fake platform device with no actual
> resources"? That's what I was triggering off of.
>
> Again, if you have actual platform resources, GREAT, use a platform
> device and aux. If you do not, then do NOT use a platform device.
>
> totally confused,
>
> greg k-h
I have now rewritten the cover letter as below. Let me know if this
helps.
Switch Arm SMCCC firmware services to auxiliary devices
As discussed here:
https://lore.kernel.org/all/20250728135216.48084-12-aneesh.kumar@kernel.org
The earlier CCA guest support used an arm-cca-dev platform device as a pure
software anchor for the TSM class device. That platform device did not
correspond to a DT/ACPI described device, MMIO range, interrupt, or other
platform resource; it existed only to make the CCA guest driver bind and to
place the resulting TSM device in the driver model. The same pattern also
exists for smccc_trng. Creating separate platform devices for such
SMCCC-discovered features is misleading, because those features are not
independent platform devices.
This series changes the model so that there is a single arm-smccc platform
device representing the SMCCC firmware interface itself. The firmware
interface, including its discoverable SMCCC function space, is the
resource: after PSCI/SMCCC conduit discovery, the kernel can query SMCCC
function IDs and determine whether optional firmware services are present.
Services such as SMCCC TRNG and Realm Services Interface (RSI) are
therefore represented as children of the arm-smccc device, and are created
only when the required SMCCC function IDs and ABI checks succeed.
The child devices use the auxiliary bus deliberately: they are intended to
bind independent feature drivers, not just to provide a driverless object for
sysfs or other class-device anchoring. They are firmware-provided functions
of the parent SMCCC interface that are consumed by separate kernel drivers
in different subsystems, such as hwrng and virt/coco/TSM. Those drivers
need normal driver-core matching, probe/remove lifetime, and module
autoloading based on the discovered firmware feature. The auxiliary bus
provides a MODALIAS and id-table based binding model for that case, while
keeping the feature drivers off the platform bus. A faux device was
considered, but not used because it is suited for simple software objects
that do not need independent bus/driver binding. The faux bus has no
feature-driver id-table or MODALIAS matching, so it would not preserve the
module-autoload flow that the current platform-device based users rely on.
In other words, the parent arm-smccc device represents the firmware
resource exposed through the SMCCC conduit, and each auxiliary child
represents one discovered firmware service of that parent. This removes the
unnecessary per-feature platform devices while retaining automatic loading
and independent subsystem drivers for the SMCCC services.
The TSM framework uses the device abstraction to provide cross-architecture
TSM and TEE I/O functionality, including enumerating available platform TEE
I/O capabilities and provisioning connections between the platform TSM and
device DSMs. For Arm CCA, the RSI auxiliary device continues to provide the
device anchor used by the CCA guest TSM provider.
For the CCA platform, the resulting device hierarchy appears as follows.
Note that the auxiliary device is parented by the arm-smccc platform device,
so the sysfs path remains under /devices/platform/arm-smccc/:
$ cd /sys/class/tsm/
$ ls -al
total 0
drwxr-xr-x 2 root root 0 Jan 1 00:02 .
drwxr-xr-x 23 root root 0 Jan 1 00:00 ..
lrwxrwxrwx 1 root root 0 Jan 1 00:03 tsm0 -> ../../devices/platform/arm-smccc/arm_cca_guest.arm-rsi-dev.0/tsm/tsm0
$
The series also replaces the old arm-cca-dev userspace-visible dummy device
with /sys/firmware/cca/realm_guest for detecting whether the kernel is
running in a Realm. This keeps the guest-state ABI under /sys/firmware and
separates it from the internal driver-binding device used by the CCA guest
TSM provider.
-aneesh
^ permalink raw reply
* Re: [PATCH v2 03/15] KVM: x86/xen: Don't truncate RAX when handling hypercall from protected guest
From: Binbin Wu @ 2026-05-18 2:19 UTC (permalink / raw)
To: Sean Christopherson
Cc: Paolo Bonzini, Vitaly Kuznetsov, Kiryl Shutsemau, David Woodhouse,
Paul Durrant, Dave Hansen, Rick Edgecombe, kvm, x86, linux-coco,
linux-kernel, Yosry Ahmed, Kai Huang
In-Reply-To: <agcXr2FP4C3I_jbQ@google.com>
On 5/15/2026 8:55 PM, Sean Christopherson wrote:
> On Fri, May 15, 2026, Binbin Wu wrote:
>>
>>
>> On 5/15/2026 5:53 AM, Sean Christopherson wrote:
>>> Don't truncate RAX when handling a Xen hypercall for a guest with protected
>>> state, as KVM's ABI is to assume the guest is in 64-bit for such cases
>>> (the guest leaving garbage in 63:32 after a transition to 32-bit mode is
>>> far less likely than 63:32 being necessary to complete the hypercall).
>>>
>>> Fixes: b5aead0064f3 ("KVM: x86: Assume a 64-bit hypercall for guests with protected state")
>>> Signed-off-by: Sean Christopherson <seanjc@google.com>
>>
>> The patch looks good to me, but one question below.
>>
>>> ---
>>> arch/x86/kvm/xen.c | 6 +++---
>>> 1 file changed, 3 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
>>> index 6d9be74bb673..895095dc684e 100644
>>> --- a/arch/x86/kvm/xen.c
>>> +++ b/arch/x86/kvm/xen.c
>>> @@ -1678,15 +1678,14 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
>>> bool handled = false;
>>> u8 cpl;
>>>
>>> - input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
>>> -
>>> /* Hyper-V hypercalls get bit 31 set in EAX */
>>> - if ((input & 0x80000000) &&
>>> + if ((kvm_rax_read(vcpu) & 0x80000000) &&
>>> kvm_hv_hypercall_enabled(vcpu))
>>> return kvm_hv_hypercall(vcpu);
>>>
>>> longmode = is_64_bit_hypercall(vcpu);
>>
>> Is the variable name misleading?
>
> It most definitely is. However, @longmode is passed around quite a few locations
> in xen.c, and so I don't want to opportunistically fix this one variable. Though
> I'm definitely not opposed to a separate patch to rename them all to is_64bit or
> something.
OK, I can do it.
>
>> If the vcpu is in compatible mode (when guest state is not protected),
>> it's in long mode, but the code goes to !longmode path.
>>
>>> if (!longmode) {
>>> + input = (u32)kvm_rax_read(vcpu);
>>> params[0] = (u32)kvm_rbx_read(vcpu);
>>> params[1] = (u32)kvm_rcx_read(vcpu);
>>> params[2] = (u32)kvm_rdx_read(vcpu);
>>> @@ -1696,6 +1695,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
>>> }
>>> else {
>>> #ifdef CONFIG_X86_64
>>> + input = (u64)kvm_rax_read(vcpu);
>>> params[0] = (u64)kvm_rdi_read(vcpu);
>>> params[1] = (u64)kvm_rsi_read(vcpu);
>>> params[2] = (u64)kvm_rdx_read(vcpu);
>>
>
^ permalink raw reply
* Re: [PATCH v4 00/13] dma-mapping: Use DMA_ATTR_CC_SHARED through direct, pool and swiotlb paths
From: Jiri Pirko @ 2026-05-17 6:19 UTC (permalink / raw)
To: Aneesh Kumar K.V (Arm)
Cc: iommu, linux-arm-kernel, linux-kernel, linux-coco, Robin Murphy,
Marek Szyprowski, Will Deacon, Marc Zyngier, Steven Price,
Suzuki K Poulose, Catalin Marinas, Jason Gunthorpe, Mostafa Saleh,
Petr Tesarik, Alexey Kardashevskiy, Dan Williams, Xu Yilun,
linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86
In-Reply-To: <20260512090408.794195-1-aneesh.kumar@kernel.org>
Tue, May 12, 2026 at 11:03:55AM +0200, aneesh.kumar@kernel.org wrote:
>This series propagates DMA_ATTR_CC_SHARED through the dma-direct,
>dma-pool, and swiotlb paths so that encrypted and decrypted DMA buffers
>are handled consistently.
>
>Today, the direct DMA path mostly relies on force_dma_unencrypted() for
>shared/decrypted buffer handling. This series consolidates the
>force_dma_unencrypted() checks in the top-level functions and ensures
>that the remaining DMA interfaces use DMA attributes to make the correct
>decisions.
FWIW, the patchset in general looks good to me. I tested this with my
system_cc_shared dmabuf flow, works flawlessly.
Thanks!
^ permalink raw reply
* Re: [PATCH v4 03/13] dma-pool: track decrypted atomic pools and select them via attrs
From: Alexey Kardashevskiy @ 2026-05-16 12:53 UTC (permalink / raw)
To: Aneesh Kumar K.V (Arm), iommu, linux-arm-kernel, linux-kernel,
linux-coco
Cc: Robin Murphy, Marek Szyprowski, Will Deacon, Marc Zyngier,
Steven Price, Suzuki K Poulose, Catalin Marinas, Jiri Pirko,
Jason Gunthorpe, Mostafa Saleh, Petr Tesarik, Dan Williams,
Xu Yilun, linuxppc-dev, linux-s390, Madhavan Srinivasan,
Michael Ellerman, Nicholas Piggin, Christophe Leroy (CS GROUP),
Alexander Gordeev, Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86
In-Reply-To: <20260512090408.794195-4-aneesh.kumar@kernel.org>
On 12/5/26 19:03, Aneesh Kumar K.V (Arm) wrote:
> Teach the atomic DMA pool code to distinguish between encrypted and
> decrypted pools, and make pool allocation select the matching pool based
> on DMA attributes.
>
> Introduce a dma_gen_pool wrapper that records whether a pool is
> decrypted, initialize that state when the atomic pools are created, and
> use it when expanding and resizing the pools. Update dma_alloc_from_pool()
> to take attrs and skip pools whose encrypted/decrypted state does not
> match DMA_ATTR_CC_SHARED. Update dma_free_from_pool() accordingly.
>
> Also pass DMA_ATTR_CC_SHARED from the swiotlb atomic allocation path
> so decrypted swiotlb allocations are taken from the correct atomic pool.
>
> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
> ---
> drivers/iommu/dma-iommu.c | 2 +-
> include/linux/dma-map-ops.h | 2 +-
> kernel/dma/direct.c | 11 ++-
> kernel/dma/pool.c | 163 +++++++++++++++++++++++-------------
> kernel/dma/swiotlb.c | 7 +-
> 5 files changed, 122 insertions(+), 63 deletions(-)
>
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index 54d96e847f16..c2595bee3d41 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -1673,7 +1673,7 @@ void *iommu_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
> if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
> !gfpflags_allow_blocking(gfp) && !coherent)
> page = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &cpu_addr,
> - gfp, NULL);
> + gfp, attrs, NULL);
> else
> cpu_addr = iommu_dma_alloc_pages(dev, size, &page, gfp, attrs);
> if (!cpu_addr)
> diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
> index 6a1832a73cad..696b2c3a2305 100644
> --- a/include/linux/dma-map-ops.h
> +++ b/include/linux/dma-map-ops.h
> @@ -212,7 +212,7 @@ void *dma_common_pages_remap(struct page **pages, size_t size, pgprot_t prot,
> void dma_common_free_remap(void *cpu_addr, size_t size);
>
> struct page *dma_alloc_from_pool(struct device *dev, size_t size,
> - void **cpu_addr, gfp_t flags,
> + void **cpu_addr, gfp_t flags, unsigned long attrs,
> bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t));
> bool dma_free_from_pool(struct device *dev, void *start, size_t size);
>
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index 0c2e1f8436ce..dc2907439b3d 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -162,7 +162,7 @@ static bool dma_direct_use_pool(struct device *dev, gfp_t gfp)
> }
>
> static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
> - dma_addr_t *dma_handle, gfp_t gfp)
> + dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
> {
> struct page *page;
> u64 phys_limit;
> @@ -172,7 +172,8 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
> return NULL;
>
> gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit);
> - page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok);
> + page = dma_alloc_from_pool(dev, size, &ret, gfp, attrs,
> + dma_coherent_ok);
> if (!page)
> return NULL;
> *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
> @@ -261,7 +262,8 @@ void *dma_direct_alloc(struct device *dev, size_t size,
> */
> if ((remap || (attrs & DMA_ATTR_CC_SHARED)) &&
> dma_direct_use_pool(dev, gfp))
> - return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
> + return dma_direct_alloc_from_pool(dev, size, dma_handle,
> + gfp, attrs);
>
> if (is_swiotlb_for_alloc(dev)) {
> page = dma_direct_alloc_swiotlb(dev, size);
> @@ -397,7 +399,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
> attrs |= DMA_ATTR_CC_SHARED;
>
> if ((attrs & DMA_ATTR_CC_SHARED) && dma_direct_use_pool(dev, gfp))
> - return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
> + return dma_direct_alloc_from_pool(dev, size, dma_handle,
> + gfp, attrs);
>
> if (is_swiotlb_for_alloc(dev)) {
> page = dma_direct_alloc_swiotlb(dev, size);
> diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
> index 2b2fbb709242..75f0eba48a23 100644
> --- a/kernel/dma/pool.c
> +++ b/kernel/dma/pool.c
> @@ -12,12 +12,18 @@
> #include <linux/set_memory.h>
> #include <linux/slab.h>
> #include <linux/workqueue.h>
> +#include <linux/cc_platform.h>
>
> -static struct gen_pool *atomic_pool_dma __ro_after_init;
> +struct dma_gen_pool {
> + bool unencrypted;
> + struct gen_pool *pool;
> +};
> +
> +static struct dma_gen_pool atomic_pool_dma __ro_after_init;
> static unsigned long pool_size_dma;
> -static struct gen_pool *atomic_pool_dma32 __ro_after_init;
> +static struct dma_gen_pool atomic_pool_dma32 __ro_after_init;
> static unsigned long pool_size_dma32;
> -static struct gen_pool *atomic_pool_kernel __ro_after_init;
> +static struct dma_gen_pool atomic_pool_kernel __ro_after_init;
> static unsigned long pool_size_kernel;
>
> /* Size can be defined by the coherent_pool command line */
> @@ -76,7 +82,7 @@ static bool cma_in_zone(gfp_t gfp)
> return true;
> }
>
> -static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
> +static int atomic_pool_expand(struct dma_gen_pool *dma_pool, size_t pool_size,
> gfp_t gfp)
> {
> unsigned int order;
> @@ -113,12 +119,15 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
> * Memory in the atomic DMA pools must be unencrypted, the pools do not
> * shrink so no re-encryption occurs in dma_direct_free().
> */
> - ret = set_memory_decrypted((unsigned long)page_to_virt(page),
> + if (dma_pool->unencrypted) {
> + ret = set_memory_decrypted((unsigned long)page_to_virt(page),
> 1 << order);
> - if (ret)
> - goto remove_mapping;
> - ret = gen_pool_add_virt(pool, (unsigned long)addr, page_to_phys(page),
> - pool_size, NUMA_NO_NODE);
> + if (ret)
> + goto remove_mapping;
> + }
> +
> + ret = gen_pool_add_virt(dma_pool->pool, (unsigned long)addr,
> + page_to_phys(page), pool_size, NUMA_NO_NODE);
> if (ret)
> goto encrypt_mapping;
>
> @@ -126,11 +135,15 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
> return 0;
>
> encrypt_mapping:
> - ret = set_memory_encrypted((unsigned long)page_to_virt(page),
> - 1 << order);
> - if (WARN_ON_ONCE(ret)) {
> - /* Decrypt succeeded but encrypt failed, purposely leak */
> - goto out;
> + if (dma_pool->unencrypted) {
> + int rc;
> +
> + rc = set_memory_encrypted((unsigned long)page_to_virt(page),
> + 1 << order);
> + if (WARN_ON_ONCE(rc)) {
> + /* Decrypt succeeded but encrypt failed, purposely leak */
> + goto out;
> + }
> }
> remove_mapping:
> #ifdef CONFIG_DMA_DIRECT_REMAP
> @@ -142,46 +155,52 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
> return ret;
> }
>
> -static void atomic_pool_resize(struct gen_pool *pool, gfp_t gfp)
> +static void atomic_pool_resize(struct dma_gen_pool *dma_pool, gfp_t gfp)
> {
> - if (pool && gen_pool_avail(pool) < atomic_pool_size)
> - atomic_pool_expand(pool, gen_pool_size(pool), gfp);
> + if (dma_pool->pool && gen_pool_avail(dma_pool->pool) < atomic_pool_size)
> + atomic_pool_expand(dma_pool, gen_pool_size(dma_pool->pool), gfp);
> }
>
> static void atomic_pool_work_fn(struct work_struct *work)
> {
> if (IS_ENABLED(CONFIG_ZONE_DMA))
> - atomic_pool_resize(atomic_pool_dma,
> + atomic_pool_resize(&atomic_pool_dma,
> GFP_KERNEL | GFP_DMA);
> if (IS_ENABLED(CONFIG_ZONE_DMA32))
> - atomic_pool_resize(atomic_pool_dma32,
> + atomic_pool_resize(&atomic_pool_dma32,
> GFP_KERNEL | GFP_DMA32);
> - atomic_pool_resize(atomic_pool_kernel, GFP_KERNEL);
> + atomic_pool_resize(&atomic_pool_kernel, GFP_KERNEL);
> }
>
> -static __init struct gen_pool *__dma_atomic_pool_init(size_t pool_size,
> - gfp_t gfp)
> +static __init struct dma_gen_pool *__dma_atomic_pool_init(struct dma_gen_pool *dma_pool,
> + size_t pool_size, gfp_t gfp)
> {
> - struct gen_pool *pool;
> int ret;
>
> - pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE);
> - if (!pool)
> + dma_pool->pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE);
> + if (!dma_pool->pool)
> return NULL;
>
> - gen_pool_set_algo(pool, gen_pool_first_fit_order_align, NULL);
> + gen_pool_set_algo(dma_pool->pool, gen_pool_first_fit_order_align, NULL);
> +
> + /* if platform is using memory encryption atomic pools are by default decrypted. */
> + if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
> + dma_pool->unencrypted = true;
> + else
> + dma_pool->unencrypted = false;
>
> - ret = atomic_pool_expand(pool, pool_size, gfp);
> + ret = atomic_pool_expand(dma_pool, pool_size, gfp);
> if (ret) {
> - gen_pool_destroy(pool);
> + gen_pool_destroy(dma_pool->pool);
> + dma_pool->pool = NULL;
> pr_err("DMA: failed to allocate %zu KiB %pGg pool for atomic allocation\n",
> pool_size >> 10, &gfp);
> return NULL;
> }
>
> pr_info("DMA: preallocated %zu KiB %pGg pool for atomic allocations\n",
> - gen_pool_size(pool) >> 10, &gfp);
> - return pool;
> + gen_pool_size(dma_pool->pool) >> 10, &gfp);
> + return dma_pool;
> }
>
> #ifdef CONFIG_ZONE_DMA32
> @@ -207,21 +226,22 @@ static int __init dma_atomic_pool_init(void)
>
> /* All memory might be in the DMA zone(s) to begin with */
> if (has_managed_zone(ZONE_NORMAL)) {
> - atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size,
> - GFP_KERNEL);
> - if (!atomic_pool_kernel)
> + __dma_atomic_pool_init(&atomic_pool_kernel, atomic_pool_size, GFP_KERNEL);
> + if (!atomic_pool_kernel.pool)
> ret = -ENOMEM;
> }
> +
> if (has_managed_dma()) {
> - atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size,
> - GFP_KERNEL | GFP_DMA);
> - if (!atomic_pool_dma)
> + __dma_atomic_pool_init(&atomic_pool_dma, atomic_pool_size,
> + GFP_KERNEL | GFP_DMA);
> + if (!atomic_pool_dma.pool)
> ret = -ENOMEM;
> }
> +
> if (has_managed_dma32) {
> - atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size,
> - GFP_KERNEL | GFP_DMA32);
> - if (!atomic_pool_dma32)
> + __dma_atomic_pool_init(&atomic_pool_dma32, atomic_pool_size,
> + GFP_KERNEL | GFP_DMA32);
> + if (!atomic_pool_dma32.pool)
> ret = -ENOMEM;
> }
>
> @@ -230,19 +250,44 @@ static int __init dma_atomic_pool_init(void)
> }
> postcore_initcall(dma_atomic_pool_init);
>
> -static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
> +static inline struct dma_gen_pool *__dma_guess_pool(struct dma_gen_pool *first,
> + struct dma_gen_pool *second, struct dma_gen_pool *third)
> +{
> + if (first->pool)
> + return first;
> + if (second && second->pool)
> + return second;
> + if (third && third->pool)
> + return third;
> + return NULL;
> +}
> +
> +static inline struct dma_gen_pool *dma_guess_pool(struct dma_gen_pool *prev,
> + gfp_t gfp)
> {
> - if (prev == NULL) {
> + if (!prev) {
> if (gfp & GFP_DMA)
> - return atomic_pool_dma ?: atomic_pool_dma32 ?: atomic_pool_kernel;
> + return __dma_guess_pool(&atomic_pool_dma,
> + &atomic_pool_dma32,
> + &atomic_pool_kernel);
> +
> if (gfp & GFP_DMA32)
> - return atomic_pool_dma32 ?: atomic_pool_dma ?: atomic_pool_kernel;
> - return atomic_pool_kernel ?: atomic_pool_dma32 ?: atomic_pool_dma;
> + return __dma_guess_pool(&atomic_pool_dma32,
> + &atomic_pool_dma,
> + &atomic_pool_kernel);
> +
> + return __dma_guess_pool(&atomic_pool_kernel,
> + &atomic_pool_dma32,
> + &atomic_pool_dma);
> }
> - if (prev == atomic_pool_kernel)
> - return atomic_pool_dma32 ? atomic_pool_dma32 : atomic_pool_dma;
> - if (prev == atomic_pool_dma32)
> - return atomic_pool_dma;
> +
> + if (prev == &atomic_pool_kernel)
> + return __dma_guess_pool(&atomic_pool_dma32,
> + &atomic_pool_dma, NULL);
> +
> + if (prev == &atomic_pool_dma32)
> + return __dma_guess_pool(&atomic_pool_dma, NULL, NULL);
> +
> return NULL;
> }
>
> @@ -272,16 +317,20 @@ static struct page *__dma_alloc_from_pool(struct device *dev, size_t size,
> }
>
> struct page *dma_alloc_from_pool(struct device *dev, size_t size,
> - void **cpu_addr, gfp_t gfp,
> + void **cpu_addr, gfp_t gfp, unsigned long attrs,
> bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t))
> {
> - struct gen_pool *pool = NULL;
> + struct dma_gen_pool *dma_pool = NULL;
> struct page *page;
> bool pool_found = false;
>
> - while ((pool = dma_guess_pool(pool, gfp))) {
> + while ((dma_pool = dma_guess_pool(dma_pool, gfp))) {
> +
> + if (dma_pool->unencrypted != !!(attrs & DMA_ATTR_CC_SHARED))
> + continue;
> +
> pool_found = true;
> - page = __dma_alloc_from_pool(dev, size, pool, cpu_addr,
> + page = __dma_alloc_from_pool(dev, size, dma_pool->pool, cpu_addr,
> phys_addr_ok);
> if (page)
> return page;
> @@ -296,12 +345,14 @@ struct page *dma_alloc_from_pool(struct device *dev, size_t size,
>
> bool dma_free_from_pool(struct device *dev, void *start, size_t size)
> {
> - struct gen_pool *pool = NULL;
> + struct dma_gen_pool *dma_pool = NULL;
> +
> + while ((dma_pool = dma_guess_pool(dma_pool, 0))) {
>
> - while ((pool = dma_guess_pool(pool, 0))) {
> - if (!gen_pool_has_addr(pool, (unsigned long)start, size))
> + if (!gen_pool_has_addr(dma_pool->pool, (unsigned long)start, size))
v3 of this just crashed here with dma_pool!=NULL but dma_pool->pool==NULL. continuing debugging... Thanks,
> continue;
> - gen_pool_free(pool, (unsigned long)start, size);
> +
> + gen_pool_free(dma_pool->pool, (unsigned long)start, size);
> return true;
> }
>
> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> index 1abd3e6146f4..ab4eccbaa076 100644
> --- a/kernel/dma/swiotlb.c
> +++ b/kernel/dma/swiotlb.c
> @@ -612,6 +612,7 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
> u64 phys_limit, gfp_t gfp)
> {
> struct page *page;
> + unsigned long attrs = 0;
>
> /*
> * Allocate from the atomic pools if memory is encrypted and
> @@ -623,8 +624,12 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
> if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
> return NULL;
>
> + /* swiotlb considered decrypted by default */
> + if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
> + attrs = DMA_ATTR_CC_SHARED;
> +
> return dma_alloc_from_pool(dev, bytes, &vaddr, gfp,
> - dma_coherent_ok);
> + attrs, dma_coherent_ok);
> }
>
> gfp &= ~GFP_ZONEMASK;
--
Alexey
^ permalink raw reply
* Re: [PATCH v3 40/41] x86/tsc: Add standalone helper for getting CPU frequency from CPUID
From: Paolo Bonzini @ 2026-05-16 7:42 UTC (permalink / raw)
To: Sean Christopherson, Kiryl Shutsemau, K. Y. Srinivasan,
Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Ajay Kaher,
Alexey Makhalov, Jan Kiszka, Dave Hansen, Andy Lutomirski,
Peter Zijlstra, Juergen Gross, Daniel Lezcano, Thomas Gleixner,
John Stultz
Cc: Rick Edgecombe, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <20260515191942.1892718-41-seanjc@google.com>
On 5/15/26 21:19, Sean Christopherson wrote:
> Extract the guts of cpu_khz_from_cpuid() to a standalone helper that
> doesn't restrict the usage to Intel CPUs. This will allow sharing the
> core logic with kvmclock, as (a) CPUID.0x16 may be enumerated alongside
> kvmclock, and (b) KVM generally doesn't restrict CPUID based on vendor.
Even for native there's no real reason to restrict to Intel, I think.
native_calibrate_tsc() only limits itself because historically (prior to
commit 604dc9170f24, "x86/tsc: Use CPUID.0x16 to calculate missing
crystal frequency", 2019-05-09) it used a hardcoded table of crystal
frequencies.
Of course paranoia applies, but for virtualization, if the leaf exists
there is no reason not to trust it.
Thanks,
Paolo
> No functional change intended.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
> arch/x86/include/asm/tsc.h | 1 +
> arch/x86/kernel/tsc.c | 37 +++++++++++++++++++++++--------------
> 2 files changed, 24 insertions(+), 14 deletions(-)
>
> diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
> index f458be688512..c145f5707b52 100644
> --- a/arch/x86/include/asm/tsc.h
> +++ b/arch/x86/include/asm/tsc.h
> @@ -91,6 +91,7 @@ struct cpuid_tsc_info {
> };
> extern int cpuid_get_tsc_info(struct cpuid_tsc_info *info);
> extern int cpuid_get_tsc_freq(struct cpuid_tsc_info *info);
> +extern int cpuid_get_cpu_freq(unsigned int *cpu_khz);
>
> extern void tsc_early_init(void);
> extern void tsc_init(void);
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 1b569954ae5e..745fa2052c74 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -719,6 +719,24 @@ int cpuid_get_tsc_freq(struct cpuid_tsc_info *info)
> return 0;
> }
>
> +int cpuid_get_cpu_freq(unsigned int *cpu_khz)
> +{
> + unsigned int eax_base_mhz, ebx, ecx, edx;
> +
> + *cpu_khz = 0;
> +
> + if (boot_cpu_data.cpuid_level < CPUID_LEAF_FREQ)
> + return -ENOENT;
> +
> + cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx, &ecx, &edx);
> +
> + if (!eax_base_mhz)
> + return -ENOENT;
> +
> + *cpu_khz = eax_base_mhz * 1000;
> + return 0;
> +}
> +
> /**
> * native_calibrate_tsc - determine TSC frequency
> * Determine TSC frequency via CPUID, else return 0.
> @@ -754,13 +772,8 @@ unsigned long native_calibrate_tsc(void)
> * clock, but we can easily calculate it to a high degree of accuracy
> * by considering the crystal ratio and the CPU speed.
> */
> - if (!info.crystal_khz && boot_cpu_data.cpuid_level >= CPUID_LEAF_FREQ) {
> - unsigned int eax_base_mhz, ebx, ecx, edx;
> -
> - cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx, &ecx, &edx);
> - info.crystal_khz = eax_base_mhz * 1000 *
> - info.denominator / info.numerator;
> - }
> + if (!info.crystal_khz && !cpuid_get_cpu_freq(&cpu_khz))
> + info.crystal_khz = cpu_khz * info.denominator / info.numerator;
>
> if (!info.crystal_khz)
> return 0;
> @@ -787,19 +800,15 @@ unsigned long native_calibrate_tsc(void)
>
> static unsigned long cpu_khz_from_cpuid(void)
> {
> - unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx;
> + unsigned int cpu_khz;
>
> if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
> return 0;
>
> - if (boot_cpu_data.cpuid_level < CPUID_LEAF_FREQ)
> + if (cpuid_get_cpu_freq(&cpu_khz))
> return 0;
>
> - eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0;
> -
> - cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx);
> -
> - return eax_base_mhz * 1000;
> + return cpu_khz;
> }
>
> /*
^ permalink raw reply
* Re: [PATCH v4 04/13] dma: swiotlb: track pool encryption state and honor DMA_ATTR_CC_SHARED
From: Jason Gunthorpe @ 2026-05-15 22:51 UTC (permalink / raw)
To: Mostafa Saleh
Cc: Aneesh Kumar K.V (Arm), iommu, linux-arm-kernel, linux-kernel,
linux-coco, Robin Murphy, Marek Szyprowski, Will Deacon,
Marc Zyngier, Steven Price, Suzuki K Poulose, Catalin Marinas,
Jiri Pirko, Petr Tesarik, Alexey Kardashevskiy, Dan Williams,
Xu Yilun, linuxppc-dev, linux-s390, Madhavan Srinivasan,
Michael Ellerman, Nicholas Piggin, Christophe Leroy (CS GROUP),
Alexander Gordeev, Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86
In-Reply-To: <agXfm3mS_M3fvRrN@google.com>
On Thu, May 14, 2026 at 02:43:39PM +0000, Mostafa Saleh wrote:
> > That's a somewhat different problem, we have the dev->trusted stuff
> > that is supposed to deal with this kind of security. We need it for
> > IOMMU based systems too, eg hot plug thunderbolt should have it.
>
> I see that it is used only for dma-iommu and for PCI devices.
> However, I think that should be a problem with other CCA solutions
> with emulated devices as they are untrusted. As I'd expect they
> would have virtio devices.
Yes, any security solution with an out of TCB device should be using either
memory encryption so the kernel already bounces or this trusted stuff
and a force strict dma-iommu so the dma layer is careful.
This is more policy from userspace what devices they want in or out of
their TCB. Like you make accept the device into T=1 but then still
want to keep it out of your TCB with the vIOMMU, I can see good
arguments for something like that.
> > > While we can debate the aesthetics of the setup , this is
> > > the exisitng behaviour for Linux, which existed for years
> > > and pKVM relies on and is used extensively.
> > > And, this patch alters that long-standing logic and introduces
> > > a functional regression.
> >
> > Yeah, Aneesh needs to do something here, I'm pointing out it is
> > entirely seperate thing from the CC path we are working on which is
> > decoupling CC from reylying on force swiotlb.
>
> I am looking into converting pKVM to use the CC stuff, I replied with
> a patch to Aneesh in this thread. However, I need to do more testing
> and make sure there are not any unwanted consequences.
Yeah, it is a nice patch and I think it will help reduce the
complexity if it aligns to CCA type stuff.
> > In a pkvm world it should be the same, the S2 table for the SMMU will
> > control what the device can access, and if the SMMU points to a
> > "private" or "shared" page is not something the device needs to know
> > or care about.
>
> I see that's because dma-iommu chooses the attrs for iommu_map().
Long term the DMA API path through the dma-iommu will pass the
ATTR_CC_SHARED through to iommu_map so when the arch requires a
different IOPTE it can construct it.
> In pKVM, dma_addr_t and IOPTE are the same for private and shared,
> so nothing differs in that case.
Yes, so you don't have to worry.
> We don’t expect pass-through devices to interact with shared
> memory (T=0) at the moment.
> However, I can see use cases for that, where the host and the guest
> collaborate with device passthrough and require zero copy.
Once you add the CC patch it becomes immediately possible though
because the user can allocate a CC shared DMA HEAP and feed that all
over the place.
> One other interesting case for device-passthrough is non-coherent
> devices which then require private pools for bouncing.
Why does shared/private matter for bouncing? Why do you need to bounce
at all? Do cmo's not work in pkvm guests?
Jason
^ permalink raw reply
* Re: [PATCH v9 11/23] x86/virt/seamldr: Allocate and populate a module update request
From: Dave Hansen @ 2026-05-15 19:43 UTC (permalink / raw)
To: Chao Gao, kvm, linux-coco, linux-kernel
Cc: binbin.wu, dave.hansen, djbw, ira.weiny, kai.huang, kas,
nik.borisov, paulmck, pbonzini, reinette.chatre, rick.p.edgecombe,
sagis, seanjc, tony.lindgren, vannapurve, vishal.l.verma,
yilun.xu, xiaoyao.li, yan.y.zhao, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, x86, H. Peter Anvin
In-Reply-To: <20260513151045.1420990-12-chao.gao@intel.com>
On 5/13/26 08:09, Chao Gao wrote:
> int seamldr_install_module(const u8 *data, u32 size)
> {
Can we please do better than 'data' and 'size'? Even s/size/data_len/ is
better. This goes for the *entire* series.
Why not call it 'tdx_image' or '_tdx_image' or '_module_image'?
Second, I'm losing track of where these are in the process. Comments on
functions help. Here's how...
> + struct seamldr_params *params;
> + int ret;
> +
> + params = kzalloc_obj(*params);
> + if (!params)
> + return -ENOMEM;
init_seamldr_params() should have a comment like this:
/*
* @data points to a vmalloc()'d 'struct tdx_image'. Transform
* it into @params which is the P-SEAMLDR ABI format.
*/
Then this site can say:
/* Populate 'params' from 'data' */
To at least show the flow of the code. Gunk flows from data=>params.
It's *NOT* otherwise obvious what's goign on here.
Please go through the whole series and make sure you're doing stuff like
this. Comment functions. Say what they are doing at a high level.
Comment their call sites when what they are doing is not clear.
> + ret = init_seamldr_params(params, data, size);
> + if (ret)
> + goto out;
> +
> /* TODO: Update TDX module here */
> - return 0;
> +out:
> + kfree(params);
> + return ret;
> }
^ permalink raw reply
* [PATCH v3 41/41] x86/kvmclock: Get CPU base frequency from CPUID when it's available
From: Sean Christopherson @ 2026-05-15 19:19 UTC (permalink / raw)
To: Kiryl Shutsemau, Paolo Bonzini, Sean Christopherson,
K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
Ajay Kaher, Alexey Makhalov, Jan Kiszka, Dave Hansen,
Andy Lutomirski, Peter Zijlstra, Juergen Gross, Daniel Lezcano,
Thomas Gleixner, John Stultz
Cc: Rick Edgecombe, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <20260515191942.1892718-1-seanjc@google.com>
If CPUID.0x16 is present and valid, use the CPU frequency provided by
CPUID instead of assuming that the virtual CPU runs at the same
frequency as TSC and/or kvmclock. Back before constant TSCs were a
thing, treating the TSC and CPU frequencies as one and the same was
somewhat reasonable, but now it's nonsensical, especially if the
hypervisor explicitly enumerates the CPU frequency.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/kvmclock.c | 16 +++++++++++++++-
1 file changed, 15 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 62c8ea2e6769..7607920ae386 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -190,6 +190,20 @@ void kvmclock_cpu_action(enum kvm_guest_cpu_action action)
}
}
+static unsigned long kvm_get_cpu_khz(void)
+{
+ unsigned int cpu_khz;
+
+ /*
+ * Prefer CPUID over kvmclock when possible, as the base CPU frequency
+ * isn't necessarily the same as the kvmlock "TSC" frequency.
+ */
+ if (!cpuid_get_cpu_freq(&cpu_khz))
+ return cpu_khz;
+
+ return pvclock_tsc_khz(this_cpu_pvti());
+}
+
/*
* If we don't do that, there is the possibility that the guest
* will calibrate under heavy load - thus, getting a lower lpj -
@@ -434,7 +448,7 @@ void __init kvmclock_init(void)
kvm_sched_clock_init(stable);
}
- tsc_register_calibration_routines(kvm_get_tsc_khz, kvm_get_tsc_khz,
+ tsc_register_calibration_routines(kvm_get_tsc_khz, kvm_get_cpu_khz,
tsc_properties);
x86_platform.get_wallclock = kvm_get_wallclock;
--
2.54.0.563.g4f69b47b94-goog
^ permalink raw reply related
* [PATCH v3 40/41] x86/tsc: Add standalone helper for getting CPU frequency from CPUID
From: Sean Christopherson @ 2026-05-15 19:19 UTC (permalink / raw)
To: Kiryl Shutsemau, Paolo Bonzini, Sean Christopherson,
K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
Ajay Kaher, Alexey Makhalov, Jan Kiszka, Dave Hansen,
Andy Lutomirski, Peter Zijlstra, Juergen Gross, Daniel Lezcano,
Thomas Gleixner, John Stultz
Cc: Rick Edgecombe, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <20260515191942.1892718-1-seanjc@google.com>
Extract the guts of cpu_khz_from_cpuid() to a standalone helper that
doesn't restrict the usage to Intel CPUs. This will allow sharing the
core logic with kvmclock, as (a) CPUID.0x16 may be enumerated alongside
kvmclock, and (b) KVM generally doesn't restrict CPUID based on vendor.
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/tsc.h | 1 +
arch/x86/kernel/tsc.c | 37 +++++++++++++++++++++++--------------
2 files changed, 24 insertions(+), 14 deletions(-)
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index f458be688512..c145f5707b52 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -91,6 +91,7 @@ struct cpuid_tsc_info {
};
extern int cpuid_get_tsc_info(struct cpuid_tsc_info *info);
extern int cpuid_get_tsc_freq(struct cpuid_tsc_info *info);
+extern int cpuid_get_cpu_freq(unsigned int *cpu_khz);
extern void tsc_early_init(void);
extern void tsc_init(void);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 1b569954ae5e..745fa2052c74 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -719,6 +719,24 @@ int cpuid_get_tsc_freq(struct cpuid_tsc_info *info)
return 0;
}
+int cpuid_get_cpu_freq(unsigned int *cpu_khz)
+{
+ unsigned int eax_base_mhz, ebx, ecx, edx;
+
+ *cpu_khz = 0;
+
+ if (boot_cpu_data.cpuid_level < CPUID_LEAF_FREQ)
+ return -ENOENT;
+
+ cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx, &ecx, &edx);
+
+ if (!eax_base_mhz)
+ return -ENOENT;
+
+ *cpu_khz = eax_base_mhz * 1000;
+ return 0;
+}
+
/**
* native_calibrate_tsc - determine TSC frequency
* Determine TSC frequency via CPUID, else return 0.
@@ -754,13 +772,8 @@ unsigned long native_calibrate_tsc(void)
* clock, but we can easily calculate it to a high degree of accuracy
* by considering the crystal ratio and the CPU speed.
*/
- if (!info.crystal_khz && boot_cpu_data.cpuid_level >= CPUID_LEAF_FREQ) {
- unsigned int eax_base_mhz, ebx, ecx, edx;
-
- cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx, &ecx, &edx);
- info.crystal_khz = eax_base_mhz * 1000 *
- info.denominator / info.numerator;
- }
+ if (!info.crystal_khz && !cpuid_get_cpu_freq(&cpu_khz))
+ info.crystal_khz = cpu_khz * info.denominator / info.numerator;
if (!info.crystal_khz)
return 0;
@@ -787,19 +800,15 @@ unsigned long native_calibrate_tsc(void)
static unsigned long cpu_khz_from_cpuid(void)
{
- unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx;
+ unsigned int cpu_khz;
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return 0;
- if (boot_cpu_data.cpuid_level < CPUID_LEAF_FREQ)
+ if (cpuid_get_cpu_freq(&cpu_khz))
return 0;
- eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0;
-
- cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx);
-
- return eax_base_mhz * 1000;
+ return cpu_khz;
}
/*
--
2.54.0.563.g4f69b47b94-goog
^ permalink raw reply related
* [PATCH v3 39/41] x86/paravirt: Move using_native_sched_clock() stub into timer.h
From: Sean Christopherson @ 2026-05-15 19:19 UTC (permalink / raw)
To: Kiryl Shutsemau, Paolo Bonzini, Sean Christopherson,
K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
Ajay Kaher, Alexey Makhalov, Jan Kiszka, Dave Hansen,
Andy Lutomirski, Peter Zijlstra, Juergen Gross, Daniel Lezcano,
Thomas Gleixner, John Stultz
Cc: Rick Edgecombe, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <20260515191942.1892718-1-seanjc@google.com>
Now that timer.h ended up with CONFIG_PARAVIRT #ifdeffery anyways, move the
PARAVIRT=n using_native_sched_clock() stub into timer.h as a "free"
optimization.
No functional change intended.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/timer.h | 5 +++--
arch/x86/kernel/tsc.c | 2 --
2 files changed, 3 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index ab1271bd9c3b..d8cb9c84f2c7 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -11,9 +11,9 @@ extern void recalibrate_cpu_khz(void);
extern int no_timer_check;
-extern bool using_native_sched_clock(void);
-
#ifdef CONFIG_PARAVIRT
+extern bool using_native_sched_clock(void);
+
int __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
void (*save)(void), void (*restore)(void),
void (*start_secondary));
@@ -27,6 +27,7 @@ static __always_inline void paravirt_set_sched_clock(u64 (*func)(void),
void paravirt_sched_clock_start_secondary(void);
#else
static inline void paravirt_sched_clock_start_secondary(void) { }
+static inline bool using_native_sched_clock(void) { return true; }
#endif
/*
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index f78e86494dec..1b569954ae5e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -316,8 +316,6 @@ int __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
}
#else
u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock")));
-
-bool using_native_sched_clock(void) { return true; }
#endif
notrace u64 sched_clock(void)
--
2.54.0.563.g4f69b47b94-goog
^ permalink raw reply related
* [PATCH v3 38/41] x86/paravirt: kvmclock: Setup kvmclock early iff it's sched_clock
From: Sean Christopherson @ 2026-05-15 19:19 UTC (permalink / raw)
To: Kiryl Shutsemau, Paolo Bonzini, Sean Christopherson,
K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
Ajay Kaher, Alexey Makhalov, Jan Kiszka, Dave Hansen,
Andy Lutomirski, Peter Zijlstra, Juergen Gross, Daniel Lezcano,
Thomas Gleixner, John Stultz
Cc: Rick Edgecombe, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <20260515191942.1892718-1-seanjc@google.com>
Rework the seemingly generic x86_cpuinit_ops.early_percpu_clock_init hook
into a dedicated PV sched_clock hook, as the only reason the hook exists
is to allow kvmclock to enable its PV clock on secondary CPUs before the
kernel tries to reference sched_clock, e.g. when grabbing a timestamp for
printk.
Rearranging the hook doesn't exactly reduce complexity; arguably it does
the opposite. But as-is, it's practically impossible to understand *why*
kvmclock needs to do early configuration.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/timer.h | 8 ++++++--
arch/x86/include/asm/x86_init.h | 2 --
arch/x86/kernel/kvmclock.c | 13 ++++++-------
arch/x86/kernel/smpboot.c | 3 ++-
arch/x86/kernel/tsc.c | 16 +++++++++++++++-
arch/x86/kernel/x86_init.c | 1 -
6 files changed, 29 insertions(+), 14 deletions(-)
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index ca5c95d48c03..ab1271bd9c3b 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -15,14 +15,18 @@ extern bool using_native_sched_clock(void);
#ifdef CONFIG_PARAVIRT
int __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
- void (*save)(void), void (*restore)(void));
+ void (*save)(void), void (*restore)(void),
+ void (*start_secondary));
static __always_inline void paravirt_set_sched_clock(u64 (*func)(void),
void (*save)(void),
void (*restore)(void))
{
- (void)__paravirt_set_sched_clock(func, true, save, restore);
+ (void)__paravirt_set_sched_clock(func, true, save, restore, NULL);
}
+void paravirt_sched_clock_start_secondary(void);
+#else
+static inline void paravirt_sched_clock_start_secondary(void) { }
#endif
/*
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 6c8a6ead84f6..d1b3f18ea41f 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -187,13 +187,11 @@ struct x86_init_ops {
/**
* struct x86_cpuinit_ops - platform specific cpu hotplug setups
* @setup_percpu_clockev: set up the per cpu clock event device
- * @early_percpu_clock_init: early init of the per cpu clock event device
* @fixup_cpu_id: fixup function for cpuinfo_x86::topo.pkg_id
* @parallel_bringup: Parallel bringup control
*/
struct x86_cpuinit_ops {
void (*setup_percpu_clockev)(void);
- void (*early_percpu_clock_init)(void);
void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node);
bool parallel_bringup;
};
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 0578bc448b1b..62c8ea2e6769 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -127,12 +127,13 @@ static void kvm_save_sched_clock_state(void)
kvmclock_disable();
}
-#ifdef CONFIG_SMP
-static void kvm_setup_secondary_clock(void)
+static void kvm_setup_secondary_sched_clock(void)
{
+ if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_SMP)))
+ return;
+
kvm_register_clock("secondary cpu, sched_clock setup");
}
-#endif
static void kvm_restore_sched_clock_state(void)
{
@@ -352,7 +353,8 @@ static __init void kvm_sched_clock_init(bool stable)
{
if (__paravirt_set_sched_clock(kvm_sched_clock_read, stable,
kvm_save_sched_clock_state,
- kvm_restore_sched_clock_state))
+ kvm_restore_sched_clock_state,
+ kvm_setup_secondary_sched_clock))
return;
kvm_sched_clock_offset = kvm_clock_read();
@@ -437,9 +439,6 @@ void __init kvmclock_init(void)
x86_platform.get_wallclock = kvm_get_wallclock;
x86_platform.set_wallclock = kvm_set_wallclock;
-#ifdef CONFIG_SMP
- x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock;
-#endif
kvm_get_preset_lpj();
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 294a8ea60298..318ae70e5e7b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -78,6 +78,7 @@
#include <asm/io_apic.h>
#include <asm/fpu/api.h>
#include <asm/setup.h>
+#include <asm/timer.h>
#include <asm/uv/uv.h>
#include <asm/microcode.h>
#include <asm/i8259.h>
@@ -275,7 +276,7 @@ static void notrace __noendbr start_secondary(void *unused)
cpu_init();
fpu__init_cpu();
rcutree_report_cpu_starting(raw_smp_processor_id());
- x86_cpuinit.early_percpu_clock_init();
+ paravirt_sched_clock_start_secondary();
ap_starting();
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7a261214fa3e..f78e86494dec 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -280,8 +280,19 @@ bool using_native_sched_clock(void)
return static_call_query(pv_sched_clock) == native_sched_clock;
}
+#ifdef CONFIG_SMP
+static void (*pv_sched_clock_start_secondary)(void) __ro_after_init;
+
+void paravirt_sched_clock_start_secondary(void)
+{
+ if (pv_sched_clock_start_secondary)
+ pv_sched_clock_start_secondary();
+}
+#endif
+
int __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
- void (*save)(void), void (*restore)(void))
+ void (*save)(void), void (*restore)(void),
+ void (*start_secondary))
{
/*
* Don't replace TSC with a PV clock when running as a CoCo guest and
@@ -298,6 +309,9 @@ int __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
static_call_update(pv_sched_clock, func);
x86_platform.save_sched_clock_state = save;
x86_platform.restore_sched_clock_state = restore;
+#ifdef CONFIG_SMP
+ pv_sched_clock_start_secondary = start_secondary;
+#endif
return 0;
}
#else
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index ebefb77c37bb..cbb5ee613ed5 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -128,7 +128,6 @@ struct x86_init_ops x86_init __initdata = {
};
struct x86_cpuinit_ops x86_cpuinit = {
- .early_percpu_clock_init = x86_init_noop,
.setup_percpu_clockev = setup_secondary_APIC_clock,
.parallel_bringup = true,
};
--
2.54.0.563.g4f69b47b94-goog
^ permalink raw reply related
* [PATCH v3 37/41] x86/kvmclock: Use TSC for sched_clock if it's constant and non-stop
From: Sean Christopherson @ 2026-05-15 19:19 UTC (permalink / raw)
To: Kiryl Shutsemau, Paolo Bonzini, Sean Christopherson,
K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
Ajay Kaher, Alexey Makhalov, Jan Kiszka, Dave Hansen,
Andy Lutomirski, Peter Zijlstra, Juergen Gross, Daniel Lezcano,
Thomas Gleixner, John Stultz
Cc: Rick Edgecombe, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <20260515191942.1892718-1-seanjc@google.com>
Prefer the TSC over kvmclock for sched_clock if the TSC is constant,
nonstop, and not marked unstable via command line. I.e. use the same
criteria as tweaking the clocksource rating so that TSC is preferred over
kvmclock. Per the below comment from native_sched_clock(), sched_clock
is more tolerant of slop than clocksource; using TSC for clocksource but
not sched_clock makes little to no sense, especially now that KVM CoCo
guests with a trusted TSC use TSC, not kvmclock.
/*
* Fall back to jiffies if there's no TSC available:
* ( But note that we still use it if the TSC is marked
* unstable. We do this because unlike Time Of Day,
* the scheduler clock tolerates small errors and it's
* very important for it to be as fast as the platform
* can achieve it. )
*/
The only advantage of using kvmclock is that doing so allows for early
and common detection of PVCLOCK_GUEST_STOPPED, but that code has been
broken for over two years with nary a complaint, i.e. it can't be
_that_ valuable. And as above, certain types of KVM guests are losing
the functionality regardless, i.e. acknowledging PVCLOCK_GUEST_STOPPED
needs to be decoupled from sched_clock() no matter what.
Link: https://lore.kernel.org/all/Z4hDK27OV7wK572A@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/kvmclock.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index abcc5b36ea1d..0578bc448b1b 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -416,22 +416,22 @@ void __init kvmclock_init(void)
}
/*
- * X86_FEATURE_NONSTOP_TSC is TSC runs at constant rate
- * with P/T states and does not stop in deep C-states.
- *
- * Invariant TSC exposed by host means kvmclock is not necessary:
- * can use TSC as clocksource.
- *
+ * If the TSC counts at a constant frequency across P/T states, counts
+ * in deep C-states, and the TSC hasn't been marked unstable, prefer
+ * the TSC over kvmclock for sched_clock and drop kvmclock's rating so
+ * that TSC is chosen as the clocksource. Note, the TSC unstable check
+ * exists purely to honor the TSC being marked unstable via command
+ * line, any runtime detection of an unstable will happen after this.
*/
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
!check_tsc_unstable()) {
kvm_clock.rating = 299;
tsc_properties = TSC_FREQ_KNOWN_AND_RELIABLE;
+ } else {
+ kvm_sched_clock_init(stable);
}
- kvm_sched_clock_init(stable);
-
tsc_register_calibration_routines(kvm_get_tsc_khz, kvm_get_tsc_khz,
tsc_properties);
--
2.54.0.563.g4f69b47b94-goog
^ permalink raw reply related
* [PATCH v3 36/41] x86/kvmclock: Get local APIC bus frequency from PV CPUID Timing Info
From: Sean Christopherson @ 2026-05-15 19:19 UTC (permalink / raw)
To: Kiryl Shutsemau, Paolo Bonzini, Sean Christopherson,
K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
Ajay Kaher, Alexey Makhalov, Jan Kiszka, Dave Hansen,
Andy Lutomirski, Peter Zijlstra, Juergen Gross, Daniel Lezcano,
Thomas Gleixner, John Stultz
Cc: Rick Edgecombe, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <20260515191942.1892718-1-seanjc@google.com>
When running as a KVM guest with kvmclock support enabled, stuff the APIC
timer period/frequency with the local APIC bus frequency reported in
CPUID.0x40000010.EBX instead of trying to calibrate/guess the frequency.
See Documentation/virt/kvm/x86/cpuid.rst for details.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/kvm_para.h | 1 +
arch/x86/kernel/kvm.c | 19 ++++++++++++++++---
arch/x86/kernel/kvmclock.c | 13 +++++++++++--
3 files changed, 28 insertions(+), 5 deletions(-)
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 3f7f558b5b24..381d029b72e7 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -130,6 +130,7 @@ void kvmclock_init(void);
void kvmclock_cpu_action(enum kvm_guest_cpu_action action);
bool kvm_para_available(void);
unsigned int kvm_para_tsc_khz(void);
+unsigned int kvm_para_apic_bus_khz(void);
unsigned int kvm_arch_para_features(void);
unsigned int kvm_arch_para_hints(void);
void kvm_async_pf_task_wait_schedule(u32 token);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 5cd92a0b156a..bfe36e361b3c 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -918,12 +918,25 @@ bool kvm_para_available(void)
}
EXPORT_SYMBOL_GPL(kvm_para_available);
-unsigned int kvm_para_tsc_khz(void)
+static bool kvm_cpuid_has_timing_info(void)
{
u32 base = kvm_cpuid_base();
- if (cpuid_eax(base) >= (base | KVM_CPUID_TIMING_INFO))
- return cpuid_eax(base | KVM_CPUID_TIMING_INFO);
+ return cpuid_eax(base) >= (base | KVM_CPUID_TIMING_INFO);
+}
+
+unsigned int kvm_para_tsc_khz(void)
+{
+ if (kvm_cpuid_has_timing_info())
+ return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_TIMING_INFO);
+
+ return 0;
+}
+
+unsigned int kvm_para_apic_bus_khz(void)
+{
+ if (kvm_cpuid_has_timing_info())
+ return cpuid_ebx(kvm_cpuid_base() | KVM_CPUID_TIMING_INFO);
return 0;
}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 5ceba4f3836c..abcc5b36ea1d 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -200,10 +200,19 @@ void kvmclock_cpu_action(enum kvm_guest_cpu_action action)
*/
static unsigned long kvm_get_tsc_khz(void)
{
+#ifdef CONFIG_X86_LOCAL_APIC
+ u32 apic_khz = kvm_para_apic_bus_khz();
+
/*
- * If KVM advertises the frequency directly in CPUID, use that
- * instead of reverse-calculating it from the KVM clock data.
+ * Use the TSC frequency from KVM's (and other hypervisors') PV CPUID
+ * leaf when available, instead of reverse-calculating it from the KVM
+ * clock data. As a bonus, the CPUID leaf also includes the local APIC
+ * bus/timer frequency.
*/
+ if (apic_khz)
+ lapic_timer_period = apic_khz;
+#endif
+
return kvm_para_tsc_khz() ? : pvclock_tsc_khz(this_cpu_pvti());
}
--
2.54.0.563.g4f69b47b94-goog
^ permalink raw reply related
* [PATCH v3 35/41] x86/kvmclock: Obtain TSC frequency from CPUID if present
From: Sean Christopherson @ 2026-05-15 19:19 UTC (permalink / raw)
To: Kiryl Shutsemau, Paolo Bonzini, Sean Christopherson,
K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
Ajay Kaher, Alexey Makhalov, Jan Kiszka, Dave Hansen,
Andy Lutomirski, Peter Zijlstra, Juergen Gross, Daniel Lezcano,
Thomas Gleixner, John Stultz
Cc: Rick Edgecombe, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <20260515191942.1892718-1-seanjc@google.com>
From: David Woodhouse <dwmw@amazon.co.uk>
In https://lkml.org/lkml/2008/10/1/246 a proposal was made for generic
CPUID conventions across hypervisors. It was mostly shot down in flames,
but the leaf at 0x40000010 containing timing information didn't die.
It's used by XNU and FreeBSD guests under all hypervisors¹² to determine
the TSC frequency, and also exposed by the EC2 Nitro hypervisor (as
well as, presumably, VMware). FreeBSD's Bhyve is probably just about
to start exposing it too.
Use it under KVM to obtain the TSC frequency more accurately, instead
of reverse-calculating the frequency from the mul/shift values in the
KVM clock.
Before:
[ 0.000020] tsc: Detected 2900.014 MHz processor
After:
[ 0.000020] tsc: Detected 2900.015 MHz processor
$ cpuid -1 -l 0x40000010
CPU:
hypervisor generic timing information (0x40000010):
TSC frequency (Hz) = 2900015
bus frequency (Hz) = 1000000
¹ https://github.com/apple/darwin-xnu/blob/main/osfmk/i386/cpuid.c
² https://github.com/freebsd/freebsd-src/commit/4a432614f68
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/kvm_para.h | 1 +
arch/x86/kernel/kvm.c | 10 ++++++++++
arch/x86/kernel/kvmclock.c | 6 +++++-
3 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 17053d2bf270..3f7f558b5b24 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -129,6 +129,7 @@ enum kvm_guest_cpu_action {
void kvmclock_init(void);
void kvmclock_cpu_action(enum kvm_guest_cpu_action action);
bool kvm_para_available(void);
+unsigned int kvm_para_tsc_khz(void);
unsigned int kvm_arch_para_features(void);
unsigned int kvm_arch_para_hints(void);
void kvm_async_pf_task_wait_schedule(u32 token);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 65c787b1ea03..5cd92a0b156a 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -918,6 +918,16 @@ bool kvm_para_available(void)
}
EXPORT_SYMBOL_GPL(kvm_para_available);
+unsigned int kvm_para_tsc_khz(void)
+{
+ u32 base = kvm_cpuid_base();
+
+ if (cpuid_eax(base) >= (base | KVM_CPUID_TIMING_INFO))
+ return cpuid_eax(base | KVM_CPUID_TIMING_INFO);
+
+ return 0;
+}
+
unsigned int kvm_arch_para_features(void)
{
return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 47f7df1e81a0..5ceba4f3836c 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -200,7 +200,11 @@ void kvmclock_cpu_action(enum kvm_guest_cpu_action action)
*/
static unsigned long kvm_get_tsc_khz(void)
{
- return pvclock_tsc_khz(this_cpu_pvti());
+ /*
+ * If KVM advertises the frequency directly in CPUID, use that
+ * instead of reverse-calculating it from the KVM clock data.
+ */
+ return kvm_para_tsc_khz() ? : pvclock_tsc_khz(this_cpu_pvti());
}
static void __init kvm_get_preset_lpj(void)
--
2.54.0.563.g4f69b47b94-goog
^ permalink raw reply related
* [PATCH v3 34/41] KVM: x86: Officially define CPUID 0x40000010 as PV Timing Info (TSC and Bus)
From: Sean Christopherson @ 2026-05-15 19:19 UTC (permalink / raw)
To: Kiryl Shutsemau, Paolo Bonzini, Sean Christopherson,
K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
Ajay Kaher, Alexey Makhalov, Jan Kiszka, Dave Hansen,
Andy Lutomirski, Peter Zijlstra, Juergen Gross, Daniel Lezcano,
Thomas Gleixner, John Stultz
Cc: Rick Edgecombe, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <20260515191942.1892718-1-seanjc@google.com>
From: David Woodhouse <dwmw@amazon.co.uk>
Formally define and document CPUID 0x40000010 as providing TSC and local
APIC bus frequency information for KVM's PV CPUID range. Way back in
2008, VMware proposed (https://lkml.org/lkml/2008/10/1/246) carving out a
range of CPUID leaves for use by hypervisors. While the broader proposal
from VMware was mostly shot down in flames, use of CPUID 0x40000010 to
provide TSC and local APIC bus frequency information survived and made it's
way into multiple guest operating systems.
XNU unconditionally assumes CPUID 0x40000010 contains the frequency
information, if it's present on any hypervisor:
https://github.com/apple/darwin-xnu/blob/main/osfmk/i386/cpuid.c
As does FreeBSD:
https://github.com/freebsd/freebsd-src/commit/4a432614f68
More importantly, QEMU (the de facto "reference" VMM for KVM) has
conditionally provided timing information in CPUID 0x40000010 for almost
9 years, since commit 9954a1582e ("x86-KVM: Supply TSC and APIC clock
rates to guest like VMWare").
So at this point it would be daft for KVM (or any hypervisor) to expose
0x40000010 for any *other* content. Officially carve out and define the
CPUID leaf so that Linux-as-a-guest can follow suit and pull TSC and Local
APIC Bus frequency information from CPUID.
Defer providing userspace with the necessary information needed to
precisely and accurately enumerate the _actual_ configured TSC frequency
to the guest (that exact information, along with the scaled ratio, isn't
exposed to userspace). As evidenced by QEMU, providing CPUID 0x40000010
without help from KVM is entirely possible, just not ideal.
Link: https://lore.kernel.org/all/ea0d7f43d910cee9600b254e303f468722fa355b.camel@infradead.org
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
[sean: drop KVM filling of CPUID, add documentation, massage changelog]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
Documentation/virt/kvm/x86/cpuid.rst | 12 ++++++++++++
arch/x86/include/uapi/asm/kvm_para.h | 11 +++++++++++
2 files changed, 23 insertions(+)
diff --git a/Documentation/virt/kvm/x86/cpuid.rst b/Documentation/virt/kvm/x86/cpuid.rst
index bda3e3e737d7..f02e395cfa9b 100644
--- a/Documentation/virt/kvm/x86/cpuid.rst
+++ b/Documentation/virt/kvm/x86/cpuid.rst
@@ -122,3 +122,15 @@ KVM_HINTS_REALTIME 0 guest checks this feature bit to
preempted for an unlimited time
allowing optimizations
================== ============ =================================
+
+function: KVM_CPUID_TIMING_INFO (0x40000010)
+
+returns::
+
+ eax = (Virtual) TSC frequency in kHz
+ ebx = (Virtual) Bus (local APIC timer) frequency in kHz
+ ecx = 0 (Reserved)
+ edx = 0 (Reserved)
+
+Note, KVM only defines the semantics of KVM_CPUID_TIMING_INFO; KVM does NOT
+advertise support via KVM_GET_SUPPORTED_CPUID.
\ No newline at end of file
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index a1efa7907a0b..c3a384711f3a 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -44,6 +44,17 @@
*/
#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24
+/*
+ * The timing information leaf provides TSC and local APIC timer frequency
+ * information to the guest. Note, userspace is responsible for filling the
+ * leaf with the correct information.
+ *
+ * # EAX: (Virtual) TSC frequency in kHz.
+ * # EBX: (Virtual) Bus (local APIC timer) frequency in kHz.
+ * # ECX, EDX: Reserved (must be zero).
+ */
+#define KVM_CPUID_TIMING_INFO 0x40000010
+
#define MSR_KVM_WALL_CLOCK 0x11
#define MSR_KVM_SYSTEM_TIME 0x12
--
2.54.0.563.g4f69b47b94-goog
^ permalink raw reply related
* [PATCH v3 33/41] x86/kvmclock: Mark TSC as reliable when it's constant and nonstop
From: Sean Christopherson @ 2026-05-15 19:19 UTC (permalink / raw)
To: Kiryl Shutsemau, Paolo Bonzini, Sean Christopherson,
K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
Ajay Kaher, Alexey Makhalov, Jan Kiszka, Dave Hansen,
Andy Lutomirski, Peter Zijlstra, Juergen Gross, Daniel Lezcano,
Thomas Gleixner, John Stultz
Cc: Rick Edgecombe, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <20260515191942.1892718-1-seanjc@google.com>
Mark the TSC as reliable if the hypervisor (KVM) has enumerated the TSC
as constant and nonstop, and the admin hasn't explicitly marked the TSC
as unstable. Like most (all?) virtualization setups, any secondary
clocksource that's used as a watchdog is guaranteed to be less reliable
than a constant, nonstop TSC, as all clocksources the kernel uses as a
watchdog are all but guaranteed to be emulated when running as a KVM
guest. I.e. any observed discrepancies between the TSC and watchdog will
be due to jitter in the watchdog.
This is especially true for KVM, as the watchdog clocksource is usually
emulated in host userspace, i.e. reading the clock incurs a roundtrip
cost of thousands of cycles.
Marking the TSC reliable addresses a flaw where the TSC will occasionally
be marked unstable if the host is under moderate/heavy load.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/kvmclock.c | 29 ++++++++++++++++-------------
1 file changed, 16 insertions(+), 13 deletions(-)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index b6b2018c51db..47f7df1e81a0 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -363,6 +363,7 @@ static __init void kvm_sched_clock_init(bool stable)
void __init kvmclock_init(void)
{
+ enum tsc_properties tsc_properties = TSC_FREQUENCY_KNOWN;
bool stable = false;
if (!kvm_para_available() || !kvmclock)
@@ -401,18 +402,6 @@ void __init kvmclock_init(void)
PVCLOCK_TSC_STABLE_BIT;
}
- kvm_sched_clock_init(stable);
-
- tsc_register_calibration_routines(kvm_get_tsc_khz, kvm_get_tsc_khz,
- TSC_FREQUENCY_KNOWN);
-
- x86_platform.get_wallclock = kvm_get_wallclock;
- x86_platform.set_wallclock = kvm_set_wallclock;
-#ifdef CONFIG_SMP
- x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock;
-#endif
- kvm_get_preset_lpj();
-
/*
* X86_FEATURE_NONSTOP_TSC is TSC runs at constant rate
* with P/T states and does not stop in deep C-states.
@@ -423,8 +412,22 @@ void __init kvmclock_init(void)
*/
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
- !check_tsc_unstable())
+ !check_tsc_unstable()) {
kvm_clock.rating = 299;
+ tsc_properties = TSC_FREQ_KNOWN_AND_RELIABLE;
+ }
+
+ kvm_sched_clock_init(stable);
+
+ tsc_register_calibration_routines(kvm_get_tsc_khz, kvm_get_tsc_khz,
+ tsc_properties);
+
+ x86_platform.get_wallclock = kvm_get_wallclock;
+ x86_platform.set_wallclock = kvm_set_wallclock;
+#ifdef CONFIG_SMP
+ x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock;
+#endif
+ kvm_get_preset_lpj();
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
pv_info.name = "KVM";
--
2.54.0.563.g4f69b47b94-goog
^ permalink raw reply related
* [PATCH v3 32/41] x86/tsc: Rejects attempts to override TSC calibration with lesser routine
From: Sean Christopherson @ 2026-05-15 19:19 UTC (permalink / raw)
To: Kiryl Shutsemau, Paolo Bonzini, Sean Christopherson,
K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
Ajay Kaher, Alexey Makhalov, Jan Kiszka, Dave Hansen,
Andy Lutomirski, Peter Zijlstra, Juergen Gross, Daniel Lezcano,
Thomas Gleixner, John Stultz
Cc: Rick Edgecombe, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <20260515191942.1892718-1-seanjc@google.com>
When registering a TSC frequency calibration routine, sanity check that
the incoming routine is as robust as the outgoing routine, and reject the
incoming routine if the sanity check fails.
Because native calibration routines only mark the TSC frequency as known
and reliable when they actually run, the effective progression of
capabilities is: None (native) => Known and maybe Reliable (PV) =>
Known and Reliable (CoCo). Violating that progression for a PV override
is relatively benign, but messing up the progression when CoCo is
involved is more problematic, as it likely means a trusted source of
information (hardware/firmware) is being discarded in favor of a less
trusted source (hypervisor).
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/tsc.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 98bef1d06fa9..7a261214fa3e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1319,8 +1319,13 @@ void tsc_register_calibration_routines(unsigned long (*calibrate_tsc)(void),
if (properties & TSC_FREQUENCY_KNOWN)
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+ else if (WARN_ON(boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)))
+ return;
+
if (properties & TSC_RELIABLE)
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+ else if (WARN_ON(boot_cpu_has(X86_FEATURE_TSC_RELIABLE)))
+ return;
x86_platform.calibrate_tsc = calibrate_tsc;
if (calibrate_cpu)
--
2.54.0.563.g4f69b47b94-goog
^ permalink raw reply related
* [PATCH v3 31/41] x86/tsc: Pass KNOWN_FREQ and RELIABLE as params to registration
From: Sean Christopherson @ 2026-05-15 19:19 UTC (permalink / raw)
To: Kiryl Shutsemau, Paolo Bonzini, Sean Christopherson,
K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
Ajay Kaher, Alexey Makhalov, Jan Kiszka, Dave Hansen,
Andy Lutomirski, Peter Zijlstra, Juergen Gross, Daniel Lezcano,
Thomas Gleixner, John Stultz
Cc: Rick Edgecombe, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <20260515191942.1892718-1-seanjc@google.com>
Add a "tsc_properties" set of flags and use it to annotate whether the
TSC operates at a known and/or reliable frequency when registering a
paravirtual TSC calibration routine. Currently, each PV flow manually
sets the associated feature flags, but often in haphazard fashion that
makes it difficult for unfamiliar readers to see the properties of the
TSC when running under a particular hypervisor.
The other, bigger issue with manually setting the feature flags is that
it decouples the flags from the calibration routine. E.g. in theory, PV
code could mark the TSC as having a known frequency, but then have its
PV calibration discarded in favor of a method that doesn't use that known
frequency. Passing the TSC properties along with the calibration routine
will allow adding sanity checks to guard against replacing a "better"
calibration routine with a "worse" routine.
As a bonus, the flags also give developers working on new PV code a heads
up that they should at least mark the TSC as having a known frequency.
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Tested-by: Michael Kelley <mhklinux@outlook.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/coco/sev/core.c | 6 ++----
arch/x86/coco/tdx/tdx.c | 7 ++-----
arch/x86/include/asm/tsc.h | 8 +++++++-
arch/x86/kernel/cpu/acrn.c | 4 ++--
arch/x86/kernel/cpu/mshyperv.c | 10 +++++++---
arch/x86/kernel/cpu/vmware.c | 7 ++++---
arch/x86/kernel/jailhouse.c | 4 ++--
arch/x86/kernel/kvmclock.c | 4 ++--
arch/x86/kernel/tsc.c | 8 +++++++-
arch/x86/xen/time.c | 4 ++--
10 files changed, 37 insertions(+), 25 deletions(-)
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index 39fb50697542..822a2a0f1a2f 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -2037,9 +2037,6 @@ void __init snp_secure_tsc_init(void)
secrets = (__force struct snp_secrets_page *)mem;
- setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
- setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
-
rdmsrq(MSR_AMD64_GUEST_TSC_FREQ, tsc_freq_mhz);
/* Extract the GUEST TSC MHZ from BIT[17:0], rest is reserved space */
@@ -2048,7 +2045,8 @@ void __init snp_secure_tsc_init(void)
snp_tsc_freq_khz = SNP_SCALE_TSC_FREQ(tsc_freq_mhz * 1000, secrets->tsc_factor);
tsc_register_calibration_routines(securetsc_get_tsc_khz,
- securetsc_get_tsc_khz);
+ securetsc_get_tsc_khz,
+ TSC_FREQ_KNOWN_AND_RELIABLE);
early_memunmap(mem, PAGE_SIZE);
}
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 26890cea790b..7050f3ee6593 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -1208,14 +1208,11 @@ static unsigned long tdx_get_tsc_khz(void)
void __init tdx_tsc_init(void)
{
- /* TSC is the only reliable clock in TDX guest */
- setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
- setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
-
/*
* Override the PV calibration routines (if set) with more trustworthy
* CPUID-based calibration. The TDX module emulates CPUID, whereas any
* PV information is provided by the hypervisor.
*/
- tsc_register_calibration_routines(tdx_get_tsc_khz, NULL);
+ tsc_register_calibration_routines(tdx_get_tsc_khz, NULL,
+ TSC_FREQ_KNOWN_AND_RELIABLE);
}
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index bae709f5f44d..f458be688512 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -95,8 +95,14 @@ extern int cpuid_get_tsc_freq(struct cpuid_tsc_info *info);
extern void tsc_early_init(void);
extern void tsc_init(void);
#if defined(CONFIG_HYPERVISOR_GUEST) || defined(CONFIG_AMD_MEM_ENCRYPT)
+enum tsc_properties {
+ TSC_FREQUENCY_KNOWN = BIT(0),
+ TSC_RELIABLE = BIT(1),
+ TSC_FREQ_KNOWN_AND_RELIABLE = TSC_FREQUENCY_KNOWN | TSC_RELIABLE,
+};
extern void tsc_register_calibration_routines(unsigned long (*calibrate_tsc)(void),
- unsigned long (*calibrate_cpu)(void));
+ unsigned long (*calibrate_cpu)(void),
+ enum tsc_properties properties);
#endif
extern void mark_tsc_unstable(char *reason);
extern int unsynchronized_tsc(void);
diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
index 2da3de4d470e..4f2f4f7ec334 100644
--- a/arch/x86/kernel/cpu/acrn.c
+++ b/arch/x86/kernel/cpu/acrn.c
@@ -29,9 +29,9 @@ static void __init acrn_init_platform(void)
/* Install system interrupt handler for ACRN hypervisor callback */
sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback);
- setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
tsc_register_calibration_routines(acrn_get_tsc_khz,
- acrn_get_tsc_khz);
+ acrn_get_tsc_khz,
+ TSC_FREQUENCY_KNOWN);
}
static bool acrn_x2apic_available(void)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 5ca139ae50b4..734d79c10ae5 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -516,8 +516,13 @@ static void __init ms_hyperv_init_platform(void)
if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
- tsc_register_calibration_routines(hv_get_tsc_khz, hv_get_tsc_khz);
- setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+ enum tsc_properties tsc_properties = TSC_FREQUENCY_KNOWN;
+
+ if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)
+ tsc_properties = TSC_FREQ_KNOWN_AND_RELIABLE;
+
+ tsc_register_calibration_routines(hv_get_tsc_khz, hv_get_tsc_khz,
+ tsc_properties);
}
if (ms_hyperv.priv_high & HV_ISOLATION) {
@@ -629,7 +634,6 @@ static void __init ms_hyperv_init_platform(void)
* is called.
*/
wrmsrq(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC);
- setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
}
/*
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 968de002975f..c19fa4471800 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -388,10 +388,10 @@ static void __init vmware_paravirt_ops_setup(void)
*/
static void __init vmware_set_capabilities(void)
{
+ /* TSC is non-stop and reliable even if the frequency isn't known. */
setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC);
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
- if (vmware_tsc_khz)
- setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+
if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMCALL)
setup_force_cpu_cap(X86_FEATURE_VMCALL);
else if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMMCALL)
@@ -420,7 +420,8 @@ static void __init vmware_platform_setup(void)
vmware_tsc_khz = tsc_khz;
tsc_register_calibration_routines(vmware_get_tsc_khz,
- vmware_get_tsc_khz);
+ vmware_get_tsc_khz,
+ TSC_FREQ_KNOWN_AND_RELIABLE);
#ifdef CONFIG_X86_LOCAL_APIC
/* Skip lapic calibration since we know the bus frequency. */
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
index db8f31fdb480..1bdc9ab321e0 100644
--- a/arch/x86/kernel/jailhouse.c
+++ b/arch/x86/kernel/jailhouse.c
@@ -219,7 +219,8 @@ static void __init jailhouse_init_platform(void)
machine_ops.emergency_restart = jailhouse_no_restart;
- tsc_register_calibration_routines(jailhouse_get_tsc, jailhouse_get_tsc);
+ tsc_register_calibration_routines(jailhouse_get_tsc, jailhouse_get_tsc,
+ TSC_FREQUENCY_KNOWN);
while (pa_data) {
mapping = early_memremap(pa_data, sizeof(header));
@@ -257,7 +258,6 @@ static void __init jailhouse_init_platform(void)
pr_debug("Jailhouse: PM-Timer IO Port: %#x\n", pmtmr_ioport);
precalibrated_tsc_khz = setup_data.v1.tsc_khz;
- setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
pci_probe = 0;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 9b3d1ed1a96d..b6b2018c51db 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -200,7 +200,6 @@ void kvmclock_cpu_action(enum kvm_guest_cpu_action action)
*/
static unsigned long kvm_get_tsc_khz(void)
{
- setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
return pvclock_tsc_khz(this_cpu_pvti());
}
@@ -404,7 +403,8 @@ void __init kvmclock_init(void)
kvm_sched_clock_init(stable);
- tsc_register_calibration_routines(kvm_get_tsc_khz, kvm_get_tsc_khz);
+ tsc_register_calibration_routines(kvm_get_tsc_khz, kvm_get_tsc_khz,
+ TSC_FREQUENCY_KNOWN);
x86_platform.get_wallclock = kvm_get_wallclock;
x86_platform.set_wallclock = kvm_set_wallclock;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index ac4abfec1f05..98bef1d06fa9 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1311,11 +1311,17 @@ static void __init check_system_tsc_reliable(void)
*/
#if defined(CONFIG_HYPERVISOR_GUEST) || defined(CONFIG_AMD_MEM_ENCRYPT)
void tsc_register_calibration_routines(unsigned long (*calibrate_tsc)(void),
- unsigned long (*calibrate_cpu)(void))
+ unsigned long (*calibrate_cpu)(void),
+ enum tsc_properties properties)
{
if (WARN_ON_ONCE(!calibrate_tsc))
return;
+ if (properties & TSC_FREQUENCY_KNOWN)
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+ if (properties & TSC_RELIABLE)
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+
x86_platform.calibrate_tsc = calibrate_tsc;
if (calibrate_cpu)
x86_platform.calibrate_cpu = calibrate_cpu;
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index f087bb76457d..c04548641558 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -43,7 +43,6 @@ static unsigned long xen_tsc_khz(void)
struct pvclock_vcpu_time_info *info =
&HYPERVISOR_shared_info->vcpu_info[0].time;
- setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
return pvclock_tsc_khz(info);
}
@@ -574,7 +573,8 @@ static void __init xen_init_time_common(void)
*/
paravirt_set_sched_clock(xen_sched_clock, NULL, NULL);
- tsc_register_calibration_routines(xen_tsc_khz, NULL);
+ tsc_register_calibration_routines(xen_tsc_khz, NULL,
+ TSC_FREQUENCY_KNOWN);
x86_platform.get_wallclock = xen_get_wallclock;
}
--
2.54.0.563.g4f69b47b94-goog
^ permalink raw reply related
* [PATCH v3 30/41] x86/paravirt: Don't use a PV sched_clock in CoCo guests with trusted TSC
From: Sean Christopherson @ 2026-05-15 19:19 UTC (permalink / raw)
To: Kiryl Shutsemau, Paolo Bonzini, Sean Christopherson,
K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
Ajay Kaher, Alexey Makhalov, Jan Kiszka, Dave Hansen,
Andy Lutomirski, Peter Zijlstra, Juergen Gross, Daniel Lezcano,
Thomas Gleixner, John Stultz
Cc: Rick Edgecombe, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <20260515191942.1892718-1-seanjc@google.com>
Silently ignore attempts to switch to a paravirt sched_clock when running
as a CoCo guest with trusted TSC. In hand-wavy theory, a misbehaving
hypervisor could attack the guest by manipulating the PV clock to affect
guest scheduling in some weird and/or predictable way. More importantly,
reading TSC on such platforms is faster than any PV clock, and sched_clock
is all about speed.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kernel/tsc.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 3c15fc10e501..ac4abfec1f05 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -283,6 +283,15 @@ bool using_native_sched_clock(void)
int __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
void (*save)(void), void (*restore)(void))
{
+ /*
+ * Don't replace TSC with a PV clock when running as a CoCo guest and
+ * the TSC is secure/trusted; PV clocks are emulated by the hypervisor,
+ * which isn't in the guest's TCB.
+ */
+ if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC) ||
+ boot_cpu_has(X86_FEATURE_TDX_GUEST))
+ return -EPERM;
+
if (!stable)
clear_sched_clock_stable();
--
2.54.0.563.g4f69b47b94-goog
^ permalink raw reply related
* [PATCH v3 29/41] x86/paravirt: Plumb a return code into __paravirt_set_sched_clock()
From: Sean Christopherson @ 2026-05-15 19:19 UTC (permalink / raw)
To: Kiryl Shutsemau, Paolo Bonzini, Sean Christopherson,
K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
Ajay Kaher, Alexey Makhalov, Jan Kiszka, Dave Hansen,
Andy Lutomirski, Peter Zijlstra, Juergen Gross, Daniel Lezcano,
Thomas Gleixner, John Stultz
Cc: Rick Edgecombe, Vitaly Kuznetsov,
Broadcom internal kernel review list, Boris Ostrovsky,
Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <20260515191942.1892718-1-seanjc@google.com>
Add a return code to __paravirt_set_sched_clock() so that the kernel can
reject attempts to use a PV sched_clock without breaking the caller. E.g.
when running as a CoCo VM with a secure TSC, using a PV clock is generally
undesirable.
Note, kvmclock is the only PV clock that does anything "extra" beyond
simply registering itself as sched_clock, i.e. is the only caller that
needs to check the new return value.
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/timer.h | 6 +++---
arch/x86/kernel/kvmclock.c | 8 +++++---
arch/x86/kernel/tsc.c | 5 +++--
3 files changed, 11 insertions(+), 8 deletions(-)
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 96ae7feac47c..ca5c95d48c03 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -14,14 +14,14 @@ extern int no_timer_check;
extern bool using_native_sched_clock(void);
#ifdef CONFIG_PARAVIRT
-void __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
- void (*save)(void), void (*restore)(void));
+int __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
+ void (*save)(void), void (*restore)(void));
static __always_inline void paravirt_set_sched_clock(u64 (*func)(void),
void (*save)(void),
void (*restore)(void))
{
- __paravirt_set_sched_clock(func, true, save, restore);
+ (void)__paravirt_set_sched_clock(func, true, save, restore);
}
#endif
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index d3bb281c0805..9b3d1ed1a96d 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -338,10 +338,12 @@ static int kvmclock_setup_percpu(unsigned int cpu)
static __init void kvm_sched_clock_init(bool stable)
{
+ if (__paravirt_set_sched_clock(kvm_sched_clock_read, stable,
+ kvm_save_sched_clock_state,
+ kvm_restore_sched_clock_state))
+ return;
+
kvm_sched_clock_offset = kvm_clock_read();
- __paravirt_set_sched_clock(kvm_sched_clock_read, stable,
- kvm_save_sched_clock_state,
- kvm_restore_sched_clock_state);
kvmclock_is_sched_clock = true;
/*
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 4a48b8ba5bea..3c15fc10e501 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -280,8 +280,8 @@ bool using_native_sched_clock(void)
return static_call_query(pv_sched_clock) == native_sched_clock;
}
-void __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
- void (*save)(void), void (*restore)(void))
+int __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
+ void (*save)(void), void (*restore)(void))
{
if (!stable)
clear_sched_clock_stable();
@@ -289,6 +289,7 @@ void __init __paravirt_set_sched_clock(u64 (*func)(void), bool stable,
static_call_update(pv_sched_clock, func);
x86_platform.save_sched_clock_state = save;
x86_platform.restore_sched_clock_state = restore;
+ return 0;
}
#else
u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock")));
--
2.54.0.563.g4f69b47b94-goog
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox