Linux Confidential Computing Development
 help / color / mirror / Atom feed
* Re: [PATCH v14 19/44] arm64: RMI: Allocate/free RECs to match vCPUs
From: Wei-Lin Chang @ 2026-05-26 22:39 UTC (permalink / raw)
  To: Steven Price, kvm, kvmarm
  Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Gavin Shan,
	Shanker Donthineni, Alper Gun, Aneesh Kumar K . V, Emi Kisanuki,
	Vishal Annapurve, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-20-steven.price@arm.com>

Hi,

On Wed, May 13, 2026 at 02:17:27PM +0100, Steven Price wrote:
> The RMM maintains a data structure known as the Realm Execution Context
> (or REC). It is similar to struct kvm_vcpu and tracks the state of the
> virtual CPUs. KVM must delegate memory and request the structures are
> created when vCPUs are created, and suitably tear down on destruction.
> 
> RECs may require additional pages (e.g. for storing larger register
> state for SVE). The RMM can request extra pages for this purpose using
> the Stateful RMI Operations (SRO) functionality to request pages during
> REC creation. These pages are then passed back to the host from the RMM
> ('reclaimed') when the REC is destroyed. The kernel tracking object
> (struct rmi_sro_state) is stored in the realm_rec structure to avoid
> memory allocation during the destruction path.
> 
> Note that only some of register state for the REC can be set by KVM, the
> rest is defined by the RMM (zeroed). The register state then cannot be
> changed by KVM after the REC is created (except when the guest
> explicitly requests this e.g. by performing a PSCI call).
> 
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> Changes since v13:
>  * Support SRO for REC creation/destruction instead of auxiliary
>    granules.
> Changes since v12:
>  * Use the new range-based delegation RMI.
> Changes since v11:
>  * Remove the KVM_ARM_VCPU_REC feature. User space no longer needs to
>    configure each VCPU separately, RECs are created on the first VCPU
>    run of the guest.
> Changes since v9:
>  * Size the aux_pages array according to the PAGE_SIZE of the host.
> Changes since v7:
>  * Add comment explaining the aux_pages array.
>  * Rename "undeleted_failed" variable to "should_free" to avoid a
>    confusing double negative.
> Changes since v6:
>  * Avoid reporting the KVM_ARM_VCPU_REC feature if the guest isn't a
>    realm guest.
>  * Support host page size being larger than RMM's granule size when
>    allocating/freeing aux granules.
> Changes since v5:
>  * Separate the concept of vcpu_is_rec() and
>    kvm_arm_vcpu_rec_finalized() by using the KVM_ARM_VCPU_REC feature as
>    the indication that the VCPU is a REC.
> Changes since v2:
>  * Free rec->run earlier in kvm_destroy_realm() and adapt to previous patches.
> ---
>  arch/arm64/include/asm/kvm_emulate.h |   2 +-
>  arch/arm64/include/asm/kvm_host.h    |   3 +
>  arch/arm64/include/asm/kvm_rmi.h     |  17 +++++
>  arch/arm64/kvm/arm.c                 |   6 ++
>  arch/arm64/kvm/reset.c               |   1 +
>  arch/arm64/kvm/rmi.c                 | 105 +++++++++++++++++++++++++++
>  6 files changed, 133 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> index 82fd777bd9bb..2e69fe494716 100644
> --- a/arch/arm64/include/asm/kvm_emulate.h
> +++ b/arch/arm64/include/asm/kvm_emulate.h
> @@ -714,7 +714,7 @@ static inline bool kvm_realm_is_created(struct kvm *kvm)
>  
>  static inline bool vcpu_is_rec(const struct kvm_vcpu *vcpu)
>  {
> -	return false;
> +	return kvm_is_realm(vcpu->kvm);
>  }
>  
>  #endif /* __ARM64_KVM_EMULATE_H__ */
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 3512696ed506..39b5de03d0fe 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -969,6 +969,9 @@ struct kvm_vcpu_arch {
>  
>  	/* Hyp-readable copy of kvm_vcpu::pid */
>  	pid_t pid;
> +
> +	/* Realm meta data */
> +	struct realm_rec rec;
>  };
>  
>  /*
> diff --git a/arch/arm64/include/asm/kvm_rmi.h b/arch/arm64/include/asm/kvm_rmi.h
> index 8bd743093ccf..d99bf4fc3c39 100644
> --- a/arch/arm64/include/asm/kvm_rmi.h
> +++ b/arch/arm64/include/asm/kvm_rmi.h
> @@ -59,6 +59,22 @@ struct realm {
>  	unsigned int ia_bits;
>  };
>  
> +/**
> + * struct realm_rec - Additional per VCPU data for a Realm
> + *
> + * @mpidr: MPIDR (Multiprocessor Affinity Register) value to identify this VCPU
> + * @rec_page: Kernel VA of the RMM's private page for this REC
> + * @aux_pages: Additional pages private to the RMM for this REC
> + * @run: Kernel VA of the RmiRecRun structure shared with the RMM
> + * @sro: A preallocated SRO state context
> + */
> +struct realm_rec {
> +	unsigned long mpidr;
> +	void *rec_page;
> +	struct rec_run *run;
> +	struct rmi_sro_state *sro;
> +};
> +
>  void kvm_init_rmi(void);
>  u32 kvm_realm_ipa_limit(void);
>  
> @@ -66,6 +82,7 @@ int kvm_init_realm(struct kvm *kvm);
>  int kvm_activate_realm(struct kvm *kvm);
>  void kvm_destroy_realm(struct kvm *kvm);
>  void kvm_realm_destroy_rtts(struct kvm *kvm);
> +void kvm_destroy_rec(struct kvm_vcpu *vcpu);
>  
>  static inline bool kvm_realm_is_private_address(struct realm *realm,
>  						unsigned long addr)
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index eb2b61fe1f0a..93d34762db91 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -586,6 +586,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
>  	/* Force users to call KVM_ARM_VCPU_INIT */
>  	vcpu_clear_flag(vcpu, VCPU_INITIALIZED);
>  
> +	vcpu->arch.rec.mpidr = INVALID_HWID;
> +
>  	vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO;
>  
>  	/* Set up the timer */
> @@ -1651,6 +1653,10 @@ static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu,
>  	if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features))
>  		return -EINVAL;
>  
> +	/* Realms are incompatible with AArch32 */
> +	if (vcpu_is_rec(vcpu))
> +		return -EINVAL;
> +
>  	return 0;
>  }
>  
> diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
> index b963fd975aac..c18cdca7d125 100644
> --- a/arch/arm64/kvm/reset.c
> +++ b/arch/arm64/kvm/reset.c
> @@ -161,6 +161,7 @@ void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu)
>  	free_page((unsigned long)vcpu->arch.ctxt.vncr_array);
>  	kfree(vcpu->arch.vncr_tlb);
>  	kfree(vcpu->arch.ccsidr);
> +	kvm_destroy_rec(vcpu);
>  }
>  
>  static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu)
> diff --git a/arch/arm64/kvm/rmi.c b/arch/arm64/kvm/rmi.c
> index 849111817af7..353a5ca45e78 100644
> --- a/arch/arm64/kvm/rmi.c
> +++ b/arch/arm64/kvm/rmi.c
> @@ -173,9 +173,108 @@ static int realm_ensure_created(struct kvm *kvm)
>  	return -ENXIO;
>  }
>  
> +static int kvm_create_rec(struct kvm_vcpu *vcpu)
> +{
> +	struct user_pt_regs *vcpu_regs = vcpu_gp_regs(vcpu);
> +	unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
> +	struct realm *realm = &vcpu->kvm->arch.realm;
> +	struct realm_rec *rec = &vcpu->arch.rec;
> +	unsigned long rec_page_phys;
> +	struct rec_params *params;
> +	int r, i;
> +
> +	if (rec->run)
> +		return -EBUSY;
> +
> +	/*
> +	 * The RMM will report PSCI v1.0 to Realms and the KVM_ARM_VCPU_PSCI_0_2
> +	 * flag covers v0.2 and onwards.
> +	 */
> +	if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_PSCI_0_2))
> +		return -EINVAL;
> +
> +	BUILD_BUG_ON(sizeof(*params) > PAGE_SIZE);
> +	BUILD_BUG_ON(sizeof(*rec->run) > PAGE_SIZE);
> +
> +	params = (struct rec_params *)get_zeroed_page(GFP_KERNEL);
> +	rec->rec_page = (void *)__get_free_page(GFP_KERNEL);
> +	rec->run = (void *)get_zeroed_page(GFP_KERNEL);

Should this be cast to (struct rec_run *) ?

> +	rec->sro = kmalloc_obj(*rec->sro);
> +	if (!params || !rec->rec_page || !rec->run || !rec->sro) {
> +		r = -ENOMEM;
> +		goto out_free_pages;
> +	}
> +
> +	for (i = 0; i < ARRAY_SIZE(params->gprs); i++)
> +		params->gprs[i] = vcpu_regs->regs[i];
> +
> +	params->pc = vcpu_regs->pc;
> +
> +	if (vcpu->vcpu_id == 0)
> +		params->flags |= REC_PARAMS_FLAG_RUNNABLE;
> +
> +	rec_page_phys = virt_to_phys(rec->rec_page);
> +
> +	if (rmi_delegate_page(rec_page_phys)) {
> +		r = -ENXIO;
> +		goto out_free_pages;
> +	}
> +
> +	params->mpidr = mpidr;
> +
> +	if (rmi_rec_create(virt_to_phys(realm->rd), rec_page_phys,
> +			   virt_to_phys(params), rec->sro)) {
> +		r = -ENXIO;
> +		goto out_undelegate_rmm_rec;
> +	}
> +
> +	rec->mpidr = mpidr;
> +
> +	free_page((unsigned long)params);
> +	return 0;
> +
> +out_undelegate_rmm_rec:
> +	if (WARN_ON(rmi_undelegate_page(rec_page_phys)))
> +		rec->rec_page = NULL;
> +out_free_pages:
> +	free_page((unsigned long)rec->run);
> +	free_page((unsigned long)rec->rec_page);
> +	free_page((unsigned long)params);
> +	kfree(rec->sro);
> +	rec->run = NULL;
> +	return r;
> +}
> +

[...]

Thanks,
Wei-Lin Chang

^ permalink raw reply

* Re: [PATCH v14 17/44] arm64: RMI: RTT tear down
From: Wei-Lin Chang @ 2026-05-26 22:32 UTC (permalink / raw)
  To: Steven Price, kvm, kvmarm
  Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Gavin Shan,
	Shanker Donthineni, Alper Gun, Aneesh Kumar K . V, Emi Kisanuki,
	Vishal Annapurve, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-18-steven.price@arm.com>

Hi,

On Wed, May 13, 2026 at 02:17:25PM +0100, Steven Price wrote:
> The RMM owns the stage 2 page tables for a realm, and KVM must request
> that the RMM creates/destroys entries as necessary. The physical pages
> to store the page tables are delegated to the realm as required, and can
> be undelegated when no longer used.
> 
> Creating new RTTs is the easy part, tearing down is a little more
> tricky. The result of realm_rtt_destroy() can be used to effectively
> walk the tree and destroy the entries (undelegating pages that were
> given to the realm).
> 
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> Changes since v13:
>  * Avoid the double call of kvm_free_stage2_pgd() by splitting the work
>    across that and a new function kvm_realm_uninit_stage2() which is
>    only called for realm guests.
> Changes since v12:
>  * Simplify some functions now we know RMM page size is the same as the
>    host's.
> Changes since v11:
>  * Moved some code from earlier in the series to this one so that it's
>    added when it's first used.
> Changes since v10:
>  * RME->RMI rename.
>  * Some code to handle freeing stage 2 PGD moved into this patch where
>    it belongs.
> Changes since v9:
>  * Add a comment clarifying that root level RTTs are not destroyed until
>    after the RD is destroyed.
> Changes since v8:
>  * Introduce free_rtt() wrapper which calls free_delegated_granule()
>    followed by kvm_account_pgtable_pages(). This makes it clear where an
>    RTT is being freed rather than just a delegated granule.
> Changes since v6:
>  * Move rme_rtt_level_mapsize() and supporting defines from kvm_rme.h
>    into rme.c as they are only used in that file.
> Changes since v5:
>  * Rename some RME_xxx defines to do with page sizes as RMM_xxx - they are
>    a property of the RMM specification not the RME architecture.
> Changes since v2:
>  * Moved {alloc,free}_delegated_page() and ensure_spare_page() to a
>    later patch when they are actually used.
>  * Some simplifications now rmi_xxx() functions allow NULL as an output
>    parameter.
>  * Improved comments and code layout.
> ---
>  arch/arm64/include/asm/kvm_rmi.h |   7 ++
>  arch/arm64/kvm/mmu.c             |  21 ++++-
>  arch/arm64/kvm/rmi.c             | 148 +++++++++++++++++++++++++++++++
>  3 files changed, 174 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_rmi.h b/arch/arm64/include/asm/kvm_rmi.h
> index 9de34983ee52..06ba0d4745c6 100644
> --- a/arch/arm64/include/asm/kvm_rmi.h
> +++ b/arch/arm64/include/asm/kvm_rmi.h
> @@ -64,5 +64,12 @@ u32 kvm_realm_ipa_limit(void);
>  
>  int kvm_init_realm(struct kvm *kvm);
>  void kvm_destroy_realm(struct kvm *kvm);
> +void kvm_realm_destroy_rtts(struct kvm *kvm);
> +
> +static inline bool kvm_realm_is_private_address(struct realm *realm,
> +						unsigned long addr)
> +{
> +	return !(addr & BIT(realm->ia_bits - 1));
> +}
>  
>  #endif /* __ASM_KVM_RMI_H */
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index ba8286472286..eb56d4e7f21a 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1024,9 +1024,26 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
>  	return err;
>  }
>  
> +static void kvm_realm_uninit_stage2(struct kvm_s2_mmu *mmu)
> +{
> +	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
> +	struct realm *realm = &kvm->arch.realm;
> +
> +	if (kvm_realm_state(kvm) != REALM_STATE_ACTIVE)
> +		return;
> +
> +	write_lock(&kvm->mmu_lock);
> +	kvm_stage2_unmap_range(mmu, 0, BIT(realm->ia_bits - 1), true);
> +	write_unlock(&kvm->mmu_lock);
> +	kvm_realm_destroy_rtts(kvm);
> +}
> +
>  void kvm_uninit_stage2_mmu(struct kvm *kvm)
>  {
> -	kvm_free_stage2_pgd(&kvm->arch.mmu);
> +	if (kvm_is_realm(kvm))
> +		kvm_realm_uninit_stage2(&kvm->arch.mmu);
> +	else
> +		kvm_free_stage2_pgd(&kvm->arch.mmu);
>  	kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
>  }
>  
> @@ -1103,7 +1120,7 @@ void stage2_unmap_vm(struct kvm *kvm)
>  void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
>  {
>  	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
> -	struct kvm_pgtable *pgt = NULL;
> +	struct kvm_pgtable *pgt;
>  
>  	write_lock(&kvm->mmu_lock);
>  	pgt = mmu->pgt;
> diff --git a/arch/arm64/kvm/rmi.c b/arch/arm64/kvm/rmi.c
> index f51ec667445e..5b00ccca4af3 100644
> --- a/arch/arm64/kvm/rmi.c
> +++ b/arch/arm64/kvm/rmi.c
> @@ -11,6 +11,14 @@
>  #include <asm/rmi_cmds.h>
>  #include <asm/virt.h>
>  
> +static inline unsigned long rmi_rtt_level_mapsize(int level)
> +{
> +	if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL))
> +		return PAGE_SIZE;
> +
> +	return (1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(level));
> +}
> +
>  static bool rmi_has_feature(unsigned long feature)
>  {
>  	return !!u64_get_bits(rmm_feat_reg0, feature);
> @@ -21,6 +29,144 @@ u32 kvm_realm_ipa_limit(void)
>  	return u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_S2SZ);
>  }
>  
> +static int get_start_level(struct realm *realm)
> +{
> +	return 4 - stage2_pgtable_levels(realm->ia_bits);
> +}
> +
> +static void free_rtt(phys_addr_t phys)
> +{
> +	if (free_delegated_page(phys))
> +		return;
> +
> +	kvm_account_pgtable_pages(phys_to_virt(phys), -1);
> +}
> +
> +/*
> + * realm_rtt_destroy - Destroy an RTT at @level for @addr.
> + *
> + * Returns - Result of the RMI_RTT_DESTROY call, and:
> + * @rtt_granule:	RTT granule, if the RTT was destroyed.
> + * @next_addr:		IPA corresponding to the next possible valid entry we
> + *			can target
> + */
> +static int realm_rtt_destroy(struct realm *realm, unsigned long addr,
> +			     int level, phys_addr_t *rtt_granule,
> +			     unsigned long *next_addr)
> +{
> +	unsigned long out_rtt;
> +	int ret;
> +
> +	ret = rmi_rtt_destroy(virt_to_phys(realm->rd), addr, level,
> +			      &out_rtt, next_addr);
> +
> +	*rtt_granule = out_rtt;
> +
> +	return ret;
> +}

Looks like out_rtt can be simplified out.

[...]

Thanks,
Wei-Lin Chang

^ permalink raw reply

* Re: [PATCH v14 17/44] arm64: RMI: RTT tear down
From: Wei-Lin Chang @ 2026-05-26 22:27 UTC (permalink / raw)
  To: Steven Price, kvm, kvmarm
  Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Gavin Shan,
	Shanker Donthineni, Alper Gun, Aneesh Kumar K . V, Emi Kisanuki,
	Vishal Annapurve, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-18-steven.price@arm.com>

Hi,

On Wed, May 13, 2026 at 02:17:25PM +0100, Steven Price wrote:
> The RMM owns the stage 2 page tables for a realm, and KVM must request
> that the RMM creates/destroys entries as necessary. The physical pages
> to store the page tables are delegated to the realm as required, and can
> be undelegated when no longer used.
> 
> Creating new RTTs is the easy part, tearing down is a little more
> tricky. The result of realm_rtt_destroy() can be used to effectively
> walk the tree and destroy the entries (undelegating pages that were
> given to the realm).
> 
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> Changes since v13:
>  * Avoid the double call of kvm_free_stage2_pgd() by splitting the work
>    across that and a new function kvm_realm_uninit_stage2() which is
>    only called for realm guests.
> Changes since v12:
>  * Simplify some functions now we know RMM page size is the same as the
>    host's.
> Changes since v11:
>  * Moved some code from earlier in the series to this one so that it's
>    added when it's first used.
> Changes since v10:
>  * RME->RMI rename.
>  * Some code to handle freeing stage 2 PGD moved into this patch where
>    it belongs.
> Changes since v9:
>  * Add a comment clarifying that root level RTTs are not destroyed until
>    after the RD is destroyed.
> Changes since v8:
>  * Introduce free_rtt() wrapper which calls free_delegated_granule()
>    followed by kvm_account_pgtable_pages(). This makes it clear where an
>    RTT is being freed rather than just a delegated granule.
> Changes since v6:
>  * Move rme_rtt_level_mapsize() and supporting defines from kvm_rme.h
>    into rme.c as they are only used in that file.
> Changes since v5:
>  * Rename some RME_xxx defines to do with page sizes as RMM_xxx - they are
>    a property of the RMM specification not the RME architecture.
> Changes since v2:
>  * Moved {alloc,free}_delegated_page() and ensure_spare_page() to a
>    later patch when they are actually used.
>  * Some simplifications now rmi_xxx() functions allow NULL as an output
>    parameter.
>  * Improved comments and code layout.
> ---
>  arch/arm64/include/asm/kvm_rmi.h |   7 ++
>  arch/arm64/kvm/mmu.c             |  21 ++++-
>  arch/arm64/kvm/rmi.c             | 148 +++++++++++++++++++++++++++++++
>  3 files changed, 174 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_rmi.h b/arch/arm64/include/asm/kvm_rmi.h
> index 9de34983ee52..06ba0d4745c6 100644
> --- a/arch/arm64/include/asm/kvm_rmi.h
> +++ b/arch/arm64/include/asm/kvm_rmi.h
> @@ -64,5 +64,12 @@ u32 kvm_realm_ipa_limit(void);
>  
>  int kvm_init_realm(struct kvm *kvm);
>  void kvm_destroy_realm(struct kvm *kvm);
> +void kvm_realm_destroy_rtts(struct kvm *kvm);
> +
> +static inline bool kvm_realm_is_private_address(struct realm *realm,
> +						unsigned long addr)
> +{
> +	return !(addr & BIT(realm->ia_bits - 1));
> +}
>  
>  #endif /* __ASM_KVM_RMI_H */
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index ba8286472286..eb56d4e7f21a 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1024,9 +1024,26 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
>  	return err;
>  }
>  
> +static void kvm_realm_uninit_stage2(struct kvm_s2_mmu *mmu)
> +{
> +	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
> +	struct realm *realm = &kvm->arch.realm;
> +
> +	if (kvm_realm_state(kvm) != REALM_STATE_ACTIVE)
> +		return;
> +
> +	write_lock(&kvm->mmu_lock);
> +	kvm_stage2_unmap_range(mmu, 0, BIT(realm->ia_bits - 1), true);
> +	write_unlock(&kvm->mmu_lock);
> +	kvm_realm_destroy_rtts(kvm);
> +}
> +
>  void kvm_uninit_stage2_mmu(struct kvm *kvm)
>  {
> -	kvm_free_stage2_pgd(&kvm->arch.mmu);
> +	if (kvm_is_realm(kvm))
> +		kvm_realm_uninit_stage2(&kvm->arch.mmu);
> +	else
> +		kvm_free_stage2_pgd(&kvm->arch.mmu);
>  	kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
>  }
>  
> @@ -1103,7 +1120,7 @@ void stage2_unmap_vm(struct kvm *kvm)
>  void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
>  {
>  	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
> -	struct kvm_pgtable *pgt = NULL;
> +	struct kvm_pgtable *pgt;

Is this included by accident?

>  
>  	write_lock(&kvm->mmu_lock);
>  	pgt = mmu->pgt;

[...]

Thanks,
Wei-Lin Chang

^ permalink raw reply

* Re: [PATCH v14 13/44] arm64: RMI: Define the user ABI
From: Wei-Lin Chang @ 2026-05-26 22:17 UTC (permalink / raw)
  To: Steven Price, kvm, kvmarm
  Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Gavin Shan,
	Shanker Donthineni, Alper Gun, Aneesh Kumar K . V, Emi Kisanuki,
	Vishal Annapurve, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-14-steven.price@arm.com>

On Wed, May 13, 2026 at 02:17:21PM +0100, Steven Price wrote:
> There is one CAP which identified the presence of CCA, and one ioctl.
> The ioctl is used to populate memory during creation of the realm as
> this requires the RMM to copy data from an unprotected address to the
> protected memory - CCA does not support memory conversion where the
> memory contents is preserved as this is incompatible with memory
> encryption.

Nit:
I believe spelling out the CAP and ioctl names can improve the commit
message. Also "memory conversion" is a little vague, maybe

... CCA does not support shared <-> private memory conversion where ...

would make this clearer?

Thanks,
Wei-Lin Chang

> 
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> Changes since v13:
>  * KVM_ARM_VCPU_RMI_PSCI_COMPLETE removed.
>  * KVM_ARM_RMI_POPULATE documentation updated to reflect that the
>    structure is written by the kernel.
>  * CAP number bumped.
> Changes since v12:
>  * Change KVM_ARM_RMI_POPULATE to update the structure with the amount
>    that has been progressed rather than return the number of bytes
>    populated.
>  * Describe the flag KVM_ARM_RMI_POPULATE_FLAGS_MEASURE.
>  * CAP number is bumped.
>  * NOTE: The PSCI ioctl may be removed in a future spec release.
> Changes since v11:
>  * Completely reworked to be more implicit. Rather than having explicit
>    CAP operations to progress the realm construction these operations
>    are done when needed (on populating and on first vCPU run).
>  * Populate and PSCI complete are promoted to proper ioctls.
> Changes since v10:
>  * Rename symbols from RME to RMI.
> Changes since v9:
>  * Improvements to documentation.
>  * Bump the magic number for KVM_CAP_ARM_RME to avoid conflicts.
> Changes since v8:
>  * Minor improvements to documentation following review.
>  * Bump the magic numbers to avoid conflicts.
> Changes since v7:
>  * Add documentation of new ioctls
>  * Bump the magic numbers to avoid conflicts
> Changes since v6:
>  * Rename some of the symbols to make their usage clearer and avoid
>    repetition.
> Changes from v5:
>  * Actually expose the new VCPU capability (KVM_ARM_VCPU_REC) by bumping
>    KVM_VCPU_MAX_FEATURES - note this also exposes KVM_ARM_VCPU_HAS_EL2!
> ---
>  Documentation/virt/kvm/api.rst | 40 ++++++++++++++++++++++++++++++++++
>  include/uapi/linux/kvm.h       | 13 +++++++++++
>  2 files changed, 53 insertions(+)
> 
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index 52bbbb553ce1..ca68aae7faa2 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -6553,6 +6553,37 @@ KVM_S390_KEYOP_SSKE
>    Sets the storage key for the guest address ``guest_addr`` to the key
>    specified in ``key``, returning the previous value in ``key``.
>  
> +4.145 KVM_ARM_RMI_POPULATE
> +--------------------------
> +
> +:Capability: KVM_CAP_ARM_RMI
> +:Architectures: arm64
> +:Type: vm ioctl
> +:Parameters: struct kvm_arm_rmi_populate (in/out)
> +:Returns: 0 on success, < 0 on error
> +
> +::
> +
> +  struct kvm_arm_rmi_populate {
> +	__u64 base;
> +	__u64 size;
> +	__u64 source_uaddr;
> +	__u32 flags;
> +	__u32 reserved;
> +  };
> +
> +Populate a region of protected address space by copying the data from the
> +(non-protected) user space pointer provided into a protected region (backed by
> +guestmem_fd). It implicitly sets the destination region to RIPAS RAM. This is
> +only valid before any VCPUs have been run. The ioctl might not populate the
> +entire region and in this case the kernel updates the fields `base`, `size` and
> +`source_uaddr`. User space may have to repeatedly call it until `size` is 0 to
> +populate the entire region.
> +
> +`flags` can be set to `KVM_ARM_RMI_POPULATE_FLAGS_MEASURE` to request that the
> +populated data is hashed and added to the guest's Realm Initial Measurement
> +(RIM).
> +
>  .. _kvm_run:
>  
>  5. The kvm_run structure
> @@ -8904,6 +8935,15 @@ helpful if user space wants to emulate instructions which are not
>  This capability can be enabled dynamically even if VCPUs were already
>  created and are running.
>  
> +7.47 KVM_CAP_ARM_RMI
> +--------------------
> +
> +:Architectures: arm64
> +:Target: VM
> +:Parameters: None
> +
> +This capability indicates that support for CCA realms is available.
> +
>  8. Other capabilities.
>  ======================
>  
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 6c8afa2047bf..b8cff0938041 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -996,6 +996,7 @@ struct kvm_enable_cap {
>  #define KVM_CAP_S390_USER_OPEREXEC 246
>  #define KVM_CAP_S390_KEYOP 247
>  #define KVM_CAP_S390_VSIE_ESAMODE 248
> +#define KVM_CAP_ARM_RMI 249
>  
>  struct kvm_irq_routing_irqchip {
>  	__u32 irqchip;
> @@ -1669,4 +1670,16 @@ struct kvm_pre_fault_memory {
>  	__u64 padding[5];
>  };
>  
> +/* Available with KVM_CAP_ARM_RMI, only for VMs with KVM_VM_TYPE_ARM_REALM */
> +#define KVM_ARM_RMI_POPULATE	_IOWR(KVMIO, 0xd7, struct kvm_arm_rmi_populate)
> +#define KVM_ARM_RMI_POPULATE_FLAGS_MEASURE	(1 << 0)
> +
> +struct kvm_arm_rmi_populate {
> +	__u64 base;
> +	__u64 size;
> +	__u64 source_uaddr;
> +	__u32 flags;
> +	__u32 reserved;
> +};
> +
>  #endif /* __LINUX_KVM_H */
> -- 
> 2.43.0
> 

^ permalink raw reply

* Re: [PATCH v2 0/4] struct page to PFN conversion for TDX guest private memory
From: Dave Hansen @ 2026-05-26 20:00 UTC (permalink / raw)
  To: Sean Christopherson, Yan Zhao
  Cc: dave.hansen, pbonzini, tglx, mingo, bp, kas, x86, linux-kernel,
	kvm, linux-coco, kai.huang, rick.p.edgecombe, yilun.xu,
	vannapurve, ackerleytng, sagis, binbin.wu, xiaoyao.li,
	isaku.yamahata
In-Reply-To: <ahX2dcHAQgwuiJBC@google.com>

On 5/26/26 12:37, Sean Christopherson wrote:
>> v2 is based on v7.1.0-rc1 + Sean's 4 cleanup patches (see details in
>> section "Base" below). The purpose is to get Dave's Ack, so Sean can take
>> it from the KVM x86 tree. The full stack of v2 is available at [14].
> Dave, any concerns?

These look fine to me. They make the code marginally cleaner and the
changelogs are much better at describing the problem now.

Going to Linus via the KVM route is fine with me:

Acked-by: Dave Hansen <dave.hansen@linux.intel.com>

^ permalink raw reply

* Re: [PATCH v2 0/4] struct page to PFN conversion for TDX guest private memory
From: Sean Christopherson @ 2026-05-26 19:37 UTC (permalink / raw)
  To: Yan Zhao
  Cc: dave.hansen, pbonzini, tglx, mingo, bp, kas, x86, linux-kernel,
	kvm, linux-coco, kai.huang, rick.p.edgecombe, yilun.xu,
	vannapurve, ackerleytng, sagis, binbin.wu, xiaoyao.li,
	isaku.yamahata
In-Reply-To: <20260430014852.24183-1-yan.y.zhao@intel.com>

On Thu, Apr 30, 2026, Yan Zhao wrote:
> Hi
> 
> This is v2 of the struct page to PFN conversion series, which converts TDX
> guest private memory mapping/unmapping APIs from taking struct page to
> taking PFN as input.
> 
> v2 is based on v7.1.0-rc1 + Sean's 4 cleanup patches (see details in
> section "Base" below). The purpose is to get Dave's Ack, so Sean can take
> it from the KVM x86 tree. The full stack of v2 is available at [14].

Dave, any concerns?

I'd like to get these into the KVM x86 tree sooner than later, so that we at
least have a fighting chance of landing the S-EPT cleanup (prep work for D-PAMT)
in 7.2.

^ permalink raw reply

* Re: [PATCH v2 0/5] guest_memfd fixes for bind and populate
From: Sean Christopherson @ 2026-05-26 16:55 UTC (permalink / raw)
  To: Ackerley Tng
  Cc: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H. Peter Anvin, Kiryl Shutsemau, Rick Edgecombe,
	Vishal Annapurve, Yan Zhao, Michael Roth, Isaku Yamahata,
	Chao Peng, Xiaoyao Li, Zongyao Chen, kvm, linux-kernel,
	linux-coco, Yu Zhang, Fuad Tabba
In-Reply-To: <20260522-fix-sev-gmem-post-populate-v2-0-3f196bfad5a1@google.com>

On Fri, May 22, 2026, Ackerley Tng wrote:
> This series is a group of fixes for the bind and populate flows for
> guest_memfd, and fixes some issues reported by Sashiko after reviewing the
> guest_memfd in-place conversions series [1] and another fixup series Sean
> posted [3].

In the future, please don't bundle unrelated changes.  The SNP specific changes
are related and should be a series, but the signed integer thing and the lack of
error handling on xa_store_range() are completely unrelated, because the fact
that Sashiko kept complaining about pre-existing issues.

I totally understand why you bundled these together, but that obviously didn't
stop Sashiko from complaining about pre-existing issues, over and over.

Unnecessarily bundling can lead to exactly what's happening here: the three SNP
changes are ready to go, but the two unrelated guest_memfd changes need new
versions.  Which isn't hard to deal with, but it's extra friction that is easily
avoided.

I'll apply the SNP changes, and send a new version of the signed vs. unsigned
issue.  Please send a new version of the xa_store_range() error handling (or
prove that I'm wrong).

Thanks!

^ permalink raw reply

* Re: [PATCH v2 5/5] KVM: SNP: Mark source page dirty in sev_gmem_post_populate
From: Sean Christopherson @ 2026-05-26 16:47 UTC (permalink / raw)
  To: Ackerley Tng
  Cc: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H. Peter Anvin, Kiryl Shutsemau, Rick Edgecombe,
	Vishal Annapurve, Yan Zhao, Michael Roth, Isaku Yamahata,
	Chao Peng, Xiaoyao Li, Zongyao Chen, kvm, linux-kernel,
	linux-coco, Yu Zhang, Fuad Tabba
In-Reply-To: <20260522-fix-sev-gmem-post-populate-v2-5-3f196bfad5a1@google.com>

On Fri, May 22, 2026, Ackerley Tng wrote:
> Mark the folio as dirty after copying data into the source page in
> sev_gmem_post_populate. After the memcpy, failing to mark the page dirty
> can lead to the memory management subsystem discarding the changes if the
> page is reclaimed or otherwise processed by the swap subsystem.
> 
> Fixes: 2a62345b3052 ("KVM: guest_memfd: GUP source pages prior to populating guest memory")
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> ---
>  arch/x86/kvm/svm/sev.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> index dbf75326a40f4..1a361f08c7a3d 100644
> --- a/arch/x86/kvm/svm/sev.c
> +++ b/arch/x86/kvm/svm/sev.c
> @@ -2395,6 +2395,7 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
>  		void *dst_vaddr = kmap_local_pfn(pfn);
>  
>  		memcpy(src_vaddr, dst_vaddr, PAGE_SIZE);
> +		folio_mark_dirty(page_folio(src_page));

I'd rather use set_page_dirty().  I'll fixup when applying, unless someon objects.

>  		kunmap_local(dst_vaddr);
>  		kunmap_local(src_vaddr);
> 
> -- 
> 2.54.0.794.g4f17f83d09-goog
> 

^ permalink raw reply

* Re: [PATCH v6 06/11] x86/virt/tdx: Optimize tdx_pamt_get/put()
From: Edgecombe, Rick P @ 2026-05-26 16:42 UTC (permalink / raw)
  To: Gao, Chao
  Cc: kvm@vger.kernel.org, linux-coco@lists.linux.dev, Huang, Kai,
	Hansen, Dave, Zhao, Yan Y, kas@kernel.org, seanjc@google.com,
	mingo@redhat.com, linux-kernel@vger.kernel.org,
	pbonzini@redhat.com, nik.borisov@suse.com,
	linux-doc@vger.kernel.org, hpa@zytor.com, tglx@kernel.org,
	Annapurve, Vishal, bp@alien8.de, kirill.shutemov@linux.intel.com,
	x86@kernel.org
In-Reply-To: <ahVghgNAe4JrmlQH@intel.com>

On Tue, 2026-05-26 at 16:57 +0800, Chao Gao wrote:
> > -	scoped_guard(spinlock, &pamt_lock) {
> 
> This converts the scoped_guard() added by the previous patch to
> explicit lock/unlock and goto. It would reduce code churn if the
> previous patch used that form directly.

Yea, it's a good point. I actually debated doing it, but decided not to because
the scoped version is cleaner for the non-optimized version. But for
reviewability, never doing the scoped version is probably better.

^ permalink raw reply

* Re: [PATCH v2 3/5] KVM: guest_memfd: Handle errors from xa_store_range() when binding
From: Sean Christopherson @ 2026-05-26 16:39 UTC (permalink / raw)
  To: Ackerley Tng
  Cc: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H. Peter Anvin, Kiryl Shutsemau, Rick Edgecombe,
	Vishal Annapurve, Yan Zhao, Michael Roth, Isaku Yamahata,
	Chao Peng, Xiaoyao Li, Zongyao Chen, kvm, linux-kernel,
	linux-coco, Yu Zhang, Fuad Tabba
In-Reply-To: <20260522-fix-sev-gmem-post-populate-v2-3-3f196bfad5a1@google.com>

On Fri, May 22, 2026, Ackerley Tng wrote:
> Unhandled errors from xa_store_range() means kvm_gmem_bind() might falsely
> reporting success, leading to false assumptions in guest_memfd's lifecycle
> later.
> 
> On error, restore the unbound state and return the error to userspace.
> 
> Fixes: a7800aa80ea4d ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory")
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> ---
>  virt/kvm/guest_memfd.c | 11 +++++++++--
>  1 file changed, 9 insertions(+), 2 deletions(-)
> 
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index d203135969d13..5b4911ffa208a 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -648,6 +648,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
>  	struct inode *inode;
>  	struct file *file;
>  	int r = -EINVAL;
> +	void *result;

I would rather go with "xr".  "result" is too generic, e.g. begs the question of
"result of what?"

Actually, I don't think we even need an intermediate variable.

>  	BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
>  
> @@ -688,7 +689,14 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
>  	if (kvm_gmem_supports_mmap(inode))
>  		slot->flags |= KVM_MEMSLOT_GMEM_ONLY;
>  
> -	xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL);
> +	result = xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL);
> +	if (xa_is_err(result)) {
> +		r = xa_err(result);
> +		xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL);

I'm not convinced this is necessary.  Sashiko "asked" the question:

 : If xa_store_range() fails midway through storing a large range (for example,
 : returning -ENOMEM), does it leave the already-processed entries in the
 : f->bindings XArray?
 : 
 : When this error is propagated back, the caller __kvm_set_memory_region()
 : will abort the operation and free the memslot without calling
 : kvm_gmem_unbind().
 : 
 : Since the partial XArray updates aren't rolled back here, could this leave
 : dangling pointers to the freed memslot in f->bindings? If so, when the file
 : is eventually closed, kvm_gmem_release() might iterate over these dangling
 : pointers and write to slot->gmem.file, resulting in a use-after-free.

but I think Sashiko is hallicunating.

If @entry is non-NULL, xa_store_range() pre-creates the entire range, before
storing anything into the range:

		if (entry) {
			unsigned int order = BITS_PER_LONG;
			if (last + 1)
				order = __ffs(last + 1);
			xas_set_order(&xas, last, order);
			xas_create(&xas, true);
			if (xas_error(&xas))
				goto unlock;
		}

Yes, the API handles failure on the subsequent xas_store(), but I can't imagine
that failure is actually, barring garbage input from KVM:

		do {
			xas_set_range(&xas, first, last);
			xas_store(&xas, entry);
			if (xas_error(&xas))
				goto unlock;
			first += xas_size(&xas);
		} while (first <= last);

Purely from a design perspective, providing an API that can fail partway through
under normal operation, with no indication of where failure occured (AFAICT),
would be awful.

> +	} else {
> +		r = 0;
> +	}
> +
>  	filemap_invalidate_unlock(inode->i_mapping);
>  
>  	/*
> @@ -696,7 +704,6 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
>  	 * not the other way 'round.  Active bindings are invalidated if the
>  	 * file is closed before memslots are destroyed.
>  	 */
> -	r = 0;

All in all, unless someone proves with a test that I'm wrong, just this?

diff --git virt/kvm/guest_memfd.c virt/kvm/guest_memfd.c
index 0c923fd603fd..c0f5b9565be2 100644
--- virt/kvm/guest_memfd.c
+++ virt/kvm/guest_memfd.c
@@ -688,7 +688,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
        if (kvm_gmem_supports_mmap(inode))
                slot->flags |= KVM_MEMSLOT_GMEM_ONLY;
 
-       xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL);
+       r = xa_err(xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL));
        filemap_invalidate_unlock(inode->i_mapping);
 
        /*
@@ -696,7 +696,6 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
         * not the other way 'round.  Active bindings are invalidated if the
         * file is closed before memslots are destroyed.
         */
-       r = 0;
 err:
        fput(file);
        return r;

^ permalink raw reply related

* Re: [PATCH v2 1/5] KVM: guest_memfd: Use write permissions when GUP-ing source pages
From: Sean Christopherson @ 2026-05-26 16:13 UTC (permalink / raw)
  To: Ackerley Tng
  Cc: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H. Peter Anvin, Kiryl Shutsemau, Rick Edgecombe,
	Vishal Annapurve, Yan Zhao, Michael Roth, Isaku Yamahata,
	Chao Peng, Xiaoyao Li, Zongyao Chen, kvm, linux-kernel,
	linux-coco, Yu Zhang, Fuad Tabba
In-Reply-To: <20260522-fix-sev-gmem-post-populate-v2-1-3f196bfad5a1@google.com>

The shortlog is misleading, bordering on outright wrong.  I think most people
would read it as "ALWAYS Use write permissions when GUP-ing source pages".  I
also think it should be scoped to:

  KVM: SEV:

because this only affects SNP, and IMO is an SNP bug, not a guest_memfd bug.  E.g.

  KVM: SEV: Pin source page for write when adding CPUID data for SNP guest

On Fri, May 22, 2026, Ackerley Tng wrote:
> From: Sean Christopherson <seanjc@google.com>
> 
> sev_gmem_post_populate() may write to the source page if there was an error

Avoid referencing function names in changelogs when possible.  Unless the reader
is already familiar with the code, the name is meaningless.  The purpose of the
changelog is to complement the literal patch, not to provide a play-by-play
description.

> while performing SNP_LAUNCH_UPDATE.
> 
> Since GUP requested only reads, there is a chance sev_gmem_post_populate()
> could be writing to some read-only page.
> 
> sev_gmem_post_populate() will only ever write the source page if the type
> of page being LAUNCH_UPDATEd is a CPUID page. Hence, request a writable
> page only when loading the CPUID page.
> 
> Since TDX never writes to the source page, always pass false to
> kvm_gmem_populate().

Describe changes in human-friendly, conversational language.  And in a way that
doesn't require looking at the patch to understand the changelog: "pass false"
is meaningless without looking at the code to see what flag was added (or exists).

> With this, even if a read-only mapping or the global zero page was provided
> as the source page, GUP will do a copy-on-write, making it writable before
> the write happens in gvm_post_populate.

Objection, speculation.  If the mapping is truly read-only, i.e. doesn't allow
writes at all, then GUP will fail.  This is all superfluous information though;
"read-only" is a pretty ubiquitous concept, there's no need to explain it in
gory detail.


I'll rewrite to this when applying:

---
When populating a guest_memfd instance with the initial CPUID data for an
SNP guest, acquire a writable pin on the source page as KVM will write back
the "correct" CPUID information if the userspace provided data is rejected
by trusted firmware.  Because KVM writes to the source page using a kernel
mapping, pinning for read could result in KVM clobbering read-only memory.

Note, well-behaved VMMs are unlikely to be affected, as CPUID information
is almost always dynamically generated by userspace, i.e. it's unlikely for
the CPUID information to be backed by a read-only mapping.
---

> Fixes: 2a62345b30529 ("KVM: guest_memfd: GUP source pages prior to populating guest memory")
> Signed-off-by: Sean Christopherson <seanjc@google.com>

Cc: stable@vger.kernel.org

> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> ---


^ permalink raw reply

* Re: [RFC PATCH 15/15] x86/virt/tdx: Enable TDX Quoting extension
From: Xu Yilun @ 2026-05-26 15:45 UTC (permalink / raw)
  To: Xiaoyao Li
  Cc: Tony Lindgren, kas, djbw, rick.p.edgecombe, x86, peter.fang,
	linux-coco, linux-kernel, kvm, sohil.mehta, yilun.xu, baolu.lu,
	zhenzhong.duan
In-Reply-To: <892508b2-6c61-4db2-a12f-902f62385e71@intel.com>

On Mon, May 25, 2026 at 06:51:27PM +0800, Xiaoyao Li wrote:
> On 5/25/2026 1:17 PM, Tony Lindgren wrote:
> > On Fri, May 22, 2026 at 11:41:28AM +0800, Xu Yilun wrote:
> > > From: Peter Fang <peter.fang@intel.com>
> > > 
> > > TDX Module updates global metadata when add-on features are enabled.
> > > Host should update the cached tdx_sysinfo to reflect these changes.
> > 
> > This should be made clearer IMO. How about mention that get_tdx_sys_info()
> > needs to get called again to reload the TDX module global metadata?
> 
> Ah ha! This patch answers my comment to patch 1:
> https://lore.kernel.org/all/956fa1e6-2920-4b2e-8037-d4b9d812ae53@intel.com/
> 
> sysinfo_ext->memory_pool_required_pages and sysinfo_ext->ext_required will
> be updated after extensions are enabled by TDH.SYS.CONFIG.
> 
> Patch 06 in this series already reads the tdx_sys_info_quote out of
> get_tdx_sys_info(), which mean get_tdx_sys_info() doesn't ensure all the
> global metadata will be update again.
> 
> So how about move the read of memory_pool_required_pages and ext_required
> out of get_tdx_sys_info() and put them after TDH.SYS.CONFIG, so that we
> don't need call get_tdx_sys_info() again?

Yes, I'm good to it. I hesitated to move them out in case we need some
central control on global data. But now I see there is already a
precedent:

https://lore.kernel.org/kvm/20260520133909.409394-22-chao.gao@intel.com/

Once we've agreed on moving add-on data reading out of get_tdx_sys_info(),
we don't have to read them after TDH.SYS.CONFIG, read them when really
needed. How about the following, that makes the Extension part in this
series self-contained.

----8<----

diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 86e5b7ad19b3..b729c1f5ab9e 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -1536,6 +1536,10 @@ static __init int init_tdx_ext(void)
        if (!(tdx_sysinfo.features.tdx_features0 & TDX_FEATURES0_EXT))
                return 0;

+       ret = get_tdx_sys_info_ext(&tdx_sysinfo.ext);
+       if (ret)
+               return ret;
+
        /* No feature requires TDX Module Extensions. */
        if (!tdx_sysinfo.ext.ext_required)
                return 0;
diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
index f9cc2dd02caf..e7d9e0c4b604 100644
--- a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
+++ b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
@@ -140,8 +140,5 @@ static __init int get_tdx_sys_info(struct tdx_sys_info *sysinfo)
        ret = ret ?: get_tdx_sys_info_td_ctrl(&sysinfo->td_ctrl);
        ret = ret ?: get_tdx_sys_info_td_conf(&sysinfo->td_conf);

-       if (sysinfo->features.tdx_features0 & TDX_FEATURES0_EXT)
-               ret = ret ?: get_tdx_sys_info_ext(&sysinfo->ext);
-
        return ret;
 }

^ permalink raw reply related

* Re: [PATCH v2 4/5] KVM: SNP: Fix kunmap_local() unmapping order
From: Sean Christopherson @ 2026-05-26 15:55 UTC (permalink / raw)
  To: Ackerley Tng
  Cc: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H. Peter Anvin, Kiryl Shutsemau, Rick Edgecombe,
	Vishal Annapurve, Yan Zhao, Michael Roth, Isaku Yamahata,
	Chao Peng, Xiaoyao Li, Zongyao Chen, kvm, linux-kernel,
	linux-coco, Yu Zhang, Fuad Tabba
In-Reply-To: <20260522-fix-sev-gmem-post-populate-v2-4-3f196bfad5a1@google.com>

Similar comment on the shortlog as patch two.  "Fix the order" tells the reader
nothing useful, other than the author of the patch thought there was a bug.

  KVM: SEV: Unmap local kmaps in LIFO order, per highmem requirements

No need for a new version, I'll massage when applying.

On Fri, May 22, 2026, Ackerley Tng wrote:
> Mappings created with kmap_local_page() or kmap_local_pfn() must be
> unmapped in the reverse order they were acquired, following a LIFO
> (last-in, first-out) stack-based approach.
> 
> In sev_gmem_post_populate(), src_vaddr is mapped first and dst_vaddr is
> mapped second. The current code incorrectly calls kunmap_local() for
> src_vaddr before dst_vaddr.
> 
> Swap the kunmap_local() calls to ensure the mappings are released in the
> correct order.

It's worth calling out that this is completely benign since SNP is 64-bit only.

^ permalink raw reply

* Re: [PATCH v2 2/5] KVM: guest_memfd: Fix possible signed integer overflow
From: Sean Christopherson @ 2026-05-26 15:53 UTC (permalink / raw)
  To: Ackerley Tng
  Cc: Paolo Bonzini, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H. Peter Anvin, Kiryl Shutsemau, Rick Edgecombe,
	Vishal Annapurve, Yan Zhao, Michael Roth, Isaku Yamahata,
	Chao Peng, Xiaoyao Li, Zongyao Chen, kvm, linux-kernel,
	linux-coco, Yu Zhang, Fuad Tabba
In-Reply-To: <20260522-fix-sev-gmem-post-populate-v2-2-3f196bfad5a1@google.com>

For shortlogs (and changeloges), when possible, describe the _change_ itself, not
its impact is.	Sometimes "Fix xyz" is the best shortlog, e.g. when fixing build
failures, but here, I would go with:

  KVM: guest_memfd: Treat memslot binding offset+size as unsigned values

for two reasons.  First, it provides a lot more context for future readers, versus 
"Fix possible signed integer overflow" which doesn't even capture what flow is
affected, how the overflow is being fixed, etc.  Second, if the fix is wrong,
incomplete, etc., we don't end up with a follow-up patch that start with "Really
fix ...".

Oh, actually, three reasons.  This doesn't only affect the overflow check.  The
check on a negative offset is flawed, as it means KVM would incorrectly reject
bindings with (comically) large offsets.

LOL, four.  There is no bug.  The size of the memslot is ((1UL << 31) - 1)
pages, i.e. 0x7FF_FFFFF000:

	if (id < KVM_USER_MEM_SLOTS &&
	    (mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
		return -EINVAL;

and so "loff_t size" can never be negative.

As for the offset, the negative check is intentional, because KVM_CREATE_GUEST_MEMFD
takes loff_t for the size, and so an offset that is negative would also be larger
than the size of the file.

I still think it's worth taking unsigned values, because teasing out all of that
information wasn't exactly easy.

On Fri, May 22, 2026, Ackerley Tng wrote:
> From: Sean Christopherson <seanjc@google.com>
> 
> The caller, kvm_set_memory_region(), checks for an overflow in an unsigned
> u64 guest_memfd_offset. When guest_memfd_offset is passed to kvm_gmem_bind,
> it is cast into a signed 64-bit integer.
> 
> Hence, a large 64-bit offset could result in a negative loff_t, which could
> result in the overflow checks failing.
> 
> Make kvm_gmem_bind() take u64 instead of loff_t to consistently deal with
> unsigned values to avoid this issue.
> 
> Fixes: a7800aa80ea4d ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory")
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> [Use size_t for size instead of u64]

Why?  Oh, right, because kvm_memory_slot.npages is an "unsigned long".  The
discrepancy between a u64 for the offset and a size_t for the size is confusing,
as they are both conceptually in the same "domain".

Rather than u64 and size_t, we should use pgoff_t, which is what KVM already uses
as the storage for kvm_memory_slot.gmem.pgoff.

I'll send a new version as a standalone patch.

^ permalink raw reply

* SVSM Development Call May 27th, 2026
From: Stefano Garzarella @ 2026-05-26 15:46 UTC (permalink / raw)
  To: coconut-svsm, linux-coco

Hi,

Here is the call for agenda items for this weeks SVSM development
call. Please send any agenda items you have in mind as a reply to this
email or raise them in the meeting.

We will use the LF Zoom instance. Details of the meeting can be found
in our governance repository at:

        https://github.com/coconut-svsm/governance

The link to the COCONUT-SVSM calendar is:

        https://zoom-lfx.platform.linuxfoundation.org/meetings/coconut-svsm?view=week

The meeting will be recorded and the recording eventually published.

Regards,
Stefano


^ permalink raw reply

* Re: [PATCH v5 10/20] dma-direct: make dma_direct_map_phys() honor DMA_ATTR_CC_SHARED
From: Jason Gunthorpe @ 2026-05-26 15:39 UTC (permalink / raw)
  To: Michael Kelley
  Cc: Aneesh Kumar K.V (Arm), iommu@lists.linux.dev,
	linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-coco@lists.linux.dev,
	Robin Murphy, Marek Szyprowski, Will Deacon, Marc Zyngier,
	Steven Price, Suzuki K Poulose, Catalin Marinas, Jiri Pirko,
	Mostafa Saleh, Petr Tesarik, Alexey Kardashevskiy, Dan Williams,
	Xu Yilun, linuxppc-dev@lists.ozlabs.org,
	linux-s390@vger.kernel.org, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
	Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, Sven Schnelle, x86@kernel.org, Jiri Pirko
In-Reply-To: <SN6PR02MB41574064D14D4A2734222C51D40B2@SN6PR02MB4157.namprd02.prod.outlook.com>

On Tue, May 26, 2026 at 02:56:57AM +0000, Michael Kelley wrote:

> With this patch removing SWIOTLB_FORCE from four places in
> kernel code, there are no remaining places where it is set.
> The test of SWIOTLB_FORCE could be removed from
> swiotlb_init_remap(), and its definition could be deleted
> from include/linux/swiotlb.h.

That's great! I think it shows this is the right approach!

Jason

^ permalink raw reply

* Re: [PATCH v5 1/5] vfio: cache KVM VM file references instead of raw struct kvm pointers
From: Anthony Krowiak @ 2026-05-26 10:52 UTC (permalink / raw)
  To: Aneesh Kumar K.V (Arm), linux-coco, iommu, linux-kernel, kvm
  Cc: Alexey Kardashevskiy, Bjorn Helgaas, Dan Williams,
	Jason Gunthorpe, Joerg Roedel, Jonathan Cameron, Kevin Tian,
	Nicolin Chen, Samuel Ortiz, Steven Price, Suzuki K Poulose,
	Will Deacon, Xu Yilun, Shameer Kolothum, Paolo Bonzini,
	Halil Pasic, Jason Herne, Harald Freudenberger, Holger Dengler,
	Heiko Carstens, Vasily Gorbik, Alexander Gordeev,
	Christian Borntraeger, Sven Schnelle, Alex Williamson,
	Matthew Rosato, Farhan Ali, Eric Farman, linux-s390
In-Reply-To: <20260525154816.1029642-2-aneesh.kumar@kernel.org>



On 5/25/26 11:48 AM, Aneesh Kumar K.V (Arm) wrote:
> VFIO currently records struct kvm pointers on vfio_group, vfio_device_file
> and the opened vfio_device. Switch VFIO to track the VM's struct file
> instead, so VFIO and iommufd can use normal file references for VM lifetime
> instead of depending on KVM's internal struct kvm refcounting.
>
> KVM_CREATE_DEVICE binds the KVM VM lifetime to the KVM device fd lifetime.
> For KVM_DEV_TYPE_VFIO, the KVM VFIO device fd also takes references to each
> VFIO file added through KVM_DEV_VFIO_FILE_ADD. The KVM VFIO device fd
> therefore owns both the internal KVM reference and the VFIO file references
> in kvf->file.
>
> KVM_DEV_VFIO_FILE_ADD further installs the VM file association into the
> VFIO file. VFIO converts the struct kvm pointer to a VM file reference with
> get_file_active(&kvm->_file), because the KVM device fd can keep struct kvm
> alive after the original VM fd is already in final release.
>
> The association intentionally pins the VM file until KVM_DEV_VFIO_FILE_DEL
> or until the KVM VFIO device fd is released. This gives VFIO/iommufd a
> stable VM file reference source without taking a dependency on KVM's struct
> kvm lifetime. The KVM VFIO device release path clears the VFIO-side
> association before dropping its VFIO file references.
>
> When a VFIO device is opened or bound, VFIO takes an additional reference
> from the associated VM file and stores it in vfio_device::kvm_file for
> driver and iommufd use. That open-time reference is released from
> vfio_device_put_kvm() when the VFIO device is closed or unbound.
>
> This gives the ownership model:
>
>    - KVM device fd pins struct kvm through kvm->users_count
>    - KVM VFIO device fd pins VFIO files through kvf->file
>    - VFIO group/device-file state pins the VM file while associated with KVM
>    - vfio_device::kvm_file pins the VM file during active VFIO device use
>
> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>

Acked-by: Anthony Krowiak <akrowiak@linxux.ibm.com>

> ---
>   drivers/s390/crypto/vfio_ap_ops.c |  5 +-
>   drivers/vfio/device_cdev.c        | 10 ++--
>   drivers/vfio/group.c              | 14 +++---
>   drivers/vfio/pci/vfio_pci_zdev.c  |  7 +--
>   drivers/vfio/vfio.h               | 16 ++++--
>   drivers/vfio/vfio_main.c          | 81 ++++++++++++++++---------------
>   include/linux/kvm_host.h          |  3 ++
>   include/linux/vfio.h              | 17 ++++++-
>   virt/kvm/kvm_main.c               |  2 +
>   9 files changed, 91 insertions(+), 64 deletions(-)
>
> diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
> index 44b3a1dcc1b3..05996a8fd860 100644
> --- a/drivers/s390/crypto/vfio_ap_ops.c
> +++ b/drivers/s390/crypto/vfio_ap_ops.c
> @@ -2054,11 +2054,12 @@ static int vfio_ap_mdev_open_device(struct vfio_device *vdev)
>   {
>   	struct ap_matrix_mdev *matrix_mdev =
>   		container_of(vdev, struct ap_matrix_mdev, vdev);
> +	struct kvm *kvm = vfio_device_get_kvm(vdev);
>   
> -	if (!vdev->kvm)
> +	if (!kvm)
>   		return -EINVAL;
>   
> -	return vfio_ap_mdev_set_kvm(matrix_mdev, vdev->kvm);
> +	return vfio_ap_mdev_set_kvm(matrix_mdev, kvm);
>   }
>   
>   static void vfio_ap_mdev_close_device(struct vfio_device *vdev)
> diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
> index 54abf312cf04..ca75ab8eb7bd 100644
> --- a/drivers/vfio/device_cdev.c
> +++ b/drivers/vfio/device_cdev.c
> @@ -56,7 +56,7 @@ int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep)
>   static void vfio_df_get_kvm_safe(struct vfio_device_file *df)
>   {
>   	spin_lock(&df->kvm_ref_lock);
> -	vfio_device_get_kvm_safe(df->device, df->kvm);
> +	vfio_device_get_kvm_safe(df->device, df->kvm_file);
>   	spin_unlock(&df->kvm_ref_lock);
>   }
>   
> @@ -133,10 +133,10 @@ long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df,
>   	}
>   
>   	/*
> -	 * Before the device open, get the KVM pointer currently
> -	 * associated with the device file (if there is) and obtain
> -	 * a reference.  This reference is held until device closed.
> -	 * Save the pointer in the device for use by drivers.
> +	 * Before the device open, get the VM struct file currently
> +	 * associated with the device file (if there is one) and obtain a
> +	 * reference. This reference is held until the device is closed.
> +	 * Save the file in the device for use by drivers.
>   	 */
>   	vfio_df_get_kvm_safe(df);
>   
> diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c
> index b2299e5bc6df..8950cfb9405d 100644
> --- a/drivers/vfio/group.c
> +++ b/drivers/vfio/group.c
> @@ -163,7 +163,7 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group,
>   static void vfio_device_group_get_kvm_safe(struct vfio_device *device)
>   {
>   	spin_lock(&device->group->kvm_ref_lock);
> -	vfio_device_get_kvm_safe(device, device->group->kvm);
> +	vfio_device_get_kvm_safe(device, device->group->kvm_file);
>   	spin_unlock(&device->group->kvm_ref_lock);
>   }
>   
> @@ -181,10 +181,10 @@ static int vfio_df_group_open(struct vfio_device_file *df)
>   	mutex_lock(&device->dev_set->lock);
>   
>   	/*
> -	 * Before the first device open, get the KVM pointer currently
> -	 * associated with the group (if there is one) and obtain a reference
> -	 * now that will be held until the open_count reaches 0 again.  Save
> -	 * the pointer in the device for use by drivers.
> +	 * Before the first device open, get the VM struct file currently
> +	 * associated with the group (if there is one) and obtain a
> +	 * reference now that will be held until the open_count reaches 0
> +	 * again. Save the file in the device for use by drivers.
>   	 */
>   	if (device->open_count == 0)
>   		vfio_device_group_get_kvm_safe(device);
> @@ -862,9 +862,7 @@ bool vfio_group_enforced_coherent(struct vfio_group *group)
>   
>   void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
>   {
> -	spin_lock(&group->kvm_ref_lock);
> -	group->kvm = kvm;
> -	spin_unlock(&group->kvm_ref_lock);
> +	vfio_kvm_file_replace(&group->kvm_file, &group->kvm_ref_lock, kvm);
>   }
>   
>   /**
> diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
> index 0990fdb146b7..a9d8e6aa3839 100644
> --- a/drivers/vfio/pci/vfio_pci_zdev.c
> +++ b/drivers/vfio/pci/vfio_pci_zdev.c
> @@ -144,15 +144,16 @@ int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
>   int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev)
>   {
>   	struct zpci_dev *zdev = to_zpci(vdev->pdev);
> +	struct kvm *kvm = vfio_device_get_kvm(&vdev->vdev);
>   
>   	if (!zdev)
>   		return -ENODEV;
>   
> -	if (!vdev->vdev.kvm)
> +	if (!kvm)
>   		return 0;
>   
>   	if (zpci_kvm_hook.kvm_register)
> -		return zpci_kvm_hook.kvm_register(zdev, vdev->vdev.kvm);
> +		return zpci_kvm_hook.kvm_register(zdev, kvm);
>   
>   	return -ENOENT;
>   }
> @@ -161,7 +162,7 @@ void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev)
>   {
>   	struct zpci_dev *zdev = to_zpci(vdev->pdev);
>   
> -	if (!zdev || !vdev->vdev.kvm)
> +	if (!zdev || !vfio_device_get_kvm(&vdev->vdev))
>   		return;
>   
>   	if (zpci_kvm_hook.kvm_unregister)
> diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
> index e4b72e79b7e3..41032104eb36 100644
> --- a/drivers/vfio/vfio.h
> +++ b/drivers/vfio/vfio.h
> @@ -22,8 +22,8 @@ struct vfio_device_file {
>   
>   	u8 access_granted;
>   	u32 devid; /* only valid when iommufd is valid */
> -	spinlock_t kvm_ref_lock; /* protect kvm field */
> -	struct kvm *kvm;
> +	spinlock_t kvm_ref_lock; /* protect kvm_file */
> +	struct file *kvm_file;
>   	struct iommufd_ctx *iommufd; /* protected by struct vfio_device_set::lock */
>   };
>   
> @@ -88,7 +88,7 @@ struct vfio_group {
>   #endif
>   	enum vfio_group_type		type;
>   	struct mutex			group_lock;
> -	struct kvm			*kvm;
> +	struct file			*kvm_file;
>   	struct file			*opened_file;
>   	struct iommufd_ctx		*iommufd;
>   	spinlock_t			kvm_ref_lock;
> @@ -434,11 +434,17 @@ static inline void vfio_virqfd_exit(void)
>   #endif
>   
>   #if IS_ENABLED(CONFIG_KVM)
> -void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm);
> +void vfio_kvm_file_replace(struct file **dst, spinlock_t *lock, struct kvm *kvm);
> +void vfio_device_get_kvm_safe(struct vfio_device *device, struct file *kvm_file);
>   void vfio_device_put_kvm(struct vfio_device *device);
>   #else
> +static inline void vfio_kvm_file_replace(struct file **dst,
> +		spinlock_t *lock, struct kvm *kvm)
> +{
> +}
> +
>   static inline void vfio_device_get_kvm_safe(struct vfio_device *device,
> -					    struct kvm *kvm)
> +					    struct file *kvm_file)
>   {
>   }
>   
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> index 6222376ab6ab..88c85a7b98c0 100644
> --- a/drivers/vfio/vfio_main.c
> +++ b/drivers/vfio/vfio_main.c
> @@ -442,55 +442,61 @@ void vfio_unregister_group_dev(struct vfio_device *device)
>   EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
>   
>   #if IS_ENABLED(CONFIG_KVM)
> -void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
> +void vfio_kvm_file_replace(struct file **dst, spinlock_t *lock, struct kvm *kvm)
>   {
> -	void (*pfn)(struct kvm *kvm);
> -	bool (*fn)(struct kvm *kvm);
> -	bool ret;
> +	struct file *old_kvm_file, *new_kvm_file = NULL;
>   
> -	lockdep_assert_held(&device->dev_set->lock);
> +	/*
> +	 * @kvm can outlive the VM fd and its final __fput(). Only take a
> +	 * new reference if the VM file is still active.
> +	 */
> +	if (kvm)
> +		new_kvm_file = get_file_active(&kvm->_file);
>   
> -	if (!kvm)
> -		return;
> +	spin_lock(lock);
> +	old_kvm_file = *dst;
> +	*dst = new_kvm_file;
> +	spin_unlock(lock);
>   
> -	pfn = symbol_get(kvm_put_kvm);
> -	if (WARN_ON(!pfn))
> -		return;
> +	if (old_kvm_file)
> +		fput(old_kvm_file);
> +}
>   
> -	fn = symbol_get(kvm_get_kvm_safe);
> -	if (WARN_ON(!fn)) {
> -		symbol_put(kvm_put_kvm);
> -		return;
> -	}
> +void vfio_device_get_kvm_safe(struct vfio_device *device, struct file *kvm_file)
> +{
> +	lockdep_assert_held(&device->dev_set->lock);
>   
> -	ret = fn(kvm);
> -	symbol_put(kvm_get_kvm_safe);
> -	if (!ret) {
> -		symbol_put(kvm_put_kvm);
> -		return;
> -	}
> +	/*
> +	 * Take a VM file reference if the KVM fd is still active.
> +	 */
> +	if (kvm_file)
> +		kvm_file = get_file(kvm_file);
>   
> -	device->put_kvm = pfn;
> -	device->kvm = kvm;
> +	device->kvm_file = kvm_file;
>   }
>   
>   void vfio_device_put_kvm(struct vfio_device *device)
>   {
> +	struct file *kvm_file;
> +
>   	lockdep_assert_held(&device->dev_set->lock);
>   
> -	if (!device->kvm)
> +	kvm_file = device->kvm_file;
> +	if (!kvm_file)
>   		return;
>   
> -	if (WARN_ON(!device->put_kvm))
> -		goto clear;
> +	device->kvm_file = NULL;
> +	fput(kvm_file);
> +}
>   
> -	device->put_kvm(device->kvm);
> -	device->put_kvm = NULL;
> -	symbol_put(kvm_put_kvm);
> +struct kvm *vfio_device_get_kvm(struct vfio_device *device)
> +{
> +	if (!device->kvm_file)
> +		return NULL;
>   
> -clear:
> -	device->kvm = NULL;
> +	return device->kvm_file->private_data;
>   }
> +EXPORT_SYMBOL_GPL(vfio_device_get_kvm);
>   #endif
>   
>   /* true if the vfio_device has open_device() called but not close_device() */
> @@ -1518,13 +1524,10 @@ static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
>   	struct vfio_device_file *df = file->private_data;
>   
>   	/*
> -	 * The kvm is first recorded in the vfio_device_file, and will
> -	 * be propagated to vfio_device::kvm when the file is bound to
> -	 * iommufd successfully in the vfio device cdev path.
> +	 * Cache the VM file reference associated with this VFIO file so it
> +	 * can be pinned into vfio_device while the device is open.
>   	 */
> -	spin_lock(&df->kvm_ref_lock);
> -	df->kvm = kvm;
> -	spin_unlock(&df->kvm_ref_lock);
> +	vfio_kvm_file_replace(&df->kvm_file, &df->kvm_ref_lock, kvm);
>   }
>   
>   /**
> @@ -1532,8 +1535,8 @@ static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
>    * @file: VFIO group file or VFIO device file
>    * @kvm: KVM to link
>    *
> - * When a VFIO device is first opened the KVM will be available in
> - * device->kvm if one was associated with the file.
> + * When a VFIO device is first opened, VFIO caches a VM file reference if
> + * one was associated with the file.
>    */
>   void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
>   {
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 4c14aee1fb06..31afac5fb0ea 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -45,6 +45,8 @@
>   #include <asm/kvm_host.h>
>   #include <linux/kvm_dirty_ring.h>
>   
> +struct file;
> +
>   #ifndef KVM_MAX_VCPU_IDS
>   #define KVM_MAX_VCPU_IDS KVM_MAX_VCPUS
>   #endif
> @@ -861,6 +863,7 @@ struct kvm {
>   	struct srcu_struct srcu;
>   	struct srcu_struct irq_srcu;
>   	pid_t userspace_pid;
> +	struct file __rcu *_file;
>   	bool override_halt_poll_ns;
>   	unsigned int max_halt_poll_ns;
>   	u32 dirty_ring_size;
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 31b826efba00..bca1d00f7845 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -22,8 +22,22 @@ struct kvm;
>   struct iommufd_ctx;
>   struct iommufd_device;
>   struct iommufd_access;
> +struct vfio_device;
>   struct vfio_info_cap;
>   
> +#if IS_ENABLED(CONFIG_KVM)
> +/*
> + * Return the KVM associated with @vdev's kvm_file. The returned pointer
> + * is valid only while VFIO device open holds the kvm_file reference.
> + */
> +struct kvm *vfio_device_get_kvm(struct vfio_device *vdev);
> +#else
> +static inline struct kvm *vfio_device_get_kvm(struct vfio_device *vdev)
> +{
> +	return NULL;
> +}
> +#endif
> +
>   /*
>    * VFIO devices can be placed in a set, this allows all devices to share this
>    * structure and the VFIO core will provide a lock that is held around
> @@ -54,7 +68,7 @@ struct vfio_device {
>   	struct list_head dev_set_list;
>   	unsigned int migration_flags;
>   	u8 precopy_info_v2;
> -	struct kvm *kvm;
> +	struct file *kvm_file;
>   
>   	/* Members below here are private, not for driver use */
>   	unsigned int index;
> @@ -66,7 +80,6 @@ struct vfio_device {
>   	unsigned int open_count;
>   	struct completion comp;
>   	struct iommufd_access *iommufd_access;
> -	void (*put_kvm)(struct kvm *kvm);
>   	struct inode *inode;
>   #if IS_ENABLED(CONFIG_IOMMUFD)
>   	struct iommufd_device *iommufd_device;
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 89489996fbc1..011819c5c47c 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1351,6 +1351,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
>   
>   	kvm_irqfd_release(kvm);
>   
> +	RCU_INIT_POINTER(kvm->_file, NULL);
>   	kvm_put_kvm(kvm);
>   	return 0;
>   }
> @@ -5500,6 +5501,7 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
>   		r = PTR_ERR(file);
>   		goto put_kvm;
>   	}
> +	rcu_assign_pointer(kvm->_file, file);
>   
>   	/*
>   	 * Don't call kvm_put_kvm anymore at this point; file->f_op is


^ permalink raw reply

* Re: [RFC PATCH 15/15] x86/virt/tdx: Enable TDX Quoting extension
From: Tony Lindgren @ 2026-05-26  9:00 UTC (permalink / raw)
  To: Xiaoyao Li
  Cc: Xu Yilun, kas, djbw, rick.p.edgecombe, x86, peter.fang,
	linux-coco, linux-kernel, kvm, sohil.mehta, yilun.xu, baolu.lu,
	zhenzhong.duan
In-Reply-To: <892508b2-6c61-4db2-a12f-902f62385e71@intel.com>

On Mon, May 25, 2026 at 06:51:27PM +0800, Xiaoyao Li wrote:
> On 5/25/2026 1:17 PM, Tony Lindgren wrote:
> > On Fri, May 22, 2026 at 11:41:28AM +0800, Xu Yilun wrote:
> > > From: Peter Fang <peter.fang@intel.com>
> > > 
> > > TDX Module updates global metadata when add-on features are enabled.
> > > Host should update the cached tdx_sysinfo to reflect these changes.
> > 
> > This should be made clearer IMO. How about mention that get_tdx_sys_info()
> > needs to get called again to reload the TDX module global metadata?
> 
> Ah ha! This patch answers my comment to patch 1:
> https://lore.kernel.org/all/956fa1e6-2920-4b2e-8037-d4b9d812ae53@intel.com/
> 
> sysinfo_ext->memory_pool_required_pages and sysinfo_ext->ext_required will
> be updated after extensions are enabled by TDH.SYS.CONFIG.
> 
> Patch 06 in this series already reads the tdx_sys_info_quote out of
> get_tdx_sys_info(), which mean get_tdx_sys_info() doesn't ensure all the
> global metadata will be update again.
> 
> So how about move the read of memory_pool_required_pages and ext_required
> out of get_tdx_sys_info() and put them after TDH.SYS.CONFIG, so that we
> don't need call get_tdx_sys_info() again?

Sounds like a good idea to me.

^ permalink raw reply

* Re: [PATCH v6 06/11] x86/virt/tdx: Optimize tdx_pamt_get/put()
From: Chao Gao @ 2026-05-26  8:57 UTC (permalink / raw)
  To: Rick Edgecombe
  Cc: bp, dave.hansen, hpa, kas, kvm, linux-coco, linux-doc,
	linux-kernel, mingo, nik.borisov, pbonzini, seanjc, tglx,
	vannapurve, x86, yan.y.zhao, kai.huang, Kirill A. Shutemov
In-Reply-To: <20260526023515.288829-7-rick.p.edgecombe@intel.com>

On Mon, May 25, 2026 at 07:35:10PM -0700, Rick Edgecombe wrote:
>@@ -2057,32 +2057,50 @@ static int tdx_pamt_get(kvm_pfn_t pfn)
> 	if (!tdx_supports_dynamic_pamt(&tdx_sysinfo))
> 		return 0;
> 
>+	pamt_refcount = tdx_find_pamt_refcount(pfn);
>+
>+	/*
>+	 * If the pamt page is already added (i.e. refcount >= 1),
>+	 * then just increment the refcount.
>+	 */
>+	if (atomic_inc_not_zero(pamt_refcount))
>+		return 0;
>+
> 	ret = alloc_pamt_array(pamt_pages);
> 	if (ret)
> 		return ret;
> 
>-	pamt_refcount = tdx_find_pamt_refcount(pfn);
>+	spin_lock(&pamt_lock);
> 
>-	scoped_guard(spinlock, &pamt_lock) {

This converts the scoped_guard() added by the previous patch to
explicit lock/unlock and goto. It would reduce code churn if the
previous patch used that form directly.

>-		/*
>-		 * If the pamt page is already added (i.e. refcount >= 1),
>-		 * then just increment the refcount.
>-		 */
>-		if (atomic_read(pamt_refcount)) {
>-			atomic_inc(pamt_refcount);
>-			goto out_free;
>-		}
>-
>-		/* Try to add the pamt page and take the refcount 0->1. */
>-		tdx_status = tdh_phymem_pamt_add(pfn, pamt_pages);
>-		if (WARN_ON_ONCE(tdx_status != TDX_SUCCESS)) {
>-			ret = -EIO;
>-			goto out_free;
>-		}
>-
>-		atomic_set(pamt_refcount, 1);
>+	/*
>+	 * Unlike tdx_pamt_put() which uses atomic_dec_and_lock() to
>+	 * atomically handle the 1->0 transition, the get side has no
>+	 * equivalent combined primitive for 0->1. Recheck under the
>+	 * lock since another get may have already done the 0->1
>+	 * transition after both saw atomic_inc_not_zero() fail.
>+	 */
>+	if (atomic_read(pamt_refcount)) {
>+		atomic_inc(pamt_refcount);
>+		spin_unlock(&pamt_lock);
>+		goto out_free;
> 	}
> 
>+	tdx_status = tdh_phymem_pamt_add(pfn, pamt_pages);
>+	if (tdx_status == TDX_SUCCESS) {
>+		/*
>+		 * The refcount is zero, and this locked path is the
>+		 * only way to increase it from 0->1.
>+		 */
>+		atomic_set(pamt_refcount, 1);
>+	} else {
>+		WARN_ON_ONCE(1);
>+		ret = -EIO;
>+		spin_unlock(&pamt_lock);
>+		goto out_free;
>+	}
>+
>+	spin_unlock(&pamt_lock);
>+
> 	return 0;
> out_free:
> 	free_pamt_array(pamt_pages);
>@@ -2104,32 +2122,34 @@ static void tdx_pamt_put(kvm_pfn_t pfn)
> 
> 	pamt_refcount = tdx_find_pamt_refcount(pfn);
> 
>-	scoped_guard(spinlock, &pamt_lock) {

Ditto

>+	/*
>+	 * If there is more than 1 reference on the pamt page, don't
>+	 * remove it yet. Just decrement the refcount.
>+	 */
>+	if (!atomic_dec_and_lock(pamt_refcount, &pamt_lock))
>+		return;
>+
>+	tdx_status = tdh_phymem_pamt_remove(pfn, pamt_pages);
>+
>+	/*
>+	 * Don't free pamt_pages as it could hold garbage when
>+	 * tdh_phymem_pamt_remove() fails.  Don't panic/BUG_ON(), as
>+	 * there is no risk of data corruption, but do yell loudly as
>+	 * failure indicates a kernel bug, memory is being leaked, and
>+	 * the dangling PAMT entry may cause future operations to fail.
>+	 */
>+	if (WARN_ON_ONCE(tdx_status != TDX_SUCCESS)) {
> 		/*
>-		 * If the there are more than 1 references on the pamt page,
>-		 * don't remove it yet. Just decrement the refcount.
>+		 * atomic_dec_and_lock() already decremented it to 0,
>+		 * but the PAMT entry still exists since REMOVE failed.
> 		 */
>-		if (atomic_read(pamt_refcount) > 1) {
>-			atomic_dec(pamt_refcount);
>-			return;
>-		}
>-
>-		/* Try to remove the pamt page and take the refcount 1->0. */
>-		tdx_status = tdh_phymem_pamt_remove(pfn, pamt_pages);
>-
>-		/*
>-		 * Don't free pamt_pages as it could hold garbage when
>-		 * tdh_phymem_pamt_remove() fails.  Don't panic/BUG_ON(), as
>-		 * there is no risk of data corruption, but do yell loudly as
>-		 * failure indicates a kernel bug, memory is being leaked, and
>-		 * the dangling PAMT entry may cause future operations to fail.
>-		 */
>-		if (WARN_ON_ONCE(tdx_status != TDX_SUCCESS))
>-			return;
>-
>-		atomic_set(pamt_refcount, 0);
>+		atomic_set(pamt_refcount, 1);
>+		spin_unlock(&pamt_lock);
>+		return;
> 	}
> 
>+	spin_unlock(&pamt_lock);
>+
> 	free_pamt_array(pamt_pages);
> }
> 
>-- 
>2.54.0
>

^ permalink raw reply

* RE: [PATCH v5 00/20] dma-mapping: Use DMA_ATTR_CC_SHARED through direct, pool and swiotlb paths
From: Michael Kelley @ 2026-05-26  4:30 UTC (permalink / raw)
  To: Aneesh Kumar K.V (Arm), iommu@lists.linux.dev,
	linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-coco@lists.linux.dev
  Cc: Robin Murphy, Marek Szyprowski, Will Deacon, Marc Zyngier,
	Steven Price, Suzuki K Poulose, Catalin Marinas, Jiri Pirko,
	Jason Gunthorpe, Mostafa Saleh, Petr Tesarik,
	Alexey Kardashevskiy, Dan Williams, Xu Yilun,
	linuxppc-dev@lists.ozlabs.org, linux-s390@vger.kernel.org,
	Madhavan Srinivasan, Michael Ellerman, Nicholas Piggin,
	Christophe Leroy (CS GROUP), Alexander Gordeev, Gerald Schaefer,
	Heiko Carstens, Vasily Gorbik, Christian Borntraeger,
	Sven Schnelle, x86@kernel.org
In-Reply-To: <20260522042815.370873-1-aneesh.kumar@kernel.org>

From: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org> Sent: Thursday, May 21, 2026 9:28 PM
> 
> This series propagates DMA_ATTR_CC_SHARED through the dma-direct,
> dma-pool, and swiotlb paths so that encrypted and decrypted DMA buffers
> are handled consistently.
> 
> Today, the direct DMA path mostly relies on force_dma_unencrypted() for
> shared/decrypted buffer handling. This series consolidates the
> force_dma_unencrypted() checks in the top-level functions and ensures
> that the remaining DMA interfaces use DMA attributes to make the correct
> decisions.
> 
> The series:
> - moves swiotlb-backed allocations out of __dma_direct_alloc_pages(),
> - propagates DMA_ATTR_CC_SHARED through the dma-direct alloc/free
>   paths
> - teaches the atomic DMA pools to track encrypted versus decrypted
>   state
> - tracks swiotlb pool encryption state and enforces strict pool
>   selection
> - centralizes encrypted/decrypted pgprot handling in dma_pgprot() using
>   DMA attributes
> - passes DMA attributes down to dma_capable() so capability checks can
>   validate whether the selected DMA address encoding matches
>   DMA_ATTR_CC_SHARED
> - makes dma_direct_map_phys() choose the DMA address encoding from
>   DMA_ATTR_CC_SHARED and fall back to swiotlb when a shared DMA request
>   cannot use the direct mapping, which lets arm64 and x86 CCA guests stop
>   relying on SWIOTLB_FORCE for DMA mappings
> - use the selected swiotlb pool state to derive the returned DMA
>   address.

[snip]

> 
> 
> Aneesh Kumar K.V (Arm) (20):
>   [DO NOT MERGE] arm64/coco: Add pKVM as a CC platform
>   [DO NOT MERGE] s390: Expose protected virtualization through
>     cc_platform_has()
>   dma-direct: swiotlb: handle swiotlb alloc/free outside
>     __dma_direct_alloc_pages
>   dma-direct: use DMA_ATTR_CC_SHARED in alloc/free paths
>   dma-pool: track decrypted atomic pools and select them via attrs
>   dma: swiotlb: pass mapping attributes by reference
>   dma: swiotlb: track pool encryption state and honor DMA_ATTR_CC_SHARED
>   dma-mapping: make dma_pgprot() honor DMA_ATTR_CC_SHARED
>   dma-direct: pass attrs to dma_capable() for DMA_ATTR_CC_SHARED checks
>   dma-direct: make dma_direct_map_phys() honor DMA_ATTR_CC_SHARED
>   dma-direct: set decrypted flag for remapped DMA allocations
>   dma-direct: select DMA address encoding from DMA_ATTR_CC_SHARED
>   dma-pool: fix page leak in atomic_pool_expand() cleanup
>   dma-direct: rename ret to cpu_addr in alloc helpers
>   dma-direct: return struct page from dma_direct_alloc_from_pool()
>   iommu/dma: Check atomic pool allocation result directly
>   dma: swiotlb: free dynamic pools from process context
>   dma: swiotlb: handle set_memory_decrypted() failures
>   dma: free atomic pool pages by physical address
>   swiotlb: Preserve allocation virtual address for dynamic pools
> 
>  arch/arm64/include/asm/hypervisor.h           |   6 +
>  arch/arm64/include/asm/mem_encrypt.h          |   3 +-
>  arch/arm64/kernel/rsi.c                       |  12 -
>  arch/arm64/mm/init.c                          |  17 +-
>  arch/powerpc/platforms/pseries/svm.c          |   2 +-
>  arch/s390/Kconfig                             |   1 +
>  arch/s390/mm/init.c                           |  16 +-
>  arch/x86/kernel/amd_gart_64.c                 |  30 +-
>  arch/x86/kernel/pci-dma.c                     |   4 +-
>  drivers/iommu/dma-iommu.c                     |  15 +-
>  drivers/virt/coco/pkvm-guest/arm-pkvm-guest.c |   5 +
>  drivers/xen/swiotlb-xen.c                     |   8 +-
>  include/linux/dma-direct.h                    |  20 +-
>  include/linux/dma-map-ops.h                   |   3 +-
>  include/linux/swiotlb.h                       |  20 +-
>  kernel/dma/direct.c                           | 275 +++++++++++++-----
>  kernel/dma/direct.h                           |  47 +--
>  kernel/dma/mapping.c                          |  16 +-
>  kernel/dma/pool.c                             | 221 ++++++++++----
>  kernel/dma/swiotlb.c                          | 270 +++++++++++++----
>  20 files changed, 717 insertions(+), 274 deletions(-)
> 

I tested the series in a linux-next20260518 kernel, running in an
Azure VM on the Hyper-V hypervisor. The physical processor is Intel
XEON(R) PLATINUM 8573C with TDX memory encryption in use, so
this is a Linux CoCo VM. The VM has the usual VMBus synthetic disk
and network devices provided by Hyper-V, plus two PCI NVMe devices
that are directly assigned to the VM. I did basic smoke tests in the
VM, including reading and writing the NVMe devices. The swiotlb is
used as expected for DMA transfers to/from the synthetic and NVMe
devices. The NVMe driver does dma_alloc_coherent() to allocate
memory for control structures that must be decrypted. I did "unbind"
on the NVMe devices, and then rebound them so the dma allocations
would be freed and then reallocated. All looks good.

I'd like to try the same tests in a CoCo VM based on AMD SEV-SNP,
but I need to get quota for that VM size in Azure, and I don't know
how soon that can happen.

So as described above,

Tested-by: Michael Kelley <mhklinux@outlook.com>

^ permalink raw reply

* RE: [PATCH v5 10/20] dma-direct: make dma_direct_map_phys() honor DMA_ATTR_CC_SHARED
From: Michael Kelley @ 2026-05-26  2:56 UTC (permalink / raw)
  To: Aneesh Kumar K.V (Arm), iommu@lists.linux.dev,
	linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-coco@lists.linux.dev
  Cc: Robin Murphy, Marek Szyprowski, Will Deacon, Marc Zyngier,
	Steven Price, Suzuki K Poulose, Catalin Marinas, Jiri Pirko,
	Jason Gunthorpe, Mostafa Saleh, Petr Tesarik,
	Alexey Kardashevskiy, Dan Williams, Xu Yilun,
	linuxppc-dev@lists.ozlabs.org, linux-s390@vger.kernel.org,
	Madhavan Srinivasan, Michael Ellerman, Nicholas Piggin,
	Christophe Leroy (CS GROUP), Alexander Gordeev, Gerald Schaefer,
	Heiko Carstens, Vasily Gorbik, Christian Borntraeger,
	Sven Schnelle, x86@kernel.org, Jiri Pirko
In-Reply-To: <20260522042815.370873-11-aneesh.kumar@kernel.org>

From: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org> Sent: Thursday, May 21, 2026 9:28 PM
> 
> Teach dma_direct_map_phys() to select the DMA address encoding based on
> DMA_ATTR_CC_SHARED.
> 
> Use phys_to_dma_unencrypted() for decrypted mappings and
> phys_to_dma_encrypted() otherwise. If a device requires unencrypted DMA
> but the source physical address is still encrypted, force the mapping
> through swiotlb so the DMA address and backing memory attributes remain
> consistent.
> 
> Update the arm64, x86, s390 and powerpc secure-guest setup to not use
> swiotlb force option
> 
> Tested-by: Jiri Pirko <jiri@nvidia.com>
> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
> ---
> Changes from v3:
> * Handle DMA_ATTR_MMIO
> ---
>  arch/arm64/mm/init.c                 |  4 +--
>  arch/powerpc/platforms/pseries/svm.c |  2 +-
>  arch/s390/mm/init.c                  |  2 +-
>  arch/x86/kernel/pci-dma.c            |  4 +--
>  kernel/dma/direct.c                  |  4 ++-
>  kernel/dma/direct.h                  | 45 +++++++++++++++-------------
>  6 files changed, 31 insertions(+), 30 deletions(-)
> 
> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index c1b223e7cc8e..a087ac5b15f7 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -340,10 +340,8 @@ void __init arch_mm_preinit(void)
>  	unsigned int flags = SWIOTLB_VERBOSE;
>  	bool swiotlb = max_pfn > PFN_DOWN(arm64_dma_phys_limit);
> 
> -	if (is_realm_world()) {
> +	if (is_realm_world())
>  		swiotlb = true;
> -		flags |= SWIOTLB_FORCE;
> -	}
> 
>  	if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && !swiotlb)
> {
>  		/*
> diff --git a/arch/powerpc/platforms/pseries/svm.c
> b/arch/powerpc/platforms/pseries/svm.c
> index 384c9dc1899a..7a403dbd35ee 100644
> --- a/arch/powerpc/platforms/pseries/svm.c
> +++ b/arch/powerpc/platforms/pseries/svm.c
> @@ -29,7 +29,7 @@ static int __init init_svm(void)
>  	 * need to use the SWIOTLB buffer for DMA even if dma_capable() says
>  	 * otherwise.
>  	 */
> -	ppc_swiotlb_flags |= SWIOTLB_ANY | SWIOTLB_FORCE;
> +	ppc_swiotlb_flags |= SWIOTLB_ANY;
> 
>  	/* Share the SWIOTLB buffer with the host. */
>  	swiotlb_update_mem_attributes();
> diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
> index ad3c6d92b801..581af1483c42 100644
> --- a/arch/s390/mm/init.c
> +++ b/arch/s390/mm/init.c
> @@ -163,7 +163,7 @@ static void __init pv_init(void)
>  	virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
> 
>  	/* make sure bounce buffers are shared */
> -	swiotlb_init(true, SWIOTLB_FORCE | SWIOTLB_VERBOSE);
> +	swiotlb_init(true, SWIOTLB_VERBOSE);
>  	swiotlb_update_mem_attributes();
>  }
> 
> diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
> index 6267363e0189..75cf8f6ae8cd 100644
> --- a/arch/x86/kernel/pci-dma.c
> +++ b/arch/x86/kernel/pci-dma.c
> @@ -59,10 +59,8 @@ static void __init pci_swiotlb_detect(void)
>  	 * bounce buffers as the hypervisor can't access arbitrary VM memory
>  	 * that is not explicitly shared with it.
>  	 */
> -	if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
> +	if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
>  		x86_swiotlb_enable = true;
> -		x86_swiotlb_flags |= SWIOTLB_FORCE;
> -	}

With this patch removing SWIOTLB_FORCE from four places in
kernel code, there are no remaining places where it is set.
The test of SWIOTLB_FORCE could be removed from
swiotlb_init_remap(), and its definition could be deleted
from include/linux/swiotlb.h.

Michael

^ permalink raw reply

* [PATCH v6 11/11] Documentation/x86: Add documentation for TDX's Dynamic PAMT
From: Rick Edgecombe @ 2026-05-26  2:35 UTC (permalink / raw)
  To: bp, dave.hansen, hpa, kas, kvm, linux-coco, linux-doc,
	linux-kernel, mingo, nik.borisov, pbonzini, seanjc, tglx,
	vannapurve, x86, chao.gao, yan.y.zhao, kai.huang
  Cc: rick.p.edgecombe, Kirill A. Shutemov
In-Reply-To: <20260526023515.288829-1-rick.p.edgecombe@intel.com>

From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>

Expand TDX documentation to include information on the Dynamic PAMT
feature.

The new section explains PAMT support in the TDX module and how Dynamic
PAMT affects the kernel memory use.

Assisted-by: Sashiko:claude-opus-4-6 GitHub Copilot:claude-opus-4-6 Claude:claude-opus-4-7
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Co-developed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
v6:
 - Add missing word (Binbin)
 - Use "::" instead of ":"
 - Make format of dmesg example accurate

v3:
 - Trim down docs to be about things that user cares about, instead
   of development history and other details like this.
---
 Documentation/arch/x86/tdx.rst | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/Documentation/arch/x86/tdx.rst b/Documentation/arch/x86/tdx.rst
index ff6b110291bc6..ce026a88b6f78 100644
--- a/Documentation/arch/x86/tdx.rst
+++ b/Documentation/arch/x86/tdx.rst
@@ -73,6 +73,28 @@ initialize::
 
   [..] virt/tdx: TDX-Module initialization failed ...
 
+Dynamic PAMT
+------------
+
+PAMT is memory that the TDX module needs to keep data about each page
+(think like struct page). It needs to be handed to the TDX module for its
+exclusive use. For normal PAMT, this is installed when the TDX module
+is first loaded and comes to about 0.4% of system memory.
+
+Dynamic PAMT is a TDX feature that allows VMM to allocate part of the
+PAMT as needed (the parts for tracking 4KB size pages). The other page
+sizes (1GB and 2MB) are still allocated statically at the time of
+TDX module initialization. This reduces the amount of memory that TDX
+uses while TDs are not in use.
+
+When Dynamic PAMT is in use, dmesg shows it like::
+
+  [..] virt/tdx: Enable Dynamic PAMT
+  [..] virt/tdx: 10092 KB allocated for PAMT
+  [..] virt/tdx: TDX-Module initialized
+
+Dynamic PAMT is enabled automatically if supported.
+
 TDX Interaction to Other Kernel Components
 ------------------------------------------
 
-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 08/11] x86/tdx: Add APIs to support Dynamic PAMT ops from KVM's fault path
From: Rick Edgecombe @ 2026-05-26  2:35 UTC (permalink / raw)
  To: bp, dave.hansen, hpa, kas, kvm, linux-coco, linux-doc,
	linux-kernel, mingo, nik.borisov, pbonzini, seanjc, tglx,
	vannapurve, x86, chao.gao, yan.y.zhao, kai.huang
  Cc: rick.p.edgecombe
In-Reply-To: <20260526023515.288829-1-rick.p.edgecombe@intel.com>

When handling an EPT violation, KVM holds a spinlock while manipulating
the EPT. Before entering the spinlock it doesn't know how many EPT page
tables will need to be installed or whether a huge page will be used. For
this reason it allocates a worst case number of page tables that it might
need as part of servicing the EPT violation.

Under Dynamic PAMT these pre-allocated pages will potentially need to have
Dynamic PAMT backing pages installed for them. KVM already has helpers to
manage topping up page caches before taking the MMU lock, but they cannot be
passed from KVM to arch/x86 code.

The problem of how and when to install the DPAMT backing pages for the
pages given to the TDX module during the fault path has had a lot of
design attempts.
 - Extracting KVM's MMU caches requires too much inlined code added to
   headers.
 - A few varieties of installing Dynamic PAMT backing when allocating the
   S-EPT page tables. [0][1]
 - Using mempool_t to transfer the pages between KVM and arch/x86 doesn't
   work because it is the component is designed more around maintaining a
   pool of pages, rather than topping up a continually drained cache.

So don't do these as they all had various problems. Instead just create a
small simple data structure to use for handing a pre-allocated list of
pages between KVM and arch/x86 code. Model this on KVM's existing MMU
memory caches.

Add a tdx_pamt_cache arg to tdx_pamt_get() so it can draw pages from a
cache when needed. Not all DPAMT page installations will happen under
spinlock, for example control pages. So have tdx_pamt_get() maintain the
existing behavior of allocating from the page allocator when NULL is
passed for the struct tdx_pamt_cache arg. This prevents excess allocations
for cases where it can be avoided.

Export the new helpers for KVM.

Assisted-by: GitHub Copilot:claude-opus-4-6 Claude:claude-opus-4-7
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Link: https://lore.kernel.org/kvm/de05853257e9cc66998101943f78a4b7e6e3d741.camel@intel.com/ [0]
Link: https://lore.kernel.org/kvm/aYprxnSHKHUtk7pt@google.com/ [1]
---
v6:
 - Filled out log from Sean's series
---
 arch/x86/include/asm/tdx.h  | 17 ++++++++++
 arch/x86/virt/vmx/tdx/tdx.c | 65 +++++++++++++++++++++++++++++++++----
 2 files changed, 76 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 74e75db5728c7..191da84bbf2a1 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -155,6 +155,23 @@ static inline bool tdx_supports_dynamic_pamt(const struct tdx_sys_info *sysinfo)
 	return false; /* To be enabled when kernel is ready */
 }
 
+/* Simple structure for pre-allocating Dynamic PAMT pages outside of locks. */
+struct tdx_pamt_cache {
+	struct list_head page_list;
+	int cnt;
+};
+
+static inline void tdx_init_pamt_cache(struct tdx_pamt_cache *cache)
+{
+	INIT_LIST_HEAD(&cache->page_list);
+	cache->cnt = 0;
+}
+
+void tdx_free_pamt_cache(struct tdx_pamt_cache *cache);
+int tdx_topup_pamt_cache(struct tdx_pamt_cache *cache, unsigned long npages);
+int tdx_pamt_get(kvm_pfn_t pfn, struct tdx_pamt_cache *cache);
+void tdx_pamt_put(kvm_pfn_t pfn);
+
 int tdx_guest_keyid_alloc(void);
 u32 tdx_get_nr_guest_keyids(void);
 void tdx_guest_keyid_free(unsigned int keyid);
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index c41c632a4cdf2..3544794fb092a 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -1971,12 +1971,33 @@ u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, kvm_pfn_t pfn)
 }
 EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_hkid);
 
-static int alloc_pamt_array(struct page **pamt_pages)
+static struct page *tdx_alloc_page_pamt_cache(struct tdx_pamt_cache *cache)
+{
+	struct page *page;
+
+	page = list_first_entry_or_null(&cache->page_list, struct page, lru);
+	if (page) {
+		list_del(&page->lru);
+		cache->cnt--;
+	}
+
+	return page;
+}
+
+static struct page *alloc_dpamt_page(struct tdx_pamt_cache *cache)
+{
+	if (cache)
+		return tdx_alloc_page_pamt_cache(cache);
+
+	return alloc_page(GFP_KERNEL_ACCOUNT);
+}
+
+static int alloc_pamt_array(struct page **pamt_pages, struct tdx_pamt_cache *cache)
 {
 	int i, j;
 
 	for (i = 0; i < TDX_DPAMT_ENTRY_PAGE_CNT; i++) {
-		pamt_pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
+		pamt_pages[i] = alloc_dpamt_page(cache);
 		if (!pamt_pages[i])
 			goto err;
 	}
@@ -2047,7 +2068,7 @@ static u64 tdh_phymem_pamt_remove(kvm_pfn_t pfn, struct page **pamt_pages)
 static DEFINE_SPINLOCK(pamt_lock);
 
 /* Bump PAMT refcount for the given page and allocate PAMT memory if needed */
-static int tdx_pamt_get(kvm_pfn_t pfn)
+int tdx_pamt_get(kvm_pfn_t pfn, struct tdx_pamt_cache *cache)
 {
 	struct page *pamt_pages[TDX_DPAMT_ENTRY_PAGE_CNT];
 	atomic_t *pamt_refcount;
@@ -2066,7 +2087,7 @@ static int tdx_pamt_get(kvm_pfn_t pfn)
 	if (atomic_inc_not_zero(pamt_refcount))
 		return 0;
 
-	ret = alloc_pamt_array(pamt_pages);
+	ret = alloc_pamt_array(pamt_pages, cache);
 	if (ret)
 		return ret;
 
@@ -2106,12 +2127,13 @@ static int tdx_pamt_get(kvm_pfn_t pfn)
 	free_pamt_array(pamt_pages);
 	return ret;
 }
+EXPORT_SYMBOL_FOR_KVM(tdx_pamt_get);
 
 /*
  * Drop PAMT refcount for the given page and free PAMT memory if it is no
  * longer needed.
  */
-static void tdx_pamt_put(kvm_pfn_t pfn)
+void tdx_pamt_put(kvm_pfn_t pfn)
 {
 	struct page *pamt_pages[TDX_DPAMT_ENTRY_PAGE_CNT] = {};
 	atomic_t *pamt_refcount;
@@ -2152,6 +2174,37 @@ static void tdx_pamt_put(kvm_pfn_t pfn)
 
 	free_pamt_array(pamt_pages);
 }
+EXPORT_SYMBOL_FOR_KVM(tdx_pamt_put);
+
+void tdx_free_pamt_cache(struct tdx_pamt_cache *cache)
+{
+	struct page *page;
+
+	while ((page = tdx_alloc_page_pamt_cache(cache)))
+		__free_page(page);
+}
+EXPORT_SYMBOL_FOR_KVM(tdx_free_pamt_cache);
+
+int tdx_topup_pamt_cache(struct tdx_pamt_cache *cache, unsigned long npages)
+{
+	if (WARN_ON_ONCE(!tdx_supports_dynamic_pamt(&tdx_sysinfo)))
+		return 0;
+
+	npages *= TDX_DPAMT_ENTRY_PAGE_CNT;
+
+	while (cache->cnt < npages) {
+		struct page *page = alloc_page(GFP_KERNEL_ACCOUNT);
+
+		if (!page)
+			return -ENOMEM;
+
+		list_add(&page->lru, &cache->page_list);
+		cache->cnt++;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_FOR_KVM(tdx_topup_pamt_cache);
 
 /*
  * Return a page that can be gifted to the TDX-Module for use as a "control"
@@ -2167,7 +2220,7 @@ struct page *tdx_alloc_control_page(void)
 	if (!page)
 		return NULL;
 
-	if (tdx_pamt_get(page_to_pfn(page))) {
+	if (tdx_pamt_get(page_to_pfn(page), NULL)) {
 		__free_page(page);
 		return NULL;
 	}
-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 09/11] KVM: TDX: Get/put PAMT pages when (un)mapping private memory
From: Rick Edgecombe @ 2026-05-26  2:35 UTC (permalink / raw)
  To: bp, dave.hansen, hpa, kas, kvm, linux-coco, linux-doc,
	linux-kernel, mingo, nik.borisov, pbonzini, seanjc, tglx,
	vannapurve, x86, chao.gao, yan.y.zhao, kai.huang
  Cc: rick.p.edgecombe, Kirill A. Shutemov
In-Reply-To: <20260526023515.288829-1-rick.p.edgecombe@intel.com>

From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>

Add Dynamic PAMT support to KVM's S-EPT MMU by "getting" a PAMT page when
adding guest memory (PAGE.ADD or PAGE.AUG), and "putting" the page when
removing guest memory (PAGE.REMOVE).

To access the per-vCPU PAMT caches without plumbing @vcpu throughout the
TDP MMU, begrudgingly use kvm_get_running_vcpu() to get the vCPU, and bug
the VM if KVM attempts to set an S-EPT leaf without an active vCPU.  KVM
only supports creating _new_ mappings in page (pre)fault paths, all of
which require an active vCPU.

The PAMT memory holds metadata for TDX-protected memory. With Dynamic
PAMT, PAMT_4K is allocated on demand. The kernel supplies the TDX module
with a few pages that cover 2M of host physical memory.

Releases are balanced via tdx_pamt_put(): every control-page free goes
through tdx_free_control_page(), and guest data pages are put directly on
the successful tdh_mem_page_remove() path and in the
tdx_mem_page_add/aug() error path.

Assisted-by: Sashiko:claude-opus-4-6 GitHub Copilot:claude-opus-4-6 Claude:claude-opus-4-7
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Co-developed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
v6:
 - Don't have topup op take a min param (Yan, Sean)
 - Make log match style of the rest of the series
 - Adjustments from dropping error helper patches
---
 arch/x86/include/asm/kvm-x86-ops.h |  1 +
 arch/x86/include/asm/kvm_host.h    |  2 +
 arch/x86/kvm/mmu/mmu.c             |  4 ++
 arch/x86/kvm/vmx/tdx.c             | 65 ++++++++++++++++++++++++++----
 arch/x86/kvm/vmx/tdx.h             |  2 +
 5 files changed, 66 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 10ccf6ea9d9a2..320f1d30edacc 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -97,6 +97,7 @@ KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
 KVM_X86_OP(load_mmu_pgd)
 KVM_X86_OP_OPTIONAL_RET0(set_external_spte)
 KVM_X86_OP_OPTIONAL(free_external_spt)
+KVM_X86_OP_OPTIONAL_RET0(topup_external_cache)
 KVM_X86_OP(has_wbinvd_exit)
 KVM_X86_OP(get_l2_tsc_offset)
 KVM_X86_OP(get_l2_tsc_multiplier)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6b28dd387bc61..bfe92e993a212 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1898,6 +1898,8 @@ struct kvm_x86_ops {
 	/* Update external page tables for page table about to be freed. */
 	void (*free_external_spt)(struct kvm *kvm, struct kvm_mmu_page *sp);
 
+	int (*topup_external_cache)(struct kvm_vcpu *vcpu, int min_nr_spts);
+
 
 	bool (*has_wbinvd_exit)(void);
 
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 892246204435c..2a48fc7fccc11 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -607,6 +607,10 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
 					       PT64_ROOT_MAX_LEVEL);
 		if (r)
 			return r;
+
+		r = kvm_x86_call(topup_external_cache)(vcpu, PT64_ROOT_MAX_LEVEL);
+		if (r)
+			return r;
 	}
 	r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
 				       PT64_ROOT_MAX_LEVEL);
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 3e67e2471ffe3..ee073cacafbec 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -685,6 +685,8 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
 	if (!irqchip_split(vcpu->kvm))
 		return -EINVAL;
 
+	tdx_init_pamt_cache(&tdx->pamt_cache);
+
 	fpstate_set_confidential(&vcpu->arch.guest_fpu);
 	vcpu->arch.apic->guest_apic_protected = true;
 	INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list);
@@ -870,6 +872,8 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu)
 	struct vcpu_tdx *tdx = to_tdx(vcpu);
 	int i;
 
+	tdx_free_pamt_cache(&tdx->pamt_cache);
+
 	if (vcpu->cpu != -1) {
 		KVM_BUG_ON(tdx->state == VCPU_TD_STATE_INITIALIZED, vcpu->kvm);
 		tdx_flush_vp_on_cpu(vcpu);
@@ -1611,6 +1615,16 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
 	td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
 }
 
+static int tdx_topup_external_pamt_cache(struct kvm_vcpu *vcpu, int min_nr_spts)
+{
+	/*
+	 * Don't cover the root SPT, but cover a possible 4KB private
+	 * page in addition to the SPTs. So -1 to exclude the root
+	 * SPT, and +1 for the guest page cancel out.
+	 */
+	return tdx_topup_pamt_cache(&to_tdx(vcpu)->pamt_cache, min_nr_spts);
+}
+
 static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level,
 			    kvm_pfn_t pfn)
 {
@@ -1669,16 +1683,29 @@ static struct page *tdx_spte_to_sept_pt(struct kvm *kvm, gfn_t gfn,
 static int tdx_sept_map_nonleaf_spte(struct kvm *kvm, gfn_t gfn,
 				     enum pg_level level, u64 new_spte)
 {
+	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
+	struct vcpu_tdx *tdx = to_tdx(vcpu);
 	gpa_t gpa = gfn_to_gpa(gfn);
 	u64 err, entry, level_state;
 	struct page *sept_pt;
+	int ret;
+
+	if (KVM_BUG_ON(!vcpu, kvm))
+		return -EIO;
 
 	sept_pt = tdx_spte_to_sept_pt(kvm, gfn, new_spte, level);
 	if (!sept_pt)
 		return -EIO;
 
+	ret = tdx_pamt_get(page_to_pfn(sept_pt), &tdx->pamt_cache);
+	if (ret)
+		return ret;
+
 	err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, level, sept_pt,
 			       &entry, &level_state);
+	if (err)
+		tdx_pamt_put(page_to_pfn(sept_pt));
+
 	if (unlikely(tdx_operand_busy(err)))
 		return -EBUSY;
 
@@ -1691,8 +1718,14 @@ static int tdx_sept_map_nonleaf_spte(struct kvm *kvm, gfn_t gfn,
 static int tdx_sept_map_leaf_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level,
 				  u64 new_spte)
 {
+	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
 	kvm_pfn_t pfn = spte_to_pfn(new_spte);
+	struct vcpu_tdx *tdx = to_tdx(vcpu);
+	int ret;
+
+	if (KVM_BUG_ON(!vcpu, kvm))
+		return -EIO;
 
 	/* TODO: handle large pages. */
 	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
@@ -1700,6 +1733,10 @@ static int tdx_sept_map_leaf_spte(struct kvm *kvm, gfn_t gfn, enum pg_level leve
 
 	WARN_ON_ONCE((new_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
 
+	ret = tdx_pamt_get(pfn, &tdx->pamt_cache);
+	if (ret)
+		return ret;
+
 	/*
 	 * Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory()
 	 * before kvm_tdx->state.  Userspace must not be allowed to pre-fault
@@ -1712,10 +1749,15 @@ static int tdx_sept_map_leaf_spte(struct kvm *kvm, gfn_t gfn, enum pg_level leve
 	 * If the TD isn't finalized/runnable, then userspace is initializing
 	 * the VM image via KVM_TDX_INIT_MEM_REGION; ADD the page to the TD.
 	 */
-	if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
-		return tdx_mem_page_add(kvm, gfn, level, pfn);
+	if (likely(kvm_tdx->state == TD_STATE_RUNNABLE))
+		ret = tdx_mem_page_aug(kvm, gfn, level, pfn);
+	else
+		ret = tdx_mem_page_add(kvm, gfn, level, pfn);
 
-	return tdx_mem_page_aug(kvm, gfn, level, pfn);
+	if (ret)
+		tdx_pamt_put(pfn);
+
+	return ret;
 }
 
 /*
@@ -1812,6 +1854,7 @@ static int tdx_sept_remove_leaf_spte(struct kvm *kvm, gfn_t gfn,
 		return -EIO;
 
 	tdx_quirk_reset_paddr(PFN_PHYS(pfn), PAGE_SIZE);
+	tdx_pamt_put(pfn);
 	return 0;
 }
 
@@ -1855,6 +1898,8 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
  */
 static void tdx_sept_free_private_spt(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
+	struct page *sept_pt = virt_to_page(sp->external_spt);
+
 	/*
 	 * KVM doesn't (yet) zap page table pages in mirror page table while
 	 * TD is active, though guest pages mapped in mirror page table could be
@@ -1868,15 +1913,15 @@ static void tdx_sept_free_private_spt(struct kvm *kvm, struct kvm_mmu_page *sp)
 	 * the page to prevent the kernel from accessing the encrypted page.
 	 */
 	if (KVM_BUG_ON(is_hkid_assigned(to_kvm_tdx(kvm)), kvm) ||
-	    tdx_reclaim_page(virt_to_page(sp->external_spt)))
+	    tdx_reclaim_page(sept_pt))
 		goto out;
 
 	/*
-	 * Immediately free the S-EPT page because RCU-time free is unnecessary
-	 * after TDH.PHYMEM.PAGE.RECLAIM ensures there are no outstanding
-	 * readers.
+	 * Immediately free the S-EPT page as the TDX subsystem doesn't support
+	 * freeing pages from RCU callbacks, and more importantly because
+	 * TDH.PHYMEM.PAGE.RECLAIM ensures there are no outstanding readers.
 	 */
-	free_page((unsigned long)sp->external_spt);
+	tdx_free_control_page(sept_pt);
 out:
 	sp->external_spt = NULL;
 }
@@ -3468,6 +3513,10 @@ int __init tdx_hardware_setup(void)
 
 	vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
 	vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
+
+	if (tdx_supports_dynamic_pamt(tdx_sysinfo))
+		vt_x86_ops.topup_external_cache = tdx_topup_external_pamt_cache;
+
 	vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
 	return 0;
 
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index b5cd2ffb303e5..47334a5a74eab 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -73,6 +73,8 @@ struct vcpu_tdx {
 
 	u64 map_gpa_next;
 	u64 map_gpa_end;
+
+	struct tdx_pamt_cache pamt_cache;
 };
 
 void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err);
-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 10/11] x86/virt/tdx: Enable Dynamic PAMT
From: Rick Edgecombe @ 2026-05-26  2:35 UTC (permalink / raw)
  To: bp, dave.hansen, hpa, kas, kvm, linux-coco, linux-doc,
	linux-kernel, mingo, nik.borisov, pbonzini, seanjc, tglx,
	vannapurve, x86, chao.gao, yan.y.zhao, kai.huang
  Cc: rick.p.edgecombe, Kirill A. Shutemov
In-Reply-To: <20260526023515.288829-1-rick.p.edgecombe@intel.com>

From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>

The Physical Address Metadata Table (PAMT) holds TDX metadata for
physical memory and must be allocated by the kernel during TDX module
initialization. Dynamic PAMT is a TDX module feature that can reduce this
memory use by allocating part of the PAMT dynamically.

All pieces are in place to Enable Dynamic PAMT if it is supported.
Determine if the TDX module supports it by checking the 'features0' bit
exposed by the TDX module.

The TDX module also exposes information about whether the *system* (and
not the module) supports Dynamic PAMT.

The TDX module documentation describes how PAMT works internally. To allow
the last level to be dynamically allocated, it uses a 3 level tree
structure, not unlike page tables. Like page tables, it has a maximum
address space that it can cover. This address space can be covered in 48
bits. If the host physical address space is higher than this, than the
TDX module can't guarantee the tree will be able to cover the TDX memory.

The TDX module exposes this system support via metadata stating the
minimum number of HKIDs that need to be available in order for Dynamic
PAMT to be usable. The reasoning appears to be that more HKIDs can shrink
the "real" addressable physical address bits enough to make the 48 bit
Dynamic PAMT limit workable on high physical address width HW. However,
the docs also clearly explain the 48 bit limit and how this fits into the
Dymamic PAMT tree constraints.

The handy x86_phys_bits value is already read and adjusted for keyid bits.
So just compare that against 48 instead of reading more metadata and
burdening the code with the more tenuous connection to minimum HKID bits.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Co-developed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
v6:
 - After Nikolai pointed out that the TDX docs actually have the Dynamic
   PAMT pages-per-2MB region fixed at 2 instead of variable sized, I
   checked over the docs more closely looking for anything else that might
   have been missed. Spotted this 48 bit physical address bit check in the
   docs, so added it.
---
 arch/x86/include/asm/tdx.h  | 11 ++++++++++-
 arch/x86/virt/vmx/tdx/tdx.c | 11 +++++++++--
 arch/x86/virt/vmx/tdx/tdx.h |  3 ---
 3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 191da84bbf2a1..187014686df3e 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -33,6 +33,10 @@
 #define TDX_SUCCESS		0ULL
 #define TDX_RND_NO_ENTROPY	0x8000020300000000ULL
 
+/* Bit definitions of TDX_FEATURES0 metadata field */
+#define TDX_FEATURES0_NO_RBP_MOD		BIT_ULL(18)
+#define TDX_FEATURES0_DYNAMIC_PAMT		BIT_ULL(36)
+
 #ifndef __ASSEMBLER__
 
 #include <uapi/asm/mce.h>
@@ -152,7 +156,12 @@ const struct tdx_sys_info *tdx_get_sysinfo(void);
 
 static inline bool tdx_supports_dynamic_pamt(const struct tdx_sys_info *sysinfo)
 {
-	return false; /* To be enabled when kernel is ready */
+	/*
+	 * The TDX Module's internal Dynamic PAMT tree structure can't
+	 * handle physical addresses with more than 48 bits.
+	 */
+	return sysinfo->features.tdx_features0 & TDX_FEATURES0_DYNAMIC_PAMT &&
+	       boot_cpu_data.x86_phys_bits <= 48;
 }
 
 /* Simple structure for pre-allocating Dynamic PAMT pages outside of locks. */
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 3544794fb092a..75140511571bf 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -1028,8 +1028,9 @@ static __init int construct_tdmrs(struct list_head *tmb_list,
 	return ret;
 }
 
-static __init int config_tdx_module(struct tdmr_info_list *tdmr_list,
-				    u64 global_keyid)
+#define TDX_SYS_CONFIG_DYNAMIC_PAMT	BIT(16)
+
+static __init int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid)
 {
 	struct tdx_module_args args = {};
 	u64 *tdmr_pa_array;
@@ -1056,6 +1057,12 @@ static __init int config_tdx_module(struct tdmr_info_list *tdmr_list,
 	args.rcx = __pa(tdmr_pa_array);
 	args.rdx = tdmr_list->nr_consumed_tdmrs;
 	args.r8 = global_keyid;
+
+	if (tdx_supports_dynamic_pamt(&tdx_sysinfo)) {
+		pr_info("Enable Dynamic PAMT\n");
+		args.r8 |= TDX_SYS_CONFIG_DYNAMIC_PAMT;
+	}
+
 	ret = seamcall_prerr(TDH_SYS_CONFIG, &args);
 
 	/* Free the array as it is not required anymore. */
diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h
index 8c39dde347cc2..68a68468fbeb6 100644
--- a/arch/x86/virt/vmx/tdx/tdx.h
+++ b/arch/x86/virt/vmx/tdx/tdx.h
@@ -86,9 +86,6 @@ struct tdmr_info {
 	DECLARE_FLEX_ARRAY(struct tdmr_reserved_area, reserved_areas);
 } __packed __aligned(TDMR_INFO_ALIGNMENT);
 
-/* Bit definitions of TDX_FEATURES0 metadata field */
-#define TDX_FEATURES0_NO_RBP_MOD	BIT(18)
-
 /*
  * Do not put any hardware-defined TDX structure representations below
  * this comment!
-- 
2.54.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox