Linux Confidential Computing Development

Linux Confidential Computing Development
 help / color / mirror / Atom feed

* Re: [PATCH v6 01/11] x86/virt/tdx: Simplify tdmr_get_pamt_sz()
From: Kiryl Shutsemau @ 2026-06-04 16:05 UTC (permalink / raw)
  To: Rick Edgecombe
  Cc: bp, dave.hansen, hpa, kvm, linux-coco, linux-doc, linux-kernel,
	mingo, nik.borisov, pbonzini, seanjc, tglx, vannapurve, x86,
	chao.gao, yan.y.zhao, kai.huang, Binbin Wu
In-Reply-To: <20260526023515.288829-2-rick.p.edgecombe@intel.com>

On Mon, May 25, 2026 at 07:35:05PM -0700, Rick Edgecombe wrote:
> For each memory region that the TDX module might use (called TDMR), three
> separate traditional PAMT allocations are needed. One for each supported
> page size (1GB, 2MB, 4KB). These store information on each page in the
> TDMR. In Linux, they are allocated out of one physically contiguous block,
> in order to more efficiently use some internal TDX module book keeping
> resources. So some simple math is needed to break the single large
> allocation into three smaller allocations for each page size.
> 
> There are some commonalities in the math needed to calculate the base and
> size for each smaller allocation, and so an effort was made to share logic
> across the three. Unfortunately doing this turned out unnaturally tortured,
> with a loop iterating over the three page sizes, only to call into a
> function with cases statement for each page size. In the future Dynamic
> PAMT will add more logic that is special to the 4KB page size, making the
> benefit of the math sharing even more questionable.
> 
> Three is not a very high number, so get rid of the loop and just duplicate
> the small calculation three times. In doing so, setup for future Dynamic
> PAMT changes.
> 
> Since the loop that iterates over it is gone, further simplify the code by
> dropping the array of intermediate size and base storage. Just store the
> values to their final locations. Accept the small complication of having
> to clear tdmr->pamt_4k_base in the error path, so that tdmr_do_pamt_func()
> will not try to operate on the TDMR struct when attempting to free it.
> 
> Assisted-by: GitHub Copilot:claude-opus-4-6 Claude:claude-opus-4-7
> Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>

Reviewed-by: Kiryl Shutsemau (Meta) <kas@kernel.org>

Couple of nits below.

> ---
> v6:
>  - Drop {} by moving a comment (Binbin)
>  - Log tweaks
> 
> v4:
>  - Just refer to global var instead of passing pamt_entry_size around
>    (Xiaoyao)
>  - Remove setting pamt_4k_base to zero, because it already is zero.
>    Adjust the comment appropriately (Kai)
> 
> v3:
>  - New patch
> ---
>  arch/x86/virt/vmx/tdx/tdx.c | 93 ++++++++++++-------------------------
>  1 file changed, 29 insertions(+), 64 deletions(-)
> 
> diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> index 967482ae3c801..487f389f52f4b 100644
> --- a/arch/x86/virt/vmx/tdx/tdx.c
> +++ b/arch/x86/virt/vmx/tdx/tdx.c
> @@ -516,31 +516,21 @@ static __init int fill_out_tdmrs(struct list_head *tmb_list,
>   * Calculate PAMT size given a TDMR and a page size.  The returned
>   * PAMT size is always aligned up to 4K page boundary.
>   */
> -static __init unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz,
> -					     u16 pamt_entry_size)
> +static __init unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz)
>  {
>  	unsigned long pamt_sz, nr_pamt_entries;
> +	const int tdx_pg_size_shift[] = { PAGE_SHIFT, PMD_SHIFT, PUD_SHIFT };
> +	const u16 pamt_entry_size[TDX_PS_NR] = {
> +		tdx_sysinfo.tdmr.pamt_4k_entry_size,
> +		tdx_sysinfo.tdmr.pamt_2m_entry_size,
> +		tdx_sysinfo.tdmr.pamt_1g_entry_size,
> +	};
>  
> -	switch (pgsz) {
> -	case TDX_PS_4K:
> -		nr_pamt_entries = tdmr->size >> PAGE_SHIFT;
> -		break;
> -	case TDX_PS_2M:
> -		nr_pamt_entries = tdmr->size >> PMD_SHIFT;
> -		break;
> -	case TDX_PS_1G:
> -		nr_pamt_entries = tdmr->size >> PUD_SHIFT;
> -		break;
> -	default:
> -		WARN_ON_ONCE(1);
> -		return 0;
> -	}
> +	nr_pamt_entries = tdmr->size >> tdx_pg_size_shift[pgsz];
> +	pamt_sz = nr_pamt_entries * pamt_entry_size[pgsz];
>  
> -	pamt_sz = nr_pamt_entries * pamt_entry_size;
>  	/* TDX requires PAMT size must be 4K aligned */
> -	pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
> -
> -	return pamt_sz;
> +	return PAGE_ALIGN(pamt_sz);
>  }
>  
>  /*
> @@ -578,28 +568,21 @@ static __init int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_lis
>   * within @tdmr, and set up PAMTs for @tdmr.
>   */
>  static __init int tdmr_set_up_pamt(struct tdmr_info *tdmr,
> -				   struct list_head *tmb_list,
> -				   u16 pamt_entry_size[])
> +				   struct list_head *tmb_list)
>  {
> -	unsigned long pamt_base[TDX_PS_NR];
> -	unsigned long pamt_size[TDX_PS_NR];
> -	unsigned long tdmr_pamt_base;
>  	unsigned long tdmr_pamt_size;
>  	struct page *pamt;
> -	int pgsz, nid;
> -
> +	int nid;

Add a newline here?

>  	nid = tdmr_get_nid(tdmr, tmb_list);
>  
>  	/*
>  	 * Calculate the PAMT size for each TDX supported page size
>  	 * and the total PAMT size.
>  	 */
> -	tdmr_pamt_size = 0;
> -	for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
> -		pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz,
> -					pamt_entry_size[pgsz]);
> -		tdmr_pamt_size += pamt_size[pgsz];
> -	}
> +	tdmr->pamt_4k_size = tdmr_get_pamt_sz(tdmr, TDX_PS_4K);
> +	tdmr->pamt_2m_size = tdmr_get_pamt_sz(tdmr, TDX_PS_2M);
> +	tdmr->pamt_1g_size = tdmr_get_pamt_sz(tdmr, TDX_PS_1G);
> +	tdmr_pamt_size = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
>  
>  	/*
>  	 * Allocate one chunk of physically contiguous memory for all
> @@ -607,26 +590,18 @@ static __init int tdmr_set_up_pamt(struct tdmr_info *tdmr,
>  	 * in overlapped TDMRs.
>  	 */
>  	pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
> -			nid, &node_online_map);
> +				  nid, &node_online_map);
> +

Looks like unrelated whitespace change. Is it intentional?

> +	/*
> +	 * tdmr->pamt_4k_base is still zero so the error
> +	 * path of the caller will skip freeing the pamt.
> +	 */
>  	if (!pamt)
>  		return -ENOMEM;
>  
> -	/*
> -	 * Break the contiguous allocation back up into the
> -	 * individual PAMTs for each page size.
> -	 */
> -	tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
> -	for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
> -		pamt_base[pgsz] = tdmr_pamt_base;
> -		tdmr_pamt_base += pamt_size[pgsz];
> -	}
> -
> -	tdmr->pamt_4k_base = pamt_base[TDX_PS_4K];
> -	tdmr->pamt_4k_size = pamt_size[TDX_PS_4K];
> -	tdmr->pamt_2m_base = pamt_base[TDX_PS_2M];
> -	tdmr->pamt_2m_size = pamt_size[TDX_PS_2M];
> -	tdmr->pamt_1g_base = pamt_base[TDX_PS_1G];
> -	tdmr->pamt_1g_size = pamt_size[TDX_PS_1G];
> +	tdmr->pamt_4k_base = page_to_phys(pamt);
> +	tdmr->pamt_2m_base = tdmr->pamt_4k_base + tdmr->pamt_4k_size;
> +	tdmr->pamt_1g_base = tdmr->pamt_2m_base + tdmr->pamt_2m_size;
>  
>  	return 0;
>  }
> @@ -657,10 +632,7 @@ static __init void tdmr_do_pamt_func(struct tdmr_info *tdmr,
>  	tdmr_get_pamt(tdmr, &pamt_base, &pamt_size);
>  
>  	/* Do nothing if PAMT hasn't been allocated for this TDMR */
> -	if (!pamt_size)
> -		return;
> -
> -	if (WARN_ON_ONCE(!pamt_base))
> +	if (!pamt_base)
>  		return;
>  
>  	pamt_func(pamt_base, pamt_size);
> @@ -686,14 +658,12 @@ static __init void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list)
>  
>  /* Allocate and set up PAMTs for all TDMRs */
>  static __init int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list,
> -					struct list_head *tmb_list,
> -					u16 pamt_entry_size[])
> +				 struct list_head *tmb_list)
>  {
>  	int i, ret = 0;
>  
>  	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
> -		ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list,
> -				pamt_entry_size);
> +		ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list);
>  		if (ret)
>  			goto err;
>  	}
> @@ -970,18 +940,13 @@ static __init int construct_tdmrs(struct list_head *tmb_list,
>  				  struct tdmr_info_list *tdmr_list,
>  				  struct tdx_sys_info_tdmr *sysinfo_tdmr)
>  {
> -	u16 pamt_entry_size[TDX_PS_NR] = {
> -		sysinfo_tdmr->pamt_4k_entry_size,
> -		sysinfo_tdmr->pamt_2m_entry_size,
> -		sysinfo_tdmr->pamt_1g_entry_size,
> -	};
>  	int ret;
>  
>  	ret = fill_out_tdmrs(tmb_list, tdmr_list);
>  	if (ret)
>  		return ret;
>  
> -	ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, pamt_entry_size);
> +	ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list);
>  	if (ret)
>  		return ret;
>  
> -- 
> 2.54.0
> 

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

^ permalink raw reply

* Re: [PATCH v14 14/44] arm64: RMI: Basic infrastructure for creating a realm.
From: Steven Price @ 2026-06-04 15:55 UTC (permalink / raw)
  To: Suzuki K Poulose, Marc Zyngier
  Cc: kvm, kvmarm, Catalin Marinas, Will Deacon, James Morse,
	Oliver Upton, Zenghui Yu, linux-arm-kernel, linux-kernel,
	Joey Gouly, Alexandru Elisei, Christoffer Dall, Fuad Tabba,
	linux-coco, Ganapatrao Kulkarni, Gavin Shan, Shanker Donthineni,
	Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
	WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <1bb9da2a-bcc2-4232-944e-0730e9e1f45a@arm.com>

On 02/06/2026 15:49, Suzuki K Poulose wrote:
> Hi Marc
> 
> On 28/05/2026 08:10, Marc Zyngier wrote:
>> On Wed, 13 May 2026 14:17:22 +0100,
>> Steven Price <steven.price@arm.com> wrote:
>>>
>>> Introduce the skeleton functions for creating and destroying a realm.
>>> The IPA size requested is checked against what the RMM supports.
>>>
>>> The actual work of constructing the realm will be added in future
>>> patches.
>>
>> Again, $SUBJECT doesn't reflect that this is purely a KVM patch.

Indeed - "KVM: arm64: CCA" is a better prefix.

>>>
>>> Signed-off-by: Steven Price <steven.price@arm.com>
>>> ---
>>> Changes since v13:
>>>   * Rebased and updated to RMM-v2.0-bet1.
>>>   * Auxiliary granules have been removed in RMM-v2.0-bet1
>>> Changes since v12:
>>>   * Drop the RMM_PAGE_{SHIFT,SIZE} defines - the RMM is now
>>> configured to
>>>     be the same as the host's page size.
>>>   * Rework delegate/undelegate functions to use the new RMI range based
>>>     operations.
>>> Changes since v11:
>>>   * Major rework to drop the realm configuration and make the
>>>     construction of realms implicit rather than driven by the VMM
>>>     directly.
>>>   * The code to create RDs, handle VMIDs etc is moved to later patches.
>>> Changes since v10:
>>>   * Rename from RME to RMI.
>>>   * Move the stage2 cleanup to a later patch.
>>> Changes since v9:
>>>   * Avoid walking the stage 2 page tables when destroying the realm -
>>>     the real ones are not accessible to the non-secure world, and the
>>> RMM
>>>     may leave junk in the physical pages when returning them.
>>>   * Fix an error path in realm_create_rd() to actually return an
>>> error value.
>>> Changes since v8:
>>>   * Fix free_delegated_granule() to not call
>>> kvm_account_pgtable_pages();
>>>     a separate wrapper will be introduced in a later patch to deal with
>>>     RTTs.
>>>   * Minor code cleanups following review.
>>> Changes since v7:
>>>   * Minor code cleanup following Gavin's review.
>>> Changes since v6:
>>>   * Separate RMM RTT calculations from host PAGE_SIZE. This allows the
>>>     host page size to be larger than 4k while still communicating
>>> with an
>>>     RMM which uses 4k granules.
>>> Changes since v5:
>>>   * Introduce free_delegated_granule() to replace many
>>>     undelegate/free_page() instances and centralise the comment on
>>>     leaking when the undelegate fails.
>>>   * Several other minor improvements suggested by reviews - thanks for
>>>     the feedback!
>>> Changes since v2:
>>>   * Improved commit description.
>>>   * Improved return failures for rmi_check_version().
>>>   * Clear contents of PGD after it has been undelegated in case the RMM
>>>     left stale data.
>>>   * Minor changes to reflect changes in previous patches.
>>> ---
>>>   arch/arm64/include/asm/kvm_emulate.h | 29 ++++++++++++++
>>>   arch/arm64/include/asm/kvm_rmi.h     | 51 +++++++++++++++++++++++++
>>>   arch/arm64/kvm/arm.c                 | 12 ++++++
>>>   arch/arm64/kvm/mmu.c                 | 12 +++++-
>>>   arch/arm64/kvm/rmi.c                 | 57 ++++++++++++++++++++++++++++
>>>   5 files changed, 159 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/
>>> include/asm/kvm_emulate.h
>>> index 5bf3d7e1d92c..82fd777bd9bb 100644
>>> --- a/arch/arm64/include/asm/kvm_emulate.h
>>> +++ b/arch/arm64/include/asm/kvm_emulate.h
>>> @@ -688,4 +688,33 @@ static inline void vcpu_set_hcrx(struct kvm_vcpu
>>> *vcpu)
>>>               vcpu->arch.hcrx_el2 |= HCRX_EL2_EnASR;
>>>       }
>>>   }
>>> +
>>> +static inline bool kvm_is_realm(struct kvm *kvm)
>>> +{
>>> +    if (static_branch_unlikely(&kvm_rmi_is_available))
>>> +        return kvm->arch.is_realm;
>>> +    return false;
>>> +}
>>> +
>>> +static inline enum realm_state kvm_realm_state(struct kvm *kvm)
>>> +{
>>> +    return READ_ONCE(kvm->arch.realm.state);
>>> +}
>>> +
>>> +static inline void kvm_set_realm_state(struct kvm *kvm,
>>> +                       enum realm_state new_state)
>>> +{
>>> +    WRITE_ONCE(kvm->arch.realm.state, new_state);
>>> +}
>>> +
>>> +static inline bool kvm_realm_is_created(struct kvm *kvm)
>>> +{
>>> +    return kvm_is_realm(kvm) && kvm_realm_state(kvm) !=
>>> REALM_STATE_NONE;
>>> +}
>>> +
>>> +static inline bool vcpu_is_rec(const struct kvm_vcpu *vcpu)
>>> +{
>>> +    return false;
>>> +}
>>> +
>>>   #endif /* __ARM64_KVM_EMULATE_H__ */
>>> diff --git a/arch/arm64/include/asm/kvm_rmi.h b/arch/arm64/include/
>>> asm/kvm_rmi.h
>>> index 4936007947fd..9de34983ee52 100644
>>> --- a/arch/arm64/include/asm/kvm_rmi.h
>>> +++ b/arch/arm64/include/asm/kvm_rmi.h
>>> @@ -6,12 +6,63 @@
>>>   #ifndef __ASM_KVM_RMI_H
>>>   #define __ASM_KVM_RMI_H
>>>   +#include <asm/rmi_smc.h>
>>> +
>>> +/**
>>> + * enum realm_state - State of a Realm
>>> + */
>>> +enum realm_state {
>>> +    /**
>>> +     * @REALM_STATE_NONE:
>>> +     *      Realm has not yet been created. rmi_realm_create() has not
>>> +     *      yet been called.
>>> +     */
>>> +    REALM_STATE_NONE,
>>> +    /**
>>> +     * @REALM_STATE_NEW:
>>> +     *      Realm is under construction, rmi_realm_create() has been
>>> +     *      called, but it is not yet activated. Pages may be
>>> populated.
>>> +     */
>>> +    REALM_STATE_NEW,
>>> +    /**
>>> +     * @REALM_STATE_ACTIVE:
>>> +     *      Realm has been created and is eligible for execution with
>>> +     *      rmi_rec_enter(). Pages may no longer be populated with
>>> +     *      rmi_data_create().
>>> +     */
>>> +    REALM_STATE_ACTIVE,
>>> +    /**
>>> +     * @REALM_STATE_DYING:
>>> +     *      Realm is in the process of being destroyed or has
>>> already been
>>> +     *      destroyed.
>>> +     */
>>> +    REALM_STATE_DYING,
>>> +    /**
>>> +     * @REALM_STATE_DEAD:
>>> +     *      Realm has been destroyed.
>>> +     */
>>> +    REALM_STATE_DEAD
>>> +};
>>
>> What is the ABI status of this state? Is it purely internal to KVM? Or
>> is it something that the RMM actively tracks?
> 
> The states are in line with what the RMM maintains for the Realm state,
> (Section A2.2.5 Realm Lifecycle)
> except for :
> 
> 1. REALM_STATE_DYING is really a KVM internal state to indicate, we
> are in the process of destroying the Realm and no further requests
> needs to be serviced
> 
> 2. We don't track the REALM_SYSTEM_OFF, REALM_ZOMBIE states separately
> as we :
>  a) Always TERMINATE the Realm, just before the DESTROY
>  b) SYSTEM_OFF is naturally triggering the tear down path, leading to
> DYING.
> 

I'll add a comment:

+ * Mirrors the RMM's Realm lifecycle states where they are meaningful to KVM,
+ * with REALM_STATE_DYING being a KVM-internal state used to prevent further
+ * requests while teardown is in progress. KVM does not track REALM_SYSTEM_OFF
+ * or REALM_ZOMBIE separately as they naturally lead to teardown.

> 
> 
>>
>>> +
>>>   /**
>>>    * struct realm - Additional per VM data for a Realm
>>> + *
>>> + * @state: The lifetime state machine for the realm
>>> + * @rd: Kernel mapping of the Realm Descriptor (RD)
>>> + * @params: Parameters for the RMI_REALM_CREATE command
>>> + * @ia_bits: Number of valid Input Address bits in the IPA
>>>    */
>>>   struct realm {
>>> +    enum realm_state state;
>>> +    void *rd;
>>
>> Why is this void? Doesn't it have a proper type?
> 
> Not really. This is an object that RMM manages (Realm Descriptor)
> in the Realm world. We use it as a parameter to address the Realm.
> 
> 
>>
>>> +    struct realm_params *params;
>>> +    unsigned int ia_bits;
>>
>> Consider reordering this structure to avoid holes.

Sure

>>>   };
>>>     void kvm_init_rmi(void);
>>> +u32 kvm_realm_ipa_limit(void);
>>
>> The use of 'realm' is confusing. This is not a per-realm property, but
>> something global. I'd rather reserve the term 'realm' for CCA VMs (cue
>> the two prototypes below).
> 
> Agreed. Perhaps, kvm_rmm_ipa_limit() ?

Sounds good to me.

> 
>>
>>> +
>>> +int kvm_init_realm(struct kvm *kvm);
>>> +void kvm_destroy_realm(struct kvm *kvm);
>>>     #endif /* __ASM_KVM_RMI_H */
>>> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
>>> index 247e03b33035..18251e561524 100644
>>> --- a/arch/arm64/kvm/arm.c
>>> +++ b/arch/arm64/kvm/arm.c
>>> @@ -264,6 +264,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned
>>> long type)
>>>         bitmap_zero(kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES);
>>>   +    /* Initialise the realm bits after the generic bits are
>>> enabled */
>>> +    if (kvm_is_realm(kvm)) {
>>> +        ret = kvm_init_realm(kvm);
>>> +        if (ret)
>>> +            goto err_uninit_mmu;
>>> +    }
>>> +
>>>       return 0;
>>>     err_uninit_mmu:
>>> @@ -326,6 +333,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
>>>       kvm_unshare_hyp(kvm, kvm + 1);
>>>         kvm_arm_teardown_hypercalls(kvm);
>>> +    if (kvm_is_realm(kvm))
>>> +        kvm_destroy_realm(kvm);
>>>   }
>>>     static bool kvm_has_full_ptr_auth(void)
>>> @@ -486,6 +495,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm,
>>> long ext)
>>>           else
>>>               r = kvm_supports_cacheable_pfnmap();
>>>           break;
>>> +    case KVM_CAP_ARM_RMI:
>>> +        r = static_key_enabled(&kvm_rmi_is_available);
>>> +        break;
>>>         default:
>>>           r = 0;
>>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
>>> index d089c107d9b7..ba8286472286 100644
>>> --- a/arch/arm64/kvm/mmu.c
>>> +++ b/arch/arm64/kvm/mmu.c
>>> @@ -877,10 +877,14 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
>>>     static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned
>>> long type)
>>>   {
>>> +    struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
>>>       u32 kvm_ipa_limit = get_kvm_ipa_limit();
>>>       u64 mmfr0, mmfr1;
>>>       u32 phys_shift;
>>>   +    if (kvm_is_realm(kvm))
>>> +        kvm_ipa_limit = kvm_realm_ipa_limit();
>>> +
>>>       phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
>>>       if (is_protected_kvm_enabled()) {
>>>           phys_shift = kvm_ipa_limit;
>>> @@ -974,6 +978,8 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct
>>> kvm_s2_mmu *mmu, unsigned long t
>>>           return -EINVAL;
>>>       }
>>>   +    mmu->arch = &kvm->arch;
>>> +
>>>       err = kvm_init_ipa_range(mmu, type);
>>>       if (err)
>>>           return err;
>>> @@ -982,7 +988,6 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct
>>> kvm_s2_mmu *mmu, unsigned long t
>>>       if (!pgt)
>>>           return -ENOMEM;
>>>   -    mmu->arch = &kvm->arch;
>>
>> Why moving this init?
> 
> Because, we need to know the "kvm" instance for kvm_init_ipa_range to
> detect the limit that applies to Realms.
> 
>>
>>>       err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu,
>>> &kvm_s2_mm_ops);
>>>       if (err)
>>>           goto out_free_pgtable;
>>> @@ -1114,7 +1119,10 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
>>>       write_unlock(&kvm->mmu_lock);
>>>         if (pgt) {
>>> -        kvm_stage2_destroy(pgt);
>>> +        if (!kvm_is_realm(kvm))
>>> +            kvm_stage2_destroy(pgt);
>>> +        else
>>> +            kvm_pgtable_stage2_destroy_pgd(pgt);
>>
>> Why can't you make kvm_stage2_destroy() do the right thing? Surely the
>> PTs have to be reclaimed one way or another.
> 
> Actually yes, we could make it work. We need to skip walking the page
> table for Realms. We may be able to do the checks via pgt->mmu->arch-
>>kvm and skip the walking for Realms. ( The S2 is unmapped and torn
> down before the RD is destroyed in kvm_destroy_realm(). We can't
> rely on the contents of the PGDs to be zero - e.g., with MEC.)

Yes I'll move the check into kvm_stage2_destroy() instead with a comment
explaining what's going on.

>>
>>>           kfree(pgt);
>>>       }
>>>   }
>>> diff --git a/arch/arm64/kvm/rmi.c b/arch/arm64/kvm/rmi.c
>>> index 6e28b669ded2..f51ec667445e 100644
>>> --- a/arch/arm64/kvm/rmi.c
>>> +++ b/arch/arm64/kvm/rmi.c
>>> @@ -5,6 +5,8 @@
>>>     #include <linux/kvm_host.h>
>>>   +#include <asm/kvm_emulate.h>
>>> +#include <asm/kvm_mmu.h>
>>>   #include <asm/kvm_pgtable.h>
>>>   #include <asm/rmi_cmds.h>
>>>   #include <asm/virt.h>
>>> @@ -14,6 +16,61 @@ static bool rmi_has_feature(unsigned long feature)
>>>       return !!u64_get_bits(rmm_feat_reg0, feature);
>>>   }
>>>   +u32 kvm_realm_ipa_limit(void)
>>> +{
>>> +    return u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_S2SZ);
>>> +}
>>> +
>>> +void kvm_destroy_realm(struct kvm *kvm)
>>> +{
>>> +    struct realm *realm = &kvm->arch.realm;
>>> +    size_t pgd_size = kvm_pgtable_stage2_pgd_size(kvm->arch.mmu.vtcr);
>>> +
>>> +    if (realm->params) {
>>> +        free_page((unsigned long)realm->params);
>>> +        realm->params = NULL;
>>> +    }
>>> +
>>> +    if (!kvm_realm_is_created(kvm))
>>> +        return;
>>> +
>>> +    kvm_set_realm_state(kvm, REALM_STATE_DYING);
>>> +
>>> +    write_lock(&kvm->mmu_lock);
>>> +    kvm_stage2_unmap_range(&kvm->arch.mmu, 0,
>>> +                   BIT(realm->ia_bits - 1), true);
>>> +    write_unlock(&kvm->mmu_lock);
>>> +
>>> +    if (realm->rd) {
>>> +        phys_addr_t rd_phys = virt_to_phys(realm->rd);
>>> +
>>> +        if (WARN_ON(rmi_realm_terminate(rd_phys)))
>>> +            return;
>>> +
>>> +        if (WARN_ON(rmi_realm_destroy(rd_phys)))
>>> +            return;
>>> +        free_delegated_page(rd_phys);
>>> +        realm->rd = NULL;
>>> +    }
>>> +
>>> +    if (WARN_ON(rmi_undelegate_range(kvm->arch.mmu.pgd_phys,
>>> pgd_size)))
>>> +        return;
>>> +
>>> +    kvm_set_realm_state(kvm, REALM_STATE_DEAD);
>>> +
>>> +    /* Now that the Realm is destroyed, free the entry level RTTs */
>>> +    kvm_free_stage2_pgd(&kvm->arch.mmu);
>>> +}
>>
>> This really needs documentation: what happens at each stage? What
>> memory is reclaimed when?
> 
> Agreed.
> 
>>
>> But even more importantly, why is this built in a completely parallel
>> way, potentially deviating from the existing KVM S2 management?
> 
> 
> RMM requires a Realm is not live at the time of REALM_DESTROY.
> (See section A2.2.4 Realm Liveness).
> i.e., All RECs are destroyed, Root RTTs wiped clean (no live mappings)
> before the RD is destroyed. So, we need to make sure all of this is
> done at Realm Destroy. Hence we delay the kvm_free_stage2_pgd() until
> we destroy the RD.
> 
> Does that help? May be we could improve the comments around it.

I'll add a comment in kvm_destroy_realm().

Thanks,
Steve

> 
> Suzuki
> 
> 
> 
>> Thanks,>
>>     M.
>>
> 


^ permalink raw reply

* Re: [PATCH v7 20/42] KVM: SEV: Make 'uaddr' parameter optional for KVM_SEV_SNP_LAUNCH_UPDATE
From: Suzuki K Poulose @ 2026-06-04 15:29 UTC (permalink / raw)
  To: ackerleytng, aik, andrew.jones, binbin.wu, brauner, chao.p.peng,
	david, ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, aneesh.kumar, liam, Paolo Bonzini, Sean Christopherson,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
	Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
	Kemeng Shi, Nhat Pham, Baoquan He, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260522-gmem-inplace-conversion-v7-20-2f0fae496530@google.com>

On 23/05/2026 01:18, Ackerley Tng via B4 Relay wrote:
> From: Michael Roth <michael.roth@amd.com>
> 
> For vm_memory_attributes=1, in-place conversion/population is not
> supported, so the initial contents necessarily must need to come
> from a separate src address, which is enforced by the current
> implementation. However, for vm_memory_attributes=0, it is possible for
> guest memory to be initialized directly from userspace by mmap()'ing the
> guest_memfd and writing to it while the corresponding GPA ranges are in
> a 'shared' state before converting them to the 'private' state expected
> by KVM_SEV_SNP_LAUNCH_UPDATE.
> 
> Update the handling/documentation for KVM_SEV_SNP_LAUNCH_UPDATE to allow
> for 'uaddr' to be set to NULL when vm_memory_attributes=0, which
> SNP_LAUNCH_UPDATE will then use to determine when it should/shouldn't
> copy in data from a separate memory location. Continue to enforce
> non-NULL for the original vm_memory_attributes=1 case.
> 
> Signed-off-by: Michael Roth <michael.roth@amd.com>
> [Added src_page check in error handling path when the firmware command fails]
> [Dropped ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES]
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>




> ---
>   Documentation/virt/kvm/x86/amd-memory-encryption.rst | 15 +++++++++++----
>   arch/x86/kvm/svm/sev.c                               | 18 +++++++++++++-----
>   virt/kvm/kvm_main.c                                  |  1 +
>   3 files changed, 25 insertions(+), 9 deletions(-)
> 
> diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst
> index b2395dd4769de..43085f65b2d85 100644
> --- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst
> +++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst
> @@ -503,7 +503,8 @@ secrets.
>   
>   It is required that the GPA ranges initialized by this command have had the
>   KVM_MEMORY_ATTRIBUTE_PRIVATE attribute set in advance. See the documentation
> -for KVM_SET_MEMORY_ATTRIBUTES for more details on this aspect.
> +for KVM_SET_MEMORY_ATTRIBUTES/KVM_SET_MEMORY_ATTRIBUTES2 for more details on
> +this aspect.
>   
>   Upon success, this command is not guaranteed to have processed the entire
>   range requested. Instead, the ``gfn_start``, ``uaddr``, and ``len`` fields of
> @@ -511,9 +512,15 @@ range requested. Instead, the ``gfn_start``, ``uaddr``, and ``len`` fields of
>   remaining range that has yet to be processed. The caller should continue
>   calling this command until those fields indicate the entire range has been
>   processed, e.g. ``len`` is 0, ``gfn_start`` is equal to the last GFN in the
> -range plus 1, and ``uaddr`` is the last byte of the userspace-provided source
> -buffer address plus 1. In the case where ``type`` is KVM_SEV_SNP_PAGE_TYPE_ZERO,
> -``uaddr`` will be ignored completely.
> +range plus 1, and ``uaddr`` (if specified) is the last byte of the
> +userspace-provided source buffer address plus 1.
> +
> +In the case where ``type`` is KVM_SEV_SNP_PAGE_TYPE_ZERO, ``uaddr`` will be
> +ignored completely. Otherwise, ``uaddr`` is required if
> +kvm.vm_memory_attributes=1 and optional if kvm.vm_memory_attributes=0, since
> +in the latter case guest memory can be initialized directly from userspace
> +prior to converting it to private and passing the GPA range on to this
> +interface.

Just to confirm, so the sev_gmem_prepare doesn't destroy the contents in 
the process of making it "private" ? i.e., the contents of a SNP shared
page are preserved while transitioning to "SNP Private" (via RMP
update).

Suzuki



>   
>   Parameters (in): struct  kvm_sev_snp_launch_update
>   
> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> index 1a361f08c7a3d..e1dbc827c2807 100644
> --- a/arch/x86/kvm/svm/sev.c
> +++ b/arch/x86/kvm/svm/sev.c
> @@ -2343,7 +2343,15 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
>   	int level;
>   	int ret;
>   
> -	if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src_page))
> +	/*
> +	 * For vm_memory_attributes=1, in-place conversion/population is not
> +	 * supported, so the initial contents necessarily need to come from a
> +	 * separate src address. For vm_memory_attributes=0, this isn't
> +	 * necessarily the case, since the pages may have been populated
> +	 * directly from userspace before calling KVM_SEV_SNP_LAUNCH_UPDATE.
> +	 */
> +	if (vm_memory_attributes &&
> +	    sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src_page)
>   		return -EINVAL;
>   
>   	ret = snp_lookup_rmpentry((u64)pfn, &assigned, &level);
> @@ -2390,7 +2398,7 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
>   	 */
>   	if (ret && !snp_page_reclaim(kvm, pfn) &&
>   	    sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID &&
> -	    sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) {
> +	    sev_populate_args->fw_error == SEV_RET_INVALID_PARAM && src_page) {
>   		void *src_vaddr = kmap_local_page(src_page);
>   		void *dst_vaddr = kmap_local_pfn(pfn);
>   
> @@ -2423,8 +2431,8 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
>   	if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
>   		return -EFAULT;
>   
> -	pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__,
> -		 params.gfn_start, params.len, params.type, params.flags);
> +	pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d src %llx\n", __func__,
> +		 params.gfn_start, params.len, params.type, params.flags, params.uaddr);
>   
>   	if (!params.len || !PAGE_ALIGNED(params.len) || params.flags ||
>   	    (params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL &&
> @@ -2481,7 +2489,7 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
>   
>   	params.gfn_start += count;
>   	params.len -= count * PAGE_SIZE;
> -	if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO)
> +	if (src && params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO)
>   		params.uaddr += count * PAGE_SIZE;
>   
>   	if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params)))
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index ba195bb239aaa..3bf212fd99193 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -105,6 +105,7 @@ module_param(allow_unsafe_mappings, bool, 0444);
>   #ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>   bool vm_memory_attributes = true;
>   module_param(vm_memory_attributes, bool, 0444);
> +EXPORT_SYMBOL_FOR_KVM_INTERNAL(vm_memory_attributes);
>   #endif
>   DEFINE_STATIC_CALL_RET0(__kvm_get_memory_attributes, kvm_get_memory_attributes_t);
>   EXPORT_SYMBOL_FOR_KVM_INTERNAL(STATIC_CALL_KEY(__kvm_get_memory_attributes));
> 


^ permalink raw reply

* Re: [PATCH v14 13/44] arm64: RMI: Define the user ABI
From: Steven Price @ 2026-06-04 15:27 UTC (permalink / raw)
  To: Marc Zyngier
  Cc: kvm, kvmarm, Catalin Marinas, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Gavin Shan,
	Shanker Donthineni, Alper Gun, Aneesh Kumar K . V, Emi Kisanuki,
	Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <86jysovpxf.wl-maz@kernel.org>

On 27/05/2026 16:21, Marc Zyngier wrote:
> On Wed, 13 May 2026 14:17:21 +0100,
> Steven Price <steven.price@arm.com> wrote:
>>
>> There is one CAP which identified the presence of CCA, and one ioctl.
>> The ioctl is used to populate memory during creation of the realm as
>> this requires the RMM to copy data from an unprotected address to the
>> protected memory - CCA does not support memory conversion where the
>> memory contents is preserved as this is incompatible with memory
>> encryption.
>>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>> Changes since v13:
>>  * KVM_ARM_VCPU_RMI_PSCI_COMPLETE removed.
>>  * KVM_ARM_RMI_POPULATE documentation updated to reflect that the
>>    structure is written by the kernel.
>>  * CAP number bumped.
>> Changes since v12:
>>  * Change KVM_ARM_RMI_POPULATE to update the structure with the amount
>>    that has been progressed rather than return the number of bytes
>>    populated.
>>  * Describe the flag KVM_ARM_RMI_POPULATE_FLAGS_MEASURE.
>>  * CAP number is bumped.
>>  * NOTE: The PSCI ioctl may be removed in a future spec release.
>> Changes since v11:
>>  * Completely reworked to be more implicit. Rather than having explicit
>>    CAP operations to progress the realm construction these operations
>>    are done when needed (on populating and on first vCPU run).
>>  * Populate and PSCI complete are promoted to proper ioctls.
>> Changes since v10:
>>  * Rename symbols from RME to RMI.
>> Changes since v9:
>>  * Improvements to documentation.
>>  * Bump the magic number for KVM_CAP_ARM_RME to avoid conflicts.
>> Changes since v8:
>>  * Minor improvements to documentation following review.
>>  * Bump the magic numbers to avoid conflicts.
>> Changes since v7:
>>  * Add documentation of new ioctls
>>  * Bump the magic numbers to avoid conflicts
>> Changes since v6:
>>  * Rename some of the symbols to make their usage clearer and avoid
>>    repetition.
>> Changes from v5:
>>  * Actually expose the new VCPU capability (KVM_ARM_VCPU_REC) by bumping
>>    KVM_VCPU_MAX_FEATURES - note this also exposes KVM_ARM_VCPU_HAS_EL2!
>> ---
>>  Documentation/virt/kvm/api.rst | 40 ++++++++++++++++++++++++++++++++++
>>  include/uapi/linux/kvm.h       | 13 +++++++++++
>>  2 files changed, 53 insertions(+)
> 
> $SUBJECT looks wrong. This is a KVM change, not an RMI change.

Ah, true I guess "KVM: arm64: Define the user ABI for CCA" is more accurate.

>>
>> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
>> index 52bbbb553ce1..ca68aae7faa2 100644
>> --- a/Documentation/virt/kvm/api.rst
>> +++ b/Documentation/virt/kvm/api.rst
>> @@ -6553,6 +6553,37 @@ KVM_S390_KEYOP_SSKE
>>    Sets the storage key for the guest address ``guest_addr`` to the key
>>    specified in ``key``, returning the previous value in ``key``.
>>  
>> +4.145 KVM_ARM_RMI_POPULATE
>> +--------------------------
>> +
>> +:Capability: KVM_CAP_ARM_RMI
>> +:Architectures: arm64
>> +:Type: vm ioctl
>> +:Parameters: struct kvm_arm_rmi_populate (in/out)
>> +:Returns: 0 on success, < 0 on error
>> +
>> +::
>> +
>> +  struct kvm_arm_rmi_populate {
>> +	__u64 base;
>> +	__u64 size;
>> +	__u64 source_uaddr;
>> +	__u32 flags;
>> +	__u32 reserved;
>> +  };
>> +
>> +Populate a region of protected address space by copying the data from the
>> +(non-protected) user space pointer provided into a protected region (backed by
>> +guestmem_fd). It implicitly sets the destination region to RIPAS RAM. This is
>> +only valid before any VCPUs have been run. The ioctl might not populate the
>> +entire region and in this case the kernel updates the fields `base`, `size` and
>> +`source_uaddr`. User space may have to repeatedly call it until `size` is 0 to
>> +populate the entire region.
>> +
>> +`flags` can be set to `KVM_ARM_RMI_POPULATE_FLAGS_MEASURE` to request that the
>> +populated data is hashed and added to the guest's Realm Initial Measurement
>> +(RIM).
> 
> Where is that measurement stored? And retrieved? At least a pointer to
> that would help.

It's stored within the RMM and retrieved by the guest (using the RSI
interface). I'll update to:

`flags` can be set to `KVM_ARM_RMI_POPULATE_FLAGS_MEASURE` to request
that the populated data is hashed and added to the guest's Realm Initial
Measurement (RIM) stored by the RMM. This can then be retrieved by the
guest (using the RSI interface) to present to an attestation server.

Thanks,

Steve

>> +
>>  .. _kvm_run:
>>  
>>  5. The kvm_run structure
>> @@ -8904,6 +8935,15 @@ helpful if user space wants to emulate instructions which are not
>>  This capability can be enabled dynamically even if VCPUs were already
>>  created and are running.
>>  
>> +7.47 KVM_CAP_ARM_RMI
>> +--------------------
>> +
>> +:Architectures: arm64
>> +:Target: VM
>> +:Parameters: None
>> +
>> +This capability indicates that support for CCA realms is available.
>> +
>>  8. Other capabilities.
>>  ======================
>>  
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> index 6c8afa2047bf..b8cff0938041 100644
>> --- a/include/uapi/linux/kvm.h
>> +++ b/include/uapi/linux/kvm.h
>> @@ -996,6 +996,7 @@ struct kvm_enable_cap {
>>  #define KVM_CAP_S390_USER_OPEREXEC 246
>>  #define KVM_CAP_S390_KEYOP 247
>>  #define KVM_CAP_S390_VSIE_ESAMODE 248
>> +#define KVM_CAP_ARM_RMI 249
>>  
>>  struct kvm_irq_routing_irqchip {
>>  	__u32 irqchip;
>> @@ -1669,4 +1670,16 @@ struct kvm_pre_fault_memory {
>>  	__u64 padding[5];
>>  };
>>  
>> +/* Available with KVM_CAP_ARM_RMI, only for VMs with KVM_VM_TYPE_ARM_REALM */
>> +#define KVM_ARM_RMI_POPULATE	_IOWR(KVMIO, 0xd7, struct kvm_arm_rmi_populate)
>> +#define KVM_ARM_RMI_POPULATE_FLAGS_MEASURE	(1 << 0)
>> +
>> +struct kvm_arm_rmi_populate {
>> +	__u64 base;
>> +	__u64 size;
>> +	__u64 source_uaddr;
>> +	__u32 flags;
>> +	__u32 reserved;
>> +};
>> +
>>  #endif /* __LINUX_KVM_H */
> 
> Thanks,
> 
> 	M.
> 


^ permalink raw reply

* Re: [PATCH v14 13/44] arm64: RMI: Define the user ABI
From: Steven Price @ 2026-06-04 15:27 UTC (permalink / raw)
  To: Wei-Lin Chang, kvm, kvmarm
  Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Gavin Shan,
	Shanker Donthineni, Alper Gun, Aneesh Kumar K . V, Emi Kisanuki,
	Vishal Annapurve, Lorenzo.Pieralisi2
In-Reply-To: <vixej2afqamvr7eaksoepr3wmpwzp73rtgrageeyuawa6azotw@payxzyhymwbt>

On 26/05/2026 23:17, Wei-Lin Chang wrote:
> On Wed, May 13, 2026 at 02:17:21PM +0100, Steven Price wrote:
>> There is one CAP which identified the presence of CCA, and one ioctl.
>> The ioctl is used to populate memory during creation of the realm as
>> this requires the RMM to copy data from an unprotected address to the
>> protected memory - CCA does not support memory conversion where the
>> memory contents is preserved as this is incompatible with memory
>> encryption.
> 
> Nit:
> I believe spelling out the CAP and ioctl names can improve the commit
> message. Also "memory conversion" is a little vague, maybe
> 
> ... CCA does not support shared <-> private memory conversion where ...
> 
> would make this clearer?

Thanks for the suggestions - yes I agree that would make it clearer.

Thanks,
Steve

> Thanks,
> Wei-Lin Chang
> 
>>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>> Changes since v13:
>>  * KVM_ARM_VCPU_RMI_PSCI_COMPLETE removed.
>>  * KVM_ARM_RMI_POPULATE documentation updated to reflect that the
>>    structure is written by the kernel.
>>  * CAP number bumped.
>> Changes since v12:
>>  * Change KVM_ARM_RMI_POPULATE to update the structure with the amount
>>    that has been progressed rather than return the number of bytes
>>    populated.
>>  * Describe the flag KVM_ARM_RMI_POPULATE_FLAGS_MEASURE.
>>  * CAP number is bumped.
>>  * NOTE: The PSCI ioctl may be removed in a future spec release.
>> Changes since v11:
>>  * Completely reworked to be more implicit. Rather than having explicit
>>    CAP operations to progress the realm construction these operations
>>    are done when needed (on populating and on first vCPU run).
>>  * Populate and PSCI complete are promoted to proper ioctls.
>> Changes since v10:
>>  * Rename symbols from RME to RMI.
>> Changes since v9:
>>  * Improvements to documentation.
>>  * Bump the magic number for KVM_CAP_ARM_RME to avoid conflicts.
>> Changes since v8:
>>  * Minor improvements to documentation following review.
>>  * Bump the magic numbers to avoid conflicts.
>> Changes since v7:
>>  * Add documentation of new ioctls
>>  * Bump the magic numbers to avoid conflicts
>> Changes since v6:
>>  * Rename some of the symbols to make their usage clearer and avoid
>>    repetition.
>> Changes from v5:
>>  * Actually expose the new VCPU capability (KVM_ARM_VCPU_REC) by bumping
>>    KVM_VCPU_MAX_FEATURES - note this also exposes KVM_ARM_VCPU_HAS_EL2!
>> ---
>>  Documentation/virt/kvm/api.rst | 40 ++++++++++++++++++++++++++++++++++
>>  include/uapi/linux/kvm.h       | 13 +++++++++++
>>  2 files changed, 53 insertions(+)
>>
>> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
>> index 52bbbb553ce1..ca68aae7faa2 100644
>> --- a/Documentation/virt/kvm/api.rst
>> +++ b/Documentation/virt/kvm/api.rst
>> @@ -6553,6 +6553,37 @@ KVM_S390_KEYOP_SSKE
>>    Sets the storage key for the guest address ``guest_addr`` to the key
>>    specified in ``key``, returning the previous value in ``key``.
>>  
>> +4.145 KVM_ARM_RMI_POPULATE
>> +--------------------------
>> +
>> +:Capability: KVM_CAP_ARM_RMI
>> +:Architectures: arm64
>> +:Type: vm ioctl
>> +:Parameters: struct kvm_arm_rmi_populate (in/out)
>> +:Returns: 0 on success, < 0 on error
>> +
>> +::
>> +
>> +  struct kvm_arm_rmi_populate {
>> +	__u64 base;
>> +	__u64 size;
>> +	__u64 source_uaddr;
>> +	__u32 flags;
>> +	__u32 reserved;
>> +  };
>> +
>> +Populate a region of protected address space by copying the data from the
>> +(non-protected) user space pointer provided into a protected region (backed by
>> +guestmem_fd). It implicitly sets the destination region to RIPAS RAM. This is
>> +only valid before any VCPUs have been run. The ioctl might not populate the
>> +entire region and in this case the kernel updates the fields `base`, `size` and
>> +`source_uaddr`. User space may have to repeatedly call it until `size` is 0 to
>> +populate the entire region.
>> +
>> +`flags` can be set to `KVM_ARM_RMI_POPULATE_FLAGS_MEASURE` to request that the
>> +populated data is hashed and added to the guest's Realm Initial Measurement
>> +(RIM).
>> +
>>  .. _kvm_run:
>>  
>>  5. The kvm_run structure
>> @@ -8904,6 +8935,15 @@ helpful if user space wants to emulate instructions which are not
>>  This capability can be enabled dynamically even if VCPUs were already
>>  created and are running.
>>  
>> +7.47 KVM_CAP_ARM_RMI
>> +--------------------
>> +
>> +:Architectures: arm64
>> +:Target: VM
>> +:Parameters: None
>> +
>> +This capability indicates that support for CCA realms is available.
>> +
>>  8. Other capabilities.
>>  ======================
>>  
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> index 6c8afa2047bf..b8cff0938041 100644
>> --- a/include/uapi/linux/kvm.h
>> +++ b/include/uapi/linux/kvm.h
>> @@ -996,6 +996,7 @@ struct kvm_enable_cap {
>>  #define KVM_CAP_S390_USER_OPEREXEC 246
>>  #define KVM_CAP_S390_KEYOP 247
>>  #define KVM_CAP_S390_VSIE_ESAMODE 248
>> +#define KVM_CAP_ARM_RMI 249
>>  
>>  struct kvm_irq_routing_irqchip {
>>  	__u32 irqchip;
>> @@ -1669,4 +1670,16 @@ struct kvm_pre_fault_memory {
>>  	__u64 padding[5];
>>  };
>>  
>> +/* Available with KVM_CAP_ARM_RMI, only for VMs with KVM_VM_TYPE_ARM_REALM */
>> +#define KVM_ARM_RMI_POPULATE	_IOWR(KVMIO, 0xd7, struct kvm_arm_rmi_populate)
>> +#define KVM_ARM_RMI_POPULATE_FLAGS_MEASURE	(1 << 0)
>> +
>> +struct kvm_arm_rmi_populate {
>> +	__u64 base;
>> +	__u64 size;
>> +	__u64 source_uaddr;
>> +	__u32 flags;
>> +	__u32 reserved;
>> +};
>> +
>>  #endif /* __LINUX_KVM_H */
>> -- 
>> 2.43.0
>>


^ permalink raw reply

* Re: [PATCH v14 10/44] arm64: RMI: Add support for SRO
From: Steven Price @ 2026-06-04 15:19 UTC (permalink / raw)
  To: Marc Zyngier
  Cc: kvm, kvmarm, Catalin Marinas, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Gavin Shan,
	Shanker Donthineni, Alper Gun, Aneesh Kumar K . V, Emi Kisanuki,
	Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <864ik0x22q.wl-maz@kernel.org>

On 21/05/2026 15:35, Marc Zyngier wrote:
> On Wed, 13 May 2026 14:17:18 +0100,
> Steven Price <steven.price@arm.com> wrote:
>>
>> RMM v2.0 introduces the concept of "Stateful RMI Operations" (SRO). This
>> means that an SMC can return with an operation still in progress. The
>> host is excepted to continue the operation until is reaches a conclusion
>> (either success or failure). During this process the RMM can request
>> additional memory ('donate') or hand memory back to the host
>> ('reclaim'). The host can request an in progress operation is cancelled,
>> but still continue the operation until it has completed (otherwise the
>> incomplete operation may cause future RMM operations to fail).
>>
>> The SRO is tracked using a struct rmi_sro_state object which keeps track
>> of any memory which has been allocated but not yet consumed by the RMM
>> or reclaimed from the RMM. This allows the memory to be reused in a
>> future request within the same operation. It will also permit an
>> operation to be done in a context where memory allocation may be
>> difficult (e.g. atomic context) with the option to abort the operation
>> and retry the memory allocation outside of the atomic context. The
>> memory stored in the struct rmi_sro_state object can then be reused on
>> the subsequent attempt.
>>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>> v14:
>>  * SRO support has improved although is still not fully complete. The
>>    infrastructure has been moved out of KVM.
>> ---
>>  arch/arm64/include/asm/rmi_cmds.h |   1 +
>>  arch/arm64/kernel/rmi.c           | 359 ++++++++++++++++++++++++++++++
>>  2 files changed, 360 insertions(+)
>>
>> diff --git a/arch/arm64/include/asm/rmi_cmds.h b/arch/arm64/include/asm/rmi_cmds.h
>> index eb213c8e6f26..1a7b0c8f1e38 100644
>> --- a/arch/arm64/include/asm/rmi_cmds.h
>> +++ b/arch/arm64/include/asm/rmi_cmds.h
>> @@ -35,6 +35,7 @@ struct rmi_sro_state {
>>  
>>  int rmi_delegate_range(phys_addr_t phys, unsigned long size);
>>  int rmi_undelegate_range(phys_addr_t phys, unsigned long size);
>> +int free_delegated_page(phys_addr_t phys);
>>  
>>  static inline int rmi_delegate_page(phys_addr_t phys)
>>  {
>> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
>> index 08cef54acadb..a8107ca9bb6d 100644
>> --- a/arch/arm64/kernel/rmi.c
>> +++ b/arch/arm64/kernel/rmi.c
>> @@ -48,6 +48,365 @@ int rmi_undelegate_range(phys_addr_t phys, unsigned long size)
>>  	return ret;
>>  }
>>  
>> +static unsigned long donate_req_to_size(unsigned long donatereq)
>> +{
>> +	unsigned long unit_size = RMI_DONATE_SIZE(donatereq);
>> +
>> +	switch (unit_size) {
>> +	case 0:
>> +		return PAGE_SIZE;
>> +	case 1:
>> +		return PMD_SIZE;
>> +	case 2:
>> +		return PUD_SIZE;
>> +	case 3:
>> +		return P4D_SIZE;
> 
> How does this work when we have folded levels? If this is supposed to
> be the architected size, then it should actively express that:
> 
> 	return BIT(unit_size * (PAGE_SHIFT - 3) + PAGE_SHIFT);

It doesn't work (as Gavin also pointed out). There's an existing macro
to make this even cleaner:

return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(3 - unit_size));

>> +	}
>> +	unreachable();
>> +}
>> +
>> +static void rmi_smccc_invoke(struct arm_smccc_1_2_regs *regs_in,
>> +			     struct arm_smccc_1_2_regs *regs_out)
>> +{
>> +	struct arm_smccc_1_2_regs regs = *regs_in;
>> +	unsigned long status;
>> +
>> +	do {
>> +		arm_smccc_1_2_invoke(&regs, regs_out);
>> +		status = RMI_RETURN_STATUS(regs_out->a0);
>> +	} while (status == RMI_BUSY || status == RMI_BLOCKED);
>> +}
>> +
>> +int free_delegated_page(phys_addr_t phys)
>> +{
>> +	if (WARN_ON(rmi_undelegate_page(phys))) {
> 
> Please drop this WARN_ON(). Or at least make it ONCE. Everywhere.

Happy to change to WARN_ON_ONCE(). I think we should keep a WARN of some
sort as this is causing Linux to leak pages - it's definitely something
the sysadmin would want to know about.

>> +		/* Undelegate failed: leak the page */
>> +		return -EBUSY;
>> +	}
>> +
>> +	free_page((unsigned long)phys_to_virt(phys));
>> +
>> +	return 0;
>> +}
>> +
>> +static int rmi_sro_ensure_capacity(struct rmi_sro_state *sro,
>> +				   unsigned long count)
>> +{
>> +	if (WARN_ON_ONCE(sro->addr_count > RMI_MAX_ADDR_LIST))
>> +		return -EOVERFLOW;
>> +
>> +	if (count > RMI_MAX_ADDR_LIST - sro->addr_count)
>> +		return -ENOSPC;
>> +
>> +	return 0;
>> +}
>> +
>> +static int rmi_sro_donate_contig(struct rmi_sro_state *sro,
>> +				 unsigned long sro_handle,
>> +				 unsigned long donatereq,
>> +				 struct arm_smccc_1_2_regs *out_regs,
>> +				 gfp_t gfp)
>> +{
>> +	unsigned long unit_size = RMI_DONATE_SIZE(donatereq);
>> +	unsigned long unit_size_bytes = donate_req_to_size(donatereq);
>> +	unsigned long count = RMI_DONATE_COUNT(donatereq);
>> +	unsigned long state = RMI_DONATE_STATE(donatereq);
>> +	unsigned long size = unit_size_bytes * count;
>> +	unsigned long addr_range;
>> +	int ret;
>> +	void *virt;
>> +	phys_addr_t phys;
>> +	struct arm_smccc_1_2_regs regs = {
>> +		SMC_RMI_OP_MEM_DONATE,
>> +		sro_handle
>> +	};
>> +
>> +	for (int i = 0; i < sro->addr_count; i++) {
>> +		unsigned long entry = sro->addr_list[i];
>> +
>> +		if (RMI_ADDR_RANGE_SIZE(entry) == unit_size &&
>> +		    RMI_ADDR_RANGE_COUNT(entry) == count &&
>> +		    RMI_ADDR_RANGE_STATE(entry) == state) {
>> +			sro->addr_count--;
>> +			swap(sro->addr_list[sro->addr_count],
>> +			     sro->addr_list[i]);
>> +
>> +			goto out;
>> +		}
>> +	}
>> +
>> +	ret = rmi_sro_ensure_capacity(sro, 1);
>> +	if (ret)
>> +		return ret;
>> +
>> +	virt = alloc_pages_exact(size, gfp);
>> +	if (!virt)
>> +		return -ENOMEM;
>> +	phys = virt_to_phys(virt);
>> +
>> +	if (state == RMI_OP_MEM_DELEGATED) {
>> +		if (rmi_delegate_range(phys, size)) {
>> +			free_pages_exact(virt, size);
>> +			return -ENXIO;
>> +		}
>> +	}
>> +
>> +	addr_range = phys & RMI_ADDR_RANGE_ADDR_MASK;
>> +	FIELD_MODIFY(RMI_ADDR_RANGE_SIZE_MASK, &addr_range, unit_size);
>> +	FIELD_MODIFY(RMI_ADDR_RANGE_COUNT_MASK, &addr_range, count);
>> +	FIELD_MODIFY(RMI_ADDR_RANGE_STATE_MASK, &addr_range, state);
>> +
>> +	sro->addr_list[sro->addr_count] = addr_range;
>> +
> 
> Shouldn't this be moved to a helper that ensures capacity, and returns
> an error otherwise?

I'm not sure quite what you are suggesting. I already have a
rmi_sro_ensure_capacity() helper. By this point we know there's space.

>> +out:
>> +	regs.a2 = virt_to_phys(&sro->addr_list[sro->addr_count]);
>> +	regs.a3 = 1;
> 
> This could really do with context specific helpers that populate regs
> based on a set of parameters. I have no idea what this 1 here is, and
> the init is spread over too much code. Think of the children!
> 
> That's valid for the whole patch.

That's a good point. SRO is a bit tricky because I wanted the actual SMC
call to be done in one place so we can handle all the RMI_INCOMPLETE
cases together. But I could certainly add some helpers to setup the
registers rather than assigning directly to regs.a<n>.

Thanks,
Steve

> 	M.
>> +	rmi_smccc_invoke(&regs, out_regs);
>> +
>> +	unsigned long donated_granules = out_regs->a1;
>> +	unsigned long donated_size = donated_granules << PAGE_SHIFT;
>> +
>> +	if (donated_granules == 0) {
>> +		/* No pages used by the RMM */
>> +		sro->addr_count++;
>> +	} else if (donated_size < size) {
>> +		phys = sro->addr_list[sro->addr_count] & RMI_ADDR_RANGE_ADDR_MASK;
>> +
>> +		/* Not all granules used by the RMM, free the remaining pages */
>> +		for (long i = donated_size; i < size; i += PAGE_SIZE) {
>> +			if (state == RMI_OP_MEM_DELEGATED)
>> +				free_delegated_page(phys + i);
>> +			else
>> +				__free_page(phys_to_page(phys + i));
>> +		}
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int rmi_sro_donate_noncontig(struct rmi_sro_state *sro,
>> +				    unsigned long sro_handle,
>> +				    unsigned long donatereq,
>> +				    struct arm_smccc_1_2_regs *out_regs,
>> +				    gfp_t gfp)
>> +{
>> +	unsigned long unit_size = RMI_DONATE_SIZE(donatereq);
>> +	unsigned long unit_size_bytes = donate_req_to_size(donatereq);
>> +	unsigned long count = RMI_DONATE_COUNT(donatereq);
>> +	unsigned long state = RMI_DONATE_STATE(donatereq);
>> +	unsigned long found = 0;
>> +	unsigned long addr_list_start = sro->addr_count;
>> +	int ret;
>> +	struct arm_smccc_1_2_regs regs = {
>> +		SMC_RMI_OP_MEM_DONATE,
>> +		sro_handle
>> +	};
>> +
>> +	for (int i = 0; i < addr_list_start && found < count; i++) {
>> +		unsigned long entry = sro->addr_list[i];
>> +
>> +		if (RMI_ADDR_RANGE_SIZE(entry) == unit_size &&
>> +		    RMI_ADDR_RANGE_COUNT(entry) == 1 &&
>> +		    RMI_ADDR_RANGE_STATE(entry) == state) {
>> +			addr_list_start--;
>> +			swap(sro->addr_list[addr_list_start],
>> +			     sro->addr_list[i]);
>> +			found++;
>> +			i--;
>> +		}
>> +	}
>> +
>> +	ret = rmi_sro_ensure_capacity(sro, count - found);
>> +	if (ret)
>> +		return ret;
>> +
>> +	while (found < count) {
>> +		unsigned long addr_range;
>> +		void *virt = alloc_pages_exact(unit_size_bytes, gfp);
>> +		phys_addr_t phys;
>> +
>> +		if (!virt)
>> +			return -ENOMEM;
>> +
>> +		phys = virt_to_phys(virt);
>> +
>> +		if (state == RMI_OP_MEM_DELEGATED) {
>> +			if (rmi_delegate_range(phys, unit_size_bytes)) {
>> +				free_pages_exact(virt, unit_size_bytes);
>> +				return -ENXIO;
>> +			}
>> +		}
>> +
>> +		addr_range = phys & RMI_ADDR_RANGE_ADDR_MASK;
>> +		FIELD_MODIFY(RMI_ADDR_RANGE_SIZE_MASK, &addr_range, unit_size);
>> +		FIELD_MODIFY(RMI_ADDR_RANGE_COUNT_MASK, &addr_range, 1);
>> +		FIELD_MODIFY(RMI_ADDR_RANGE_STATE_MASK, &addr_range, state);
>> +
>> +		sro->addr_list[sro->addr_count++] = addr_range;
>> +		found++;
>> +	}
>> +
>> +	regs.a2 = virt_to_phys(&sro->addr_list[addr_list_start]);
>> +	regs.a3 = found;
>> +	rmi_smccc_invoke(&regs, out_regs);
>> +
>> +	unsigned long donated_granules = out_regs->a1;
>> +
>> +	if (WARN_ON(donated_granules & ((unit_size_bytes >> PAGE_SHIFT) - 1))) {
>> +		/*
>> +		 * FIXME: RMM has only consumed part of a huge page, this leaks
>> +		 * the rest of the huge page
>> +		 */
>> +		donated_granules = ALIGN(donated_granules,
>> +					 (unit_size_bytes >> PAGE_SHIFT));
>> +	}
>> +	unsigned long donated_blocks = donated_granules / (unit_size_bytes >> PAGE_SHIFT);
>> +
>> +	if (WARN_ON(donated_blocks > found))
>> +		donated_blocks = found;
>> +
>> +	unsigned long undonated_blocks = found - donated_blocks;
>> +
>> +	while (donated_blocks && undonated_blocks) {
>> +		sro->addr_count--;
>> +		swap(sro->addr_list[addr_list_start],
>> +		     sro->addr_list[sro->addr_count]);
>> +		addr_list_start++;
>> +
>> +		donated_blocks--;
>> +		undonated_blocks--;
>> +	}
>> +	sro->addr_count -= donated_blocks;
>> +
>> +	return 0;
>> +}
>> +
>> +static int rmi_sro_donate(struct rmi_sro_state *sro,
>> +			  unsigned long sro_handle,
>> +			  unsigned long donatereq,
>> +			  struct arm_smccc_1_2_regs *regs,
>> +			  gfp_t gfp)
>> +{
>> +	unsigned long count = RMI_DONATE_COUNT(donatereq);
>> +
>> +	if (WARN_ON(!count))
>> +		return 0;
>> +
>> +	if (RMI_DONATE_CONTIG(donatereq)) {
>> +		return rmi_sro_donate_contig(sro, sro_handle, donatereq,
>> +					     regs, gfp);
>> +	} else {
>> +		return rmi_sro_donate_noncontig(sro, sro_handle, donatereq,
>> +						regs, gfp);
>> +	}
>> +}
>> +
>> +static int rmi_sro_reclaim(struct rmi_sro_state *sro,
>> +			   unsigned long sro_handle,
>> +			   struct arm_smccc_1_2_regs *out_regs)
>> +{
>> +	unsigned long capacity;
>> +	struct arm_smccc_1_2_regs regs;
>> +	int ret;
>> +
>> +	ret = rmi_sro_ensure_capacity(sro, 1);
>> +	if (ret)
>> +		rmi_sro_free(sro);
>> +
>> +	capacity = RMI_MAX_ADDR_LIST - sro->addr_count;
>> +
>> +	regs = (struct arm_smccc_1_2_regs){
>> +		SMC_RMI_OP_MEM_RECLAIM,
>> +		sro_handle,
>> +		virt_to_phys(&sro->addr_list[sro->addr_count]),
>> +		capacity
>> +	};
>> +	rmi_smccc_invoke(&regs, out_regs);
>> +
>> +	if (WARN_ON_ONCE(out_regs->a1 > capacity))
>> +		out_regs->a1 = capacity;
>> +
>> +	sro->addr_count += out_regs->a1;
>> +
>> +	return 0;
>> +}
>> +
>> +void rmi_sro_free(struct rmi_sro_state *sro)
>> +{
>> +	for (int i = 0; i < sro->addr_count; i++) {
>> +		unsigned long entry = sro->addr_list[i];
>> +		unsigned long addr = RMI_ADDR_RANGE_ADDR(entry);
>> +		unsigned long unit_size = RMI_ADDR_RANGE_SIZE(entry);
>> +		unsigned long count = RMI_ADDR_RANGE_COUNT(entry);
>> +		unsigned long state = RMI_ADDR_RANGE_STATE(entry);
>> +		unsigned long size = donate_req_to_size(unit_size) * count;
>> +
>> +		if (state == RMI_OP_MEM_DELEGATED) {
>> +			if (WARN_ON(rmi_undelegate_range(addr, size))) {
>> +				/* Leak the pages */
>> +				continue;
>> +			}
>> +		}
>> +		free_pages_exact(phys_to_virt(addr), size);
>> +	}
>> +
>> +	sro->addr_count = 0;
>> +}
>> +
>> +unsigned long rmi_sro_execute(struct rmi_sro_state *sro, gfp_t gfp)
>> +{
>> +	unsigned long sro_handle;
>> +	struct arm_smccc_1_2_regs regs;
>> +	struct arm_smccc_1_2_regs *regs_in = &sro->regs;
>> +
>> +	rmi_smccc_invoke(regs_in, &regs);
>> +
>> +	sro_handle = regs.a1;
>> +
>> +	while (RMI_RETURN_STATUS(regs.a0) == RMI_INCOMPLETE) {
>> +		bool can_cancel = RMI_RETURN_CAN_CANCEL(regs.a0);
>> +		int ret;
>> +
>> +		switch (RMI_RETURN_MEMREQ(regs.a0)) {
>> +		case RMI_OP_MEM_REQ_NONE:
>> +			regs = (struct arm_smccc_1_2_regs){
>> +				SMC_RMI_OP_CONTINUE, sro_handle, 0
>> +			};
>> +			rmi_smccc_invoke(&regs, &regs);
>> +			break;
>> +		case RMI_OP_MEM_REQ_DONATE:
>> +			ret = rmi_sro_donate(sro, sro_handle, regs.a2, &regs,
>> +					     gfp);
>> +			break;
>> +		case RMI_OP_MEM_REQ_RECLAIM:
>> +			ret = rmi_sro_reclaim(sro, sro_handle, &regs);
>> +			break;
>> +		default:
>> +			ret = WARN_ON(1);
>> +			break;
>> +		}
>> +
>> +		if (ret) {
>> +			if (can_cancel) {
>> +				/*
>> +				 * FIXME: Handle cancelling properly!
>> +				 *
>> +				 * If the operation has failed due to memory
>> +				 * allocation failure then the information on
>> +				 * the memory allocation should be saved, so
>> +				 * that the allocation can be repeated outside
>> +				 * of any context which prevented the
>> +				 * allocation.
> 
> Honestly, this is the sort of stuff that I'd expect to be solved
> *before* posting this code. Since this is so central to the whole
> memory management, it needs to be correct from day-1.
> 
> If you can't make it work in time, then tone the supported features
> down. But FIXMEs and WARN_ONs are not the way to go.
> 
> 	M.
> 


^ permalink raw reply

* Re: [PATCH v14 10/44] arm64: RMI: Add support for SRO
From: Steven Price @ 2026-06-04 15:19 UTC (permalink / raw)
  To: Gavin Shan, kvm, kvmarm
  Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Shanker Donthineni,
	Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
	WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <872ce762-0a83-42a4-bbe8-b21ea6f4e1cf@redhat.com>

On 21/05/2026 05:38, Gavin Shan wrote:
> Hi Steven,
> 
> On 5/13/26 11:17 PM, Steven Price wrote:
>> RMM v2.0 introduces the concept of "Stateful RMI Operations" (SRO). This
>> means that an SMC can return with an operation still in progress. The
>> host is excepted to continue the operation until is reaches a conclusion
>> (either success or failure). During this process the RMM can request
>> additional memory ('donate') or hand memory back to the host
>> ('reclaim'). The host can request an in progress operation is cancelled,
>> but still continue the operation until it has completed (otherwise the
>> incomplete operation may cause future RMM operations to fail).
>>
>> The SRO is tracked using a struct rmi_sro_state object which keeps track
>> of any memory which has been allocated but not yet consumed by the RMM
>> or reclaimed from the RMM. This allows the memory to be reused in a
>> future request within the same operation. It will also permit an
>> operation to be done in a context where memory allocation may be
>> difficult (e.g. atomic context) with the option to abort the operation
>> and retry the memory allocation outside of the atomic context. The
>> memory stored in the struct rmi_sro_state object can then be reused on
>> the subsequent attempt.
>>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>> v14:
>>   * SRO support has improved although is still not fully complete. The
>>     infrastructure has been moved out of KVM.
>> ---
>>   arch/arm64/include/asm/rmi_cmds.h |   1 +
>>   arch/arm64/kernel/rmi.c           | 359 ++++++++++++++++++++++++++++++
>>   2 files changed, 360 insertions(+)
>>
>> diff --git a/arch/arm64/include/asm/rmi_cmds.h b/arch/arm64/include/
>> asm/rmi_cmds.h
>> index eb213c8e6f26..1a7b0c8f1e38 100644
>> --- a/arch/arm64/include/asm/rmi_cmds.h
>> +++ b/arch/arm64/include/asm/rmi_cmds.h
>> @@ -35,6 +35,7 @@ struct rmi_sro_state {
>>     int rmi_delegate_range(phys_addr_t phys, unsigned long size);
>>   int rmi_undelegate_range(phys_addr_t phys, unsigned long size);
>> +int free_delegated_page(phys_addr_t phys);
>>     static inline int rmi_delegate_page(phys_addr_t phys)
>>   {
>> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
>> index 08cef54acadb..a8107ca9bb6d 100644
>> --- a/arch/arm64/kernel/rmi.c
>> +++ b/arch/arm64/kernel/rmi.c
>> @@ -48,6 +48,365 @@ int rmi_undelegate_range(phys_addr_t phys,
>> unsigned long size)
>>       return ret;
>>   }
>>   +static unsigned long donate_req_to_size(unsigned long donatereq)
>> +{
>> +    unsigned long unit_size = RMI_DONATE_SIZE(donatereq);
>> +
>> +    switch (unit_size) {
>> +    case 0:
>> +        return PAGE_SIZE;
>> +    case 1:
>> +        return PMD_SIZE;
>> +    case 2:
>> +        return PUD_SIZE;
>> +    case 3:
>> +        return P4D_SIZE;
>> +    }
>> +    unreachable();
>> +}
>> +
> 
> It's worthy to have 'inline'.

Generally it's best to let the compiler make the decision. A small
static function like this is highly likely to be inlined by the compiler
already.

The exception of course is when the function is in a header and then
it's necessary to use "static inline".

> {P4D, PUD, PMD}_SIZE can be equal if there> are
> no P4D and PUD, depending on CONFIG_PGTABLE_LEVELS. In this case, can the
> 'unit_size' be translated to wrong value?

Technically yes. I think I can actually rewrite this more simply as:

static unsigned long donate_req_to_size(unsigned long donatereq)
{
	unsigned long unit_size = RMI_DONATE_SIZE(donatereq);

	return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(3 - unit_size));
}

which neatly sidesteps the CONFIG_PGTABLE_LEVELS problem too.

>> +static void rmi_smccc_invoke(struct arm_smccc_1_2_regs *regs_in,
>> +                 struct arm_smccc_1_2_regs *regs_out)
>> +{
>> +    struct arm_smccc_1_2_regs regs = *regs_in;
>> +    unsigned long status;
>> +
>> +    do {
>> +        arm_smccc_1_2_invoke(&regs, regs_out);
>> +        status = RMI_RETURN_STATUS(regs_out->a0);
>> +    } while (status == RMI_BUSY || status == RMI_BLOCKED);
>> +}
>> +
>> +int free_delegated_page(phys_addr_t phys)
>> +{
>> +    if (WARN_ON(rmi_undelegate_page(phys))) {
>> +        /* Undelegate failed: leak the page */
>> +        return -EBUSY;
>> +    }
>> +
>> +    free_page((unsigned long)phys_to_virt(phys));
>> +
>> +    return 0;
>> +}
>> +
>> +static int rmi_sro_ensure_capacity(struct rmi_sro_state *sro,
>> +                   unsigned long count)
>> +{
>> +    if (WARN_ON_ONCE(sro->addr_count > RMI_MAX_ADDR_LIST))
>> +        return -EOVERFLOW;
>> +
>> +    if (count > RMI_MAX_ADDR_LIST - sro->addr_count)
>> +        return -ENOSPC;
>> +
>> +    return 0;
>> +}
>> +
>> +static int rmi_sro_donate_contig(struct rmi_sro_state *sro,
>> +                 unsigned long sro_handle,
>> +                 unsigned long donatereq,
>> +                 struct arm_smccc_1_2_regs *out_regs,
>> +                 gfp_t gfp)
>> +{
>> +    unsigned long unit_size = RMI_DONATE_SIZE(donatereq);
>> +    unsigned long unit_size_bytes = donate_req_to_size(donatereq);
>> +    unsigned long count = RMI_DONATE_COUNT(donatereq);
>> +    unsigned long state = RMI_DONATE_STATE(donatereq);
>> +    unsigned long size = unit_size_bytes * count;
>> +    unsigned long addr_range;
>> +    int ret;
>> +    void *virt;
>> +    phys_addr_t phys;
>> +    struct arm_smccc_1_2_regs regs = {
>> +        SMC_RMI_OP_MEM_DONATE,
>> +        sro_handle
>> +    };
>> +
>> +    for (int i = 0; i < sro->addr_count; i++) {
>> +        unsigned long entry = sro->addr_list[i];
>> +
>> +        if (RMI_ADDR_RANGE_SIZE(entry) == unit_size &&
>> +            RMI_ADDR_RANGE_COUNT(entry) == count &&
>> +            RMI_ADDR_RANGE_STATE(entry) == state) {
>> +            sro->addr_count--;
>> +            swap(sro->addr_list[sro->addr_count],
>> +                 sro->addr_list[i]);
>> +
>> +            goto out;
>> +        }
>> +    }
>> +
>> +    ret = rmi_sro_ensure_capacity(sro, 1);
>> +    if (ret)
>> +        return ret;
>> +
>> +    virt = alloc_pages_exact(size, gfp);
>> +    if (!virt)
>> +        return -ENOMEM;
>> +    phys = virt_to_phys(virt);
>> +
> 
> alloc_pages_exact() will fail if the requested size exceeds the maximal
> allowed
> size (1 << MAX_PAGE_ORDER). The maximal size is usually smaller than
> PUD_SIZE
> but PUD_SIZE is allowed by the RMM.

This is an area where to be honest I'm really not sure what to do.
Technically the RMM is allowed to ask for a contiguous range of 512GB
pages (on a 4K system - larger with larger page sizes) - but clearly no
real OS is going to be able to provide anything like that.

In practise we don't expect the RMM to do anything so crazy. It's not
really clear to be whether even 2MB (PMD_SIZE) is needed. But the spec
is written to be generic.

So my current approach is to calculate the required size and pass it
into alloc_pages_exact(). For "stupidly large" values this will fail and
Linux just doesn't support an RMM which attempts this. If there is ever
a usecase which needs this then we'd need to find a different method of
providing the memory (most likely some form of carveout to avoid
fragmentation). But my view is we should wait for that usecase to be
identified first.

>> +    if (state == RMI_OP_MEM_DELEGATED) {
>> +        if (rmi_delegate_range(phys, size)) {
>> +            free_pages_exact(virt, size);
>> +            return -ENXIO;
>> +        }
>> +    }
>> +
>> +    addr_range = phys & RMI_ADDR_RANGE_ADDR_MASK;
>> +    FIELD_MODIFY(RMI_ADDR_RANGE_SIZE_MASK, &addr_range, unit_size);
>> +    FIELD_MODIFY(RMI_ADDR_RANGE_COUNT_MASK, &addr_range, count);
>> +    FIELD_MODIFY(RMI_ADDR_RANGE_STATE_MASK, &addr_range, state);
>> +
>> +    sro->addr_list[sro->addr_count] = addr_range;
>> +
>> +out:
>> +    regs.a2 = virt_to_phys(&sro->addr_list[sro->addr_count]);
>> +    regs.a3 = 1;
>> +    rmi_smccc_invoke(&regs, out_regs);
>> +
>> +    unsigned long donated_granules = out_regs->a1;
>> +    unsigned long donated_size = donated_granules << PAGE_SHIFT;
>> +
>> +    if (donated_granules == 0) {
>> +        /* No pages used by the RMM */
>> +        sro->addr_count++;
>> +    } else if (donated_size < size) {
>> +        phys = sro->addr_list[sro->addr_count] &
>> RMI_ADDR_RANGE_ADDR_MASK;
>> +
>> +        /* Not all granules used by the RMM, free the remaining pages */
>> +        for (long i = donated_size; i < size; i += PAGE_SIZE) {
>> +            if (state == RMI_OP_MEM_DELEGATED)
>> +                free_delegated_page(phys + i);
>> +            else
>> +                __free_page(phys_to_page(phys + i));
>> +        }
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int rmi_sro_donate_noncontig(struct rmi_sro_state *sro,
>> +                    unsigned long sro_handle,
>> +                    unsigned long donatereq,
>> +                    struct arm_smccc_1_2_regs *out_regs,
>> +                    gfp_t gfp)
>> +{
>> +    unsigned long unit_size = RMI_DONATE_SIZE(donatereq);
>> +    unsigned long unit_size_bytes = donate_req_to_size(donatereq);
>> +    unsigned long count = RMI_DONATE_COUNT(donatereq);
>> +    unsigned long state = RMI_DONATE_STATE(donatereq);
>> +    unsigned long found = 0;
>> +    unsigned long addr_list_start = sro->addr_count;
>> +    int ret;
>> +    struct arm_smccc_1_2_regs regs = {
>> +        SMC_RMI_OP_MEM_DONATE,
>> +        sro_handle
>> +    };
>> +
>> +    for (int i = 0; i < addr_list_start && found < count; i++) {
>> +        unsigned long entry = sro->addr_list[i];
>> +
>> +        if (RMI_ADDR_RANGE_SIZE(entry) == unit_size &&
>> +            RMI_ADDR_RANGE_COUNT(entry) == 1 &&
>> +            RMI_ADDR_RANGE_STATE(entry) == state) {
>> +            addr_list_start--;
>> +            swap(sro->addr_list[addr_list_start],
>> +                 sro->addr_list[i]);
>> +            found++;
>> +            i--;
>> +        }
>> +    }
>> +
>> +    ret = rmi_sro_ensure_capacity(sro, count - found);
>> +    if (ret)
>> +        return ret;
>> +
>> +    while (found < count) {
>> +        unsigned long addr_range;
>> +        void *virt = alloc_pages_exact(unit_size_bytes, gfp);
>> +        phys_addr_t phys;
>> +
>> +        if (!virt)
>> +            return -ENOMEM;
>> +
>> +        phys = virt_to_phys(virt);
>> +
>> +        if (state == RMI_OP_MEM_DELEGATED) {
>> +            if (rmi_delegate_range(phys, unit_size_bytes)) {
>> +                free_pages_exact(virt, unit_size_bytes);
>> +                return -ENXIO;
>> +            }
>> +        }
>> +
>> +        addr_range = phys & RMI_ADDR_RANGE_ADDR_MASK;
>> +        FIELD_MODIFY(RMI_ADDR_RANGE_SIZE_MASK, &addr_range, unit_size);
>> +        FIELD_MODIFY(RMI_ADDR_RANGE_COUNT_MASK, &addr_range, 1);
>> +        FIELD_MODIFY(RMI_ADDR_RANGE_STATE_MASK, &addr_range, state);
>> +
>> +        sro->addr_list[sro->addr_count++] = addr_range;
>> +        found++;
>> +    }
>> +
>> +    regs.a2 = virt_to_phys(&sro->addr_list[addr_list_start]);
>> +    regs.a3 = found;
>> +    rmi_smccc_invoke(&regs, out_regs);
>> +
>> +    unsigned long donated_granules = out_regs->a1;
>> +
>> +    if (WARN_ON(donated_granules & ((unit_size_bytes >> PAGE_SHIFT) -
>> 1))) {
>> +        /*
>> +         * FIXME: RMM has only consumed part of a huge page, this leaks
>> +         * the rest of the huge page
>> +         */
>> +        donated_granules = ALIGN(donated_granules,
>> +                     (unit_size_bytes >> PAGE_SHIFT));
>> +    }
>> +    unsigned long donated_blocks = donated_granules /
>> (unit_size_bytes >> PAGE_SHIFT);
>> +
>> +    if (WARN_ON(donated_blocks > found))
>> +        donated_blocks = found;
>> +
>> +    unsigned long undonated_blocks = found - donated_blocks;
>> +
>> +    while (donated_blocks && undonated_blocks) {
>> +        sro->addr_count--;
>> +        swap(sro->addr_list[addr_list_start],
>> +             sro->addr_list[sro->addr_count]);
>> +        addr_list_start++;
>> +
>> +        donated_blocks--;
>> +        undonated_blocks--;
>> +    }
>> +    sro->addr_count -= donated_blocks;
>> +
>> +    return 0;
>> +}
>> +
>> +static int rmi_sro_donate(struct rmi_sro_state *sro,
>> +              unsigned long sro_handle,
>> +              unsigned long donatereq,
>> +              struct arm_smccc_1_2_regs *regs,
>> +              gfp_t gfp)
>> +{
>> +    unsigned long count = RMI_DONATE_COUNT(donatereq);
>> +
>> +    if (WARN_ON(!count))
>> +        return 0;
>> +
>> +    if (RMI_DONATE_CONTIG(donatereq)) {
>> +        return rmi_sro_donate_contig(sro, sro_handle, donatereq,
>> +                         regs, gfp);
>> +    } else {
>> +        return rmi_sro_donate_noncontig(sro, sro_handle, donatereq,
>> +                        regs, gfp);
>> +    }
>> +}
>> +
>> +static int rmi_sro_reclaim(struct rmi_sro_state *sro,
>> +               unsigned long sro_handle,
>> +               struct arm_smccc_1_2_regs *out_regs)
>> +{
>> +    unsigned long capacity;
>> +    struct arm_smccc_1_2_regs regs;
>> +    int ret;
>> +
>> +    ret = rmi_sro_ensure_capacity(sro, 1);
>> +    if (ret)
>> +        rmi_sro_free(sro);
>> +
>> +    capacity = RMI_MAX_ADDR_LIST - sro->addr_count;
>> +
>> +    regs = (struct arm_smccc_1_2_regs){
>> +        SMC_RMI_OP_MEM_RECLAIM,
>> +        sro_handle,
>> +        virt_to_phys(&sro->addr_list[sro->addr_count]),
>> +        capacity
>> +    };
>> +    rmi_smccc_invoke(&regs, out_regs);
>> +
>> +    if (WARN_ON_ONCE(out_regs->a1 > capacity))
>> +        out_regs->a1 = capacity;
>> +
>> +    sro->addr_count += out_regs->a1;
>> +
>> +    return 0;
>> +}
>> +
>> +void rmi_sro_free(struct rmi_sro_state *sro)
>> +{
>> +    for (int i = 0; i < sro->addr_count; i++) {
>> +        unsigned long entry = sro->addr_list[i];
>> +        unsigned long addr = RMI_ADDR_RANGE_ADDR(entry);
>> +        unsigned long unit_size = RMI_ADDR_RANGE_SIZE(entry);
>> +        unsigned long count = RMI_ADDR_RANGE_COUNT(entry);
>> +        unsigned long state = RMI_ADDR_RANGE_STATE(entry);
>> +        unsigned long size = donate_req_to_size(unit_size) * count;
>> +
>> +        if (state == RMI_OP_MEM_DELEGATED) {
>> +            if (WARN_ON(rmi_undelegate_range(addr, size))) {
>> +                /* Leak the pages */
>> +                continue;
>> +            }
>> +        }
>> +        free_pages_exact(phys_to_virt(addr), size);
>> +    }
>> +
>> +    sro->addr_count = 0;
>> +}
>> +
>> +unsigned long rmi_sro_execute(struct rmi_sro_state *sro, gfp_t gfp)
>> +{
>> +    unsigned long sro_handle;
>> +    struct arm_smccc_1_2_regs regs;
>> +    struct arm_smccc_1_2_regs *regs_in = &sro->regs;
>> +
>> +    rmi_smccc_invoke(regs_in, &regs);
>> +
>> +    sro_handle = regs.a1;
>> +
>> +    while (RMI_RETURN_STATUS(regs.a0) == RMI_INCOMPLETE) {
>> +        bool can_cancel = RMI_RETURN_CAN_CANCEL(regs.a0);
>> +        int ret;
>> +
>> +        switch (RMI_RETURN_MEMREQ(regs.a0)) {
>> +        case RMI_OP_MEM_REQ_NONE:
>> +            regs = (struct arm_smccc_1_2_regs){
>> +                SMC_RMI_OP_CONTINUE, sro_handle, 0
>> +            };
>> +            rmi_smccc_invoke(&regs, &regs);
>> +            break;
> 
> 'ret' isn't initialized for case RMI_OP_MEM_REQ_NONE.

Good spot - ret should be initialised to 0.

Thanks,
Steve

>> +        case RMI_OP_MEM_REQ_DONATE:
>> +            ret = rmi_sro_donate(sro, sro_handle, regs.a2, &regs,
>> +                         gfp);
>> +            break;
>> +        case RMI_OP_MEM_REQ_RECLAIM:
>> +            ret = rmi_sro_reclaim(sro, sro_handle, &regs);
>> +            break;
>> +        default:
>> +            ret = WARN_ON(1);
>> +            break;
>> +        }
>> +
>> +        if (ret) {
>> +            if (can_cancel) {
>> +                /*
>> +                 * FIXME: Handle cancelling properly!
>> +                 *
>> +                 * If the operation has failed due to memory
>> +                 * allocation failure then the information on
>> +                 * the memory allocation should be saved, so
>> +                 * that the allocation can be repeated outside
>> +                 * of any context which prevented the
>> +                 * allocation.
>> +                 */
>> +            }
>> +            if (WARN_ON(ret))
>> +                return ret;
>> +        }
>> +    }
>> +
>> +    return regs.a0;
>> +}
>> +
>>   static int rmi_check_version(void)
>>   {
>>       struct arm_smccc_res res;
> 
> Thanks,
> Gavin
> 


^ permalink raw reply

* Re: [PATCH v14 10/44] arm64: RMI: Add support for SRO
From: Steven Price @ 2026-06-04 15:19 UTC (permalink / raw)
  To: Aneesh Kumar K.V, kvm, kvmarm
  Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Gavin Shan,
	Shanker Donthineni, Alper Gun, Emi Kisanuki, Vishal Annapurve,
	WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <yq5a7bp0szrw.fsf@kernel.org>

On 19/05/2026 07:02, Aneesh Kumar K.V wrote:
> Steven Price <steven.price@arm.com> writes:
> 
>> +static unsigned long donate_req_to_size(unsigned long donatereq)
>> +{
>> +	unsigned long unit_size = RMI_DONATE_SIZE(donatereq);
>> +
>> +	switch (unit_size) {
>> +	case 0:
>> +		return PAGE_SIZE;
>> +	case 1:
>> +		return PMD_SIZE;
>> +	case 2:
>> +		return PUD_SIZE;
>> +	case 3:
>> +		return P4D_SIZE;
>> +	}
>> +	unreachable();
>> +}
>>
>> +
>> +static void rmi_smccc_invoke(struct arm_smccc_1_2_regs *regs_in,
>> +			     struct arm_smccc_1_2_regs *regs_out)
>> +{
>> +	struct arm_smccc_1_2_regs regs = *regs_in;
>> +	unsigned long status;
>> +
>> +	do {
>> +		arm_smccc_1_2_invoke(&regs, regs_out);
>> +		status = RMI_RETURN_STATUS(regs_out->a0);
>> +	} while (status == RMI_BUSY || status == RMI_BLOCKED);
>> +}
>> +
>> +int free_delegated_page(phys_addr_t phys)
>> +{
>> +	if (WARN_ON(rmi_undelegate_page(phys))) {
>> +		/* Undelegate failed: leak the page */
>> +		return -EBUSY;
>> +	}
>> +
>> +	free_page((unsigned long)phys_to_virt(phys));
>> +
>> +	return 0;
>> +}
>> +
>> +static int rmi_sro_ensure_capacity(struct rmi_sro_state *sro,
>> +				   unsigned long count)
>> +{
>> +	if (WARN_ON_ONCE(sro->addr_count > RMI_MAX_ADDR_LIST))
>> +		return -EOVERFLOW;
>> +
>> +	if (count > RMI_MAX_ADDR_LIST - sro->addr_count)
>> +		return -ENOSPC;
>> +
>> +	return 0;
>> +}
>> +
>> +static int rmi_sro_donate_contig(struct rmi_sro_state *sro,
>> +				 unsigned long sro_handle,
>> +				 unsigned long donatereq,
>> +				 struct arm_smccc_1_2_regs *out_regs,
>> +				 gfp_t gfp)
>> +{
>> +	unsigned long unit_size = RMI_DONATE_SIZE(donatereq);
>> +	unsigned long unit_size_bytes = donate_req_to_size(donatereq);
>> +	unsigned long count = RMI_DONATE_COUNT(donatereq);
>> +	unsigned long state = RMI_DONATE_STATE(donatereq);
>> +	unsigned long size = unit_size_bytes * count;
>> +	unsigned long addr_range;
>>
> 
> Looking at above and the related code, I am wondering whether we should
> use u64 instead of unsigned long for everything that the specification
> defines as 64-bit.

I'm split on this. The kernel makes use of "(unsigned) long" for a
register sized value in quite a few places. Not least in struct
arm_smccc_1_2_regs which is ultimately where most of the values are read
from or written to.

I'm not a great fan of the kernel's approach to using long like this -
there's a good argument that uintptr_t is more correct. Equally when
we're in arch code for a 64 bit architecture (i.e. "arm64") then we know
the size is u64.

The disadvantage here is that if I use u64 then there's a bunch of
implicit conversions going on between unsigned long and u64 - which
might come back to bite if anything changes. Hence my current view that
"unsigned long" is the best option here in the kernel.

Anyone else have any view on the best type here?

Thanks,
Steve


^ permalink raw reply

* RE: [PATCH v5 05/20] dma-pool: track decrypted atomic pools and select them via attrs
From: Aneesh Kumar K.V @ 2026-06-04 14:57 UTC (permalink / raw)
  To: Michael Kelley, Jason Gunthorpe, Michael Kelley
  Cc: iommu@lists.linux.dev, linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-coco@lists.linux.dev,
	Robin Murphy, Marek Szyprowski, Will Deacon, Marc Zyngier,
	Steven Price, Suzuki K Poulose, Catalin Marinas, Jiri Pirko,
	Mostafa Saleh, Petr Tesarik, Alexey Kardashevskiy, Dan Williams,
	Xu Yilun, linuxppc-dev@lists.ozlabs.org,
	linux-s390@vger.kernel.org, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
	Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, Sven Schnelle, x86@kernel.org, Jiri Pirko
In-Reply-To: <SN6PR02MB4157F94C902B78E55E99372DD4102@SN6PR02MB4157.namprd02.prod.outlook.com>

Michael Kelley <mhklinux@outlook.com> writes:

> From: Jason Gunthorpe <jgg@ziepe.ca> Sent: Tuesday, June 2, 2026 5:55 PM
>> 
>> On Tue, Jun 02, 2026 at 02:24:40PM +0000, Michael Kelley wrote:
>> 
>> > Except that in a normal VM, the "unencrypted" pool attribute does *not*
>> > describe the state of the memory itself.  In a normal VM, the memory is
>> > unencrypted, but the "unencrypted" pool attribute is false. That
>> > contradiction is the essence of my concern.
>> 
>> I would argue no..
>> 
>> When CC is enabled the default state of memory in a Linux environment
>> is "encrypted". You have to take a special action to "decrypt" it.
>> 
>> Thus the default state of memory in a non-CC environment is also
>> paradoxically "encrypted" too. 
>
> The need to have such an unnatural premise is usually an indication
> of a conceptual problem with the overall model, or perhaps just a
> terminology problem. 
>
> Here's a proposal. The new DMA attribute is DMA_ATTR_CC_SHARED.
> Name the pool attribute "cc_shared" instead of "unencrypted". Having
> "cc_shared" set to false in a normal VM doesn't lead to the non-sensical
> situation of claiming that a normal VM is encrypted. The boolean
> "unencrypted" parameter that has been added to various calls also
> becomes "cc_shared".  If "CC_SHARED" is a suitable name for the DMA
> attribute, it ought to be suitable as the pool attribute. And everything
> matches as well.
>

That is better. It would also simplify:

	if (mem->unencrypted != !!(attrs & DMA_ATTR_CC_SHARED))
		return NULL;

to
	if (mem->cc_shared != !!(attrs & DMA_ATTR_CC_SHARED))
		return NULL;


I already sent a v6 in the hope of getting this merged for the next
merge window. Should I send a v7, or would you prefer that I do the
rename on top of v6?

-aneesh

^ permalink raw reply

* [PATCH v4 3/3] x86/tdx: Fix zero-extension for 32-bit port I/O
From: Kiryl Shutsemau (Meta) @ 2026-06-04 14:47 UTC (permalink / raw)
  To: tglx, mingo, bp, dave.hansen
  Cc: seanjc, pbonzini, sathyanarayanan.kuppuswamy, kai.huang,
	xiaoyao.li, binbin.wu, rick.p.edgecombe, david.laight.linux, ak,
	djbw, tsyrulnikov.borys, x86, kvm, linux-coco, linux-kernel,
	Kiryl Shutsemau (Meta), stable
In-Reply-To: <cover.1780584300.git.kas@kernel.org>

According to x86 architecture rules, 32-bit operations zero-extend the
result to 64 bits. The current implementation of handle_in() only masks
the lower 32 bits, which preserves the upper 32 bits of RAX when a
32-bit port IN instruction is emulated.

Use insn_assign_reg() to write the result back into RAX with proper
partial-register-write semantics: 1- and 2-byte forms leave the upper
bits untouched, the 4-byte form zero-extends to the full register.

Fixes: 03149948832a ("x86/tdx: Port I/O: Add runtime hypercalls")
Reported-by: Borys Tsyrulnikov <tsyrulnikov.borys@gmail.com>
Link: https://lore.kernel.org/all/CAKw_Dz96rfSQc6Rn+9QBcUFHhmkK+9zu+P=bxowfZwxrATCBRg@mail.gmail.com/
Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
Cc: stable@vger.kernel.org
---
 arch/x86/coco/tdx/tdx.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 65119362f9a2..41cc23cc63dd 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -693,8 +693,8 @@ static bool handle_in(struct pt_regs *regs, int size, int port)
 		.r13 = PORT_READ,
 		.r14 = port,
 	};
-	u64 mask = GENMASK(BITS_PER_BYTE * size - 1, 0);
 	bool success;
+	u64 val;
 
 	/*
 	 * Emulate the I/O read via hypercall. More info about ABI can be found
@@ -702,11 +702,9 @@ static bool handle_in(struct pt_regs *regs, int size, int port)
 	 * "TDG.VP.VMCALL<Instruction.IO>".
 	 */
 	success = !__tdx_hypercall(&args);
+	val = success ? args.r11 : 0;
 
-	/* Update part of the register affected by the emulated instruction */
-	regs->ax &= ~mask;
-	if (success)
-		regs->ax |= args.r11 & mask;
+	insn_assign_reg(&regs->ax, val, size);
 
 	return success;
 }
-- 
2.54.0


^ permalink raw reply related

* [PATCH v4 2/3] x86/insn-eval: Add insn_assign_reg() helper
From: Kiryl Shutsemau (Meta) @ 2026-06-04 14:47 UTC (permalink / raw)
  To: tglx, mingo, bp, dave.hansen
  Cc: seanjc, pbonzini, sathyanarayanan.kuppuswamy, kai.huang,
	xiaoyao.li, binbin.wu, rick.p.edgecombe, david.laight.linux, ak,
	djbw, tsyrulnikov.borys, x86, kvm, linux-coco, linux-kernel,
	Kiryl Shutsemau (Meta)
In-Reply-To: <cover.1780584300.git.kas@kernel.org>

KVM's instruction emulator has a small helper, assign_register(), that
writes a value into a sub-register with x86 partial-register-write
semantics: 1- and 2-byte writes leave the upper bits of the destination
untouched, 4-byte writes zero-extend to 64 bits, 8-byte writes overwrite
the full register.

The TDX guest #VE handler needs the same logic for port I/O emulation
to get 32-bit zero-extension right. Rather than copy-pasting the helper,
lift it to <asm/insn-eval.h> as insn_assign_reg() so both can use it.

Rewrite the body using arithmetic instead of pointer punning so the
helper does not depend on -fno-strict-aliasing or little-endian byte
order, and add <asm/insn.h> to the header's includes so it builds
standalone in callers that have not pulled it in transitively.

No functional change.

Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
---
 arch/x86/include/asm/insn-eval.h | 25 +++++++++++++++++++++++++
 arch/x86/kvm/emulate.c           | 26 ++++----------------------
 2 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h
index 4733e9064ee5..85251e718a77 100644
--- a/arch/x86/include/asm/insn-eval.h
+++ b/arch/x86/include/asm/insn-eval.h
@@ -9,6 +9,7 @@
 #include <linux/compiler.h>
 #include <linux/bug.h>
 #include <linux/err.h>
+#include <asm/insn.h>
 #include <asm/ptrace.h>
 
 #define INSN_CODE_SEG_ADDR_SZ(params) ((params >> 4) & 0xf)
@@ -46,4 +47,28 @@ enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes);
 
 bool insn_is_nop(struct insn *insn);
 
+/*
+ * Write @val into *@reg with x86 partial-register-write semantics: a 1-
+ * or 2-byte write leaves the upper bits of the destination untouched; a
+ * 4-byte write zero-extends to 64 bits (matching IN[BWL], MOV[BWL]
+ * etc.); an 8-byte write overwrites the full register.
+ */
+static inline void insn_assign_reg(unsigned long *reg, u64 val, int bytes)
+{
+	switch (bytes) {
+	case 1:
+		*reg = (*reg & ~0xfful)   | (val & 0xff);
+		break;
+	case 2:
+		*reg = (*reg & ~0xfffful) | (val & 0xffff);
+		break;
+	case 4:
+		*reg = (u32)val;
+		break;
+	case 8:
+		*reg = val;
+		break;
+	}
+}
+
 #endif /* _ASM_X86_INSN_EVAL_H */
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8013dccb3110..74972c17edb8 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -24,6 +24,7 @@
 #include "kvm_emulate.h"
 #include <linux/stringify.h>
 #include <asm/debugreg.h>
+#include <asm/insn-eval.h>
 #include <asm/nospec-branch.h>
 #include <asm/ibt.h>
 #include <asm/text-patching.h>
@@ -439,25 +440,6 @@ static void assign_masked(ulong *dest, ulong src, ulong mask)
 	*dest = (*dest & ~mask) | (src & mask);
 }
 
-static void assign_register(unsigned long *reg, u64 val, int bytes)
-{
-	/* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
-	switch (bytes) {
-	case 1:
-		*(u8 *)reg = (u8)val;
-		break;
-	case 2:
-		*(u16 *)reg = (u16)val;
-		break;
-	case 4:
-		*reg = (u32)val;
-		break;	/* 64b: zero-extend */
-	case 8:
-		*reg = val;
-		break;
-	}
-}
-
 static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
 {
 	return (1UL << (ctxt->ad_bytes << 3)) - 1;
@@ -505,7 +487,7 @@ register_address_increment(struct x86_emulate_ctxt *ctxt, int reg, int inc)
 {
 	ulong *preg = reg_rmw(ctxt, reg);
 
-	assign_register(preg, *preg + inc, ctxt->ad_bytes);
+	insn_assign_reg(preg, *preg + inc, ctxt->ad_bytes);
 }
 
 static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
@@ -1766,7 +1748,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 
 static void write_register_operand(struct operand *op)
 {
-	return assign_register(op->addr.reg, op->val, op->bytes);
+	return insn_assign_reg(op->addr.reg, op->val, op->bytes);
 }
 
 static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
@@ -2007,7 +1989,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
 		rc = emulate_pop(ctxt, &val, ctxt->op_bytes);
 		if (rc != X86EMUL_CONTINUE)
 			break;
-		assign_register(reg_rmw(ctxt, reg), val, ctxt->op_bytes);
+		insn_assign_reg(reg_rmw(ctxt, reg), val, ctxt->op_bytes);
 		--reg;
 	}
 	return rc;
-- 
2.54.0


^ permalink raw reply related

* [PATCH v4 1/3] x86/tdx: Fix off-by-one in port I/O handling
From: Kiryl Shutsemau (Meta) @ 2026-06-04 14:46 UTC (permalink / raw)
  To: tglx, mingo, bp, dave.hansen
  Cc: seanjc, pbonzini, sathyanarayanan.kuppuswamy, kai.huang,
	xiaoyao.li, binbin.wu, rick.p.edgecombe, david.laight.linux, ak,
	djbw, tsyrulnikov.borys, x86, kvm, linux-coco, linux-kernel,
	Kiryl Shutsemau (Meta), stable
In-Reply-To: <cover.1780584300.git.kas@kernel.org>

handle_in() and handle_out() in arch/x86/coco/tdx/tdx.c use:

    u64 mask = GENMASK(BITS_PER_BYTE * size, 0);

GENMASK(h, l) includes bit h. For size=1 (INB), this produces
GENMASK(8, 0) = 0x1FF (9 bits) instead of GENMASK(7, 0) = 0xFF (8
bits). The mask is one bit too wide for all I/O sizes.

Fix the mask calculation.

Fixes: 03149948832a ("x86/tdx: Port I/O: Add runtime hypercalls")
Reported-by: Borys Tsyrulnikov <tsyrulnikov.borys@gmail.com>
Link: https://lore.kernel.org/all/CAKw_Dz96rfSQc6Rn+9QBcUFHhmkK+9zu+P=bxowfZwxrATCBRg@mail.gmail.com/
Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Reviewed-by: Kai Huang <kai.huang@intel.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Cc: stable@vger.kernel.org
---
 arch/x86/coco/tdx/tdx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 186915a17c50..65119362f9a2 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -693,7 +693,7 @@ static bool handle_in(struct pt_regs *regs, int size, int port)
 		.r13 = PORT_READ,
 		.r14 = port,
 	};
-	u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
+	u64 mask = GENMASK(BITS_PER_BYTE * size - 1, 0);
 	bool success;
 
 	/*
@@ -713,7 +713,7 @@ static bool handle_in(struct pt_regs *regs, int size, int port)
 
 static bool handle_out(struct pt_regs *regs, int size, int port)
 {
-	u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
+	u64 mask = GENMASK(BITS_PER_BYTE * size - 1, 0);
 
 	/*
 	 * Emulate the I/O write via hypercall. More info about ABI can be found
-- 
2.54.0


^ permalink raw reply related

* [PATCH v4 0/3] x86/tdx: Fix port I/O handling bugs
From: Kiryl Shutsemau (Meta) @ 2026-06-04 14:46 UTC (permalink / raw)
  To: tglx, mingo, bp, dave.hansen
  Cc: seanjc, pbonzini, sathyanarayanan.kuppuswamy, kai.huang,
	xiaoyao.li, binbin.wu, rick.p.edgecombe, david.laight.linux, ak,
	djbw, tsyrulnikov.borys, x86, kvm, linux-coco, linux-kernel,
	Kiryl Shutsemau (Meta)

Two bugs in the TDX guest port I/O #VE emulation, plus a small helper
extracted from KVM to avoid open-coding partial-register-write logic
in the second fix.

Patch 1 is an off-by-one in the mask used to clip the I/O value:
GENMASK(BITS_PER_BYTE * size, 0) is one bit too wide. Unchanged from
v3 1/2.

Patch 2 lifts KVM's instruction-emulator helper assign_register() out
of arch/x86/kvm/emulate.c into <asm/insn-eval.h>, renamed to
insn_assign_reg(). Dave suggested consolidating rather than adding a
third copy of the same partial-register switch; the body is rewritten
using plain arithmetic (suggested by David Laight) so the helper does
not rely on -fno-strict-aliasing or little-endian byte order. KVM
behaviour is unchanged.

Patch 3 fixes the architectural zero-extension of 32-bit IN: the old
mask-based handle_in() preserves RAX[63:32] after inl, which is wrong.
Now done by calling the helper.

Changes since v3:
  - Patch 1/2 carried over unchanged as 1/3.
  - Helper extracted from KVM (new patch 2/3) and used from
    handle_in() (Dave, David Laight).
  - Reviewed-by tags from v3 2/2 dropped on patch 3/3 because the
    implementation changed substantially. v3 1/2 -> v4 1/3 Rb tags
    preserved (patch unchanged).

v3: https://lore.kernel.org/all/20260527120544.2903923-1-kas@kernel.org/

Kiryl Shutsemau (Meta) (3):
  x86/tdx: Fix off-by-one in port I/O handling
  x86/insn-eval: Add insn_assign_reg() helper
  x86/tdx: Fix zero-extension for 32-bit port I/O

 arch/x86/coco/tdx/tdx.c          | 10 ++++------
 arch/x86/include/asm/insn-eval.h | 25 +++++++++++++++++++++++++
 arch/x86/kvm/emulate.c           | 26 ++++----------------------
 3 files changed, 33 insertions(+), 28 deletions(-)

-- 
2.54.0

^ permalink raw reply

* Re: [PATCH v14 09/44] arm64: RMI: Provide functions to delegate/undelegate ranges of memory
From: Steven Price @ 2026-06-04 14:43 UTC (permalink / raw)
  To: Marc Zyngier
  Cc: kvm, kvmarm, Catalin Marinas, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Gavin Shan,
	Shanker Donthineni, Alper Gun, Aneesh Kumar K . V, Emi Kisanuki,
	Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <867bowx3qx.wl-maz@kernel.org>

On 21/05/2026 14:59, Marc Zyngier wrote:
> On Wed, 13 May 2026 14:17:17 +0100,
> Steven Price <steven.price@arm.com> wrote:
>>
>> The RMM requires memory is 'delegated' to it so that it can be used
>> either for a realm guest or for various tracking purposes within the RMM
>> (e.g. for metadata or page tables). Memory that has been delegated
>> cannot be accessed by the host (it will result in a Granule Protection
>> Fault).
>>
>> Undelegation may fail if the memory is still in use by the RMM. This
>> shouldn't happen (Linux should ensure it has destroyed the RMM objects
>> before attempting to undelegate). In the event that it does happen this
>> points to a programming bug and the only reasonable approach is for the
>> physical pages to be leaked - it is up to the caller of
>> rmi_undelegate_range() to handle this.
>>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>> v14:
>>  * Split into separate patch and moved out of KVM
>> ---
>>  arch/arm64/include/asm/rmi_cmds.h | 13 +++++++++++
>>  arch/arm64/kernel/rmi.c           | 36 +++++++++++++++++++++++++++++++
>>  2 files changed, 49 insertions(+)
>>
>> diff --git a/arch/arm64/include/asm/rmi_cmds.h b/arch/arm64/include/asm/rmi_cmds.h
>> index 9078a2920a7c..eb213c8e6f26 100644
>> --- a/arch/arm64/include/asm/rmi_cmds.h
>> +++ b/arch/arm64/include/asm/rmi_cmds.h
>> @@ -33,6 +33,19 @@ struct rmi_sro_state {
>>  } while (RMI_RETURN_STATUS(res.a0) == RMI_BUSY ||			\
>>  	 RMI_RETURN_STATUS(res.a0) == RMI_BLOCKED)
>>  
>> +int rmi_delegate_range(phys_addr_t phys, unsigned long size);
>> +int rmi_undelegate_range(phys_addr_t phys, unsigned long size);
>> +
>> +static inline int rmi_delegate_page(phys_addr_t phys)
>> +{
>> +	return rmi_delegate_range(phys, PAGE_SIZE);
>> +}
>> +
>> +static inline int rmi_undelegate_page(phys_addr_t phys)
>> +{
>> +	return rmi_undelegate_range(phys, PAGE_SIZE);
>> +}
>> +
>>  bool rmi_is_available(void);
>>  
>>  unsigned long rmi_sro_execute(struct rmi_sro_state *sro, gfp_t gfp);
>> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
>> index 52a415e99500..08cef54acadb 100644
>> --- a/arch/arm64/kernel/rmi.c
>> +++ b/arch/arm64/kernel/rmi.c
>> @@ -12,6 +12,42 @@ static bool arm64_rmi_is_available;
>>  unsigned long rmm_feat_reg0;
>>  unsigned long rmm_feat_reg1;
>>  
>> +int rmi_delegate_range(phys_addr_t phys, unsigned long size)
>> +{
>> +	unsigned long ret = 0;
>> +	unsigned long top = phys + size;
>> +	unsigned long out_top;
>> +
>> +	while (phys < top) {
>> +		ret = rmi_granule_range_delegate(phys, top, &out_top);
>> +		if (ret == RMI_SUCCESS)
>> +			phys = out_top;
>> +		else if (ret != RMI_BUSY && ret != RMI_BLOCKED)
>> +			return ret;
>> +	}
>> +
>> +	return ret;
>> +}
>> +
>> +int rmi_undelegate_range(phys_addr_t phys, unsigned long size)
>> +{
>> +	unsigned long ret = 0;
>> +	unsigned long top = phys + size;
>> +	unsigned long out_top;
>> +
>> +	WARN_ON(size == 0);
> 
> I find it odd to warn on size = 0. After all, free(NULL) is not an
> error. But even then, you continue feeding this to the RMM.

Ok, I'll admit that this is left over debugging - although this is a
condition that shouldn't happen.

Note that the while() condition prevents this from actually getting to
the RMM.

I'll drop the WARN_ON() since it's confusing.

Thanks,
Steve

> You also don't seem to be bothered with that on the delegation side...
> 
>> +
>> +	while (phys < top) {
>> +		ret = rmi_granule_range_undelegate(phys, top, &out_top);
>> +		if (ret == RMI_SUCCESS)
>> +			phys = out_top;
> 
> and size==0 doesn't violate any of the failure conditions listed in
> B4.5.18.2 (beta2). Will you end-up looping around forever?
> 
> Same questions for the delegation, obviously.
> 
> 	M.
> 


^ permalink raw reply

* Re: [PATCH v5 05/20] dma-pool: track decrypted atomic pools and select them via attrs
From: Jason Gunthorpe @ 2026-06-04 14:30 UTC (permalink / raw)
  To: Michael Kelley
  Cc: Aneesh Kumar K.V, iommu@lists.linux.dev,
	linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-coco@lists.linux.dev,
	Robin Murphy, Marek Szyprowski, Will Deacon, Marc Zyngier,
	Steven Price, Suzuki K Poulose, Catalin Marinas, Jiri Pirko,
	Mostafa Saleh, Petr Tesarik, Alexey Kardashevskiy, Dan Williams,
	Xu Yilun, linuxppc-dev@lists.ozlabs.org,
	linux-s390@vger.kernel.org, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
	Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, Sven Schnelle, x86@kernel.org, Jiri Pirko
In-Reply-To: <SN6PR02MB4157F94C902B78E55E99372DD4102@SN6PR02MB4157.namprd02.prod.outlook.com>

On Thu, Jun 04, 2026 at 02:05:35PM +0000, Michael Kelley wrote:
> From: Jason Gunthorpe <jgg@ziepe.ca> Sent: Tuesday, June 2, 2026 5:55 PM
> > 
> > On Tue, Jun 02, 2026 at 02:24:40PM +0000, Michael Kelley wrote:
> > 
> > > Except that in a normal VM, the "unencrypted" pool attribute does *not*
> > > describe the state of the memory itself.  In a normal VM, the memory is
> > > unencrypted, but the "unencrypted" pool attribute is false. That
> > > contradiction is the essence of my concern.
> > 
> > I would argue no..
> > 
> > When CC is enabled the default state of memory in a Linux environment
> > is "encrypted". You have to take a special action to "decrypt" it.
> > 
> > Thus the default state of memory in a non-CC environment is also
> > paradoxically "encrypted" too. 
> 
> The need to have such an unnatural premise is usually an indication
> of a conceptual problem with the overall model, or perhaps just a
> terminology problem. 

Oh yes I do think the AMD derived terminogy is aweful :(

> Here's a proposal. The new DMA attribute is DMA_ATTR_CC_SHARED.
> Name the pool attribute "cc_shared" instead of "unencrypted". 

Yeah maybe. I sometimes imagine replacing the encrypted/decrypted
names with cc_shared too just to make it sane.

> "cc_shared" set to false in a normal VM doesn't lead to the non-sensical
> situation of claiming that a normal VM is encrypted.

It seems like a good idea to me

Jason

^ permalink raw reply

* RE: [PATCH v5 05/20] dma-pool: track decrypted atomic pools and select them via attrs
From: Michael Kelley @ 2026-06-04 14:05 UTC (permalink / raw)
  To: Jason Gunthorpe, Michael Kelley
  Cc: Aneesh Kumar K.V, iommu@lists.linux.dev,
	linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-coco@lists.linux.dev,
	Robin Murphy, Marek Szyprowski, Will Deacon, Marc Zyngier,
	Steven Price, Suzuki K Poulose, Catalin Marinas, Jiri Pirko,
	Mostafa Saleh, Petr Tesarik, Alexey Kardashevskiy, Dan Williams,
	Xu Yilun, linuxppc-dev@lists.ozlabs.org,
	linux-s390@vger.kernel.org, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
	Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, Sven Schnelle, x86@kernel.org, Jiri Pirko
In-Reply-To: <20260603005454.GM2487554@ziepe.ca>

From: Jason Gunthorpe <jgg@ziepe.ca> Sent: Tuesday, June 2, 2026 5:55 PM
> 
> On Tue, Jun 02, 2026 at 02:24:40PM +0000, Michael Kelley wrote:
> 
> > Except that in a normal VM, the "unencrypted" pool attribute does *not*
> > describe the state of the memory itself.  In a normal VM, the memory is
> > unencrypted, but the "unencrypted" pool attribute is false. That
> > contradiction is the essence of my concern.
> 
> I would argue no..
> 
> When CC is enabled the default state of memory in a Linux environment
> is "encrypted". You have to take a special action to "decrypt" it.
> 
> Thus the default state of memory in a non-CC environment is also
> paradoxically "encrypted" too. 

The need to have such an unnatural premise is usually an indication
of a conceptual problem with the overall model, or perhaps just a
terminology problem. 

Here's a proposal. The new DMA attribute is DMA_ATTR_CC_SHARED.
Name the pool attribute "cc_shared" instead of "unencrypted". Having
"cc_shared" set to false in a normal VM doesn't lead to the non-sensical
situation of claiming that a normal VM is encrypted. The boolean
"unencrypted" parameter that has been added to various calls also
becomes "cc_shared".  If "CC_SHARED" is a suitable name for the DMA
attribute, it ought to be suitable as the pool attribute. And everything
matches as well.

Michael  


> "decryption" is impossible.
> 
> Therefore the "unencrypted" state is a special state that only memory
> inside a CC VM can have. A normal VM can never have "unencrypted"
> memory at all, so having it be false in the pool is accurate as far as
> the APIs go.
> 
> un-encrypted = true means "the memory in this pool was transformed with
> set_memory_decrypted()" - which is impossible on a normal VM.
> 
> Jason


^ permalink raw reply

* Re: [PATCH v6 3/4] firmware: smccc: arm-cca-guest: Bind the TSM provider to an SMCCC device
From: Sudeep Holla @ 2026-06-04 13:45 UTC (permalink / raw)
  To: Aneesh Kumar K.V
  Cc: linux-coco, linux-arm-kernel, linux-kernel, Catalin Marinas,
	Sudeep Holla, Greg KH, Jeremy Linton, Jonathan Cameron,
	Lorenzo Pieralisi, Mark Rutland, Will Deacon, Steven Price,
	Suzuki K Poulose
In-Reply-To: <yq5ase72qvwb.fsf@kernel.org>

On Thu, Jun 04, 2026 at 06:56:28PM +0530, Aneesh Kumar K.V wrote:
> Sudeep Holla <sudeep.holla@kernel.org> writes:
> 
> ...
> 
> > +static const struct smccc_device_info smccc_devices[] __initconst = {
> > +       {
> > +               .func_id        = ARM_SMCCC_TRNG_VERSION,
> > +               .requires_smc   = false,
> > +               .min_return     = ARM_SMCCC_TRNG_MIN_VERSION,
> > +               .device_name    = "arm-smccc-trng",
> > +       },
> > +};
> > +
> > +static bool __init
> > +smccc_probe_smccc_device(const struct smccc_device_info *smccc_dev)
> > +{
> > +       struct arm_smccc_res res;
> > +       unsigned long ret;
> > +
> > +       if (!IS_ENABLED(CONFIG_ARM64))
> > +               return false;
> > +
> > +       if (smccc_conduit == SMCCC_CONDUIT_NONE)
> > +               return false;
> > +
> > +       if (smccc_dev->requires_smc && smccc_conduit != SMCCC_CONDUIT_SMC)
> > +               return false;
> > +
> > +       arm_smccc_1_1_invoke(smccc_dev->func_id, &res);
> > +       ret = res.a0;
> > +
> > +       if ((s32)ret < 0)
> > +               return false;
> > +
> > +       return ret >= smccc_dev->min_return;
> > +}
> > +
> >
> 
> I am not sure we want the check to be as simple as ret < 0. Some
> function IDs may return input errors based on the supplied arguments
> (for example, RMI_ERROR_INPUT). In those cases, we would likely want
> this to be handled via a callback.
> 

As I mentioned in response to Suzuki, we can defer that to probe of
that device. If *_VERSION, succeeds SMCCC core can add that device and
leave the rest to the core keeping the core and bus layer simple IMO.

> We also want to use conditional compilation for some function IDs.
> Given the callback approach and the #ifdefs, I wonder whether what we
> currently have is actually simpler and more flexible.”
> 

I was trying to avoid conditional compilation altogether and hence the
reason for keeping it as simple as possible. Also IS_ENABLED(CONFIG_ARM64)
in above snippet must come as some condition to this generic probe.

Adding any more logic or callback defeats the bus idea here if we need
to rely/depend on multiple conditional compilation or callbacks IMO.

Let's find see if it can work with what we are adding now and may add in
near future and then decide.

-- 
Regards,
Sudeep

^ permalink raw reply

* Re: [PATCH v6 3/4] firmware: smccc: arm-cca-guest: Bind the TSM provider to an SMCCC device
From: Aneesh Kumar K.V @ 2026-06-04 13:26 UTC (permalink / raw)
  To: Sudeep Holla
  Cc: linux-coco, linux-arm-kernel, linux-kernel, Catalin Marinas,
	Sudeep Holla, Greg KH, Jeremy Linton, Jonathan Cameron,
	Lorenzo Pieralisi, Mark Rutland, Will Deacon, Steven Price,
	Suzuki K Poulose
In-Reply-To: <20260603-determined-bumblebee-of-promise-e633d6@sudeepholla>

Sudeep Holla <sudeep.holla@kernel.org> writes:

...

> +static const struct smccc_device_info smccc_devices[] __initconst = {
> +       {
> +               .func_id        = ARM_SMCCC_TRNG_VERSION,
> +               .requires_smc   = false,
> +               .min_return     = ARM_SMCCC_TRNG_MIN_VERSION,
> +               .device_name    = "arm-smccc-trng",
> +       },
> +};
> +
> +static bool __init
> +smccc_probe_smccc_device(const struct smccc_device_info *smccc_dev)
> +{
> +       struct arm_smccc_res res;
> +       unsigned long ret;
> +
> +       if (!IS_ENABLED(CONFIG_ARM64))
> +               return false;
> +
> +       if (smccc_conduit == SMCCC_CONDUIT_NONE)
> +               return false;
> +
> +       if (smccc_dev->requires_smc && smccc_conduit != SMCCC_CONDUIT_SMC)
> +               return false;
> +
> +       arm_smccc_1_1_invoke(smccc_dev->func_id, &res);
> +       ret = res.a0;
> +
> +       if ((s32)ret < 0)
> +               return false;
> +
> +       return ret >= smccc_dev->min_return;
> +}
> +
>

I am not sure we want the check to be as simple as ret < 0. Some
function IDs may return input errors based on the supplied arguments
(for example, RMI_ERROR_INPUT). In those cases, we would likely want
this to be handled via a callback.

We also want to use conditional compilation for some function IDs.
Given the callback approach and the #ifdefs, I wonder whether what we
currently have is actually simpler and more flexible.”

>  void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit)
>  {
>         struct arm_smccc_res res;
> @@ -31,7 +68,7 @@ void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit)
>         smccc_version = version;
>         smccc_conduit = conduit;
>
> -       smccc_trng_available = smccc_probe_trng();
> +       smccc_trng_available = smccc_probe_smccc_device(&smccc_devices[0]);
>
>         if ((smccc_version >= ARM_SMCCC_VERSION_1_2) &&
>             (smccc_conduit != SMCCC_CONDUIT_NONE)) {
> @@ -241,14 +278,20 @@ subsys_initcall(arm_smccc_bus_init);
>
>  static int __init smccc_devices_init(void)
>  {
> -       struct platform_device *pdev;
> -
> -       if (smccc_trng_available) {
> -               pdev = platform_device_register_simple("smccc_trng", -1,
> -                                                      NULL, 0);
> -               if (IS_ERR(pdev))
> -                       pr_err("smccc_trng: could not register device: %ld\n",
> -                              PTR_ERR(pdev));
> +       const struct smccc_device_info *smccc_dev;
> +       struct arm_smccc_device *sdev;
> +       int i;
> +
> +       for (i = 0; i < ARRAY_SIZE(smccc_devices); i++) {
> +               smccc_dev = &smccc_devices[i];
> +
> +               if (!smccc_probe_smccc_device(smccc_dev))
> +                       continue;
> +
> +               sdev = arm_smccc_device_register(smccc_dev->device_name);
> +               if (IS_ERR(sdev))
> +                       pr_err("%s: could not register device: %ld\n",
> +                              smccc_dev->device_name, PTR_ERR(sdev));
>         }
>
>         return 0;
>

-aneesh

^ permalink raw reply

* Re: [PATCH v6 0/4] Switch Arm SMCCC firmware services to an SMCCC bus
From: Aneesh Kumar K.V @ 2026-06-04 12:58 UTC (permalink / raw)
  To: gregkh, linux-coco, linux-arm-kernel, linux-kernel
  Cc: Catalin Marinas, Jeremy Linton, Jonathan Cameron,
	Lorenzo Pieralisi, Mark Rutland, Sudeep Holla, Will Deacon,
	Steven Price, Suzuki K Poulose
In-Reply-To: <20260527100233.428018-1-aneesh.kumar@kernel.org>


Hi Greg,

"Aneesh Kumar K.V (Arm)" <aneesh.kumar@kernel.org> writes:

> As discussed here:
> https://lore.kernel.org/all/20250728135216.48084-12-aneesh.kumar@kernel.org
>
> The earlier CCA guest support used an arm-cca-dev platform device as a pure
> software anchor for the TSM class device. That platform device did not
> correspond to a DT/ACPI described device, MMIO range, interrupt, or other
> platform resource; it existed only to make the CCA guest driver bind and to
> place the resulting TSM device in the driver model. The same pattern also
> exists for smccc_trng. Creating separate platform devices for such
> SMCCC-discovered features is misleading, because those features are not
> independent platform devices.
>
> This series adds an Arm SMCCC bus for services discovered through the SMCCC
> firmware interface. The bus provides SMCCC device and driver registration
> helpers, name-based matching, uevent modalias generation, and a sysfs modalias
> attribute. SMCCC service drivers can use MODULE_DEVICE_TABLE(arm_smccc, ...)
> to emit arm_smccc:<name> aliases, allowing userspace to autoload service
> drivers when the SMCCC core registers matching firmware-service devices.
>
> The series then moves SMCCC TRNG and the Arm CCA guest RSI service off the
> platform bus. When the SMCCC core discovers the corresponding firmware
> service, it registers an arm-smccc device for that service. The hwrng
> arm_smccc_trng driver and the Arm CCA guest TSM provider are converted to
> SMCCC drivers that bind to those discovered devices.
>
> The old arm-cca-dev platform device has also been used by userspace as a Realm
> guest indicator. Removing it without a replacement would leave userspace
> depending on an internal driver-binding device. This series therefore adds
> /sys/firmware/cca/realm_guest as a stable, architecture-provided ABI for
> detecting whether the kernel is running as an Arm CCA Realm guest, and then
> removes the dummy arm-cca-dev platform-device registration.
>


Gentle ping. Based on your feedback in [1], I reworked the series to use
an SMCCC bus, with smccc-trng and arm-cca-dev represented as devices on
that bus. Could you let me know whether this approach addresses your
concerns?

[1] https://lore.kernel.org/all/2026051451-comfort-museum-4d2a@gregkh/

-aneesh

^ permalink raw reply

* Re: [PATCH v6 3/4] firmware: smccc: arm-cca-guest: Bind the TSM provider to an SMCCC device
From: Sudeep Holla @ 2026-06-04 10:55 UTC (permalink / raw)
  To: Suzuki K Poulose
  Cc: Aneesh Kumar K.V (Arm), linux-coco, linux-arm-kernel,
	linux-kernel, Sudeep Holla, Catalin Marinas, Greg KH,
	Jeremy Linton, Jonathan Cameron, Lorenzo Pieralisi, Mark Rutland,
	Will Deacon, Steven Price
In-Reply-To: <bee6166c-f63e-4692-a874-b80b7a6f6dc4@arm.com>

On Thu, Jun 04, 2026 at 11:24:05AM +0100, Suzuki K Poulose wrote:
> On 04/06/2026 10:21, Sudeep Holla wrote:
> > On Wed, May 27, 2026 at 03:32:32PM +0530, Aneesh Kumar K.V (Arm) wrote:
> > > The Arm CCA guest TSM provider currently binds through the arm-cca-dev
> > > platform device. Like arm-smccc-trng, this device is not an independent
> > > platform resource; it is a software representation of the RSI firmware
> > > service discovered through SMCCC.
> > > 
> > > Move RSI discovery into the SMCCC firmware driver. When the SMCCC conduit
> > > is SMC and the RSI ABI version check succeeds, create an arm-rsi-dev SMCCC
> > > device. Convert the Arm CCA guest TSM provider to an SMCCC driver so it
> > > binds to that discovered RSI service and keeps module autoloading through
> > > the SMCCC device id table.
> > > 
> > > Keep the old arm-cca-dev platform-device registration for now. Userspace
> > > has used that device as a Realm-guest indicator, so removing it is left to
> > > a follow-up patch that adds a replacement sysfs ABI.
> > > 
> > > Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
> > > ---
> > >   arch/arm64/include/asm/rsi.h                  |  2 +-
> > >   arch/arm64/kernel/rsi.c                       |  2 +-
> > >   drivers/firmware/smccc/Makefile               |  4 ++
> > >   drivers/firmware/smccc/rmm.c                  | 25 ++++++++
> > >   drivers/firmware/smccc/rmm.h                  | 17 ++++++
> > >   drivers/firmware/smccc/smccc.c                |  8 +++
> > >   drivers/virt/coco/arm-cca-guest/Kconfig       |  1 +
> > >   drivers/virt/coco/arm-cca-guest/Makefile      |  2 +
> > >   .../{arm-cca-guest.c => arm-cca.c}            | 60 +++++++++----------
> > >   9 files changed, 89 insertions(+), 32 deletions(-)
> > >   create mode 100644 drivers/firmware/smccc/rmm.c
> > >   create mode 100644 drivers/firmware/smccc/rmm.h
> > >   rename drivers/virt/coco/arm-cca-guest/{arm-cca-guest.c => arm-cca.c} (85%)
> > > 
> > > diff --git a/arch/arm64/include/asm/rsi.h b/arch/arm64/include/asm/rsi.h
> > > index 88b50d660e85..2d2d363aaaee 100644
> > > --- a/arch/arm64/include/asm/rsi.h
> > > +++ b/arch/arm64/include/asm/rsi.h
> > > @@ -10,7 +10,7 @@
> > >   #include <linux/jump_label.h>
> > >   #include <asm/rsi_cmds.h>
> > > -#define RSI_PDEV_NAME "arm-cca-dev"
> > > +#define RSI_DEV_NAME "arm-rsi-dev"
> > >   DECLARE_STATIC_KEY_FALSE(rsi_present);
> > > diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c
> > > index 92160f2e57ff..da440f71bb64 100644
> > > --- a/arch/arm64/kernel/rsi.c
> > > +++ b/arch/arm64/kernel/rsi.c
> > > @@ -161,7 +161,7 @@ void __init arm64_rsi_init(void)
> > >   }
> > >   static struct platform_device rsi_dev = {
> > > -	.name = RSI_PDEV_NAME,
> > > +	.name = "arm-cca-dev",
> > >   	.id = PLATFORM_DEVID_NONE
> > >   };
> > > diff --git a/drivers/firmware/smccc/Makefile b/drivers/firmware/smccc/Makefile
> > > index 40d19144a860..33c850aaff4d 100644
> > > --- a/drivers/firmware/smccc/Makefile
> > > +++ b/drivers/firmware/smccc/Makefile
> > > @@ -2,3 +2,7 @@
> > >   #
> > >   obj-$(CONFIG_HAVE_ARM_SMCCC_DISCOVERY)	+= smccc.o kvm_guest.o
> > >   obj-$(CONFIG_ARM_SMCCC_SOC_ID)	+= soc_id.o
> > > +
> > > +ifeq ($(CONFIG_HAVE_ARM_SMCCC_DISCOVERY),y)
> > > +obj-$(CONFIG_ARM64) += rmm.o
> > > +endif
> > > diff --git a/drivers/firmware/smccc/rmm.c b/drivers/firmware/smccc/rmm.c
> > > new file mode 100644
> > > index 000000000000..d572f47e955c
> > > --- /dev/null
> > > +++ b/drivers/firmware/smccc/rmm.c
> > > @@ -0,0 +1,25 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +/*
> > > + * Copyright (C) 2026 Arm Limited
> > > + */
> > > +
> > > +#include <linux/arm-smccc-bus.h>
> > > +#include <linux/err.h>
> > > +#include <linux/printk.h>
> > > +
> > > +#include "rmm.h"
> > > +
> > > +void __init register_rsi_device(void)
> > > +{
> > > +	unsigned long ret;
> > > +
> > > +	if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_SMC)
> > > +		return;
> > > +
> > > +	ret = rsi_request_version(RSI_ABI_VERSION, NULL, NULL);
> > > +	if (ret != RSI_SUCCESS)
> > > +		return;
> > > +
> > > +	if (IS_ERR(arm_smccc_device_register(RSI_DEV_NAME)))
> > > +		pr_err("%s: could not register device\n", RSI_DEV_NAME);
> > > +}
> > 
> > OK, I had something else in my mind when I started looking at 1/4. I didn't
> > expect each device added on this bus comes up with it's own way to enumerate
> > it. IMO, it defeats the purpose of building the smccc bus. We may find the
> > specs for each feature deviated a bit but we can have a generic probe
> > IMO, let's try that before exploring per feature probe function.
> 
> I guess this is ideal, but see below.
> 
> > 
> > I have a brief sketch of what I think we should aim for(uncompiled/untested)
> > below. Let me know if that makes sense. I just based it on your bus code.
> > 
> > Regards,
> > Sudeep
> > 
> > -->8
> > 
> > diff --git c/drivers/firmware/smccc/smccc.c w/drivers/firmware/smccc/smccc.c
> > index 695c920a8087..450605ddfab6 100644
> > --- c/drivers/firmware/smccc/smccc.c
> > +++ w/drivers/firmware/smccc/smccc.c
> > @@ -9,21 +9,58 @@
> >   #include <linux/init.h>
> >   #include <linux/arm-smccc.h>
> >   #include <linux/kernel.h>
> > -#include <linux/platform_device.h>
> >   #include <linux/arm-smccc-bus.h>
> >   #include <linux/idr.h>
> >   #include <linux/slab.h>
> > 
> > -#include <asm/archrandom.h>
> > -
> >   static u32 smccc_version = ARM_SMCCC_VERSION_1_0;
> >   static enum arm_smccc_conduit smccc_conduit = SMCCC_CONDUIT_NONE;
> >   static DEFINE_IDA(arm_smccc_bus_id);
> > 
> > -bool __ro_after_init smccc_trng_available = false;
> > +struct smccc_device_info {
> > +       u32 func_id;
> > +       bool requires_smc;

I wanted to ask this but just forgot.

RSI uses SMC because the Realm is calling the RMM, not the host hypervisor.
Using HVC would make RSI look like a hypervisor ABI and would blur the trust
boundary, is that right assumption ?

I assume the conduit derived from PSCI node for the realms will always be
SMC and it shouldn't be a problem. I mean there won't be a case where you
would need HVC as conduit within the realm VM kernel ?

> > +       unsigned long min_return;
> 
> Is this viable for all ? There may be additional restrictions around
> the return values and further SMC calls ?

Fair enough, but do you have examples currently ?

> Which brings us to call backs and we kind of have a variant of that here.
> 

I am fine with callback but need to keep the scope of it as minimum as
possible IMO. For me it's simple if that main FID for that feature is
implemented we create SMCCC device and probe can deal with all the extra
details, why won't that work ? Just trying to understand why the core
SMCCC bus need to know more or must provide a callback.

-- 
Regards,
Sudeep

^ permalink raw reply

* Re: [PATCH v6 3/4] firmware: smccc: arm-cca-guest: Bind the TSM provider to an SMCCC device
From: Suzuki K Poulose @ 2026-06-04 10:24 UTC (permalink / raw)
  To: Sudeep Holla, Aneesh Kumar K.V (Arm)
  Cc: linux-coco, linux-arm-kernel, linux-kernel, Catalin Marinas,
	Greg KH, Jeremy Linton, Jonathan Cameron, Lorenzo Pieralisi,
	Mark Rutland, Will Deacon, Steven Price
In-Reply-To: <20260603-determined-bumblebee-of-promise-e633d6@sudeepholla>

On 04/06/2026 10:21, Sudeep Holla wrote:
> On Wed, May 27, 2026 at 03:32:32PM +0530, Aneesh Kumar K.V (Arm) wrote:
>> The Arm CCA guest TSM provider currently binds through the arm-cca-dev
>> platform device. Like arm-smccc-trng, this device is not an independent
>> platform resource; it is a software representation of the RSI firmware
>> service discovered through SMCCC.
>>
>> Move RSI discovery into the SMCCC firmware driver. When the SMCCC conduit
>> is SMC and the RSI ABI version check succeeds, create an arm-rsi-dev SMCCC
>> device. Convert the Arm CCA guest TSM provider to an SMCCC driver so it
>> binds to that discovered RSI service and keeps module autoloading through
>> the SMCCC device id table.
>>
>> Keep the old arm-cca-dev platform-device registration for now. Userspace
>> has used that device as a Realm-guest indicator, so removing it is left to
>> a follow-up patch that adds a replacement sysfs ABI.
>>
>> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
>> ---
>>   arch/arm64/include/asm/rsi.h                  |  2 +-
>>   arch/arm64/kernel/rsi.c                       |  2 +-
>>   drivers/firmware/smccc/Makefile               |  4 ++
>>   drivers/firmware/smccc/rmm.c                  | 25 ++++++++
>>   drivers/firmware/smccc/rmm.h                  | 17 ++++++
>>   drivers/firmware/smccc/smccc.c                |  8 +++
>>   drivers/virt/coco/arm-cca-guest/Kconfig       |  1 +
>>   drivers/virt/coco/arm-cca-guest/Makefile      |  2 +
>>   .../{arm-cca-guest.c => arm-cca.c}            | 60 +++++++++----------
>>   9 files changed, 89 insertions(+), 32 deletions(-)
>>   create mode 100644 drivers/firmware/smccc/rmm.c
>>   create mode 100644 drivers/firmware/smccc/rmm.h
>>   rename drivers/virt/coco/arm-cca-guest/{arm-cca-guest.c => arm-cca.c} (85%)
>>
>> diff --git a/arch/arm64/include/asm/rsi.h b/arch/arm64/include/asm/rsi.h
>> index 88b50d660e85..2d2d363aaaee 100644
>> --- a/arch/arm64/include/asm/rsi.h
>> +++ b/arch/arm64/include/asm/rsi.h
>> @@ -10,7 +10,7 @@
>>   #include <linux/jump_label.h>
>>   #include <asm/rsi_cmds.h>
>>   
>> -#define RSI_PDEV_NAME "arm-cca-dev"
>> +#define RSI_DEV_NAME "arm-rsi-dev"
>>   
>>   DECLARE_STATIC_KEY_FALSE(rsi_present);
>>   
>> diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c
>> index 92160f2e57ff..da440f71bb64 100644
>> --- a/arch/arm64/kernel/rsi.c
>> +++ b/arch/arm64/kernel/rsi.c
>> @@ -161,7 +161,7 @@ void __init arm64_rsi_init(void)
>>   }
>>   
>>   static struct platform_device rsi_dev = {
>> -	.name = RSI_PDEV_NAME,
>> +	.name = "arm-cca-dev",
>>   	.id = PLATFORM_DEVID_NONE
>>   };
>>   
>> diff --git a/drivers/firmware/smccc/Makefile b/drivers/firmware/smccc/Makefile
>> index 40d19144a860..33c850aaff4d 100644
>> --- a/drivers/firmware/smccc/Makefile
>> +++ b/drivers/firmware/smccc/Makefile
>> @@ -2,3 +2,7 @@
>>   #
>>   obj-$(CONFIG_HAVE_ARM_SMCCC_DISCOVERY)	+= smccc.o kvm_guest.o
>>   obj-$(CONFIG_ARM_SMCCC_SOC_ID)	+= soc_id.o
>> +
>> +ifeq ($(CONFIG_HAVE_ARM_SMCCC_DISCOVERY),y)
>> +obj-$(CONFIG_ARM64) += rmm.o
>> +endif
>> diff --git a/drivers/firmware/smccc/rmm.c b/drivers/firmware/smccc/rmm.c
>> new file mode 100644
>> index 000000000000..d572f47e955c
>> --- /dev/null
>> +++ b/drivers/firmware/smccc/rmm.c
>> @@ -0,0 +1,25 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + * Copyright (C) 2026 Arm Limited
>> + */
>> +
>> +#include <linux/arm-smccc-bus.h>
>> +#include <linux/err.h>
>> +#include <linux/printk.h>
>> +
>> +#include "rmm.h"
>> +
>> +void __init register_rsi_device(void)
>> +{
>> +	unsigned long ret;
>> +
>> +	if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_SMC)
>> +		return;
>> +
>> +	ret = rsi_request_version(RSI_ABI_VERSION, NULL, NULL);
>> +	if (ret != RSI_SUCCESS)
>> +		return;
>> +
>> +	if (IS_ERR(arm_smccc_device_register(RSI_DEV_NAME)))
>> +		pr_err("%s: could not register device\n", RSI_DEV_NAME);
>> +}
> 
> OK, I had something else in my mind when I started looking at 1/4. I didn't
> expect each device added on this bus comes up with it's own way to enumerate
> it. IMO, it defeats the purpose of building the smccc bus. We may find the
> specs for each feature deviated a bit but we can have a generic probe
> IMO, let's try that before exploring per feature probe function.

I guess this is ideal, but see below.

> 
> I have a brief sketch of what I think we should aim for(uncompiled/untested)
> below. Let me know if that makes sense. I just based it on your bus code.
> 
> Regards,
> Sudeep
> 
> -->8
> 
> diff --git c/drivers/firmware/smccc/smccc.c w/drivers/firmware/smccc/smccc.c
> index 695c920a8087..450605ddfab6 100644
> --- c/drivers/firmware/smccc/smccc.c
> +++ w/drivers/firmware/smccc/smccc.c
> @@ -9,21 +9,58 @@
>   #include <linux/init.h>
>   #include <linux/arm-smccc.h>
>   #include <linux/kernel.h>
> -#include <linux/platform_device.h>
>   #include <linux/arm-smccc-bus.h>
>   #include <linux/idr.h>
>   #include <linux/slab.h>
> 
> -#include <asm/archrandom.h>
> -
>   static u32 smccc_version = ARM_SMCCC_VERSION_1_0;
>   static enum arm_smccc_conduit smccc_conduit = SMCCC_CONDUIT_NONE;
>   static DEFINE_IDA(arm_smccc_bus_id);
> 
> -bool __ro_after_init smccc_trng_available = false;
> +struct smccc_device_info {
> +       u32 func_id;
> +       bool requires_smc;
> +       unsigned long min_return;

Is this viable for all ? There may be additional restrictions around
the return values and further SMC calls ? Which brings us to call backs
and we kind of have a variant of that here.

Suzuki


> +       const char *device_name;
> +};
> +
> +bool __ro_after_init smccc_trng_available;
>   s32 __ro_after_init smccc_soc_id_version = SMCCC_RET_NOT_SUPPORTED;
>   s32 __ro_after_init smccc_soc_id_revision = SMCCC_RET_NOT_SUPPORTED;
> 
> +static const struct smccc_device_info smccc_devices[] __initconst = {
> +       {
> +               .func_id        = ARM_SMCCC_TRNG_VERSION,
> +               .requires_smc   = false,
> +               .min_return     = ARM_SMCCC_TRNG_MIN_VERSION,
> +               .device_name    = "arm-smccc-trng",
> +       },
> +};
> +
> +static bool __init
> +smccc_probe_smccc_device(const struct smccc_device_info *smccc_dev)
> +{
> +       struct arm_smccc_res res;
> +       unsigned long ret;
> +
> +       if (!IS_ENABLED(CONFIG_ARM64))
> +               return false;
> +
> +       if (smccc_conduit == SMCCC_CONDUIT_NONE)
> +               return false;
> +
> +       if (smccc_dev->requires_smc && smccc_conduit != SMCCC_CONDUIT_SMC)
> +               return false;
> +
> +       arm_smccc_1_1_invoke(smccc_dev->func_id, &res);
> +       ret = res.a0;
> +
> +       if ((s32)ret < 0)
> +               return false;
> +
> +       return ret >= smccc_dev->min_return;
> +}
> +
>   void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit)
>   {
>          struct arm_smccc_res res;
> @@ -31,7 +68,7 @@ void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit)
>          smccc_version = version;
>          smccc_conduit = conduit;
> 
> -       smccc_trng_available = smccc_probe_trng();
> +       smccc_trng_available = smccc_probe_smccc_device(&smccc_devices[0]);
> 
>          if ((smccc_version >= ARM_SMCCC_VERSION_1_2) &&
>              (smccc_conduit != SMCCC_CONDUIT_NONE)) {
> @@ -241,14 +278,20 @@ subsys_initcall(arm_smccc_bus_init);
> 
>   static int __init smccc_devices_init(void)
>   {
> -       struct platform_device *pdev;
> -
> -       if (smccc_trng_available) {
> -               pdev = platform_device_register_simple("smccc_trng", -1,
> -                                                      NULL, 0);
> -               if (IS_ERR(pdev))
> -                       pr_err("smccc_trng: could not register device: %ld\n",
> -                              PTR_ERR(pdev));
> +       const struct smccc_device_info *smccc_dev;
> +       struct arm_smccc_device *sdev;
> +       int i;
> +
> +       for (i = 0; i < ARRAY_SIZE(smccc_devices); i++) {
> +               smccc_dev = &smccc_devices[i];
> +
> +               if (!smccc_probe_smccc_device(smccc_dev))
> +                       continue;
> +
> +               sdev = arm_smccc_device_register(smccc_dev->device_name);
> +               if (IS_ERR(sdev))
> +                       pr_err("%s: could not register device: %ld\n",
> +                              smccc_dev->device_name, PTR_ERR(sdev));
>          }
> 
>          return 0;
> 


^ permalink raw reply

* [RFC PATCH 6/6] x86/tdx: Release private memory before private->shared conversion
From: Zhenzhong Duan @ 2026-06-04  9:35 UTC (permalink / raw)
  To: marcandre.lureau, david, kas, rick.p.edgecombe, prsampat,
	pbonzini, mst, peterx, chenyi.qiang, elena.reshetova, michaeluth,
	ackerleytng
  Cc: linux-kernel, linux-coco, virtualization, x86, yilun.xu,
	xiaoyao.li, chao.p.peng
In-Reply-To: <20260604093551.1511079-1-zhenzhong.duan@intel.com>

TDX supports a PAGE.RELEASE feature, when configured, host can only
remove a private page until guest releases it and puts it in a PENDING
state through TDG.MEM.PAGE.RELEASE.

When TDX PAGE.RELEASE is supported, release private memory pages before
converting them to shared state, this ensures pages transition from
accepted to pending state.

The release operation helps handle scenarios where the hypervisor may
retain old private pages during conversion. Without proper release,
subsequent shared->private conversions could encounter re-acceptance
errors when attempting to accept pages that are still in accepted state.

If the release operation fails, abort the conversion to prevent
inconsistent memory state. Note that if tdx_map_gpa() fails after
successful release, we cannot safely rollback because the GPA mapping may
have partially succeeded, creating a mix of shared and private pages that
cannot be reliably tracked or recovered.

Co-developed-by: Xu Yilun <yilun.xu@linux.intel.com>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
 arch/x86/coco/tdx/tdx.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 0abfb3505093..ecee6df92395 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -1121,7 +1121,25 @@ static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
 {
 	phys_addr_t start = __pa(vaddr);
 	phys_addr_t end   = __pa(vaddr + numpages * PAGE_SIZE);
+	bool release_required = !enc && tdx_page_release_supported;

+	/*
+	 * For private->shared conversion, release memory pages first.
+	 * This transitions pages from accepted to pending state to be
+	 * more robust with buggy VMM, e.g., VMM may keep old pages,
+	 * when converting back to private, re-accept error triggers.
+	 */
+	if (release_required && !tdx_release_memory(start, end))
+		return false;
+
+	/*
+	 * Update the GPA mapping state. If this fails, we cannot rollback
+	 * by calling tdx_accept_memory() because tdx_map_gpa() may have
+	 * partially succeeded, creating a mix of shared and private pages.
+	 * Attempting to accept the entire range would fail on pages that
+	 * are still in shared state, and we have no way to determine which
+	 * pages are in which state after partial failure.
+	 */
 	if (!tdx_map_gpa(start, end, enc))
 		return false;

-- 
2.52.0

^ permalink raw reply related

* [RFC PATCH 5/6] x86/tdx: Register memory pre-unplug callback for TDX guests
From: Zhenzhong Duan @ 2026-06-04  9:35 UTC (permalink / raw)
  To: marcandre.lureau, david, kas, rick.p.edgecombe, prsampat,
	pbonzini, mst, peterx, chenyi.qiang, elena.reshetova, michaeluth,
	ackerleytng
  Cc: linux-kernel, linux-coco, virtualization, x86, yilun.xu,
	xiaoyao.li, chao.p.peng
In-Reply-To: <20260604093551.1511079-1-zhenzhong.duan@intel.com>

Add support for releasing memory pages before unplugging in TDX guests.
When memory is about to be unplugged by virtio-mem or other memory
hotplug drivers, the TDX guest should release the memory pages back to the
hypervisor using TDG.MEM.PAGE.RELEASE TDCALL to be more robust for buggy
VMM behavior, e.g., VMM may do nothing for unplug request.

The implementation detects TDG.MEM.PAGE.RELEASE support and optimizes
release operations by trying larger page sizes 1G/2M before falling back
to 4K pages. If release fails, the function re-accepts any released pages
to maintain consistency. Without proper memory release, re-plugging memory
in TDX guests fails when guest accepts those memory because hypervisor can
do no-op to memory unplug request and memory is already in "accepted"
state.

Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
 arch/x86/include/asm/shared/tdx.h |   2 +
 arch/x86/coco/tdx/tdx.c           | 135 ++++++++++++++++++++++++++++++
 2 files changed, 137 insertions(+)

diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h
index 049638e3da74..910ec1e57528 100644
--- a/arch/x86/include/asm/shared/tdx.h
+++ b/arch/x86/include/asm/shared/tdx.h
@@ -19,6 +19,7 @@
 #define TDG_MEM_PAGE_ACCEPT		6
 #define TDG_VM_RD			7
 #define TDG_VM_WR			8
+#define TDG_MEM_PAGE_RELEASE		30
 
 /* TDX TD attributes */
 #define TDX_TD_ATTR_DEBUG_BIT		0
@@ -54,6 +55,7 @@
 
 /* TDCS_CONFIG_FLAGS bits */
 #define TDCS_CONFIG_FLEXIBLE_PENDING_VE	BIT_ULL(1)
+#define TDCS_CONFIG_PAGE_RELEASE	BIT_ULL(6)
 
 /* TDCS_TD_CTLS bits */
 #define TD_CTLS_PENDING_VE_DISABLE_BIT	0
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index d93ba092d311..0abfb3505093 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -345,6 +345,139 @@ static int tdx_memory_post_plug(u64 addr, u64 size)
 	return -EINVAL;
 }
 
+static bool tdx_page_release_supported;
+
+static void detect_mem_page_release(void)
+{
+	u64 config = 0;
+
+	tdg_vm_rd(TDCS_CONFIG_FLAGS, &config);
+
+	tdx_page_release_supported = !!(config & TDCS_CONFIG_PAGE_RELEASE);
+}
+
+static unsigned long try_release_one(phys_addr_t start, unsigned long len,
+				     enum pg_level pg_level)
+{
+	unsigned long release_size = page_level_size(pg_level);
+	struct tdx_module_args args = {};
+	u8 page_size;
+	u64 ret;
+
+	if (!IS_ALIGNED(start, release_size))
+		return 0;
+
+	if (len < release_size)
+		return 0;
+
+	/*
+	 * Pass the page physical address to TDX module to release the
+	 * private page and to put it in PENDING state.
+	 *
+	 * Bits 2:0 of RCX encode page size: 0 - 4K, 1 - 2M, 2 - 1G.
+	 */
+	switch (pg_level) {
+	case PG_LEVEL_4K:
+		page_size = TDX_PS_4K;
+		break;
+	case PG_LEVEL_2M:
+		page_size = TDX_PS_2M;
+		break;
+	case PG_LEVEL_1G:
+		page_size = TDX_PS_1G;
+		break;
+	default:
+		return 0;
+	}
+
+	args.rcx = start | page_size;
+	ret = __tdcall(TDG_MEM_PAGE_RELEASE, &args);
+	if (ret)
+		return 0;
+
+	return release_size;
+}
+
+static bool _tdx_release_memory(phys_addr_t start, phys_addr_t end, phys_addr_t *cur)
+{
+	*cur = start;
+
+	while (*cur < end) {
+		unsigned long len = end - *cur;
+		unsigned long release_size;
+
+		/*
+		 * Try larger release first. It speeds up process by cutting
+		 * number of hypercalls (if successful).
+		 */
+
+		release_size = try_release_one(*cur, len, PG_LEVEL_1G);
+		if (!release_size)
+			release_size = try_release_one(*cur, len, PG_LEVEL_2M);
+		if (!release_size)
+			release_size = try_release_one(*cur, len, PG_LEVEL_4K);
+		if (!release_size)
+			return false;
+		*cur += release_size;
+	}
+
+	return true;
+}
+
+/*
+ * Release memory pages back to the hypervisor in TDX guests.
+ *
+ * @start: Physical start address of memory range to release
+ * @end:   Physical end address of memory range to release
+ *
+ * Uses TDG.MEM.PAGE.RELEASE TDCALL to transition private pages back to
+ * pending state. If PAGE_RELEASE is not supported by the TDX
+ * configuration, returns true (success) as no action is needed.
+ *
+ * On partial failure, automatically re-accepts any successfully released
+ * pages to restore consistent memory state. Re-acceptance failure is
+ * treated as a fatal error since it indicates severe TDX module issues.
+ *
+ * Returns: true on success, false on failure
+ */
+static bool tdx_release_memory(phys_addr_t start, phys_addr_t end)
+{
+	phys_addr_t released = start;
+	bool ret;
+
+	if (!tdx_page_release_supported)
+		return true;
+
+	ret = _tdx_release_memory(start, end, &released);
+	if (!ret) {
+		pr_err("Failed to release memory [0x%llx, 0x%llx)\n",
+		       (unsigned long long)start, (unsigned long long)end);
+
+		/*
+		 * Re-accept any pages that were successfully released before
+		 * the failure occurred. This should never fail since we're
+		 * just restoring the previous accepted state.
+		 */
+		if (!tdx_accept_memory(start, released))
+			panic("%s Failed to re-accept memory\n", __func__);
+	}
+
+	return ret;
+}
+
+static int tdx_memory_pre_unplug(u64 addr, u64 size)
+{
+	u64 end;
+
+	if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(size))
+		return -EINVAL;
+
+	if (check_add_overflow(addr, size, &end))
+		return -EINVAL;
+
+	return tdx_release_memory(addr, end) ? 0 : -EINVAL;
+}
+
 static void tdx_setup(u64 *cc_mask)
 {
 	struct tdx_module_args args = {};
@@ -380,6 +513,8 @@ static void tdx_setup(u64 *cc_mask)
 	reduce_unnecessary_ve();
 
 	set_memory_post_plug_callback(tdx_memory_post_plug);
+	detect_mem_page_release();
+	set_memory_pre_unplug_callback(tdx_memory_pre_unplug);
 }
 
 /*
-- 
2.52.0


^ permalink raw reply related

* [RFC PATCH 4/6] x86/tdx: Register memory post-plug callback for TDX guests
From: Zhenzhong Duan @ 2026-06-04  9:35 UTC (permalink / raw)
  To: marcandre.lureau, david, kas, rick.p.edgecombe, prsampat,
	pbonzini, mst, peterx, chenyi.qiang, elena.reshetova, michaeluth,
	ackerleytng
  Cc: linux-kernel, linux-coco, virtualization, x86, yilun.xu,
	xiaoyao.li, chao.p.peng
In-Reply-To: <20260604093551.1511079-1-zhenzhong.duan@intel.com>

Register a callback to handle memory acceptance after memory plugging in
TDX guests. When memory is added by virtio-mem or other memory hotplug
drivers, the TDX guest must accept the memory pages using
TDG.MEM.PAGE.ACCEPT TDCALL before they can be safely accessed.

The callback uses the existing tdx_accept_memory() function to accept all
pages in the newly plugged memory range. Without this callback, newly
added memory would remain in "unaccepted" state, and any access to these
pages would trigger VM exits and potentially cause guest crashes. The
callback is registered during TDX setup and remains active for the
lifetime of the guest, ensuring all dynamically added memory is properly
accepted before being made available to the kernel's memory management
subsystem.

Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
 arch/x86/coco/tdx/tdx.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 186915a17c50..d93ba092d311 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -326,6 +326,25 @@ static void reduce_unnecessary_ve(void)
 	enable_cpu_topology_enumeration();
 }

+static int tdx_memory_post_plug(u64 addr, u64 size)
+{
+	u64 end;
+
+	if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(size))
+		return -EINVAL;
+
+	if (check_add_overflow(addr, size, &end))
+		return -EINVAL;
+
+	if (tdx_accept_memory(addr, end))
+		return 0;
+
+	pr_err("Failed to accept memory [0x%llx, 0x%llx)\n",
+	       (unsigned long long)addr, (unsigned long long)end);
+
+	return -EINVAL;
+}
+
 static void tdx_setup(u64 *cc_mask)
 {
 	struct tdx_module_args args = {};
@@ -359,6 +378,8 @@ static void tdx_setup(u64 *cc_mask)
 	disable_sept_ve(td_attr);

 	reduce_unnecessary_ve();
+
+	set_memory_post_plug_callback(tdx_memory_post_plug);
 }

 /*
-- 
2.52.0

^ permalink raw reply related

* [RFC PATCH 3/6] virtio-mem: Integrate memory acceptance and release callbacks
From: Zhenzhong Duan @ 2026-06-04  9:35 UTC (permalink / raw)
  To: marcandre.lureau, david, kas, rick.p.edgecombe, prsampat,
	pbonzini, mst, peterx, chenyi.qiang, elena.reshetova, michaeluth,
	ackerleytng
  Cc: linux-kernel, linux-coco, virtualization, x86, yilun.xu,
	xiaoyao.li, chao.p.peng
In-Reply-To: <20260604093551.1511079-1-zhenzhong.duan@intel.com>

Integrate the memory post-plug and pre-unplug callbacks into virtio-mem's
plug and unplug operations to support TDX memory acceptance and release.

For memory plugging, call the post-plug callback after successfully
requesting memory from the hypervisor to ensure newly added memory is
accepted by TDX guests. If acceptance fails, return -EINVAL to mark the
device as broken rather than attempting rollback, since unplug operations
may also fail and partial acceptance creates difficult-to-recover state.

For memory unplugging, call the pre-unplug callback before requesting
memory removal from the hypervisor to allow TDX guests to release memory
pages. If release fails, return -EINVAL to mark the device as broken.

If the hypervisor unplug request fails after successful memory release,
attempt to re-accept the memory to restore consistent state for retry. If
re-acceptance fails, mark the device as broken to prevent corruption.

The config_changed check is moved to the wrapper functions to ensure
callbacks are not invoked unnecessarily when operations will be retried.

This integration ensures proper memory lifecycle management in
confidential computing environments while maintaining backward
compatibility with non-TDX systems where the callbacks are no-ops.

Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
 drivers/virtio/virtio_mem.c | 80 ++++++++++++++++++++++++++++++++-----
 1 file changed, 70 insertions(+), 10 deletions(-)

diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 48051e9e98ab..12b8229dab0d 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -1416,8 +1416,8 @@ static uint64_t virtio_mem_send_request(struct virtio_mem *vm,
 	return virtio16_to_cpu(vm->vdev, vm->resp.type);
 }
 
-static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr,
-					uint64_t size)
+static int _virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr,
+					 uint64_t size)
 {
 	const uint64_t nb_vm_blocks = size / vm->device_block_size;
 	const struct virtio_mem_req req = {
@@ -1427,9 +1427,6 @@ static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr,
 	};
 	int rc = -ENOMEM;
 
-	if (atomic_read(&vm->config_changed))
-		return -EAGAIN;
-
 	dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr,
 		addr + size - 1);
 
@@ -1454,8 +1451,8 @@ static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr,
 	return rc;
 }
 
-static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr,
-					  uint64_t size)
+static int _virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr,
+					   uint64_t size)
 {
 	const uint64_t nb_vm_blocks = size / vm->device_block_size;
 	const struct virtio_mem_req req = {
@@ -1465,9 +1462,6 @@ static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr,
 	};
 	int rc = -ENOMEM;
 
-	if (atomic_read(&vm->config_changed))
-		return -EAGAIN;
-
 	dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr,
 		addr + size - 1);
 
@@ -1489,6 +1483,72 @@ static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr,
 	return rc;
 }
 
+static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr,
+					uint64_t size)
+{
+	int ret;
+
+	if (atomic_read(&vm->config_changed))
+		return -EAGAIN;
+
+	ret = _virtio_mem_send_plug_request(vm, addr, size);
+	if (ret)
+		return ret;
+
+	/*
+	 * If memory acceptance fails, we cannot safely rollback to the pre-plug
+	 * state because the unplug operation may also fail (e.g., hypervisor
+	 * out of memory, VM migration in progress). Additionally, acceptance
+	 * failures may be partial, leaving some pages accepted and others not,
+	 * creating inconsistent memory state that is difficult to track and
+	 * recover from.
+	 *
+	 * Rather than attempting complex state recovery that may fail, we treat
+	 * acceptance failure as a critical error and return -EINVAL. This causes
+	 * the caller to set the broken flag and stop processing further requests,
+	 * preventing potential memory corruption or system instability. As a
+	 * consequence, the hypervisor-side memory for the failing range is
+	 * leaked for the lifetime of the device.
+	 */
+	if (memory_post_plug_call(addr, size))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr,
+					  uint64_t size)
+{
+	int ret;
+
+	if (atomic_read(&vm->config_changed))
+		return -EAGAIN;
+
+	/*
+	 * If memory release fails, treat it as a critical error similar to
+	 * acceptance failure. See virtio_mem_send_plug_request() for detailed
+	 * rationale on why we avoid complex error recovery.
+	 */
+	ret = memory_pre_unplug_call(addr, size);
+	if (ret)
+		return -EINVAL;
+
+	ret = _virtio_mem_send_unplug_request(vm, addr, size);
+	/*
+	 * If the hypervisor unplug request fails (e.g., out of memory, VM
+	 * migration), the operation will be retried later. Since we already
+	 * released the memory from TDX perspective, we must re-accept it to
+	 * restore consistent state for the next retry. If re-acceptance fails,
+	 * treat it as critical error to prevent state corruption. As a
+	 * consequence, the hypervisor-side memory for the failing range is
+	 * leaked for the lifetime of the device.
+	 */
+	if (ret && memory_post_plug_call(addr, size))
+		return -EINVAL;
+
+	return ret;
+}
+
 static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
 {
 	const struct virtio_mem_req req = {
-- 
2.52.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox