Re: [PATCH v2 6/7] KVM: arm64: Eagerly restore host fpsimd/sve state in pKVM

From: Marc Zyngier <maz@kernel.org>
To: Fuad Tabba <tabba@google.com>
Cc: kvmarm@lists.linux.dev, linux-arm-kernel@lists.infradead.org,
	will@kernel.org, qperret@google.com, seanjc@google.com,
	alexandru.elisei@arm.com, catalin.marinas@arm.com,
	philmd@linaro.org, james.morse@arm.com, suzuki.poulose@arm.com,
	oliver.upton@linux.dev, mark.rutland@arm.com, broonie@kernel.org,
	joey.gouly@arm.com, rananta@google.com, yuzenghui@huawei.com
Subject: Re: [PATCH v2 6/7] KVM: arm64: Eagerly restore host fpsimd/sve state in pKVM
Date: Tue, 21 May 2024 23:52:21 +0100	[thread overview]
Message-ID: <86o78yn5ga.wl-maz@kernel.org> (raw)
In-Reply-To: <20240521163720.3812851-7-tabba@google.com>

On Tue, 21 May 2024 17:37:19 +0100,
Fuad Tabba <tabba@google.com> wrote:
> 
> When running in protected mode we don't want to leak protected
> guest state to the host, including whether a guest has used
> fpsimd/sve. Therefore, eagerly restore the host state on guest
> exit when running in protected mode, which happens only if the
> guest has used fpsimd/sve.
> 
> As a future optimisation, we could go back to lazy restoring
> state at the host after exiting non-protected guests.

No. This sort of things is way too invasive and would require mapping
the VHE host data at EL2. If we need to optimise this crap, then we
apply the same techniques as we use for guests. If that's good enough
for the guests, that's good enough for the host.

> 
> Signed-off-by: Fuad Tabba <tabba@google.com>
> ---
>  arch/arm64/kvm/hyp/include/hyp/switch.h | 13 +++++-
>  arch/arm64/kvm/hyp/nvhe/hyp-main.c      | 53 +++++++++++++++++++++++--
>  arch/arm64/kvm/hyp/nvhe/pkvm.c          |  2 +
>  arch/arm64/kvm/hyp/nvhe/switch.c        | 16 +++++++-
>  4 files changed, 79 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
> index 1897b73e635c..2fa29bfec0b6 100644
> --- a/arch/arm64/kvm/hyp/include/hyp/switch.h
> +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
> @@ -320,6 +320,16 @@ static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu)
>  	write_sysreg_el1(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR);
>  }
>  
> +static inline void __hyp_sve_save_host(void)
> +{
> +	struct user_sve_state *sve_state = *host_data_ptr(sve_state);
> +
> +	sve_state->zcr_el1 = read_sysreg_el1(SYS_ZCR);
> +	sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2);
> +	__sve_save_state(sve_state->sve_regs + sve_ffr_offset(kvm_host_sve_max_vl),
> +			 &sve_state->fpsr);
> +}
> +
>  static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu);
>  
>  /*
> @@ -356,7 +366,8 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
>  
>  	/* First disable enough traps to allow us to update the registers */
>  	reg = CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN;
> -	if (sve_guest)
> +	if (sve_guest ||
> +	    (is_protected_kvm_enabled() && system_supports_sve()))

This looks really ugly. Why can't we just compute sve_guest to take
these conditions into account?

>  		reg |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN;
>  	cpacr_clear_set(0, reg);
>  	isb();
> diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> index b07d44484f42..f79f0f7b2759 100644
> --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> @@ -23,20 +23,66 @@ DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
>  
>  void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt);
>  
> +static void __hyp_sve_save_guest(struct kvm_vcpu *vcpu)
> +{
> +	__vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR);
> +	sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2);
> +	__sve_save_state(vcpu_sve_pffr(vcpu), &vcpu->arch.ctxt.fp_regs.fpsr);
> +	sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2);
> +}
> +
> +static void __hyp_sve_restore_host(void)
> +{
> +	struct user_sve_state *sve_state = *host_data_ptr(sve_state);
> +
> +	sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2);
> +	__sve_restore_state(sve_state->sve_regs + sve_ffr_offset(kvm_host_sve_max_vl),

This is what I was worried about in a previous patch.
kvm_host_sve_max_vl represents the max VL across all CPUs. if CPU0
supports 128bit SVE and CPU1 256bit, the value is 256. but on CPU0,
ZCR_ELx_LEN_MASK will results in using 128bit accesses, and the offset
will be wrong. I can't convince myself that anything really goes wrong
as long as we're consistent between save and restore, but that's at
best ugly and needs extensive documenting.

On top of that, a conditional update of ZCR_EL2 with ZCR_ELx_LEN_MASK
is unlikely to be beneficial, since nobody implements 2k vectors. A
direct write followed by an ISB is likely to be better.

> +			    &sve_state->fpsr);
> +	write_sysreg_el1(sve_state->zcr_el1, SYS_ZCR);
> +}
> +
> +static void fpsimd_sve_flush(void)
> +{
> +	*host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED;
> +}
> +
> +static void fpsimd_sve_sync(struct kvm_vcpu *vcpu)
> +{
> +	if (!guest_owns_fp_regs())
> +		return;
> +
> +	cpacr_clear_set(0, (CPACR_EL1_ZEN_EL1EN | CPACR_EL1_ZEN_EL0EN |
> +			    CPACR_EL1_FPEN_EL1EN | CPACR_EL1_FPEN_EL0EN));
> +	isb();
> +
> +	if (vcpu_has_sve(vcpu))
> +		__hyp_sve_save_guest(vcpu);
> +	else
> +		__fpsimd_save_state(&vcpu->arch.ctxt.fp_regs);
> +
> +	if (system_supports_sve())
> +		__hyp_sve_restore_host();
> +	else
> +		__fpsimd_restore_state(*host_data_ptr(fpsimd_state));
> +
> +	*host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED;
> +}
> +
>  static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
>  {
>  	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
>  
> +	fpsimd_sve_flush();
> +
>  	hyp_vcpu->vcpu.arch.ctxt	= host_vcpu->arch.ctxt;
>  
>  	hyp_vcpu->vcpu.arch.sve_state	= kern_hyp_va(host_vcpu->arch.sve_state);
> -	hyp_vcpu->vcpu.arch.sve_max_vl	= host_vcpu->arch.sve_max_vl;
> +	hyp_vcpu->vcpu.arch.sve_max_vl	= min(host_vcpu->arch.sve_max_vl, kvm_host_sve_max_vl);
>  
>  	hyp_vcpu->vcpu.arch.hw_mmu	= host_vcpu->arch.hw_mmu;
>  
>  	hyp_vcpu->vcpu.arch.hcr_el2	= host_vcpu->arch.hcr_el2;
>  	hyp_vcpu->vcpu.arch.mdcr_el2	= host_vcpu->arch.mdcr_el2;
> -	hyp_vcpu->vcpu.arch.cptr_el2	= host_vcpu->arch.cptr_el2;
>  
>  	hyp_vcpu->vcpu.arch.iflags	= host_vcpu->arch.iflags;
>  
> @@ -54,10 +100,11 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
>  	struct vgic_v3_cpu_if *host_cpu_if = &host_vcpu->arch.vgic_cpu.vgic_v3;
>  	unsigned int i;
>  
> +	fpsimd_sve_sync(&hyp_vcpu->vcpu);
> +
>  	host_vcpu->arch.ctxt		= hyp_vcpu->vcpu.arch.ctxt;
>  
>  	host_vcpu->arch.hcr_el2		= hyp_vcpu->vcpu.arch.hcr_el2;
> -	host_vcpu->arch.cptr_el2	= hyp_vcpu->vcpu.arch.cptr_el2;
>  
>  	host_vcpu->arch.fault		= hyp_vcpu->vcpu.arch.fault;
>  
> diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
> index 25e9a94f6d76..feb27b4ce459 100644
> --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
> +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
> @@ -588,6 +588,8 @@ int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
>  	if (ret)
>  		unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu));
>  
> +	hyp_vcpu->vcpu.arch.cptr_el2 = kvm_get_reset_cptr_el2(&hyp_vcpu->vcpu);
> +

Eventually, we need to rename this to cpacr_el2 and make sure we
consistently use the VHE format everywhere. I'm starting to be worried
that we mix things badly.

>  	return ret;
>  }
>  
> diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
> index 019f863922fa..5bf437682b94 100644
> --- a/arch/arm64/kvm/hyp/nvhe/switch.c
> +++ b/arch/arm64/kvm/hyp/nvhe/switch.c
> @@ -184,7 +184,21 @@ static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code)
>  
>  static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu)
>  {
> -	__fpsimd_save_state(*host_data_ptr(fpsimd_state));
> +	/*
> +	 * Non-protected kvm relies on the host restoring its sve state.
> +	 * Protected kvm restores the host's sve state as not to reveal that
> +	 * fpsimd was used by a guest nor leak upper sve bits.
> +	 */
> +	if (unlikely(is_protected_kvm_enabled() && system_supports_sve())) {
> +		__hyp_sve_save_host();
> +
> +		/* Re-enable SVE traps if not supported for the guest vcpu. */
> +		if (!vcpu_has_sve(vcpu))
> +			cpacr_clear_set(CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN, 0);
> +

nit: spurious new line

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel