Re: [PATCH v4 10/10] KVM: arm64: np-guest CMOs with PMD_SIZE fixmap

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Marc Zyngier <maz@kernel.org>
To: Vincent Donnefort <vdonnefort@google.com>
Cc: oliver.upton@linux.dev, joey.gouly@arm.com,
	suzuki.poulose@arm.com, yuzenghui@huawei.com,
	catalin.marinas@arm.com, will@kernel.org, qperret@google.com,
	linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,
	linux-kernel@vger.kernel.org, kernel-team@android.com
Subject: Re: [PATCH v4 10/10] KVM: arm64: np-guest CMOs with PMD_SIZE fixmap
Date: Fri, 16 May 2025 14:55:27 +0100	[thread overview]
Message-ID: <864ixkfynk.wl-maz@kernel.org> (raw)
In-Reply-To: <20250509131706.2336138-11-vdonnefort@google.com>

On Fri, 09 May 2025 14:17:06 +0100,
Vincent Donnefort <vdonnefort@google.com> wrote:
> 
> With the introduction of stage-2 huge mappings in the pKVM hypervisor,
> guest pages CMO is needed for PMD_SIZE size. Fixmap only supports
> PAGE_SIZE and iterating over the huge-page is time consuming (mostly due
> to TLBI on hyp_fixmap_unmap) which is a problem for EL2 latency.
> 
> Introduce a shared PMD_SIZE fixmap (hyp_fixblock_map/hyp_fixblock_unmap)
> to improve guest page CMOs when stage-2 huge mappings are installed.
> 
> On a Pixel6, the iterative solution resulted in a latency of ~700us,
> while the PMD_SIZE fixmap reduces it to ~100us.
> 
> Because of the horrendous private range allocation that would be
> necessary, this is disabled for 64KiB pages systems.
> 
> Suggested-by: Quentin Perret <qperret@google.com>
> Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
> Signed-off-by: Quentin Perret <qperret@google.com>
> 
> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> index 1b43bcd2a679..2888b5d03757 100644
> --- a/arch/arm64/include/asm/kvm_pgtable.h
> +++ b/arch/arm64/include/asm/kvm_pgtable.h
> @@ -59,6 +59,11 @@ typedef u64 kvm_pte_t;
>  
>  #define KVM_PHYS_INVALID		(-1ULL)
>  
> +#define KVM_PTE_TYPE			BIT(1)
> +#define KVM_PTE_TYPE_BLOCK		0
> +#define KVM_PTE_TYPE_PAGE		1
> +#define KVM_PTE_TYPE_TABLE		1
> +
>  #define KVM_PTE_LEAF_ATTR_LO		GENMASK(11, 2)
>  
>  #define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX	GENMASK(4, 2)
> diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h
> index 230e4f2527de..b0c72bc2d5ba 100644
> --- a/arch/arm64/kvm/hyp/include/nvhe/mm.h
> +++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h
> @@ -13,9 +13,11 @@
>  extern struct kvm_pgtable pkvm_pgtable;
>  extern hyp_spinlock_t pkvm_pgd_lock;
>  
> -int hyp_create_pcpu_fixmap(void);
> +int hyp_create_fixmap(void);
>  void *hyp_fixmap_map(phys_addr_t phys);
>  void hyp_fixmap_unmap(void);
> +void *hyp_fixblock_map(phys_addr_t phys);
> +void hyp_fixblock_unmap(void);
>  
>  int hyp_create_idmap(u32 hyp_va_bits);
>  int hyp_map_vectors(void);
> diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> index 97e0fea9db4e..9f3ffa4e0690 100644
> --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> @@ -220,16 +220,52 @@ static void guest_s2_put_page(void *addr)
>  	hyp_put_page(&current_vm->pool, addr);
>  }
>  
> +static void *__fixmap_guest_page(void *va, size_t *size)
> +{
> +	if (IS_ALIGNED(*size, PMD_SIZE)) {
> +		void *addr = hyp_fixblock_map(__hyp_pa(va));
> +
> +		if (addr)
> +			return addr;
> +
> +		*size = PAGE_SIZE;
> +	}
> +
> +	if (IS_ALIGNED(*size, PAGE_SIZE))
> +		return hyp_fixmap_map(__hyp_pa(va));
> +
> +	WARN_ON(1);
> +
> +	return NULL;
> +}
> +
> +static void __fixunmap_guest_page(size_t size)
> +{
> +	switch (size) {
> +	case PAGE_SIZE:
> +		hyp_fixmap_unmap();
> +		break;
> +	case PMD_SIZE:
> +		hyp_fixblock_unmap();
> +		break;
> +	default:
> +		WARN_ON(1);
> +	}

This is pretty ugly. How can we end-up there the first place? I'd
rather you make sure we can't reach this default path at all. See also
towards the end of this patch (tl;dr: hyp_fixblock_unmap() should
never explode).

> +}
> +
>  static void clean_dcache_guest_page(void *va, size_t size)
>  {
>  	WARN_ON(!PAGE_ALIGNED(size));
>  
>  	while (size) {
> -		__clean_dcache_guest_page(hyp_fixmap_map(__hyp_pa(va)),
> -					  PAGE_SIZE);
> -		hyp_fixmap_unmap();
> -		va += PAGE_SIZE;
> -		size -= PAGE_SIZE;
> +		size_t fixmap_size = size == PMD_SIZE ? size : PAGE_SIZE;
> +		void *addr = __fixmap_guest_page(va, &fixmap_size);
> +
> +		__clean_dcache_guest_page(addr, fixmap_size);
> +		__fixunmap_guest_page(fixmap_size);
> +
> +		size -= fixmap_size;
> +		va += fixmap_size;

Can this ever be called with a *multiple* of PMD_SIZE? In this case
you'd still end-up doing PAGE_SIZEd-bite CMOs until there is only
PMD_SIZE left, ruining the optimisation.

I think this needs fixing.

>  	}
>  }
>  
> @@ -238,11 +274,14 @@ static void invalidate_icache_guest_page(void *va, size_t size)
>  	WARN_ON(!PAGE_ALIGNED(size));
>  
>  	while (size) {
> -		__invalidate_icache_guest_page(hyp_fixmap_map(__hyp_pa(va)),
> -					       PAGE_SIZE);
> -		hyp_fixmap_unmap();
> -		va += PAGE_SIZE;
> -		size -= PAGE_SIZE;
> +		size_t fixmap_size = size == PMD_SIZE ? size : PAGE_SIZE;
> +		void *addr = __fixmap_guest_page(va, &fixmap_size);
> +
> +		__invalidate_icache_guest_page(addr, fixmap_size);
> +		__fixunmap_guest_page(fixmap_size);
> +
> +		size -= fixmap_size;
> +		va += fixmap_size;
>  	}
>  }
>  
> diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
> index f41c7440b34b..e3b1bece8504 100644
> --- a/arch/arm64/kvm/hyp/nvhe/mm.c
> +++ b/arch/arm64/kvm/hyp/nvhe/mm.c
> @@ -229,9 +229,8 @@ int hyp_map_vectors(void)
>  	return 0;
>  }
>  
> -void *hyp_fixmap_map(phys_addr_t phys)
> +static void *fixmap_map_slot(struct hyp_fixmap_slot *slot, phys_addr_t phys)
>  {
> -	struct hyp_fixmap_slot *slot = this_cpu_ptr(&fixmap_slots);
>  	kvm_pte_t pte, *ptep = slot->ptep;
>  
>  	pte = *ptep;
> @@ -243,10 +242,21 @@ void *hyp_fixmap_map(phys_addr_t phys)
>  	return (void *)slot->addr;
>  }
>  
> +void *hyp_fixmap_map(phys_addr_t phys)
> +{
> +	return fixmap_map_slot(this_cpu_ptr(&fixmap_slots), phys);
> +}
> +
>  static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
>  {
>  	kvm_pte_t *ptep = slot->ptep;
>  	u64 addr = slot->addr;
> +	u32 level;
> +
> +	if (FIELD_GET(KVM_PTE_TYPE, *ptep) == KVM_PTE_TYPE_PAGE)
> +		level = KVM_PGTABLE_LAST_LEVEL;
> +	else
> +		level = KVM_PGTABLE_LAST_LEVEL - 1; /* create_fixblock() guarantees PMD level */

Seeing this, (KVM_PGTABLE_LAST_LEVEL - 1) looks nicee than the "2" I
suggested in one of the previous patches.

>
>  	WRITE_ONCE(*ptep, *ptep & ~KVM_PTE_VALID);
>  
> @@ -260,7 +270,7 @@ static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
>  	 * https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03
>  	 */
>  	dsb(ishst);
> -	__tlbi_level(vale2is, __TLBI_VADDR(addr, 0), KVM_PGTABLE_LAST_LEVEL);
> +	__tlbi_level(vale2is, __TLBI_VADDR(addr, 0), level);
>  	dsb(ish);
>  	isb();
>  }
> @@ -273,9 +283,9 @@ void hyp_fixmap_unmap(void)
>  static int __create_fixmap_slot_cb(const struct kvm_pgtable_visit_ctx *ctx,
>  				   enum kvm_pgtable_walk_flags visit)
>  {
> -	struct hyp_fixmap_slot *slot = per_cpu_ptr(&fixmap_slots, (u64)ctx->arg);
> +	struct hyp_fixmap_slot *slot = (struct hyp_fixmap_slot *)ctx->arg;
>  
> -	if (!kvm_pte_valid(ctx->old) || ctx->level != KVM_PGTABLE_LAST_LEVEL)
> +	if (!kvm_pte_valid(ctx->old) || (ctx->end - ctx->start) != kvm_granule_size(ctx->level))
>  		return -EINVAL;
>  
>  	slot->addr = ctx->addr;
> @@ -296,13 +306,73 @@ static int create_fixmap_slot(u64 addr, u64 cpu)
>  	struct kvm_pgtable_walker walker = {
>  		.cb	= __create_fixmap_slot_cb,
>  		.flags	= KVM_PGTABLE_WALK_LEAF,
> -		.arg = (void *)cpu,
> +		.arg = (void *)per_cpu_ptr(&fixmap_slots, cpu),

Do you really need this cast?

>  	};
>  
>  	return kvm_pgtable_walk(&pkvm_pgtable, addr, PAGE_SIZE, &walker);
>  }
>  
> -int hyp_create_pcpu_fixmap(void)
> +#ifndef CONFIG_ARM64_64K_PAGES

I don't have much faith in this symbol. We have changed the config
stuff so often over the years that I wouldn't trust it long term.

Using something like PAGE_SIZE or PAGE_SHIFT is likely to be more
robust.

> +static struct hyp_fixmap_slot hyp_fixblock_slot;
> +static DEFINE_HYP_SPINLOCK(hyp_fixblock_lock);
> +
> +void *hyp_fixblock_map(phys_addr_t phys)
> +{
> +	hyp_spin_lock(&hyp_fixblock_lock);
> +	return fixmap_map_slot(&hyp_fixblock_slot, phys);
> +}
> +
> +void hyp_fixblock_unmap(void)
> +{
> +	fixmap_clear_slot(&hyp_fixblock_slot);
> +	hyp_spin_unlock(&hyp_fixblock_lock);
> +}
> +
> +static int create_fixblock(void)
> +{
> +	struct kvm_pgtable_walker walker = {
> +		.cb	= __create_fixmap_slot_cb,
> +		.flags	= KVM_PGTABLE_WALK_LEAF,
> +		.arg = (void *)&hyp_fixblock_slot,
> +	};
> +	unsigned long addr;
> +	phys_addr_t phys;
> +	int ret, i;
> +
> +	/* Find a RAM phys address, PMD aligned */
> +	for (i = 0; i < hyp_memblock_nr; i++) {
> +		phys = ALIGN(hyp_memory[i].base, PMD_SIZE);
> +		if (phys + PMD_SIZE < (hyp_memory[i].base + hyp_memory[i].size))
> +			break;
> +	}
> +
> +	if (i >= hyp_memblock_nr)
> +		return -EINVAL;
> +
> +	hyp_spin_lock(&pkvm_pgd_lock);
> +	addr = ALIGN(__io_map_base, PMD_SIZE);
> +	ret = __pkvm_alloc_private_va_range(addr, PMD_SIZE);
> +	if (ret)
> +		goto unlock;
> +
> +	ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, PMD_SIZE, phys, PAGE_HYP);
> +	if (ret)
> +		goto unlock;
> +
> +	ret = kvm_pgtable_walk(&pkvm_pgtable, addr, PMD_SIZE, &walker);
> +
> +unlock:
> +	hyp_spin_unlock(&pkvm_pgd_lock);
> +
> +	return ret;
> +}
> +#else
> +void hyp_fixblock_unmap(void) { WARN_ON(1); }
> +void *hyp_fixblock_map(phys_addr_t phys) { return NULL; }
> +static int create_fixblock(void) { return 0; }
> +#endif

I can't say I like this. Can't you have a fallback that does the
iteration rather than these placeholders that are only there to make
things catch fire?

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.

next prev parent reply	other threads:[~2025-05-16 13:55 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-05-09 13:16 [PATCH v4 00/10] Stage-2 huge mappings for pKVM np-guests Vincent Donnefort
2025-05-09 13:16 ` [PATCH v4 01/10] KVM: arm64: Handle huge mappings for np-guest CMOs Vincent Donnefort
2025-05-16 12:15   ` Marc Zyngier
2025-05-16 17:53     ` Vincent Donnefort
2025-05-17  8:51       ` Marc Zyngier
2025-05-09 13:16 ` [PATCH v4 02/10] KVM: arm64: Introduce for_each_hyp_page Vincent Donnefort
2025-05-16 12:57   ` Marc Zyngier
2025-05-19 14:46     ` Quentin Perret
2025-05-09 13:16 ` [PATCH v4 03/10] KVM: arm64: Add a range to __pkvm_host_share_guest() Vincent Donnefort
2025-05-16 13:10   ` Marc Zyngier
2025-05-09 13:17 ` [PATCH v4 04/10] KVM: arm64: Add a range to __pkvm_host_unshare_guest() Vincent Donnefort
2025-05-09 13:17 ` [PATCH v4 05/10] KVM: arm64: Add a range to __pkvm_host_wrprotect_guest() Vincent Donnefort
2025-05-09 13:17 ` [PATCH v4 06/10] KVM: arm64: Add a range to __pkvm_host_test_clear_young_guest() Vincent Donnefort
2025-05-09 13:17 ` [PATCH v4 07/10] KVM: arm64: Convert pkvm_mappings to interval tree Vincent Donnefort
2025-05-16 13:15   ` Marc Zyngier
2025-05-19 14:22     ` Quentin Perret
2025-05-09 13:17 ` [PATCH v4 08/10] KVM: arm64: Add a range to pkvm_mappings Vincent Donnefort
2025-05-09 13:17 ` [PATCH v4 09/10] KVM: arm64: Stage-2 huge mappings for np-guests Vincent Donnefort
2025-05-16 13:28   ` Marc Zyngier
2025-05-19 14:34     ` Quentin Perret
2025-05-09 13:17 ` [PATCH v4 10/10] KVM: arm64: np-guest CMOs with PMD_SIZE fixmap Vincent Donnefort
2025-05-16 13:55   ` Marc Zyngier [this message]
2025-05-16 18:03     ` Vincent Donnefort
2025-05-17  8:53       ` Marc Zyngier

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=864ixkfynk.wl-maz@kernel.org \
    --to=maz@kernel.org \
    --cc=catalin.marinas@arm.com \
    --cc=joey.gouly@arm.com \
    --cc=kernel-team@android.com \
    --cc=kvmarm@lists.linux.dev \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=oliver.upton@linux.dev \
    --cc=qperret@google.com \
    --cc=suzuki.poulose@arm.com \
    --cc=vdonnefort@google.com \
    --cc=will@kernel.org \
    --cc=yuzenghui@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).