Re: [PATCH 2/2] KVM: x86/mmu: Use MMU shrinker to shrink KVM MMU memory caches

All of lore.kernel.org
 help / color / mirror / Atom feed

From: David Matlack <dmatlack@google.com>
To: Vipin Sharma <vipinsh@google.com>
Cc: seanjc@google.com, pbonzini@redhat.com, zhi.wang.linux@gmail.com,
	weijiang.yang@intel.com, mizhang@google.com,
	liangchen.linux@gmail.com, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH 2/2] KVM: x86/mmu: Use MMU shrinker to shrink KVM MMU memory caches
Date: Tue, 1 Oct 2024 15:16:58 -0700	[thread overview]
Message-ID: <Zvx02r8XjllG7oI_@google.com> (raw)
In-Reply-To: <20240913214316.1945951-3-vipinsh@google.com>

On 2024-09-13 02:43 PM, Vipin Sharma wrote:
> Use MMU shrinker to iterate through all the vCPUs of all the VMs and
> free pages allocated in MMU memory caches. Protect cache allocation in
> page fault and MMU load path from MMU shrinker by using a per vCPU
> mutex. In MMU shrinker, move the iterated VM to the end of the VMs list
> so that the pain of emptying cache spread among other VMs too.
> 
> The specific caches to empty are mmu_shadow_page_cache and
> mmu_shadowed_info_cache as these caches store whole pages. Emptying them
> will give more impact to shrinker compared to other caches like
> mmu_pte_list_desc_cache{} and mmu_page_header_cache{}
> 
> Holding per vCPU mutex lock ensures that a vCPU doesn't get surprised
> by finding its cache emptied after filling them up for page table
> allocations during page fault handling and MMU load operation. Per vCPU
> mutex also makes sure there is only race between MMU shrinker and all
> other vCPUs. This should result in very less contention.
> 
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Suggested-by: David Matlack <dmatlack@google.com>
> Signed-off-by: Vipin Sharma <vipinsh@google.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  6 +++
>  arch/x86/kvm/mmu/mmu.c          | 69 +++++++++++++++++++++++++++------
>  arch/x86/kvm/mmu/paging_tmpl.h  | 14 ++++---
>  include/linux/kvm_host.h        |  1 +
>  virt/kvm/kvm_main.c             |  8 +++-
>  5 files changed, 81 insertions(+), 17 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index cbfe31bac6cf..63eaf03111eb 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -811,6 +811,12 @@ struct kvm_vcpu_arch {
>  	 */
>  	struct kvm_mmu *walk_mmu;
>  
> +	/*
> +	 * Protect cache from getting emptied in MMU shrinker while vCPU might
> +	 * use cache for fault handling or loading MMU.  As this is a per vCPU
> +	 * lock, only contention might happen when MMU shrinker runs.
> +	 */
> +	struct mutex mmu_memory_cache_lock;
>  	struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
>  	struct kvm_mmu_memory_cache mmu_shadow_page_cache;
>  	struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 213e46b55dda..8e2935347615 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -4524,29 +4524,33 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  	if (r != RET_PF_INVALID)
>  		return r;
>  
> +	mutex_lock(&vcpu->arch.mmu_memory_cache_lock);
>  	r = mmu_topup_memory_caches(vcpu, false);
>  	if (r)
> -		return r;
> +		goto out_mmu_memory_cache_unlock;
>  
>  	r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
>  	if (r != RET_PF_CONTINUE)
> -		return r;
> +		goto out_mmu_memory_cache_unlock;
>  
>  	r = RET_PF_RETRY;
>  	write_lock(&vcpu->kvm->mmu_lock);
>  
>  	if (is_page_fault_stale(vcpu, fault))
> -		goto out_unlock;
> +		goto out_mmu_unlock;
>  
>  	r = make_mmu_pages_available(vcpu);
>  	if (r)
> -		goto out_unlock;
> +		goto out_mmu_unlock;
>  
>  	r = direct_map(vcpu, fault);
>  
> -out_unlock:
> +out_mmu_unlock:
>  	write_unlock(&vcpu->kvm->mmu_lock);
>  	kvm_release_pfn_clean(fault->pfn);
> +out_mmu_memory_cache_unlock:
> +	mutex_unlock(&vcpu->arch.mmu_memory_cache_lock);
> +
>  	return r;
>  }
>  
> @@ -4617,25 +4621,28 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
>  	if (r != RET_PF_INVALID)
>  		return r;
>  
> +	mutex_lock(&vcpu->arch.mmu_memory_cache_lock);
>  	r = mmu_topup_memory_caches(vcpu, false);
>  	if (r)
> -		return r;
> +		goto out_mmu_memory_cache_unlock;
>  
>  	r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
>  	if (r != RET_PF_CONTINUE)
> -		return r;
> +		goto out_mmu_memory_cache_unlock;
>  
>  	r = RET_PF_RETRY;
>  	read_lock(&vcpu->kvm->mmu_lock);
>  
>  	if (is_page_fault_stale(vcpu, fault))
> -		goto out_unlock;
> +		goto out_mmu_unlock;
>  
>  	r = kvm_tdp_mmu_map(vcpu, fault);
>  
> -out_unlock:
> +out_mmu_unlock:
>  	read_unlock(&vcpu->kvm->mmu_lock);
>  	kvm_release_pfn_clean(fault->pfn);
> +out_mmu_memory_cache_unlock:
> +	mutex_unlock(&vcpu->arch.mmu_memory_cache_lock);
>  	return r;
>  }
>  #endif
> @@ -5691,6 +5698,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
>  {
>  	int r;
>  
> +	mutex_lock(&vcpu->arch.mmu_memory_cache_lock);
>  	r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct);
>  	if (r)
>  		goto out;
> @@ -5717,6 +5725,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
>  	 */
>  	kvm_x86_call(flush_tlb_current)(vcpu);
>  out:
> +	mutex_unlock(&vcpu->arch.mmu_memory_cache_lock);
>  	return r;
>  }
>  
> @@ -6303,6 +6312,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
>  	if (!vcpu->arch.mmu_shadow_page_cache.init_value)
>  		vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
>  
> +	mutex_init(&vcpu->arch.mmu_memory_cache_lock);
>  	vcpu->arch.mmu = &vcpu->arch.root_mmu;
>  	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
>  
> @@ -6997,13 +7007,50 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
>  static unsigned long mmu_shrink_scan(struct shrinker *shrink,
>  				     struct shrink_control *sc)
>  {
> -	return SHRINK_STOP;
> +	struct kvm *kvm, *next_kvm, *first_kvm = NULL;
> +	unsigned long i, freed = 0;
> +	struct kvm_vcpu *vcpu;
> +
> +	mutex_lock(&kvm_lock);
> +	list_for_each_entry_safe(kvm, next_kvm, &vm_list, vm_list) {
> +		if (!first_kvm)
> +			first_kvm = kvm;
> +		else if (first_kvm == kvm)
> +			break;
> +
> +		list_move_tail(&kvm->vm_list, &vm_list);
> +
> +		kvm_for_each_vcpu(i, vcpu, kvm) {
> +			if (!mutex_trylock(&vcpu->arch.mmu_memory_cache_lock))
> +				continue;
> +			freed += kvm_mmu_empty_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
> +			freed += kvm_mmu_empty_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
> +			mutex_unlock(&vcpu->arch.mmu_memory_cache_lock);
> +			if (freed >= sc->nr_to_scan)
> +				goto out;

Looking at the caller in mm/shrinker.c, sc->nr_to_scan will be <= 128
(SHRINK_BATCH), which is only enough for 2 vCPUs. So I think the
shrinker will only ever free 2 vCPU caches of each VM (probably the
first 2 vCPUs) before reordering the list and moving onto the next VM on
the next call.

Does that match the behavior you observe?

> +		}
> +	}
> +out:
> +	mutex_unlock(&kvm_lock);
> +	return freed;
>  }
>  
>  static unsigned long mmu_shrink_count(struct shrinker *shrink,
>  				      struct shrink_control *sc)
>  {
> -	return SHRINK_EMPTY;
> +	unsigned long i, count = 0;
> +	struct kvm_vcpu *vcpu;
> +	struct kvm *kvm;
> +
> +	mutex_lock(&kvm_lock);
> +	list_for_each_entry(kvm, &vm_list, vm_list) {
> +		kvm_for_each_vcpu(i, vcpu, kvm) {
> +			count += READ_ONCE(vcpu->arch.mmu_shadow_page_cache.nobjs);
> +			count += READ_ONCE(vcpu->arch.mmu_shadowed_info_cache.nobjs);
> +		}
> +	}
> +	mutex_unlock(&kvm_lock);
> +	return !count ? SHRINK_EMPTY : count;
>  }
>  
>  static struct shrinker *mmu_shrinker;
> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index 405bd7ceee2a..084a5c532078 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -809,13 +809,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  		return RET_PF_EMULATE;
>  	}
>  
> +	mutex_lock(&vcpu->arch.mmu_memory_cache_lock);
>  	r = mmu_topup_memory_caches(vcpu, true);
>  	if (r)
> -		return r;
> +		goto out_mmu_memory_cache_unlock;
>  
>  	r = kvm_faultin_pfn(vcpu, fault, walker.pte_access);
>  	if (r != RET_PF_CONTINUE)
> -		return r;
> +		goto out_mmu_memory_cache_unlock;
>  
>  	/*
>  	 * Do not change pte_access if the pfn is a mmio page, otherwise
> @@ -840,16 +841,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  	write_lock(&vcpu->kvm->mmu_lock);
>  
>  	if (is_page_fault_stale(vcpu, fault))
> -		goto out_unlock;
> +		goto out_mmu_unlock;
>  
>  	r = make_mmu_pages_available(vcpu);
>  	if (r)
> -		goto out_unlock;
> +		goto out_mmu_unlock;
>  	r = FNAME(fetch)(vcpu, fault, &walker);
>  
> -out_unlock:
> +out_mmu_unlock:
>  	write_unlock(&vcpu->kvm->mmu_lock);
>  	kvm_release_pfn_clean(fault->pfn);
> +out_mmu_memory_cache_unlock:
> +	mutex_unlock(&vcpu->arch.mmu_memory_cache_lock);
> +
>  	return r;
>  }
>  
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index b23c6d48392f..288e503f14a0 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1446,6 +1446,7 @@ void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
>  int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min);
>  int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min);
>  int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc);
> +int kvm_mmu_empty_memory_cache(struct kvm_mmu_memory_cache *mc);
>  void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc);
>  void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
>  #endif
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index cb2b78e92910..5d89ca218791 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -451,15 +451,21 @@ int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
>  	return mc->nobjs;
>  }
>  
> -void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
> +int kvm_mmu_empty_memory_cache(struct kvm_mmu_memory_cache *mc)
>  {
> +	int freed = mc->nobjs;
>  	while (mc->nobjs) {
>  		if (mc->kmem_cache)
>  			kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
>  		else
>  			free_page((unsigned long)mc->objects[--mc->nobjs]);
>  	}
> +	return freed;
> +}
>  
> +void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
> +{
> +	kvm_mmu_empty_memory_cache(mc);
>  	kvfree(mc->objects);
>  
>  	mc->objects = NULL;
> -- 
> 2.46.0.662.g92d0881bb0-goog
>

next prev parent reply	other threads:[~2024-10-01 22:17 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-09-13 21:43 [PATCH 0/2] KVM: x86/mmu: Repurpose MMU shrinker into page cache shrinker Vipin Sharma
2024-09-13 21:43 ` [PATCH 1/2] KVM: x86/mmu: Change KVM mmu shrinker to no-op Vipin Sharma
2024-09-25 23:54   ` David Matlack
2024-09-13 21:43 ` [PATCH 2/2] KVM: x86/mmu: Use MMU shrinker to shrink KVM MMU memory caches Vipin Sharma
2024-10-01 22:16   ` David Matlack [this message]
2024-10-02 16:17     ` Vipin Sharma
2024-09-25 23:51 ` [PATCH 0/2] KVM: x86/mmu: Repurpose MMU shrinker into page cache shrinker David Matlack
2024-09-30 16:42   ` Vipin Sharma
2024-09-30 16:50     ` David Matlack
  -- strict thread matches above, loose matches on Subject: below --
2024-09-15 20:34 [PATCH 2/2] KVM: x86/mmu: Use MMU shrinker to shrink KVM MMU memory caches kernel test robot

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Zvx02r8XjllG7oI_@google.com \
    --to=dmatlack@google.com \
    --cc=kvm@vger.kernel.org \
    --cc=liangchen.linux@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mizhang@google.com \
    --cc=pbonzini@redhat.com \
    --cc=seanjc@google.com \
    --cc=vipinsh@google.com \
    --cc=weijiang.yang@intel.com \
    --cc=zhi.wang.linux@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.