From mboxrd@z Thu Jan  1 00:00:00 1970
From: Kai Huang <kai.huang@linux.intel.com>
Subject: Re: [PATCH 2/6] KVM: MMU: Add mmu help functions to support PML
Date: Thu, 05 Feb 2015 13:59:37 +0800
Message-ID: <54D306C9.2060501@linux.intel.com>
References: <1422413668-3509-1-git-send-email-kai.huang@linux.intel.com> <1422413668-3509-3-git-send-email-kai.huang@linux.intel.com> <20150203173440.GG19731@potion.redhat.com>
Mime-Version: 1.0
Content-Type: text/plain; charset=UTF-8;
	format=flowed
Content-Transfer-Encoding: QUOTED-PRINTABLE
Cc: pbonzini@redhat.com, gleb@kernel.org, linux@arm.linux.org.uk,
	kvm@vger.kernel.org
To: =?UTF-8?B?UmFkaW0gS3LEjW3DocWZ?= <rkrcmar@redhat.com>
Return-path: <kvm-owner@vger.kernel.org>
Received: from mga01.intel.com ([192.55.52.88]:58935 "EHLO mga01.intel.com"
	rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
	id S1751142AbbBEGHz (ORCPT <rfc822;kvm@vger.kernel.org>);
	Thu, 5 Feb 2015 01:07:55 -0500
In-Reply-To: <20150203173440.GG19731@potion.redhat.com>
Sender: kvm-owner@vger.kernel.org
List-ID: <kvm.vger.kernel.org>


On 02/04/2015 01:34 AM, Radim Kr=C4=8Dm=C3=A1=C5=99 wrote:
> 2015-01-28 10:54+0800, Kai Huang:
>> This patch adds new mmu layer functions to clear/set D-bit for memor=
y slot, and
>> to write protect superpages for memory slot.
>>
>> In case of PML, CPU logs the dirty GPA automatically to PML buffer w=
hen CPU
>> updates D-bit from 0 to 1, therefore we don't have to write protect =
4K pages,
>> instead, we only need to clear D-bit in order to log that GPA.
>>
>> For superpages, we still write protect it and let page fault code to=
 handle
>> dirty page logging, as we still need to split superpage to 4K pages =
in PML.
>>
>> As PML is always enabled during guest's lifetime, to eliminate unnec=
essary PML
>> GPA logging, we set D-bit manually for the slot with dirty logging d=
isabled.
>>
>> Signed-off-by: Kai Huang <kai.huang@linux.intel.com>
> This message contains just several  stylistic tips, I wasn't able to
> learn enough about large pages to review today.
Hi Radim,

Thanks for your review and sorry for the late reply. I was working on=20
something else these days.

There are two reasons we still write protect the large page in case of=20
PML. One is we still need to split large page to 4K page in dirty=20
logging mode, and PML doesn't split large page automatically. The secon=
d=20
is PML only logs dirty GPA but it doesn't distinguish if the dirty GPA=20
is in 4K page or large page.

=46or rest of your comments, as this patch series have already been in=20
Paolo's queue branch, I think I can send further patches to enhance=20
them, if Paolo agrees the enhancement is needed.

Thanks,
-Kai
>
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index b18e65c..c438224 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -4368,6 +4448,121 @@ void kvm_mmu_slot_remove_write_access(struct=
 kvm *kvm, int slot)
>> +void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
>> +					struct kvm_memory_slot *memslot)
>> +{
>> +	gfn_t last_gfn;
>> +	int i;
>> +	bool flush =3D false;
>> +
>> +	last_gfn =3D memslot->base_gfn + memslot->npages - 1;
>> +
>> +	spin_lock(&kvm->mmu_lock);
>> +
>> +	for (i =3D PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */
> ("+ 1" is the only difference from kvm_mmu_slot_remove_write_access()=
;
>   new argument, initial level, would be better.
>
>   Btw, PT_PAGE_TABLE_LEVEL + 1 =3D=3D PT_DIRECTORY_LEVEL)
>
>> +	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
>> +		unsigned long *rmapp;
>> +		unsigned long last_index, index;
>> +
>> +		rmapp =3D memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
>> +		last_index =3D gfn_to_index(last_gfn, memslot->base_gfn, i);
>> +
>> +		for (index =3D 0; index <=3D last_index; ++index, ++rmapp) {
>> +			if (*rmapp)
>> +				flush |=3D __rmap_write_protect(kvm, rmapp,
>> +						false);
>> +
>> +			if (need_resched() || spin_needbreak(&kvm->mmu_lock))
>> +				cond_resched_lock(&kvm->mmu_lock);
>> +		}
>> +	}
>> +	spin_unlock(&kvm->mmu_lock);
>> +
>> +	/* see kvm_mmu_slot_remove_write_access */
>> +	lockdep_assert_held(&kvm->slots_lock);
>> +
>> +	if (flush)
>> +		kvm_flush_remote_tlbs(kvm);
>> +}
>> +void kvm_mmu_slot_set_dirty(struct kvm *kvm,
>> +			    struct kvm_memory_slot *memslot)
>> +{
>> +	gfn_t last_gfn;
>> +	int i;
>> +	bool flush =3D false;
>> +
>> +	last_gfn =3D memslot->base_gfn + memslot->npages - 1;
>> +
>> +	spin_lock(&kvm->mmu_lock);
>> +
>> +	for (i =3D PT_PAGE_TABLE_LEVEL;
>> +	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
>> +		unsigned long *rmapp;
>> +		unsigned long last_index, index;
>> +
>> +		rmapp =3D memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
>> +		last_index =3D gfn_to_index(last_gfn, memslot->base_gfn, i);
>> +
>> +		for (index =3D 0; index <=3D last_index; ++index, ++rmapp) {
>> +			if (*rmapp)
>> +				flush |=3D __rmap_set_dirty(kvm, rmapp);
> (And yet another similar walker ... We can either pass a worker funct=
ion
>   accepting 'kvm' and 'rmapp', use a 'switch' with a new operations e=
num,
>   or have code duplication.  Paolo?)
>
>> +
>> +			if (need_resched() || spin_needbreak(&kvm->mmu_lock))
>> +				cond_resched_lock(&kvm->mmu_lock);
>> +		}
>> +	}
>> +
>> +	spin_unlock(&kvm->mmu_lock);
>> +
>> +	lockdep_assert_held(&kvm->slots_lock);
>> +
>> +	/* see kvm_mmu_slot_leaf_clear_dirty */
>> +	if (flush)
>> +		kvm_flush_remote_tlbs(kvm);
>> +}
>> +EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>> +void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
>> +				   struct kvm_memory_slot *memslot)
>> +{
>> +	gfn_t last_gfn;
>> +	unsigned long *rmapp;
>> +	unsigned long last_index, index;
>> +	bool flush =3D false;
>> +
>> +	last_gfn =3D memslot->base_gfn + memslot->npages - 1;
>> +
>> +	spin_lock(&kvm->mmu_lock);
>> +
> (If we abstracted with switch or function, we could also add max leve=
l
>   argument to cover this function.)
>
>> +	rmapp =3D memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1];
>> +	last_index =3D gfn_to_index(last_gfn, memslot->base_gfn,
>> +			PT_PAGE_TABLE_LEVEL);
>> +
>> +	for (index =3D 0; index <=3D last_index; ++index, ++rmapp) {
>> +		if (*rmapp)
>> +			flush |=3D __rmap_clear_dirty(kvm, rmapp);
>> +
>> +		if (need_resched() || spin_needbreak(&kvm->mmu_lock))
>> +			cond_resched_lock(&kvm->mmu_lock);
>> +	}
>> +
>> +	spin_unlock(&kvm->mmu_lock);
>> +
>> +	lockdep_assert_held(&kvm->slots_lock);
>> +
>> +	/*
>> +	 * It's also safe to flush TLBs out of mmu lock here as currently =
this
>> +	 * function is only used for dirty logging, in which case flushing=
 TLB
>> +	 * out of mmu lock also guarantees no dirty pages will be lost in
>> +	 * dirty_bitmap.
>> +	 */
>> +	if (flush)
>> +		kvm_flush_remote_tlbs(kvm);
>> +}
>> +EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
>> @@ -1215,6 +1215,60 @@ static bool __rmap_write_protect(struct kvm *=
kvm, unsigned long *rmapp,
> (After looking at following two pairs of functions, you'll hopefully
>   understand why I don't like C very much.)
>
>> +static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep)
>> +{
>> +	u64 spte =3D *sptep;
>> +
>> +	rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
>> +
>> +	spte &=3D ~shadow_dirty_mask;
>> +
>> +	return mmu_spte_update(sptep, spte);
>> +}
>> +static bool spte_set_dirty(struct kvm *kvm, u64 *sptep)
>> +{
>> +	u64 spte =3D *sptep;
>> +
>> +	rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
>> +
>> +	spte |=3D shadow_dirty_mask;
>> +
>> +	return mmu_spte_update(sptep, spte);
>> +}
>> +static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmap=
p)
>> +{
>> +	u64 *sptep;
>> +	struct rmap_iterator iter;
>> +	bool flush =3D false;
>> +
>> +	for (sptep =3D rmap_get_first(*rmapp, &iter); sptep;) {
>> +		BUG_ON(!(*sptep & PT_PRESENT_MASK));
>> +
>> +		flush |=3D spte_clear_dirty(kvm, sptep);
>> +		sptep =3D rmap_get_next(&iter);
>> +	}
>> +
>> +	return flush;
>> +}
>> +static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp)
>> +{
>> +	u64 *sptep;
>> +	struct rmap_iterator iter;
>> +	bool flush =3D false;
>> +
>> +	for (sptep =3D rmap_get_first(*rmapp, &iter); sptep;) {
>> +		BUG_ON(!(*sptep & PT_PRESENT_MASK));
>> +
>> +		flush |=3D spte_set_dirty(kvm, sptep);
>> +		sptep =3D rmap_get_next(&iter);
>> +	}
>> +
>> +	return flush;
>> +}