From mboxrd@z Thu Jan 1 00:00:00 1970 From: Avi Kivity Subject: Re: [patch] kvm with mmu notifier v18 Date: Thu, 05 Jun 2008 18:54:30 +0300 Message-ID: <48480C36.6050309@qumranet.com> References: <20080605002626.GA15502@duo.random> Mime-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit Cc: kvm@vger.kernel.org To: Andrea Arcangeli Return-path: Received: from il.qumranet.com ([212.179.150.194]:41863 "EHLO il.qumranet.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753243AbYFEPyg (ORCPT ); Thu, 5 Jun 2008 11:54:36 -0400 In-Reply-To: <20080605002626.GA15502@duo.random> Sender: kvm-owner@vger.kernel.org List-ID: Andrea Arcangeli wrote: > Hello, > > this is an update of the patch to test kvm on mmu notifier v18. I'll > post the mmu notifier v18 tomorrow after some more review but I can > post the kvm side in the meantime (which works with the previous v17 > as well if anyone wants to test). > > This has a relevant fix for kvm_unmap_rmapp: rmap_remove while > deleting the current spte from the desc array, can overwrite the > deleted current spte with the last spte in the desc array in turn > reodering it. So if we restart rmap_next from the sptes after the > deleted current spte, we may miss the later sptes that have been moved > in the slot of the current spte. We've to teardown the whole desc > array so the fix was to simply pick from the first entry and wait the > others to come down. > > I also wonder if the update_pte done outside the mmu_lock is safe > without mmu notifiers, or if the below changes are required regardless > (I think they are). I cleaned up the fix but I probably need to > extract it from this patch. > > + > +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) > +{ > + u64 *spte; > + int young = 0; > + > + spte = rmap_next(kvm, rmapp, NULL); > + while (spte) { > + int _young; > + u64 _spte = *spte; > + BUG_ON(!(_spte & PT_PRESENT_MASK)); > + _young = _spte & PT_ACCESSED_MASK; > + if (_young) { > + young = !!_young; > young = 1? > + set_shadow_pte(spte, _spte & ~PT_ACCESSED_MASK); > This can theoretically lose the dirty bit. We don't track it now, so that's okay. > + } > + spte = rmap_next(kvm, rmapp, spte); > + } > + return young; > +} > + > +int kvm_age_hva(struct kvm *kvm, unsigned long hva) > +{ > + int i; > + int young = 0; > + > + /* > + * If mmap_sem isn't taken, we can look the memslots with only > + * the mmu_lock by skipping over the slots with userspace_addr == 0. > + */ > One day we want to sort the slots according to size. We'll need better locking then (rcu, likely). > > > @@ -1235,9 +1340,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) > int i; > struct kvm_mmu_page *sp; > > - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) > - return; > spin_lock(&vcpu->kvm->mmu_lock); > + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) > + goto out; > vcpu-> stuff is protected by the vcpu mutex. > @@ -1626,18 +1744,20 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) > return !!(spte && (*spte & shadow_accessed_mask)); > } > > -static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, > - const u8 *new, int bytes) > +static int mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, > + const u8 *new, int bytes, > + gfn_t *_gfn, pfn_t *_pfn, > + int *_mmu_seq, int *_largepage) > { > gfn_t gfn; > int r; > u64 gpte = 0; > pfn_t pfn; > - > - vcpu->arch.update_pte.largepage = 0; > + int mmu_seq; > + int largepage; > > if (bytes != 4 && bytes != 8) > - return; > + return 0; > > /* > * Assume that the pte write on a page table of the same type > @@ -1650,7 +1770,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, > if ((bytes == 4) && (gpa % 4 == 0)) { > r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8); > if (r) > - return; > + return 0; > memcpy((void *)&gpte + (gpa % 8), new, 4); > } else if ((bytes == 8) && (gpa % 8 == 0)) { > memcpy((void *)&gpte, new, 8); > @@ -1660,23 +1780,30 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, > memcpy((void *)&gpte, new, 4); > } > if (!is_present_pte(gpte)) > - return; > + return 0; > gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; > > + largepage = 0; > down_read(¤t->mm->mmap_sem); > if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { > gfn &= ~(KVM_PAGES_PER_HPAGE-1); > - vcpu->arch.update_pte.largepage = 1; > + largepage = 1; > } > + mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq); > + /* implicit mb(), we'll read before PT lock is unlocked */ > pfn = gfn_to_pfn(vcpu->kvm, gfn); > up_read(¤t->mm->mmap_sem); > > - if (is_error_pfn(pfn)) { > + if (unlikely(is_error_pfn(pfn))) { > kvm_release_pfn_clean(pfn); > - return; > + return 0; > } > - vcpu->arch.update_pte.gfn = gfn; > - vcpu->arch.update_pte.pfn = pfn; > + > + *_gfn = gfn; > + *_pfn = pfn; > + *_mmu_seq = mmu_seq; > + *_largepage = largepage; > + return 1; > } > Alternatively, we can replace this with follow_page() in update_pte(). Probably best to defer it, though. > + /* > + * When ->invalidate_page runs, the linux pte has been zapped > + * already but the page is still allocated until > + * ->invalidate_page returns. So if we increase the sequence > + * here the kvm page fault will notice if the spte can't be > + * established because the page is going to be freed. If > + * instead the kvm page fault establishes the spte before > + * ->invalidate_page runs, kvm_unmap_hva will release it > + * before returning. > This too... Please move the registration to virt/kvm/kvm_main.c, and provide stubs for non-x86. This is definitely something that we want to do cross-arch (except s390 which have it in hardware). -- error compiling committee.c: too many arguments to function