[patch] kvm with mmu notifier v18

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [patch] kvm with mmu notifier v18
@ 2008-06-05  0:26 Andrea Arcangeli
  2008-06-05 15:54 ` Avi Kivity
  0 siblings, 1 reply; 10+ messages in thread
From: Andrea Arcangeli @ 2008-06-05  0:26 UTC (permalink / raw)
  To: kvm

Hello,

this is an update of the patch to test kvm on mmu notifier v18. I'll
post the mmu notifier v18 tomorrow after some more review but I can
post the kvm side in the meantime (which works with the previous v17
as well if anyone wants to test).

This has a relevant fix for kvm_unmap_rmapp: rmap_remove while
deleting the current spte from the desc array, can overwrite the
deleted current spte with the last spte in the desc array in turn
reodering it. So if we restart rmap_next from the sptes after the
deleted current spte, we may miss the later sptes that have been moved
in the slot of the current spte. We've to teardown the whole desc
array so the fix was to simply pick from the first entry and wait the
others to come down.

I also wonder if the update_pte done outside the mmu_lock is safe
without mmu notifiers, or if the below changes are required regardless
(I think they are). I cleaned up the fix but I probably need to
extract it from this patch.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 8d45fab..ce3251c 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -21,6 +21,7 @@ config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
 	depends on HAVE_KVM
 	select PREEMPT_NOTIFIERS
+	select MMU_NOTIFIER
 	select ANON_INODES
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 491f645..4dee9e5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -651,6 +651,98 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 	account_shadowed(kvm, gfn);
 }
 
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+	u64 *spte;
+	int need_tlb_flush = 0;
+
+	while ((spte = rmap_next(kvm, rmapp, NULL))) {
+		BUG_ON(!(*spte & PT_PRESENT_MASK));
+		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
+		rmap_remove(kvm, spte);
+		set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+		need_tlb_flush = 1;
+	}
+	return need_tlb_flush;
+}
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+	int i;
+	int need_tlb_flush = 0;
+
+	/*
+	 * If mmap_sem isn't taken, we can look the memslots with only
+	 * the mmu_lock by skipping over the slots with userspace_addr == 0.
+	 */
+	for (i = 0; i < kvm->nmemslots; i++) {
+		struct kvm_memory_slot *memslot = &kvm->memslots[i];
+		unsigned long start = memslot->userspace_addr;
+		unsigned long end;
+
+		/* mmu_lock protects userspace_addr */
+		if (!start)
+			continue;
+
+		end = start + (memslot->npages << PAGE_SHIFT);
+		if (hva >= start && hva < end) {
+			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+			need_tlb_flush |= kvm_unmap_rmapp(kvm,
+							  &memslot->rmap[gfn_offset]);
+		}
+	}
+
+	return need_tlb_flush;
+}
+
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+	u64 *spte;
+	int young = 0;
+
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		int _young;
+		u64 _spte = *spte;
+		BUG_ON(!(_spte & PT_PRESENT_MASK));
+		_young = _spte & PT_ACCESSED_MASK;
+		if (_young) {
+			young = !!_young;
+			set_shadow_pte(spte, _spte & ~PT_ACCESSED_MASK);
+		}
+		spte = rmap_next(kvm, rmapp, spte);
+	}
+	return young;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+	int i;
+	int young = 0;
+
+	/*
+	 * If mmap_sem isn't taken, we can look the memslots with only
+	 * the mmu_lock by skipping over the slots with userspace_addr == 0.
+	 */
+	for (i = 0; i < kvm->nmemslots; i++) {
+		struct kvm_memory_slot *memslot = &kvm->memslots[i];
+		unsigned long start = memslot->userspace_addr;
+		unsigned long end;
+
+		/* mmu_lock protects userspace_addr */
+		if (!start)
+			continue;
+
+		end = start + (memslot->npages << PAGE_SHIFT);
+		if (hva >= start && hva < end) {
+			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+			young |= kvm_age_rmapp(kvm, &memslot->rmap[gfn_offset]);
+		}
+	}
+
+	return young;
+}
+
 #ifdef MMU_DEBUG
 static int is_empty_shadow_page(u64 *spt)
 {
@@ -1203,6 +1295,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	int r;
 	int largepage = 0;
 	pfn_t pfn;
+	int mmu_seq;
 
 	down_read(&current->mm->mmap_sem);
 	if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
@@ -1210,6 +1303,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 		largepage = 1;
 	}
 
+	mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq);
+	/* implicit mb(), we'll read before PT lock is unlocked */
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
 	up_read(&current->mm->mmap_sem);
 
@@ -1220,6 +1315,11 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	}
 
 	spin_lock(&vcpu->kvm->mmu_lock);
+	if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_count)))
+		goto out_unlock;
+	smp_rmb();
+	if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_seq) != mmu_seq))
+		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
 	r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
 			 PT32E_ROOT_LEVEL);
@@ -1227,6 +1327,11 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 
 
 	return r;
+
+out_unlock:
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	kvm_release_pfn_clean(pfn);
+	return 0;
 }
 
 
@@ -1235,9 +1340,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 	int i;
 	struct kvm_mmu_page *sp;
 
-	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
-		return;
 	spin_lock(&vcpu->kvm->mmu_lock);
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		goto out;
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 
@@ -1245,9 +1350,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 		--sp->root_count;
 		if (!sp->root_count && sp->role.invalid)
 			kvm_mmu_zap_page(vcpu->kvm, sp);
-		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-		spin_unlock(&vcpu->kvm->mmu_lock);
-		return;
+		goto out_invalid;
 	}
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
@@ -1261,8 +1364,10 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 		}
 		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
 	}
-	spin_unlock(&vcpu->kvm->mmu_lock);
+out_invalid:
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+out:
+	spin_unlock(&vcpu->kvm->mmu_lock);
 }
 
 static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
@@ -1345,6 +1450,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 	int r;
 	int largepage = 0;
 	gfn_t gfn = gpa >> PAGE_SHIFT;
+	int mmu_seq;
 
 	ASSERT(vcpu);
 	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -1358,6 +1464,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
 		largepage = 1;
 	}
+	mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq);
+	/* implicit mb(), we'll read before PT lock is unlocked */
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
 	up_read(&current->mm->mmap_sem);
 	if (is_error_pfn(pfn)) {
@@ -1365,12 +1473,22 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 		return 1;
 	}
 	spin_lock(&vcpu->kvm->mmu_lock);
+	if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_count)))
+		goto out_unlock;
+	smp_rmb();
+	if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_seq) != mmu_seq))
+		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
 	r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
 			 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level());
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return r;
+
+out_unlock:
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	kvm_release_pfn_clean(pfn);
+	return 0;
 }
 
 static void nonpaging_free(struct kvm_vcpu *vcpu)
@@ -1626,18 +1744,20 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
 	return !!(spte && (*spte & shadow_accessed_mask));
 }
 
-static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-					  const u8 *new, int bytes)
+static int mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+					 const u8 *new, int bytes,
+					 gfn_t *_gfn, pfn_t *_pfn,
+					 int *_mmu_seq, int *_largepage)
 {
 	gfn_t gfn;
 	int r;
 	u64 gpte = 0;
 	pfn_t pfn;
-
-	vcpu->arch.update_pte.largepage = 0;
+	int mmu_seq;
+	int largepage;
 
 	if (bytes != 4 && bytes != 8)
-		return;
+		return 0;
 
 	/*
 	 * Assume that the pte write on a page table of the same type
@@ -1650,7 +1770,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		if ((bytes == 4) && (gpa % 4 == 0)) {
 			r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
 			if (r)
-				return;
+				return 0;
 			memcpy((void *)&gpte + (gpa % 8), new, 4);
 		} else if ((bytes == 8) && (gpa % 8 == 0)) {
 			memcpy((void *)&gpte, new, 8);
@@ -1660,23 +1780,30 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 			memcpy((void *)&gpte, new, 4);
 	}
 	if (!is_present_pte(gpte))
-		return;
+		return 0;
 	gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 
+	largepage = 0;
 	down_read(&current->mm->mmap_sem);
 	if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
 		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
-		vcpu->arch.update_pte.largepage = 1;
+		largepage = 1;
 	}
+	mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq);
+	/* implicit mb(), we'll read before PT lock is unlocked */
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
 	up_read(&current->mm->mmap_sem);
 
-	if (is_error_pfn(pfn)) {
+	if (unlikely(is_error_pfn(pfn))) {
 		kvm_release_pfn_clean(pfn);
-		return;
+		return 0;
 	}
-	vcpu->arch.update_pte.gfn = gfn;
-	vcpu->arch.update_pte.pfn = pfn;
+
+	*_gfn = gfn;
+	*_pfn = pfn;
+	*_mmu_seq = mmu_seq;
+	*_largepage = largepage;
+	return 1;
 }
 
 static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
@@ -1711,9 +1838,24 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	int npte;
 	int r;
 
+	int update_pte;
+	gfn_t gpte_gfn;
+	pfn_t pfn;
+	int mmu_seq;
+	int largepage;
+
 	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
-	mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
+	update_pte = mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes,
+						   &gpte_gfn, &pfn,
+						   &mmu_seq, &largepage);
 	spin_lock(&vcpu->kvm->mmu_lock);
+	if (update_pte) {
+		BUG_ON(!is_error_pfn(vcpu->arch.update_pte.pfn));
+		vcpu->arch.update_pte.gfn = gpte_gfn;
+		vcpu->arch.update_pte.pfn = pfn;
+		vcpu->arch.update_pte.mmu_seq = mmu_seq;
+		vcpu->arch.update_pte.largepage = largepage;
+	}
 	kvm_mmu_access_page(vcpu, gfn);
 	kvm_mmu_free_some_pages(vcpu);
 	++vcpu->kvm->stat.mmu_pte_write;
@@ -1793,11 +1935,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		}
 	}
 	kvm_mmu_audit(vcpu, "post pte write");
-	spin_unlock(&vcpu->kvm->mmu_lock);
 	if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
 		kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
 		vcpu->arch.update_pte.pfn = bad_pfn;
 	}
+	spin_unlock(&vcpu->kvm->mmu_lock);
 }
 
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4d91822..eb87479 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -263,6 +263,12 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 	pfn = vcpu->arch.update_pte.pfn;
 	if (is_error_pfn(pfn))
 		return;
+	if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_count)))
+		return;
+	smp_rmb();
+	if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_seq) !=
+		     vcpu->arch.update_pte.mmu_seq))
+		return;
 	kvm_get_pfn(pfn);
 	mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
 		     gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
@@ -380,6 +386,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	int r;
 	pfn_t pfn;
 	int largepage = 0;
+	int mmu_seq;
 
 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
 	kvm_mmu_audit(vcpu, "pre page fault");
@@ -413,6 +420,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 			largepage = 1;
 		}
 	}
+	mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq);
+	/* implicit mb(), we'll read before PT lock is unlocked */
 	pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
 	up_read(&current->mm->mmap_sem);
 
@@ -424,6 +433,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	}
 
 	spin_lock(&vcpu->kvm->mmu_lock);
+	if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_count)))
+		goto out_unlock;
+	smp_rmb();
+	if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_seq) != mmu_seq))
+		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
 	shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
 				  largepage, &write_pt, pfn);
@@ -439,6 +453,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return write_pt;
+
+out_unlock:
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	kvm_release_pfn_clean(pfn);
+	return 0;
 }
 
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 77fb2bd..28019d3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/highmem.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/uaccess.h>
 #include <asm/msr.h>
@@ -3903,16 +3904,127 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 	free_page((unsigned long)vcpu->arch.pio_data);
 }
 
-struct  kvm *kvm_arch_create_vm(void)
+static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 {
-	struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+	struct kvm_arch *kvm_arch;
+	kvm_arch = container_of(mn, struct kvm_arch, mmu_notifier);
+	return container_of(kvm_arch, struct kvm, arch);
+}
 
-	if (!kvm)
-		return ERR_PTR(-ENOMEM);
+static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
+					     struct mm_struct *mm,
+					     unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int need_tlb_flush;
 
-	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+	/*
+	 * When ->invalidate_page runs, the linux pte has been zapped
+	 * already but the page is still allocated until
+	 * ->invalidate_page returns. So if we increase the sequence
+	 * here the kvm page fault will notice if the spte can't be
+	 * established because the page is going to be freed. If
+	 * instead the kvm page fault establishes the spte before
+	 * ->invalidate_page runs, kvm_unmap_hva will release it
+	 * before returning.
+
+	 * No need of memory barriers as the sequence increase only
+	 * need to be seen at spin_unlock time, and not at spin_lock
+	 * time.
+	 *
+	 * Increasing the sequence after the spin_unlock would be
+	 * unsafe because the kvm page fault could then establish the
+	 * pte after kvm_unmap_hva returned, without noticing the page
+	 * is going to be freed.
+	 */
+	atomic_inc(&kvm->arch.mmu_notifier_seq);
+	spin_lock(&kvm->mmu_lock);
+	need_tlb_flush = kvm_unmap_hva(kvm, address);
+	spin_unlock(&kvm->mmu_lock);
 
-	return kvm;
+	/* we've to flush the tlb before the pages can be freed */
+	if (need_tlb_flush)
+		kvm_flush_remote_tlbs(kvm);
+
+}
+
+static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
+						    struct mm_struct *mm,
+						    unsigned long start,
+						    unsigned long end)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int need_tlb_flush = 0;
+
+	/*
+	 * The count increase must become visible at unlock time as no
+	 * spte can be established without taking the mmu_lock and
+	 * count is also read inside the mmu_lock critical section.
+	 */
+	atomic_inc(&kvm->arch.mmu_notifier_count);
+
+	spin_lock(&kvm->mmu_lock);
+	for (; start < end; start += PAGE_SIZE)
+		need_tlb_flush |= kvm_unmap_hva(kvm, start);
+	spin_unlock(&kvm->mmu_lock);
+
+	/* we've to flush the tlb before the pages can be freed */
+	if (need_tlb_flush)
+		kvm_flush_remote_tlbs(kvm);
+}
+
+static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+						  struct mm_struct *mm,
+						  unsigned long start,
+						  unsigned long end)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	/*
+	 *
+	 * This sequence increase will notify the kvm page fault that
+	 * the page that is going to be mapped in the spte could have
+	 * been freed.
+	 *
+	 * There's also an implicit mb() here in this comment,
+	 * provided by the last PT lock taken to zap pagetables, and
+	 * that the read side has to take too in follow_page(). The
+	 * sequence increase in the worst case will become visible to
+	 * the kvm page fault after the spin_lock of the last PT lock
+	 * of the last PT-lock-protected critical section preceeding
+	 * invalidate_range_end. So if the kvm page fault is about to
+	 * establish the spte inside the mmu_lock, while we're freeing
+	 * the pages, it will have to backoff and when it retries, it
+	 * will have to take the PT lock before it can check the
+	 * pagetables again. And after taking the PT lock it will
+	 * re-establish the pte even if it will see the already
+	 * increased sequence number before calling gfn_to_pfn.
+	 */
+	atomic_inc(&kvm->arch.mmu_notifier_seq);
+	/*
+	 * The sequence increase must be visible before count
+	 * decrease. The page fault has to read count before sequence
+	 * for this write order to be effective.
+	 */
+	wmb();
+	atomic_dec(&kvm->arch.mmu_notifier_count);
+	BUG_ON(atomic_read(&kvm->arch.mmu_notifier_count) < 0);
+}
+
+static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
+					      struct mm_struct *mm,
+					      unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int young;
+
+	spin_lock(&kvm->mmu_lock);
+	young = kvm_age_hva(kvm, address);
+	spin_unlock(&kvm->mmu_lock);
+
+	if (young)
+		kvm_flush_remote_tlbs(kvm);
+
+	return young;
 }
 
 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
@@ -3922,16 +4034,58 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
 	vcpu_put(vcpu);
 }
 
-static void kvm_free_vcpus(struct kvm *kvm)
+static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
+				     struct mm_struct *mm)
 {
-	unsigned int i;
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int i;
+
+	BUG_ON(mm != kvm->mm);
 
 	/*
-	 * Unpin any mmu pages first.
+	 * All pages mapped by the sptes will be freed after this
+	 * function returns, so it's good idea to invalidate the spte
+	 * mappings before the pages are freed.
 	 */
+	mutex_lock(&kvm->lock); /* make vcpus array stable, we can sleep */
 	for (i = 0; i < KVM_MAX_VCPUS; ++i)
 		if (kvm->vcpus[i])
 			kvm_unload_vcpu_mmu(kvm->vcpus[i]);
+	mutex_unlock(&kvm->lock);
+}
+
+static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+	.release		= kvm_mmu_notifier_release,
+	.invalidate_page	= kvm_mmu_notifier_invalidate_page,
+	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
+	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
+	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
+};
+
+struct  kvm *kvm_arch_create_vm(void)
+{
+	struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+	int err;
+
+	if (!kvm)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+
+	kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+	err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+	if (err) {
+		kfree(kvm);
+		return ERR_PTR(err);
+	}
+
+	return kvm;
+}
+
+static void kvm_free_vcpus(struct kvm *kvm)
+{
+	unsigned int i;
+
 	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
 		if (kvm->vcpus[i]) {
 			kvm_arch_vcpu_free(kvm->vcpus[i]);
@@ -3946,6 +4100,12 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_free_pit(kvm);
 	kfree(kvm->arch.vpic);
 	kfree(kvm->arch.vioapic);
+	/*
+	 * kvm_mmu_notifier_release() will be called before
+	 * mmu_notifier_unregister returns, if it didn't run
+	 * already.
+	 */
+	mmu_notifier_unregister(&kvm->arch.mmu_notifier, kvm->mm);
 	kvm_free_vcpus(kvm);
 	kvm_free_physmem(kvm);
 	if (kvm->arch.apic_access_page)
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index cd50380..8cd420c 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -13,6 +13,7 @@
 
 #include <linux/types.h>
 #include <linux/mm.h>
+#include <linux/mmu_notifier.h>
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
@@ -249,6 +250,7 @@ struct kvm_vcpu_arch {
 		gfn_t gfn;	/* presumed gfn during guest pte update */
 		pfn_t pfn;	/* pfn corresponding to that gfn */
 		int largepage;
+		int mmu_seq;
 	} update_pte;
 
 	struct i387_fxsave_struct host_fx_image;
@@ -324,6 +326,10 @@ struct kvm_arch{
 
 	struct page *ept_identity_pagetable;
 	bool ept_identity_pagetable_done;
+
+	struct mmu_notifier mmu_notifier;
+	atomic_t mmu_notifier_seq;
+	atomic_t mmu_notifier_count;
 };
 
 struct kvm_vm_stat {
@@ -448,6 +454,8 @@ void kvm_mmu_set_base_ptes(u64 base_pte);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 		u64 dirty_mask, u64 nx_mask, u64 x_mask);
 
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
 void kvm_mmu_zap_all(struct kvm *kvm);

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [patch] kvm with mmu notifier v18
  2008-06-05  0:26 [patch] kvm with mmu notifier v18 Andrea Arcangeli
@ 2008-06-05 15:54 ` Avi Kivity
  2008-06-05 16:47   ` Andrea Arcangeli
  0 siblings, 1 reply; 10+ messages in thread
From: Avi Kivity @ 2008-06-05 15:54 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: kvm

Andrea Arcangeli wrote:
> Hello,
>
> this is an update of the patch to test kvm on mmu notifier v18. I'll
> post the mmu notifier v18 tomorrow after some more review but I can
> post the kvm side in the meantime (which works with the previous v17
> as well if anyone wants to test).
>
> This has a relevant fix for kvm_unmap_rmapp: rmap_remove while
> deleting the current spte from the desc array, can overwrite the
> deleted current spte with the last spte in the desc array in turn
> reodering it. So if we restart rmap_next from the sptes after the
> deleted current spte, we may miss the later sptes that have been moved
> in the slot of the current spte. We've to teardown the whole desc
> array so the fix was to simply pick from the first entry and wait the
> others to come down.
>
> I also wonder if the update_pte done outside the mmu_lock is safe
> without mmu notifiers, or if the below changes are required regardless
> (I think they are). I cleaned up the fix but I probably need to
> extract it from this patch.
>   

> +
> +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
> +{
> +	u64 *spte;
> +	int young = 0;
> +
> +	spte = rmap_next(kvm, rmapp, NULL);
> +	while (spte) {
> +		int _young;
> +		u64 _spte = *spte;
> +		BUG_ON(!(_spte & PT_PRESENT_MASK));
> +		_young = _spte & PT_ACCESSED_MASK;
> +		if (_young) {
> +			young = !!_young;
>   

young = 1?

> +			set_shadow_pte(spte, _spte & ~PT_ACCESSED_MASK);
>   

This can theoretically lose the dirty bit.  We don't track it now, so 
that's okay.

> +		}
> +		spte = rmap_next(kvm, rmapp, spte);
> +	}
> +	return young;
> +}
> +
> +int kvm_age_hva(struct kvm *kvm, unsigned long hva)
> +{
> +	int i;
> +	int young = 0;
> +
> +	/*
> +	 * If mmap_sem isn't taken, we can look the memslots with only
> +	 * the mmu_lock by skipping over the slots with userspace_addr == 0.
> +	 */
>   

One day we want to sort the slots according to size.  We'll need better 
locking then (rcu, likely).

>  
>  
> @@ -1235,9 +1340,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
>  	int i;
>  	struct kvm_mmu_page *sp;
>  
> -	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
> -		return;
>  	spin_lock(&vcpu->kvm->mmu_lock);
> +	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
> +		goto out;
>   

vcpu-> stuff is protected by the vcpu mutex.

> @@ -1626,18 +1744,20 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
>  	return !!(spte && (*spte & shadow_accessed_mask));
>  }
>  
> -static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
> -					  const u8 *new, int bytes)
> +static int mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
> +					 const u8 *new, int bytes,
> +					 gfn_t *_gfn, pfn_t *_pfn,
> +					 int *_mmu_seq, int *_largepage)
>  {
>  	gfn_t gfn;
>  	int r;
>  	u64 gpte = 0;
>  	pfn_t pfn;
> -
> -	vcpu->arch.update_pte.largepage = 0;
> +	int mmu_seq;
> +	int largepage;
>  
>  	if (bytes != 4 && bytes != 8)
> -		return;
> +		return 0;
>  
>  	/*
>  	 * Assume that the pte write on a page table of the same type
> @@ -1650,7 +1770,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
>  		if ((bytes == 4) && (gpa % 4 == 0)) {
>  			r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
>  			if (r)
> -				return;
> +				return 0;
>  			memcpy((void *)&gpte + (gpa % 8), new, 4);
>  		} else if ((bytes == 8) && (gpa % 8 == 0)) {
>  			memcpy((void *)&gpte, new, 8);
> @@ -1660,23 +1780,30 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
>  			memcpy((void *)&gpte, new, 4);
>  	}
>  	if (!is_present_pte(gpte))
> -		return;
> +		return 0;
>  	gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
>  
> +	largepage = 0;
>  	down_read(&current->mm->mmap_sem);
>  	if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
>  		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
> -		vcpu->arch.update_pte.largepage = 1;
> +		largepage = 1;
>  	}
> +	mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq);
> +	/* implicit mb(), we'll read before PT lock is unlocked */
>  	pfn = gfn_to_pfn(vcpu->kvm, gfn);
>  	up_read(&current->mm->mmap_sem);
>  
> -	if (is_error_pfn(pfn)) {
> +	if (unlikely(is_error_pfn(pfn))) {
>  		kvm_release_pfn_clean(pfn);
> -		return;
> +		return 0;
>  	}
> -	vcpu->arch.update_pte.gfn = gfn;
> -	vcpu->arch.update_pte.pfn = pfn;
> +
> +	*_gfn = gfn;
> +	*_pfn = pfn;
> +	*_mmu_seq = mmu_seq;
> +	*_largepage = largepage;
> +	return 1;
>  }
>   

Alternatively, we can replace this with follow_page() in update_pte().  
Probably best to defer it, though.

> +	/*
> +	 * When ->invalidate_page runs, the linux pte has been zapped
> +	 * already but the page is still allocated until
> +	 * ->invalidate_page returns. So if we increase the sequence
> +	 * here the kvm page fault will notice if the spte can't be
> +	 * established because the page is going to be freed. If
> +	 * instead the kvm page fault establishes the spte before
> +	 * ->invalidate_page runs, kvm_unmap_hva will release it
> +	 * before returning.
>   

This too...

Please move the registration to virt/kvm/kvm_main.c, and provide stubs 
for non-x86.  This is definitely something that we want to do cross-arch 
(except s390 which have it in hardware).

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch] kvm with mmu notifier v18
  2008-06-05 15:54 ` Avi Kivity
@ 2008-06-05 16:47   ` Andrea Arcangeli
  2008-06-06  8:49     ` Avi Kivity
  2008-06-10 20:41     ` Marcelo Tosatti
  0 siblings, 2 replies; 10+ messages in thread
From: Andrea Arcangeli @ 2008-06-05 16:47 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm

On Thu, Jun 05, 2008 at 06:54:30PM +0300, Avi Kivity wrote:
>> +		if (_young) {
>> +			young = !!_young;
>>   
>
> young = 1?

I liked !! more, but I guess =1 is more efficient, ok updated ;)

>> +			set_shadow_pte(spte, _spte & ~PT_ACCESSED_MASK);
>>   
>
> This can theoretically lose the dirty bit.  We don't track it now, so 
> that's okay.

good point, kvm_mmu_access_page uses set_bit, so I can as well use
clear_bit to replaced the above with:

    	 	  clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);

> One day we want to sort the slots according to size.  We'll need better 
> locking then (rcu, likely).

I think it's more interesting to sort them on their start/end gfn
address, prio_tree would be the optimal structure for me to use in the
mmu notifier invalidates as I could ask the prio tree to show me in
O(log(N)) (N number of slots) all the slots that overlap with the
invalidated start/end range. However I'm afraid there aren't enough
slots for this to matter... but OTOH there aren't frequent
modifications either, so it may be a microoptimization (if there were
frequent modifications with such a small number of slots it likely
would be slower than a list).

>>   @@ -1235,9 +1340,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
>>  	int i;
>>  	struct kvm_mmu_page *sp;
>>  -	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
>> -		return;
>>  	spin_lock(&vcpu->kvm->mmu_lock);
>> +	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
>> +		goto out;
>>   
>
> vcpu-> stuff is protected by the vcpu mutex.

That was to run it in ->release but I'm undoing as I'm removing release.

> Alternatively, we can replace this with follow_page() in update_pte().  
> Probably best to defer it, though.

follow_page seems not exported to modules, ok defer to a different
patch anyway.

> Please move the registration to virt/kvm/kvm_main.c, and provide stubs for 
> non-x86.  This is definitely something that we want to do cross-arch 
> (except s390 which have it in hardware).

Sorry but I did this originally in kvm/kvm_main.c and I got complains
from s390 that didn't want the common code polluted with non-s390
needed stuff. so what should I do?

untested updated patch attached. Previously I also picked from not the
latest version, I removed ->release already before posting the last
one because by the time vm destroy runs no more guest mode can run, so
sptes are irrelevant and no cpu can follow the secondary tlb anymore
because no cpu can be in guest mode for the 'mm', even if sptes are
actually destroyed later. The previous patch was taking a kvm mutex in
release under mmu_lock and that's forbidden, so it's simpler to remove
the release debug knob for now (you suggested to use
kvm_reload_remote_mmus in the future that shouldn't take sleeping
locks). The only reason for having a real ->release would be to avoid
any risk w.r.t. to tlb speculative accesses to gart alias with
different cache protocol (I doubt this is a realistic worry but anyway
it's not big deal to implement a ->release).

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 8d45fab..ce3251c 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -21,6 +21,7 @@ config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
 	depends on HAVE_KVM
 	select PREEMPT_NOTIFIERS
+	select MMU_NOTIFIER
 	select ANON_INODES
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 491f645..d3c56ad 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -651,6 +651,98 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 	account_shadowed(kvm, gfn);
 }
 
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+	u64 *spte;
+	int need_tlb_flush = 0;
+
+	while ((spte = rmap_next(kvm, rmapp, NULL))) {
+		BUG_ON(!(*spte & PT_PRESENT_MASK));
+		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
+		rmap_remove(kvm, spte);
+		set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+		need_tlb_flush = 1;
+	}
+	return need_tlb_flush;
+}
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+	int i;
+	int need_tlb_flush = 0;
+
+	/*
+	 * If mmap_sem isn't taken, we can look the memslots with only
+	 * the mmu_lock by skipping over the slots with userspace_addr == 0.
+	 */
+	for (i = 0; i < kvm->nmemslots; i++) {
+		struct kvm_memory_slot *memslot = &kvm->memslots[i];
+		unsigned long start = memslot->userspace_addr;
+		unsigned long end;
+
+		/* mmu_lock protects userspace_addr */
+		if (!start)
+			continue;
+
+		end = start + (memslot->npages << PAGE_SHIFT);
+		if (hva >= start && hva < end) {
+			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+			need_tlb_flush |= kvm_unmap_rmapp(kvm,
+							  &memslot->rmap[gfn_offset]);
+		}
+	}
+
+	return need_tlb_flush;
+}
+
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+	u64 *spte;
+	int young = 0;
+
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		int _young;
+		u64 _spte = *spte;
+		BUG_ON(!(_spte & PT_PRESENT_MASK));
+		_young = _spte & PT_ACCESSED_MASK;
+		if (_young) {
+			young = 1;
+			clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
+		}
+		spte = rmap_next(kvm, rmapp, spte);
+	}
+	return young;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+	int i;
+	int young = 0;
+
+	/*
+	 * If mmap_sem isn't taken, we can look the memslots with only
+	 * the mmu_lock by skipping over the slots with userspace_addr == 0.
+	 */
+	for (i = 0; i < kvm->nmemslots; i++) {
+		struct kvm_memory_slot *memslot = &kvm->memslots[i];
+		unsigned long start = memslot->userspace_addr;
+		unsigned long end;
+
+		/* mmu_lock protects userspace_addr */
+		if (!start)
+			continue;
+
+		end = start + (memslot->npages << PAGE_SHIFT);
+		if (hva >= start && hva < end) {
+			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+			young |= kvm_age_rmapp(kvm, &memslot->rmap[gfn_offset]);
+		}
+	}
+
+	return young;
+}
+
 #ifdef MMU_DEBUG
 static int is_empty_shadow_page(u64 *spt)
 {
@@ -1203,6 +1295,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	int r;
 	int largepage = 0;
 	pfn_t pfn;
+	int mmu_seq;
 
 	down_read(&current->mm->mmap_sem);
 	if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
@@ -1210,6 +1303,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 		largepage = 1;
 	}
 
+	mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq);
+	/* implicit mb(), we'll read before PT lock is unlocked */
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
 	up_read(&current->mm->mmap_sem);
 
@@ -1220,6 +1315,11 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	}
 
 	spin_lock(&vcpu->kvm->mmu_lock);
+	if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_count)))
+		goto out_unlock;
+	smp_rmb();
+	if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_seq) != mmu_seq))
+		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
 	r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
 			 PT32E_ROOT_LEVEL);
@@ -1227,6 +1327,11 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 
 
 	return r;
+
+out_unlock:
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	kvm_release_pfn_clean(pfn);
+	return 0;
 }
 
 
@@ -1345,6 +1450,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 	int r;
 	int largepage = 0;
 	gfn_t gfn = gpa >> PAGE_SHIFT;
+	int mmu_seq;
 
 	ASSERT(vcpu);
 	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -1358,6 +1464,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
 		largepage = 1;
 	}
+	mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq);
+	/* implicit mb(), we'll read before PT lock is unlocked */
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
 	up_read(&current->mm->mmap_sem);
 	if (is_error_pfn(pfn)) {
@@ -1365,12 +1473,22 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 		return 1;
 	}
 	spin_lock(&vcpu->kvm->mmu_lock);
+	if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_count)))
+		goto out_unlock;
+	smp_rmb();
+	if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_seq) != mmu_seq))
+		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
 	r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
 			 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level());
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return r;
+
+out_unlock:
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	kvm_release_pfn_clean(pfn);
+	return 0;
 }
 
 static void nonpaging_free(struct kvm_vcpu *vcpu)
@@ -1626,18 +1744,20 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
 	return !!(spte && (*spte & shadow_accessed_mask));
 }
 
-static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-					  const u8 *new, int bytes)
+static int mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+					 const u8 *new, int bytes,
+					 gfn_t *_gfn, pfn_t *_pfn,
+					 int *_mmu_seq, int *_largepage)
 {
 	gfn_t gfn;
 	int r;
 	u64 gpte = 0;
 	pfn_t pfn;
-
-	vcpu->arch.update_pte.largepage = 0;
+	int mmu_seq;
+	int largepage;
 
 	if (bytes != 4 && bytes != 8)
-		return;
+		return 0;
 
 	/*
 	 * Assume that the pte write on a page table of the same type
@@ -1650,7 +1770,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		if ((bytes == 4) && (gpa % 4 == 0)) {
 			r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
 			if (r)
-				return;
+				return 0;
 			memcpy((void *)&gpte + (gpa % 8), new, 4);
 		} else if ((bytes == 8) && (gpa % 8 == 0)) {
 			memcpy((void *)&gpte, new, 8);
@@ -1660,23 +1780,30 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 			memcpy((void *)&gpte, new, 4);
 	}
 	if (!is_present_pte(gpte))
-		return;
+		return 0;
 	gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 
+	largepage = 0;
 	down_read(&current->mm->mmap_sem);
 	if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
 		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
-		vcpu->arch.update_pte.largepage = 1;
+		largepage = 1;
 	}
+	mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq);
+	/* implicit mb(), we'll read before PT lock is unlocked */
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
 	up_read(&current->mm->mmap_sem);
 
-	if (is_error_pfn(pfn)) {
+	if (unlikely(is_error_pfn(pfn))) {
 		kvm_release_pfn_clean(pfn);
-		return;
+		return 0;
 	}
-	vcpu->arch.update_pte.gfn = gfn;
-	vcpu->arch.update_pte.pfn = pfn;
+
+	*_gfn = gfn;
+	*_pfn = pfn;
+	*_mmu_seq = mmu_seq;
+	*_largepage = largepage;
+	return 1;
 }
 
 static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
@@ -1711,9 +1838,24 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	int npte;
 	int r;
 
+	int update_pte;
+	gfn_t gpte_gfn;
+	pfn_t pfn;
+	int mmu_seq;
+	int largepage;
+
 	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
-	mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
+	update_pte = mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes,
+						   &gpte_gfn, &pfn,
+						   &mmu_seq, &largepage);
 	spin_lock(&vcpu->kvm->mmu_lock);
+	if (update_pte) {
+		BUG_ON(!is_error_pfn(vcpu->arch.update_pte.pfn));
+		vcpu->arch.update_pte.gfn = gpte_gfn;
+		vcpu->arch.update_pte.pfn = pfn;
+		vcpu->arch.update_pte.mmu_seq = mmu_seq;
+		vcpu->arch.update_pte.largepage = largepage;
+	}
 	kvm_mmu_access_page(vcpu, gfn);
 	kvm_mmu_free_some_pages(vcpu);
 	++vcpu->kvm->stat.mmu_pte_write;
@@ -1793,11 +1935,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		}
 	}
 	kvm_mmu_audit(vcpu, "post pte write");
-	spin_unlock(&vcpu->kvm->mmu_lock);
 	if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
 		kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
 		vcpu->arch.update_pte.pfn = bad_pfn;
 	}
+	spin_unlock(&vcpu->kvm->mmu_lock);
 }
 
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4d91822..eb87479 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -263,6 +263,12 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 	pfn = vcpu->arch.update_pte.pfn;
 	if (is_error_pfn(pfn))
 		return;
+	if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_count)))
+		return;
+	smp_rmb();
+	if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_seq) !=
+		     vcpu->arch.update_pte.mmu_seq))
+		return;
 	kvm_get_pfn(pfn);
 	mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
 		     gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
@@ -380,6 +386,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	int r;
 	pfn_t pfn;
 	int largepage = 0;
+	int mmu_seq;
 
 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
 	kvm_mmu_audit(vcpu, "pre page fault");
@@ -413,6 +420,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 			largepage = 1;
 		}
 	}
+	mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq);
+	/* implicit mb(), we'll read before PT lock is unlocked */
 	pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
 	up_read(&current->mm->mmap_sem);
 
@@ -424,6 +433,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	}
 
 	spin_lock(&vcpu->kvm->mmu_lock);
+	if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_count)))
+		goto out_unlock;
+	smp_rmb();
+	if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_seq) != mmu_seq))
+		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
 	shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
 				  largepage, &write_pt, pfn);
@@ -439,6 +453,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return write_pt;
+
+out_unlock:
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	kvm_release_pfn_clean(pfn);
+	return 0;
 }
 
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 77fb2bd..fff6d8d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/highmem.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/uaccess.h>
 #include <asm/msr.h>
@@ -3903,16 +3904,127 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 	free_page((unsigned long)vcpu->arch.pio_data);
 }
 
-struct  kvm *kvm_arch_create_vm(void)
+static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 {
-	struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+	struct kvm_arch *kvm_arch;
+	kvm_arch = container_of(mn, struct kvm_arch, mmu_notifier);
+	return container_of(kvm_arch, struct kvm, arch);
+}
 
-	if (!kvm)
-		return ERR_PTR(-ENOMEM);
+static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
+					     struct mm_struct *mm,
+					     unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int need_tlb_flush;
 
-	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+	/*
+	 * When ->invalidate_page runs, the linux pte has been zapped
+	 * already but the page is still allocated until
+	 * ->invalidate_page returns. So if we increase the sequence
+	 * here the kvm page fault will notice if the spte can't be
+	 * established because the page is going to be freed. If
+	 * instead the kvm page fault establishes the spte before
+	 * ->invalidate_page runs, kvm_unmap_hva will release it
+	 * before returning.
+
+	 * No need of memory barriers as the sequence increase only
+	 * need to be seen at spin_unlock time, and not at spin_lock
+	 * time.
+	 *
+	 * Increasing the sequence after the spin_unlock would be
+	 * unsafe because the kvm page fault could then establish the
+	 * pte after kvm_unmap_hva returned, without noticing the page
+	 * is going to be freed.
+	 */
+	atomic_inc(&kvm->arch.mmu_notifier_seq);
+	spin_lock(&kvm->mmu_lock);
+	need_tlb_flush = kvm_unmap_hva(kvm, address);
+	spin_unlock(&kvm->mmu_lock);
 
-	return kvm;
+	/* we've to flush the tlb before the pages can be freed */
+	if (need_tlb_flush)
+		kvm_flush_remote_tlbs(kvm);
+
+}
+
+static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
+						    struct mm_struct *mm,
+						    unsigned long start,
+						    unsigned long end)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int need_tlb_flush = 0;
+
+	/*
+	 * The count increase must become visible at unlock time as no
+	 * spte can be established without taking the mmu_lock and
+	 * count is also read inside the mmu_lock critical section.
+	 */
+	atomic_inc(&kvm->arch.mmu_notifier_count);
+
+	spin_lock(&kvm->mmu_lock);
+	for (; start < end; start += PAGE_SIZE)
+		need_tlb_flush |= kvm_unmap_hva(kvm, start);
+	spin_unlock(&kvm->mmu_lock);
+
+	/* we've to flush the tlb before the pages can be freed */
+	if (need_tlb_flush)
+		kvm_flush_remote_tlbs(kvm);
+}
+
+static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+						  struct mm_struct *mm,
+						  unsigned long start,
+						  unsigned long end)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	/*
+	 *
+	 * This sequence increase will notify the kvm page fault that
+	 * the page that is going to be mapped in the spte could have
+	 * been freed.
+	 *
+	 * There's also an implicit mb() here in this comment,
+	 * provided by the last PT lock taken to zap pagetables, and
+	 * that the read side has to take too in follow_page(). The
+	 * sequence increase in the worst case will become visible to
+	 * the kvm page fault after the spin_lock of the last PT lock
+	 * of the last PT-lock-protected critical section preceeding
+	 * invalidate_range_end. So if the kvm page fault is about to
+	 * establish the spte inside the mmu_lock, while we're freeing
+	 * the pages, it will have to backoff and when it retries, it
+	 * will have to take the PT lock before it can check the
+	 * pagetables again. And after taking the PT lock it will
+	 * re-establish the pte even if it will see the already
+	 * increased sequence number before calling gfn_to_pfn.
+	 */
+	atomic_inc(&kvm->arch.mmu_notifier_seq);
+	/*
+	 * The sequence increase must be visible before count
+	 * decrease. The page fault has to read count before sequence
+	 * for this write order to be effective.
+	 */
+	wmb();
+	atomic_dec(&kvm->arch.mmu_notifier_count);
+	BUG_ON(atomic_read(&kvm->arch.mmu_notifier_count) < 0);
+}
+
+static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
+					      struct mm_struct *mm,
+					      unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int young;
+
+	spin_lock(&kvm->mmu_lock);
+	young = kvm_age_hva(kvm, address);
+	spin_unlock(&kvm->mmu_lock);
+
+	if (young)
+		kvm_flush_remote_tlbs(kvm);
+
+	return young;
 }
 
 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
@@ -3922,6 +4034,33 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
 	vcpu_put(vcpu);
 }
 
+static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+	.invalidate_page	= kvm_mmu_notifier_invalidate_page,
+	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
+	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
+	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
+};
+
+struct  kvm *kvm_arch_create_vm(void)
+{
+	struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+	int err;
+
+	if (!kvm)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+
+	kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+	err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+	if (err) {
+		kfree(kvm);
+		return ERR_PTR(err);
+	}
+
+	return kvm;
+}
+
 static void kvm_free_vcpus(struct kvm *kvm)
 {
 	unsigned int i;
@@ -3946,6 +4085,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_free_pit(kvm);
 	kfree(kvm->arch.vpic);
 	kfree(kvm->arch.vioapic);
+	mmu_notifier_unregister(&kvm->arch.mmu_notifier, kvm->mm);
 	kvm_free_vcpus(kvm);
 	kvm_free_physmem(kvm);
 	if (kvm->arch.apic_access_page)
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index cd50380..8cd420c 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -13,6 +13,7 @@
 
 #include <linux/types.h>
 #include <linux/mm.h>
+#include <linux/mmu_notifier.h>
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
@@ -249,6 +250,7 @@ struct kvm_vcpu_arch {
 		gfn_t gfn;	/* presumed gfn during guest pte update */
 		pfn_t pfn;	/* pfn corresponding to that gfn */
 		int largepage;
+		int mmu_seq;
 	} update_pte;
 
 	struct i387_fxsave_struct host_fx_image;
@@ -324,6 +326,10 @@ struct kvm_arch{
 
 	struct page *ept_identity_pagetable;
 	bool ept_identity_pagetable_done;
+
+	struct mmu_notifier mmu_notifier;
+	atomic_t mmu_notifier_seq;
+	atomic_t mmu_notifier_count;
 };
 
 struct kvm_vm_stat {
@@ -448,6 +454,8 @@ void kvm_mmu_set_base_ptes(u64 base_pte);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 		u64 dirty_mask, u64 nx_mask, u64 x_mask);
 
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
 void kvm_mmu_zap_all(struct kvm *kvm);

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [patch] kvm with mmu notifier v18
  2008-06-05 16:47   ` Andrea Arcangeli
@ 2008-06-06  8:49     ` Avi Kivity
  2008-06-06 12:50       ` Andrea Arcangeli
  2008-06-10 20:41     ` Marcelo Tosatti
  1 sibling, 1 reply; 10+ messages in thread
From: Avi Kivity @ 2008-06-06  8:49 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: kvm

Andrea Arcangeli wrote:
>
>> One day we want to sort the slots according to size.  We'll need better 
>> locking then (rcu, likely).
>>     
>
> I think it's more interesting to sort them on their start/end gfn
> address, prio_tree would be the optimal structure for me to use in the
> mmu notifier invalidates as I could ask the prio tree to show me in
> O(log(N)) (N number of slots) all the slots that overlap with the
> invalidated start/end range. However I'm afraid there aren't enough
> slots for this to matter... but OTOH there aren't frequent
> modifications either, so it may be a microoptimization (if there were
> frequent modifications with such a small number of slots it likely
> would be slower than a list).
>   

There are only two interesting slots, 1G-end_of_mem and 0-pci_hole.  A 
sorted array gives an average of less than two lookups to find the slot, 
with the smallest possible constant.  Any other algorithm will give more 
lookups with a far higher constant.

Sometimes linear search is the best algorithm.


>   
>>>   @@ -1235,9 +1340,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
>>>  	int i;
>>>  	struct kvm_mmu_page *sp;
>>>  -	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
>>> -		return;
>>>  	spin_lock(&vcpu->kvm->mmu_lock);
>>> +	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
>>> +		goto out;
>>>   
>>>       
>> vcpu-> stuff is protected by the vcpu mutex.
>>     
>
> That was to run it in ->release but I'm undoing as I'm removing release.
>
>   

Surely ->release can't happen while vcpus are still running?

>> Please move the registration to virt/kvm/kvm_main.c, and provide stubs for 
>> non-x86.  This is definitely something that we want to do cross-arch 
>> (except s390 which have it in hardware).
>>     
>
> Sorry but I did this originally in kvm/kvm_main.c and I got complains
> from s390 that didn't want the common code polluted with non-s390
> needed stuff. so what should I do?
>
>   

x86, ia64, and ppc all need mmu notifiers as well as other new 
architectures, so this better be in common code.  We can #ifdef it so 
s390 doesn't suffer a performance impact.

> +
> +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
> +{
> +	u64 *spte;
> +	int young = 0;
> +
> +	spte = rmap_next(kvm, rmapp, NULL);
> +	while (spte) {
> +		int _young;
> +		u64 _spte = *spte;
> +		BUG_ON(!(_spte & PT_PRESENT_MASK));
> +		_young = _spte & PT_ACCESSED_MASK;
>   

I forgot, last round:  EPT doesn't have PT_ACCESSED_MASK, so you're 
reading (and clearing) some random bit.  We can easily qualify the test 
by looking at shadow_accessed_mask; if zero, the hardware doesn't have a 
young bit.

There's a bigger question of what to do in the case of EPT.  We can 
always return true, or return true and also unmap.  The first means we 
lie to the vm, the second means we take unnecessary exits to maintain 
age information.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch] kvm with mmu notifier v18
  2008-06-06  8:49     ` Avi Kivity
@ 2008-06-06 12:50       ` Andrea Arcangeli
  2008-06-06 16:37         ` Avi Kivity
  0 siblings, 1 reply; 10+ messages in thread
From: Andrea Arcangeli @ 2008-06-06 12:50 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm

On Fri, Jun 06, 2008 at 11:49:23AM +0300, Avi Kivity wrote:
> Andrea Arcangeli wrote:
>>
>>> One day we want to sort the slots according to size.  We'll need better 
>>> locking then (rcu, likely).
>>>     
>>
>> I think it's more interesting to sort them on their start/end gfn
>> address, prio_tree would be the optimal structure for me to use in the
>> mmu notifier invalidates as I could ask the prio tree to show me in
>> O(log(N)) (N number of slots) all the slots that overlap with the
>> invalidated start/end range. However I'm afraid there aren't enough
>> slots for this to matter... but OTOH there aren't frequent
>> modifications either, so it may be a microoptimization (if there were
>> frequent modifications with such a small number of slots it likely
>> would be slower than a list).
>>   
>
> There are only two interesting slots, 1G-end_of_mem and 0-pci_hole.  A 
> sorted array gives an average of less than two lookups to find the slot, 
> with the smallest possible constant.  Any other algorithm will give more 
> lookups with a far higher constant.
>
> Sometimes linear search is the best algorithm.

But we can't break the loop after the first match! I think you're not
taking into account the aliasing done with overlapping gfn in the
memslots (which is not forbidden apparently and you planned to
obsolete the explicit aliasing logic with it). Only prio_tree can
reduce the number of walks in the most optimal way because of the gfn
overlaps and search by start-end address. Sorting by size won't be
useful because we can't break the loop after the first found match. If
prio tree is too heavyweight the next best would be sorting with a
single-index tree like rbtree by start address as it'll at least
eliminate all the memslots that have a start address higher than the
end of the invalidate range. But prio_tree will eliminate them all in
O(log(N)) so on the paper prio_tree is best (but perhaps in practice
rbtree is better, dunno).

> Surely ->release can't happen while vcpus are still running?

Exactly.

> x86, ia64, and ppc all need mmu notifiers as well as other new 
> architectures, so this better be in common code.  We can #ifdef it so s390 
> doesn't suffer a performance impact.

Ok, that's fine with me (I'll undo the changes post the s390 comment ;).

> I forgot, last round:  EPT doesn't have PT_ACCESSED_MASK, so you're reading 
> (and clearing) some random bit.  We can easily qualify the test by looking 
> at shadow_accessed_mask; if zero, the hardware doesn't have a young bit.
>
> There's a bigger question of what to do in the case of EPT.  We can always 
> return true, or return true and also unmap.  The first means we lie to the 
> vm, the second means we take unnecessary exits to maintain age information.

If there's no access bit (and we can't use the guest pte access bit
because EPT represent a physical page and not a guest pte anymore)
we're forced to return 0 without doing anything at all. Returning 1 is
unsafe as it'd pin the page in host ram.

If there's no access bit all we can do is to wait the VM to unmap the
page, mark the spte not present during the invalidate_page mmu
notifier, and wait the guest to trigger a minor fault from swapcache.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch] kvm with mmu notifier v18
  2008-06-06 12:50       ` Andrea Arcangeli
@ 2008-06-06 16:37         ` Avi Kivity
  2008-06-06 17:37           ` Andrea Arcangeli
  0 siblings, 1 reply; 10+ messages in thread
From: Avi Kivity @ 2008-06-06 16:37 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: kvm

Andrea Arcangeli wrote:
> On Fri, Jun 06, 2008 at 11:49:23AM +0300, Avi Kivity wrote:
>   
>> Andrea Arcangeli wrote:
>>     
>>>> One day we want to sort the slots according to size.  We'll need better 
>>>> locking then (rcu, likely).
>>>>     
>>>>         
>>> I think it's more interesting to sort them on their start/end gfn
>>> address, prio_tree would be the optimal structure for me to use in the
>>> mmu notifier invalidates as I could ask the prio tree to show me in
>>> O(log(N)) (N number of slots) all the slots that overlap with the
>>> invalidated start/end range. However I'm afraid there aren't enough
>>> slots for this to matter... but OTOH there aren't frequent
>>> modifications either, so it may be a microoptimization (if there were
>>> frequent modifications with such a small number of slots it likely
>>> would be slower than a list).
>>>   
>>>       
>> There are only two interesting slots, 1G-end_of_mem and 0-pci_hole.  A 
>> sorted array gives an average of less than two lookups to find the slot, 
>> with the smallest possible constant.  Any other algorithm will give more 
>> lookups with a far higher constant.
>>
>> Sometimes linear search is the best algorithm.
>>     
>
> But we can't break the loop after the first match! I think you're not
> taking into account the aliasing done with overlapping gfn in the
> memslots (which is not forbidden apparently and you planned to
> obsolete the explicit aliasing logic with it)

For the gfn->hva (the common case) we can break immediately.  For the 
hva->gfn, in general we cannot, but we can add an "unaliased" flag to 
the memslot and set it for slots which do not have aliases.  That makes 
the loop terminate soon again.

> . Only prio_tree can
> reduce the number of walks in the most optimal way because of the gfn
> overlaps and search by start-end address. Sorting by size won't be
> useful because we can't break the loop after the first found match. If
> prio tree is too heavyweight the next best would be sorting with a
> single-index tree like rbtree by start address as it'll at least
> eliminate all the memslots that have a start address higher than the
> end of the invalidate range. But prio_tree will eliminate them all in
> O(log(N)) so on the paper prio_tree is best (but perhaps in practice
> rbtree is better, dunno).
>   

Any pointer-based data structure is bound to be much slower than a list 
with such a small number of elements.

btw, on 64-bit userspace we can arrange the entire physical address 
space in a contiguous region (some of it unmapped) and have just one 
slot for the guest.

>> I forgot, last round:  EPT doesn't have PT_ACCESSED_MASK, so you're reading 
>> (and clearing) some random bit.  We can easily qualify the test by looking 
>> at shadow_accessed_mask; if zero, the hardware doesn't have a young bit.
>>
>> There's a bigger question of what to do in the case of EPT.  We can always 
>> return true, or return true and also unmap.  The first means we lie to the 
>> vm, the second means we take unnecessary exits to maintain age information.
>>     
>
> If there's no access bit (and we can't use the guest pte access bit
> because EPT represent a physical page and not a guest pte anymore)
> we're forced to return 0 without doing anything at all. Returning 1 is
> unsafe as it'd pin the page in host ram.
>
> If there's no access bit all we can do is to wait the VM to unmap the
> page, mark the spte not present during the invalidate_page mmu
> notifier, and wait the guest to trigger a minor fault from swapcache.
>   

Okay.  It's sad, but I don't see any choice.

If anyone from Intel is listening, please give us an accessed bit (and a 
dirty bit, too).

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch] kvm with mmu notifier v18
  2008-06-06 16:37         ` Avi Kivity
@ 2008-06-06 17:37           ` Andrea Arcangeli
  2008-06-06 20:09             ` Avi Kivity
  0 siblings, 1 reply; 10+ messages in thread
From: Andrea Arcangeli @ 2008-06-06 17:37 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm

On Fri, Jun 06, 2008 at 07:37:48PM +0300, Avi Kivity wrote:
> For the gfn->hva (the common case) we can break immediately.  For the 
> hva->gfn, in general we cannot, but we can add an "unaliased" flag to the 

I was thinking at hva->gfn which is what mmu notififer does. But
fn_to_memslot is called much more frequently and is more performance
critical and for it, sorting the list by size is enough.

> memslot and set it for slots which do not have aliases.  That makes the 
> loop terminate soon again.

So we need to make sure that aliases (gart-like) aren't common, or if
we've an alias on ram we go back to scanning the whole list all the time.

> Any pointer-based data structure is bound to be much slower than a list 
> with such a small number of elements.

Tree can only be slower than a list if there is the bitflag to signal
there is no alias so the list will break the loop always at the first
step. If you remove that bitflag, tree lookup can't be slower than
walking the entire list, even if there are only 3/4 elements
queued. Only the no_alias bitflag allows the list to be faster.

> btw, on 64-bit userspace we can arrange the entire physical address space 
> in a contiguous region (some of it unmapped) and have just one slot for the 
> guest.

I thought mmio regions would need to be separated for 64bit too? I
mean what's the point of the memslots in the first place if there's
only one for the whole physical address space?

> Okay.  It's sad, but I don't see any choice.
>
> If anyone from Intel is listening, please give us an accessed bit (and a 
> dirty bit, too).

Seconded.

The other thing we could do would be to mark the spte invalid, and
return 1, and then at the second ->clear_test_young if the spte is
still invalid, we return 0. That way we would limit the accessed bit
refresh to a kvm page fault without tearing down the linux pte (so
follow_page would be enough then). While if we return 0, if the linux
pte is already old, the page will be unmapped and go in swapcache and
follow_page won't be enough and get_user_pages will have to call
do_swap_page minor fault.

However to do the above, we would need to track with rmap non present
sptes, and that'd require changes to the kvm rmap logic. so initially
returning 0 is simpler.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch] kvm with mmu notifier v18
  2008-06-06 17:37           ` Andrea Arcangeli
@ 2008-06-06 20:09             ` Avi Kivity
  0 siblings, 0 replies; 10+ messages in thread
From: Avi Kivity @ 2008-06-06 20:09 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: kvm

Andrea Arcangeli wrote:
>> memslot and set it for slots which do not have aliases.  That makes the 
>> loop terminate soon again.
>>     
>
> So we need to make sure that aliases (gart-like) aren't common, or if
> we've an alias on ram we go back to scanning the whole list all the time.
>
>   

They aren't.  The two cases are the VGA window at 0xa0000-0xc0000 (the 
motivation for aliasing support; before we had that, Windows would take 
several seconds to clear the screen which switching to graphics mode) 
and the BIOS at 0xe000 (which ought to be aliased to 0xfffe0000, but 
isn't).  Both are very rarely used.

>> Any pointer-based data structure is bound to be much slower than a list 
>> with such a small number of elements.
>>     
>
> Tree can only be slower than a list if there is the bitflag to signal
> there is no alias so the list will break the loop always at the first
> step. If you remove that bitflag, tree lookup can't be slower than
> walking the entire list, even if there are only 3/4 elements
> queued. Only the no_alias bitflag allows the list to be faster.
>
>   

No.  List cost is C1*N, while tree cost is C2*log(N).  If N is small and 
C2/C1 is sufficiently large, the list wins.

Modern processors will increase C2/C1, since for trees there are data 
dependencies which reduce parallelism.  For a tree, you need to chase 
pointers (and the processor doesn't know which data to fetch until it 
loads the pointer), and also the pointer depends on the result of the 
comparison, further reducing speculation.

>> btw, on 64-bit userspace we can arrange the entire physical address space 
>> in a contiguous region (some of it unmapped) and have just one slot for the 
>> guest.
>>     
>
> I thought mmio regions would need to be separated for 64bit too? I
> mean what's the point of the memslots in the first place if there's
> only one for the whole physical address space?
>   

Right.  Scratch that.

>> Okay.  It's sad, but I don't see any choice.
>>
>> If anyone from Intel is listening, please give us an accessed bit (and a 
>> dirty bit, too).
>>     
>
> Seconded.
>
> The other thing we could do would be to mark the spte invalid, and
> return 1, and then at the second ->clear_test_young if the spte is
> still invalid, we return 0. That way we would limit the accessed bit
> refresh to a kvm page fault without tearing down the linux pte (so
> follow_page would be enough then). While if we return 0, if the linux
> pte is already old, the page will be unmapped and go in swapcache and
> follow_page won't be enough and get_user_pages will have to call
> do_swap_page minor fault.
>
> However to do the above, we would need to track with rmap non present
> sptes, and that'd require changes to the kvm rmap logic. so initially
> returning 0 is simpler.
>   

This may be a good compromise.  We'll have to measure and see.

Perhaps we can use a few bits in the spte to keep a counter of accessed 
bit checks, and only return 0 after it has been incremented a few times.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch] kvm with mmu notifier v18
  2008-06-05 16:47   ` Andrea Arcangeli
  2008-06-06  8:49     ` Avi Kivity
@ 2008-06-10 20:41     ` Marcelo Tosatti
  2008-06-12  1:33       ` Andrea Arcangeli
  1 sibling, 1 reply; 10+ messages in thread
From: Marcelo Tosatti @ 2008-06-10 20:41 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Avi Kivity, kvm

Hi Andrea,

On Thu, Jun 05, 2008 at 06:47:17PM +0200, Andrea Arcangeli wrote:
> latest version, I removed ->release already before posting the last
> one because by the time vm destroy runs no more guest mode can run, so
> sptes are irrelevant and no cpu can follow the secondary tlb anymore
> because no cpu can be in guest mode for the 'mm', even if sptes are
> actually destroyed later. The previous patch was taking a kvm mutex in
> release under mmu_lock and that's forbidden, so it's simpler to remove
> the release debug knob for now (you suggested to use
> kvm_reload_remote_mmus in the future that shouldn't take sleeping
> locks). The only reason for having a real ->release would be to avoid
> any risk w.r.t. to tlb speculative accesses to gart alias with
> different cache protocol (I doubt this is a realistic worry but anyway
> it's not big deal to implement a ->release).
> 
> Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

> +static int mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
> +					 const u8 *new, int bytes,
> +					 gfn_t *_gfn, pfn_t *_pfn,
> +					 int *_mmu_seq, int *_largepage)
>  {
>  	gfn_t gfn;
>  	int r;
>  	u64 gpte = 0;
>  	pfn_t pfn;
> -
> -	vcpu->arch.update_pte.largepage = 0;
> +	int mmu_seq;
> +	int largepage;
>  
>  	if (bytes != 4 && bytes != 8)
> -		return;
> +		return 0;
>  
>  	/*
>  	 * Assume that the pte write on a page table of the same type
> @@ -1650,7 +1770,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
>  		if ((bytes == 4) && (gpa % 4 == 0)) {
>  			r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
>  			if (r)
> -				return;
> +				return 0;
>  			memcpy((void *)&gpte + (gpa % 8), new, 4);
>  		} else if ((bytes == 8) && (gpa % 8 == 0)) {
>  			memcpy((void *)&gpte, new, 8);
> @@ -1660,23 +1780,30 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
>  			memcpy((void *)&gpte, new, 4);
>  	}
>  	if (!is_present_pte(gpte))
> -		return;
> +		return 0;
>  	gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
>  
> +	largepage = 0;
>  	down_read(&current->mm->mmap_sem);
>  	if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
>  		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
> -		vcpu->arch.update_pte.largepage = 1;
> +		largepage = 1;
>  	}
> +	mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq);
> +	/* implicit mb(), we'll read before PT lock is unlocked */
>  	pfn = gfn_to_pfn(vcpu->kvm, gfn);
>  	up_read(&current->mm->mmap_sem);
>  
> -	if (is_error_pfn(pfn)) {
> +	if (unlikely(is_error_pfn(pfn))) {
>  		kvm_release_pfn_clean(pfn);
> -		return;
> +		return 0;
>  	}
> -	vcpu->arch.update_pte.gfn = gfn;
> -	vcpu->arch.update_pte.pfn = pfn;
> +
> +	*_gfn = gfn;
> +	*_pfn = pfn;
> +	*_mmu_seq = mmu_seq;
> +	*_largepage = largepage;
> +	return 1;
>  }
>  
>  static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
> @@ -1711,9 +1838,24 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
>  	int npte;
>  	int r;
>  
> +	int update_pte;
> +	gfn_t gpte_gfn;
> +	pfn_t pfn;
> +	int mmu_seq;
> +	int largepage;
> +
>  	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
> -	mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
> +	update_pte = mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes,
> +						   &gpte_gfn, &pfn,
> +						   &mmu_seq, &largepage);
>  	spin_lock(&vcpu->kvm->mmu_lock);
> +	if (update_pte) {
> +		BUG_ON(!is_error_pfn(vcpu->arch.update_pte.pfn));
> +		vcpu->arch.update_pte.gfn = gpte_gfn;
> +		vcpu->arch.update_pte.pfn = pfn;
> +		vcpu->arch.update_pte.mmu_seq = mmu_seq;
> +		vcpu->arch.update_pte.largepage = largepage;
> +	}

I don't get this. mmu_lock protects the shadow page tables, reverse
mappings and associated lists. vcpu->arch.update_pte is a per-vcpu
structure, so it does not need locking by itself.

The memslots are protected by memslots_lock, which is always taken when
mmu_guess_page_from_pte_write() is reached.

mmap_sem protects QEMU's virtual mmaping from changing during find_vma /
get_user_pages.

Can you explain please?



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch] kvm with mmu notifier v18
  2008-06-10 20:41     ` Marcelo Tosatti
@ 2008-06-12  1:33       ` Andrea Arcangeli
  0 siblings, 0 replies; 10+ messages in thread
From: Andrea Arcangeli @ 2008-06-12  1:33 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, kvm

On Tue, Jun 10, 2008 at 05:41:30PM -0300, Marcelo Tosatti wrote:
> Hi Andrea,
> 
> On Thu, Jun 05, 2008 at 06:47:17PM +0200, Andrea Arcangeli wrote:
> > latest version, I removed ->release already before posting the last
> > one because by the time vm destroy runs no more guest mode can run, so
> > sptes are irrelevant and no cpu can follow the secondary tlb anymore
> > because no cpu can be in guest mode for the 'mm', even if sptes are
> > actually destroyed later. The previous patch was taking a kvm mutex in
> > release under mmu_lock and that's forbidden, so it's simpler to remove
> > the release debug knob for now (you suggested to use
> > kvm_reload_remote_mmus in the future that shouldn't take sleeping
> > locks). The only reason for having a real ->release would be to avoid
> > any risk w.r.t. to tlb speculative accesses to gart alias with
> > different cache protocol (I doubt this is a realistic worry but anyway
> > it's not big deal to implement a ->release).
> > 
> > Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
> 
> > +static int mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
> > +					 const u8 *new, int bytes,
> > +					 gfn_t *_gfn, pfn_t *_pfn,
> > +					 int *_mmu_seq, int *_largepage)
> >  {
> >  	gfn_t gfn;
> >  	int r;
> >  	u64 gpte = 0;
> >  	pfn_t pfn;
> > -
> > -	vcpu->arch.update_pte.largepage = 0;
> > +	int mmu_seq;
> > +	int largepage;
> >  
> >  	if (bytes != 4 && bytes != 8)
> > -		return;
> > +		return 0;
> >  
> >  	/*
> >  	 * Assume that the pte write on a page table of the same type
> > @@ -1650,7 +1770,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
> >  		if ((bytes == 4) && (gpa % 4 == 0)) {
> >  			r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
> >  			if (r)
> > -				return;
> > +				return 0;
> >  			memcpy((void *)&gpte + (gpa % 8), new, 4);
> >  		} else if ((bytes == 8) && (gpa % 8 == 0)) {
> >  			memcpy((void *)&gpte, new, 8);
> > @@ -1660,23 +1780,30 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
> >  			memcpy((void *)&gpte, new, 4);
> >  	}
> >  	if (!is_present_pte(gpte))
> > -		return;
> > +		return 0;
> >  	gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
> >  
> > +	largepage = 0;
> >  	down_read(&current->mm->mmap_sem);
> >  	if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
> >  		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
> > -		vcpu->arch.update_pte.largepage = 1;
> > +		largepage = 1;
> >  	}
> > +	mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq);
> > +	/* implicit mb(), we'll read before PT lock is unlocked */
> >  	pfn = gfn_to_pfn(vcpu->kvm, gfn);
> >  	up_read(&current->mm->mmap_sem);
> >  
> > -	if (is_error_pfn(pfn)) {
> > +	if (unlikely(is_error_pfn(pfn))) {
> >  		kvm_release_pfn_clean(pfn);
> > -		return;
> > +		return 0;
> >  	}
> > -	vcpu->arch.update_pte.gfn = gfn;
> > -	vcpu->arch.update_pte.pfn = pfn;
> > +
> > +	*_gfn = gfn;
> > +	*_pfn = pfn;
> > +	*_mmu_seq = mmu_seq;
> > +	*_largepage = largepage;
> > +	return 1;
> >  }
> >  
> >  static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
> > @@ -1711,9 +1838,24 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
> >  	int npte;
> >  	int r;
> >  
> > +	int update_pte;
> > +	gfn_t gpte_gfn;
> > +	pfn_t pfn;
> > +	int mmu_seq;
> > +	int largepage;
> > +
> >  	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
> > -	mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
> > +	update_pte = mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes,
> > +						   &gpte_gfn, &pfn,
> > +						   &mmu_seq, &largepage);
> >  	spin_lock(&vcpu->kvm->mmu_lock);
> > +	if (update_pte) {
> > +		BUG_ON(!is_error_pfn(vcpu->arch.update_pte.pfn));
> > +		vcpu->arch.update_pte.gfn = gpte_gfn;
> > +		vcpu->arch.update_pte.pfn = pfn;
> > +		vcpu->arch.update_pte.mmu_seq = mmu_seq;
> > +		vcpu->arch.update_pte.largepage = largepage;
> > +	}
> 
> I don't get this. mmu_lock protects the shadow page tables, reverse
> mappings and associated lists. vcpu->arch.update_pte is a per-vcpu
> structure, so it does not need locking by itself.

Ok, I wasn't sure myself if this is needed. The question is if two
physical cpus could ever access vcpu->arch.update_pte structure at the
same time... I guess answer is no in which case I can freely undo the
above.

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2008-06-12  1:34 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-06-05  0:26 [patch] kvm with mmu notifier v18 Andrea Arcangeli
2008-06-05 15:54 ` Avi Kivity
2008-06-05 16:47   ` Andrea Arcangeli
2008-06-06  8:49     ` Avi Kivity
2008-06-06 12:50       ` Andrea Arcangeli
2008-06-06 16:37         ` Avi Kivity
2008-06-06 17:37           ` Andrea Arcangeli
2008-06-06 20:09             ` Avi Kivity
2008-06-10 20:41     ` Marcelo Tosatti
2008-06-12  1:33       ` Andrea Arcangeli

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox