[PATCH 1/6] kvm.git mmu notifier patch

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/6] kvm.git mmu notifier patch
@ 2008-07-25 14:24 Andrea Arcangeli
  2008-07-25 14:26 ` [PATCH 2/6] kvm.git allow reading aliases with mmu_lock Andrea Arcangeli
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Andrea Arcangeli @ 2008-07-25 14:24 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm

Hello,

This is the latest version of the kvm mmu notifier patch.

This allows largepages to be invalidated too, and a separate patch
takes care of alias locking (2/5) and memslot locking (3/5).

While in the long term we should remove the alias to microoptimize the
fast paths and avoid the need of unalias_gfn calls, I decided not to
remove alias immediately: that patch done best would have removed the
alias API as a whole from userland, and done bad it would have kept
the alias API intact while implementing the lowlevel kernel details
with memslots instead of the alias array. Problem is the memslot array
is already fragmented with the private entries, and so adding the
alias entries before or after the private memslot entries resulted in
different issues, often requiring a new loop over the alias range or
the private range depending where I put the alias slots (before or
after the private ones).

So I think to do it right the alias ioctl should be removed as a whole
and that was too large to do right now, especially given we're
concentrating on stability and fixing the alias with the mmu_lock was
a 2 liner change.

Marcelo please double check this code, I didn't effectively test it
myself with the largepages yet but it looks the rmap_next works the
same and the rmap_pde is just a rmapp with the only difference of the
PSE bit being set. I didn't hide the rmap_pde details with gfn_to_rmap
to avoid an entirely superflous gfn_to_memslot.

+                       retval |= handler(kvm,
+                                         &memslot->lpage_info[
+                                                 gfn_offset / KVM_PAGES_PER_HPAGE].rmap_pde);

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 8d45fab..ce3251c 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -21,6 +21,7 @@ config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
 	depends on HAVE_KVM
 	select PREEMPT_NOTIFIERS
+	select MMU_NOTIFIER
 	select ANON_INODES
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2b60b7d..9bc31fc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -653,6 +653,84 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 	account_shadowed(kvm, gfn);
 }
 
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+	u64 *spte;
+	int need_tlb_flush = 0;
+
+	while ((spte = rmap_next(kvm, rmapp, NULL))) {
+		BUG_ON(!(*spte & PT_PRESENT_MASK));
+		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
+		rmap_remove(kvm, spte);
+		set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+		need_tlb_flush = 1;
+	}
+	return need_tlb_flush;
+}
+
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+			  int (*handler)(struct kvm *kvm, unsigned long *rmapp))
+{
+	int i;
+	int retval = 0;
+
+	/*
+	 * If mmap_sem isn't taken, we can look the memslots with only
+	 * the mmu_lock by skipping over the slots with userspace_addr == 0.
+	 */
+	for (i = 0; i < kvm->nmemslots; i++) {
+		struct kvm_memory_slot *memslot = &kvm->memslots[i];
+		unsigned long start = memslot->userspace_addr;
+		unsigned long end;
+
+		/* mmu_lock protects userspace_addr */
+		if (!start)
+			continue;
+
+		end = start + (memslot->npages << PAGE_SHIFT);
+		if (hva >= start && hva < end) {
+			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+			retval |= handler(kvm, &memslot->rmap[gfn_offset]);
+			retval |= handler(kvm,
+					  &memslot->lpage_info[
+						  gfn_offset /
+						  KVM_PAGES_PER_HPAGE].rmap_pde);
+		}
+	}
+
+	return retval;
+}
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+	return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+}
+
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+	u64 *spte;
+	int young = 0;
+
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		int _young;
+		u64 _spte = *spte;
+		BUG_ON(!(_spte & PT_PRESENT_MASK));
+		_young = _spte & PT_ACCESSED_MASK;
+		if (_young) {
+			young = 1;
+			clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
+		}
+		spte = rmap_next(kvm, rmapp, spte);
+	}
+	return young;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+	return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
+}
+
 #ifdef MMU_DEBUG
 static int is_empty_shadow_page(u64 *spt)
 {
@@ -1205,6 +1283,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	int r;
 	int largepage = 0;
 	pfn_t pfn;
+	unsigned long mmu_seq;
 
 	down_read(&current->mm->mmap_sem);
 	if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
@@ -1212,6 +1291,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 		largepage = 1;
 	}
 
+	mmu_seq = vcpu->kvm->mmu_notifier_seq;
+	/* implicit mb(), we'll read before PT lock is unlocked */
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
 	up_read(&current->mm->mmap_sem);
 
@@ -1222,6 +1303,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	}
 
 	spin_lock(&vcpu->kvm->mmu_lock);
+	if (mmu_notifier_retry(vcpu, mmu_seq))
+		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
 	r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
 			 PT32E_ROOT_LEVEL);
@@ -1229,6 +1312,11 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 
 
 	return r;
+
+out_unlock:
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	kvm_release_pfn_clean(pfn);
+	return 0;
 }
 
 
@@ -1347,6 +1435,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 	int r;
 	int largepage = 0;
 	gfn_t gfn = gpa >> PAGE_SHIFT;
+	unsigned long mmu_seq;
 
 	ASSERT(vcpu);
 	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -1360,6 +1449,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
 		largepage = 1;
 	}
+	mmu_seq = vcpu->kvm->mmu_notifier_seq;
+	/* implicit mb(), we'll read before PT lock is unlocked */
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
 	up_read(&current->mm->mmap_sem);
 	if (is_error_pfn(pfn)) {
@@ -1367,12 +1458,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 		return 1;
 	}
 	spin_lock(&vcpu->kvm->mmu_lock);
+	if (mmu_notifier_retry(vcpu, mmu_seq))
+		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
 	r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
 			 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level());
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return r;
+
+out_unlock:
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	kvm_release_pfn_clean(pfn);
+	return 0;
 }
 
 static void nonpaging_free(struct kvm_vcpu *vcpu)
@@ -1672,6 +1770,8 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
 		vcpu->arch.update_pte.largepage = 1;
 	}
+	vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
+	/* implicit mb(), we'll read before PT lock is unlocked */
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
 	up_read(&current->mm->mmap_sem);
 
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4d91822..f72ac1f 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -263,6 +263,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 	pfn = vcpu->arch.update_pte.pfn;
 	if (is_error_pfn(pfn))
 		return;
+	if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
+		return;
 	kvm_get_pfn(pfn);
 	mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
 		     gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
@@ -380,6 +382,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	int r;
 	pfn_t pfn;
 	int largepage = 0;
+	unsigned long mmu_seq;
 
 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
 	kvm_mmu_audit(vcpu, "pre page fault");
@@ -413,6 +416,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 			largepage = 1;
 		}
 	}
+	mmu_seq = vcpu->kvm->mmu_notifier_seq;
+	/* implicit mb(), we'll read before PT lock is unlocked */
 	pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
 	up_read(&current->mm->mmap_sem);
 
@@ -424,6 +429,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	}
 
 	spin_lock(&vcpu->kvm->mmu_lock);
+	if (mmu_notifier_retry(vcpu, mmu_seq))
+		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
 	shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
 				  largepage, &write_pt, pfn);
@@ -439,6 +446,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return write_pt;
+
+out_unlock:
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	kvm_release_pfn_clean(pfn);
+	return 0;
 }
 
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 0b6b996..d4bbf3b 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -13,6 +13,7 @@
 
 #include <linux/types.h>
 #include <linux/mm.h>
+#include <linux/mmu_notifier.h>
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
@@ -262,6 +263,7 @@ struct kvm_vcpu_arch {
 		gfn_t gfn;	/* presumed gfn during guest pte update */
 		pfn_t pfn;	/* pfn corresponding to that gfn */
 		int largepage;
+		unsigned long mmu_seq;
 	} update_pte;
 
 	struct i387_fxsave_struct host_fx_image;
@@ -727,4 +729,8 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
 	KVM_EX_ENTRY " 666b, 667b \n\t" \
 	".popsection"
 
+#define KVM_ARCH_WANT_MMU_NOTIFIER
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+
 #endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3798097..a18aaad 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -121,6 +121,12 @@ struct kvm {
 	struct kvm_coalesced_mmio_dev *coalesced_mmio_dev;
 	struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
 #endif
+
+#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
+	struct mmu_notifier mmu_notifier;
+	unsigned long mmu_notifier_seq;
+	long mmu_notifier_count;
+#endif
 };
 
 /* The guest did something we don't support. */
@@ -351,4 +357,22 @@ int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg)
 #define kvm_trace_cleanup() ((void)0)
 #endif
 
+#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
+static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq)
+{
+	if (unlikely(vcpu->kvm->mmu_notifier_count))
+		return 1;
+	/*
+	 * Both reads happen under the mmu_lock and both values are
+	 * modified under mmu_lock, so there's no need of smb_rmb()
+	 * here in between, otherwise mmu_notifier_count should be
+	 * read before mmu_notifier_seq, see
+	 * mmu_notifier_invalidate_range_end write side.
+	 */
+	if (vcpu->kvm->mmu_notifier_seq != mmu_seq)
+		return 1;
+	return 0;
+}
+#endif
+
 #endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 904d7b7..f1cb63d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -192,6 +192,123 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
 
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
+{
+	return container_of(mn, struct kvm, mmu_notifier);
+}
+
+static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
+					     struct mm_struct *mm,
+					     unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int need_tlb_flush;
+
+	/*
+	 * When ->invalidate_page runs, the linux pte has been zapped
+	 * already but the page is still allocated until
+	 * ->invalidate_page returns. So if we increase the sequence
+	 * here the kvm page fault will notice if the spte can't be
+	 * established because the page is going to be freed. If
+	 * instead the kvm page fault establishes the spte before
+	 * ->invalidate_page runs, kvm_unmap_hva will release it
+	 * before returning.
+	 *
+	 * The sequence increase only need to be seen at spin_unlock
+	 * time, and not at spin_lock time.
+	 *
+	 * Increasing the sequence after the spin_unlock would be
+	 * unsafe because the kvm page fault could then establish the
+	 * pte after kvm_unmap_hva returned, without noticing the page
+	 * is going to be freed.
+	 */
+	spin_lock(&kvm->mmu_lock);
+	kvm->mmu_notifier_seq++;
+	need_tlb_flush = kvm_unmap_hva(kvm, address);
+	spin_unlock(&kvm->mmu_lock);
+
+	/* we've to flush the tlb before the pages can be freed */
+	if (need_tlb_flush)
+		kvm_flush_remote_tlbs(kvm);
+
+}
+
+static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
+						    struct mm_struct *mm,
+						    unsigned long start,
+						    unsigned long end)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int need_tlb_flush = 0;
+
+	spin_lock(&kvm->mmu_lock);
+	/*
+	 * The count increase must become visible at unlock time as no
+	 * spte can be established without taking the mmu_lock and
+	 * count is also read inside the mmu_lock critical section.
+	 */
+	kvm->mmu_notifier_count++;
+	for (; start < end; start += PAGE_SIZE)
+		need_tlb_flush |= kvm_unmap_hva(kvm, start);
+	spin_unlock(&kvm->mmu_lock);
+
+	/* we've to flush the tlb before the pages can be freed */
+	if (need_tlb_flush)
+		kvm_flush_remote_tlbs(kvm);
+}
+
+static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+						  struct mm_struct *mm,
+						  unsigned long start,
+						  unsigned long end)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+
+	spin_lock(&kvm->mmu_lock);
+	/*
+	 * This sequence increase will notify the kvm page fault that
+	 * the page that is going to be mapped in the spte could have
+	 * been freed.
+	 */
+	kvm->mmu_notifier_seq++;
+	/*
+	 * The above sequence increase must be visible before the
+	 * below count decrease but both values are read by the kvm
+	 * page fault under mmu_lock spinlock so we don't need to add
+	 * a smb_wmb() here in between the two.
+	 */
+	kvm->mmu_notifier_count--;
+	spin_unlock(&kvm->mmu_lock);
+
+	BUG_ON(kvm->mmu_notifier_count < 0);
+}
+
+static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
+					      struct mm_struct *mm,
+					      unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int young;
+
+	spin_lock(&kvm->mmu_lock);
+	young = kvm_age_hva(kvm, address);
+	spin_unlock(&kvm->mmu_lock);
+
+	if (young)
+		kvm_flush_remote_tlbs(kvm);
+
+	return young;
+}
+
+static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+	.invalidate_page	= kvm_mmu_notifier_invalidate_page,
+	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
+	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
+	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
+};
+#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
+
 static struct kvm *kvm_create_vm(void)
 {
 	struct kvm *kvm = kvm_arch_create_vm();
@@ -212,6 +329,21 @@ static struct kvm *kvm_create_vm(void)
 			(struct kvm_coalesced_mmio_ring *)page_address(page);
 #endif
 
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+	{
+		int err;
+		kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
+		err = mmu_notifier_register(&kvm->mmu_notifier, current->mm);
+		if (err) {
+#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+			put_page(page);
+#endif
+			kfree(kvm);
+			return ERR_PTR(err);
+		}
+	}
+#endif
+
 	kvm->mm = current->mm;
 	atomic_inc(&kvm->mm->mm_count);
 	spin_lock_init(&kvm->mmu_lock);
@@ -272,6 +404,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	if (kvm->coalesced_mmio_ring != NULL)
 		free_page((unsigned long)kvm->coalesced_mmio_ring);
 #endif
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+#endif
 	kvm_arch_destroy_vm(kvm);
 	mmdrop(mm);
 }

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/6] kvm.git allow reading aliases with mmu_lock
  2008-07-25 14:24 [PATCH 1/6] kvm.git mmu notifier patch Andrea Arcangeli
@ 2008-07-25 14:26 ` Andrea Arcangeli
  2008-07-25 14:32   ` [PATCH 3/6] kvm.git allow browsing memslots " Andrea Arcangeli
  2008-07-28 21:43   ` [PATCH 2/6] kvm.git allow reading aliases " Marcelo Tosatti
  2008-07-28 21:11 ` [PATCH 1/6] kvm.git mmu notifier patch Marcelo Tosatti
  2008-07-29  9:31 ` Avi Kivity
  2 siblings, 2 replies; 16+ messages in thread
From: Andrea Arcangeli @ 2008-07-25 14:26 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm

This allows the mmu notifier code to run unalias_gfn with only the
mmu_lock held. Only alias writes need the mmu_lock held. Readers will
either take the slots_lock in read mode or the mmu_lock.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f7726a0..ddaf960 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1515,6 +1515,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
 		goto out;
 
 	down_write(&kvm->slots_lock);
+	spin_lock(&kvm->mmu_lock);
 
 	p = &kvm->arch.aliases[alias->slot];
 	p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
@@ -1526,6 +1527,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
 			break;
 	kvm->arch.naliases = n;
 
+	spin_unlock(&kvm->mmu_lock);
 	kvm_mmu_zap_all(kvm);
 
 	up_write(&kvm->slots_lock);

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 3/6] kvm.git allow browsing memslots with mmu_lock
  2008-07-25 14:26 ` [PATCH 2/6] kvm.git allow reading aliases with mmu_lock Andrea Arcangeli
@ 2008-07-25 14:32   ` Andrea Arcangeli
  2008-07-25 14:38     ` [PATCH 4/6] kvm-userland.git mmu notifier compat Andrea Arcangeli
  2008-07-28 21:51     ` [PATCH 3/6] kvm.git allow browsing memslots with mmu_lock Marcelo Tosatti
  2008-07-28 21:43   ` [PATCH 2/6] kvm.git allow reading aliases " Marcelo Tosatti
  1 sibling, 2 replies; 16+ messages in thread
From: Andrea Arcangeli @ 2008-07-25 14:32 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm

This allows reading memslots with only the mmu_lock hold for mmu
notifiers that runs in atomic context and with mmu_lock held.

Compared to previous versions I had to move this:

-       if (mem->slot >= kvm->nmemslots)
-               kvm->nmemslots = mem->slot + 1;

after the call to kvm_arch_flush_shadow but that's ok as they're
mutually exclusive: npages being null asks to remove the memslot and
you can't remove a memslot that doesn't exist yet even if there's no
explicit check that forbids it. In any case the ordering didn't seem
meaningful if slots is set higher than nmemslots. But please Marcelo
double check, thanks!

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f7726a0..cf04489 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3999,16 +3999,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 	 */
 	if (!user_alloc) {
 		if (npages && !old.rmap) {
+			unsigned long userspace_addr;
+
 			down_write(&current->mm->mmap_sem);
-			memslot->userspace_addr = do_mmap(NULL, 0,
-						     npages * PAGE_SIZE,
-						     PROT_READ | PROT_WRITE,
-						     MAP_SHARED | MAP_ANONYMOUS,
-						     0);
+			userspace_addr = do_mmap(NULL, 0,
+						 npages * PAGE_SIZE,
+						 PROT_READ | PROT_WRITE,
+						 MAP_SHARED | MAP_ANONYMOUS,
+						 0);
 			up_write(&current->mm->mmap_sem);
 
-			if (IS_ERR((void *)memslot->userspace_addr))
-				return PTR_ERR((void *)memslot->userspace_addr);
+			if (IS_ERR((void *)userspace_addr))
+				return PTR_ERR((void *)userspace_addr);
+
+			/* set userspace_addr atomically for kvm_hva_to_rmapp */
+			spin_lock(&kvm->mmu_lock);
+			memslot->userspace_addr = userspace_addr;
+			spin_unlock(&kvm->mmu_lock);
 		} else {
 			if (!old.user_alloc && old.rmap) {
 				int ret;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 904d7b7..132a26c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -375,7 +375,15 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		memset(new.rmap, 0, npages * sizeof(*new.rmap));
 
 		new.user_alloc = user_alloc;
-		new.userspace_addr = mem->userspace_addr;
+		/*
+		 * hva_to_rmmap() serialzies with the mmu_lock and to be
+		 * safe it has to ignore memslots with !user_alloc &&
+		 * !userspace_addr.
+		 */
+		if (user_alloc)
+			new.userspace_addr = mem->userspace_addr;
+		else
+			new.userspace_addr = 0;
 	}
 	if (npages && !new.lpage_info) {
 		int largepages = npages / KVM_PAGES_PER_HPAGE;
@@ -408,17 +416,21 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	}
 #endif /* not defined CONFIG_S390 */
 
-	if (mem->slot >= kvm->nmemslots)
-		kvm->nmemslots = mem->slot + 1;
-
 	if (!npages)
 		kvm_arch_flush_shadow(kvm);
 
+	spin_lock(&kvm->mmu_lock);
+	if (mem->slot >= kvm->nmemslots)
+		kvm->nmemslots = mem->slot + 1;
+
 	*memslot = new;
+	spin_unlock(&kvm->mmu_lock);
 
 	r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
 	if (r) {
+		spin_lock(&kvm->mmu_lock);
 		*memslot = old;
+		spin_unlock(&kvm->mmu_lock);
 		goto out_free;
 	}
 

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 4/6] kvm-userland.git mmu notifier compat
  2008-07-25 14:32   ` [PATCH 3/6] kvm.git allow browsing memslots " Andrea Arcangeli
@ 2008-07-25 14:38     ` Andrea Arcangeli
  2008-07-25 14:40       ` [PATCH 5/6] kvm.git handle reserved pages as mmio Andrea Arcangeli
  2008-07-29 12:49       ` [PATCH 4/6] kvm-userland.git mmu notifier compat Avi Kivity
  2008-07-28 21:51     ` [PATCH 3/6] kvm.git allow browsing memslots with mmu_lock Marcelo Tosatti
  1 sibling, 2 replies; 16+ messages in thread
From: Andrea Arcangeli @ 2008-07-25 14:38 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm

This patch to kvm-userland.git allows to build the tree on older
kernels without mmu notifiers.

Unfortunately emulating mmu notifiers with kprobes was feasible only
for do_wp_page because it has a special property where calling the
spte invalidate before the pte invalidate is ok thanks to the
guarantee of that pte being wprotect, not to tell about the start/end
functions. So old kernels without mmu notifiers will have to keep the
page pinned I'm afraid. So to remove the page pin in future patches in
a backwards compatible way, we'll have to go with #ifdef MMU_NOTIFIER
in kvm.git code I'm afraid and this should be all that's needed on the
kvm-userland side. Adding all page pinning needed by old kernels
through hack-module.awk sounds a bit extreme to me, even if it would
result in cleaner kvm.git code... anyway I think the below patch will
be still needed to build regardless how we decide to handle the future
changes.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

diff --git a/kernel/include-compat/linux/mmu_notifier.h b/kernel/include-compat/linux/mmu_notifier.h
new file mode 100644
index 0000000..a6db4ba
--- /dev/null
+++ b/kernel/include-compat/linux/mmu_notifier.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_MMU_NOTIFIER_H
+#define _LINUX_MMU_NOTIFIER_H
+
+struct mmu_notifier {};
+
+#endif

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 5/6] kvm.git handle reserved pages as mmio
  2008-07-25 14:38     ` [PATCH 4/6] kvm-userland.git mmu notifier compat Andrea Arcangeli
@ 2008-07-25 14:40       ` Andrea Arcangeli
  2008-07-25 14:44         ` [PATCH 6/6] kvm.git allow reserved pages to be used as guest phys ram Andrea Arcangeli
  2008-07-29 12:49       ` [PATCH 4/6] kvm-userland.git mmu notifier compat Avi Kivity
  1 sibling, 1 reply; 16+ messages in thread
From: Andrea Arcangeli @ 2008-07-25 14:40 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm

I need the below patch applied to handle reserved pages as mmio
regions, the 1:1 reserved-ram patch generates all reserved pages with
pfn_valid and gfn_to_pfn currently doesn't handle that and bugs out.

From: Ben-Ami Yassour <benami@il.ibm.com>

In some cases it is not enough to identify mmio memory slots by
pfn_valid. This patch adds checking the PageReserved as well.

Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
---
 virt/kvm/kvm_main.c |   22 +++++++++++++++-------
 1 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f9427e2..27b2eff 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -76,6 +76,14 @@ static inline int valid_vcpu(int n)
 	return likely(n >= 0 && n < KVM_MAX_VCPUS);
 }
 
+static inline int is_mmio_pfn(pfn_t pfn)
+{
+	if (pfn_valid(pfn))
+		return PageReserved(pfn_to_page(pfn));
+
+	return true;
+}
+
 /*
  * Switches to specified vcpu, until a matching vcpu_put()
  */
@@ -582,7 +590,7 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 		}
 
 		pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-		BUG_ON(pfn_valid(pfn));
+		BUG_ON(!is_mmio_pfn(pfn));
 	} else
 		pfn = page_to_pfn(page[0]);
 
@@ -596,10 +604,10 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 	pfn_t pfn;
 
 	pfn = gfn_to_pfn(kvm, gfn);
-	if (pfn_valid(pfn))
+	if (!is_mmio_pfn(pfn))
 		return pfn_to_page(pfn);
 
-	WARN_ON(!pfn_valid(pfn));
+	WARN_ON(is_mmio_pfn(pfn));
 
 	get_page(bad_page);
 	return bad_page;
@@ -615,7 +623,7 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean);
 
 void kvm_release_pfn_clean(pfn_t pfn)
 {
-	if (pfn_valid(pfn))
+	if (!is_mmio_pfn(pfn))
 		put_page(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
@@ -641,7 +649,7 @@ EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
 
 void kvm_set_pfn_dirty(pfn_t pfn)
 {
-	if (pfn_valid(pfn)) {
+	if (!is_mmio_pfn(pfn)) {
 		struct page *page = pfn_to_page(pfn);
 		if (!PageReserved(page))
 			SetPageDirty(page);
@@ -651,14 +659,14 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
 
 void kvm_set_pfn_accessed(pfn_t pfn)
 {
-	if (pfn_valid(pfn))
+	if (!is_mmio_pfn(pfn))
 		mark_page_accessed(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
 
 void kvm_get_pfn(pfn_t pfn)
 {
-	if (pfn_valid(pfn))
+	if (!is_mmio_pfn(pfn))
 		get_page(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_get_pfn);


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 6/6] kvm.git allow reserved pages to be used as guest phys ram
  2008-07-25 14:40       ` [PATCH 5/6] kvm.git handle reserved pages as mmio Andrea Arcangeli
@ 2008-07-25 14:44         ` Andrea Arcangeli
  0 siblings, 0 replies; 16+ messages in thread
From: Andrea Arcangeli @ 2008-07-25 14:44 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm

This is an incremental fix on top of Ben-Ami reserved-pages-mmio
patch. I need this for the reserved-ram 1:1 patch because I must allow
to map reserved pages with a valid pfn in the user address space of
the kvm task. This is because those reserved pages aren't mmio regions
for me but the real backing of the guest physical ram.

Reserved pages can be mapped through the ->fault api just fine, as
long as a pfn exists for them, and the reserved-ram patch ensures that
a pfn exists for them and that they're marked reserved as expected by
special ram reserved at boot time.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

 kvm_main.c |    5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

Index: virt/kvm/kvm_main.c
--- virt/kvm/kvm_main.c.orig	2008-06-25 02:39:51.000000000 +0200
+++ a/virt/kvm/kvm_main.c	2008-06-25 02:40:35.000000000 +0200
@@ -604,10 +604,9 @@ struct page *gfn_to_page(struct kvm *kvm
 	pfn_t pfn;

 	pfn = gfn_to_pfn(kvm, gfn);
-	if (!is_mmio_pfn(pfn))
+	if (pfn_valid(pfn))
 		return pfn_to_page(pfn);
-
-	WARN_ON(is_mmio_pfn(pfn));
+	WARN_ON(1);

 	get_page(bad_page);
 	return bad_page;

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/6] kvm-userland.git mmu notifier compat
  2008-07-25 14:38     ` [PATCH 4/6] kvm-userland.git mmu notifier compat Andrea Arcangeli
  2008-07-25 14:40       ` [PATCH 5/6] kvm.git handle reserved pages as mmio Andrea Arcangeli
@ 2008-07-29 12:49       ` Avi Kivity
  2008-07-29 13:03         ` Andrea Arcangeli
  1 sibling, 1 reply; 16+ messages in thread
From: Avi Kivity @ 2008-07-29 12:49 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Marcelo Tosatti, kvm

Andrea Arcangeli wrote:
> This patch to kvm-userland.git allows to build the tree on older
> kernels without mmu notifiers.
>
> Unfortunately emulating mmu notifiers with kprobes was feasible only
> for do_wp_page because it has a special property where calling the
> spte invalidate before the pte invalidate is ok thanks to the
> guarantee of that pte being wprotect, not to tell about the start/end
> functions. So old kernels without mmu notifiers will have to keep the
> page pinned I'm afraid. So to remove the page pin in future patches in
> a backwards compatible way, we'll have to go with #ifdef MMU_NOTIFIER
> in kvm.git code I'm afraid and this should be all that's needed on the
> kvm-userland side. Adding all page pinning needed by old kernels
> through hack-module.awk sounds a bit extreme to me, even if it would
> result in cleaner kvm.git code... anyway I think the below patch will
> be still needed to build regardless how we decide to handle the future
> changes.
>
>   

Applied; thanks.  I think page pinning can be worked around by pinning 
all pages on memslot registration (and unpinning on memslot removal); 
this will slow down virtual machine startup, but is at least simple.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/6] kvm-userland.git mmu notifier compat
  2008-07-29 12:49       ` [PATCH 4/6] kvm-userland.git mmu notifier compat Avi Kivity
@ 2008-07-29 13:03         ` Andrea Arcangeli
  2008-07-29 13:30           ` Avi Kivity
  0 siblings, 1 reply; 16+ messages in thread
From: Andrea Arcangeli @ 2008-07-29 13:03 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm

On Tue, Jul 29, 2008 at 03:49:08PM +0300, Avi Kivity wrote:
> Applied; thanks.  I think page pinning can be worked around by pinning all 
> pages on memslot registration (and unpinning on memslot removal); this will 
> slow down virtual machine startup, but is at least simple.

This has the benefit that will also fix the tlb issues, but if we go
this way if a sles/rhel user takes the next kvm release, all ram will
be pinned and it won't swap anything for him, like with my patch that
added VM_LOCKED. Said that swap wasn't a reliable feature before so
nobody should have depended on that.

If we really go this memslot-wide-pinning way I could submit to rhel
and sles a mmu notifier backport so the rhel/sles users won't have to
wait to get their kernel upgraded to 2.6.27 to get full kvm-paging.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/6] kvm-userland.git mmu notifier compat
  2008-07-29 13:03         ` Andrea Arcangeli
@ 2008-07-29 13:30           ` Avi Kivity
  2008-07-29 13:37             ` Andrea Arcangeli
  0 siblings, 1 reply; 16+ messages in thread
From: Avi Kivity @ 2008-07-29 13:30 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Marcelo Tosatti, kvm

Andrea Arcangeli wrote:
> On Tue, Jul 29, 2008 at 03:49:08PM +0300, Avi Kivity wrote:
>   
>> Applied; thanks.  I think page pinning can be worked around by pinning all 
>> pages on memslot registration (and unpinning on memslot removal); this will 
>> slow down virtual machine startup, but is at least simple.
>>     
>
> This has the benefit that will also fix the tlb issues, but if we go
> this way if a sles/rhel user takes the next kvm release, all ram will
> be pinned and it won't swap anything for him, like with my patch that
> added VM_LOCKED. Said that swap wasn't a reliable feature before so
> nobody should have depended on that.
>   

A 64-bit guest would pin all pages anyway as soon as it touched them the 
first time, through the direct map.

> If we really go this memslot-wide-pinning way I could submit to rhel
> and sles a mmu notifier backport so the rhel/sles users won't have to
> wait to get their kernel upgraded to 2.6.27 to get full kvm-paging.
>   

I really doubt they would take it; Red Hat will want to point people at 
ovirt and Novel at Xen.

It's not a simple patch.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/6] kvm-userland.git mmu notifier compat
  2008-07-29 13:30           ` Avi Kivity
@ 2008-07-29 13:37             ` Andrea Arcangeli
  0 siblings, 0 replies; 16+ messages in thread
From: Andrea Arcangeli @ 2008-07-29 13:37 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm

On Tue, Jul 29, 2008 at 04:30:42PM +0300, Avi Kivity wrote:
> A 64-bit guest would pin all pages anyway as soon as it touched them the 
> first time, through the direct map.

Linux indeed, ok!

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/6] kvm.git allow browsing memslots with mmu_lock
  2008-07-25 14:32   ` [PATCH 3/6] kvm.git allow browsing memslots " Andrea Arcangeli
  2008-07-25 14:38     ` [PATCH 4/6] kvm-userland.git mmu notifier compat Andrea Arcangeli
@ 2008-07-28 21:51     ` Marcelo Tosatti
  2008-07-28 22:06       ` Andrea Arcangeli
  1 sibling, 1 reply; 16+ messages in thread
From: Marcelo Tosatti @ 2008-07-28 21:51 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Avi Kivity, Marcelo Tosatti, kvm

On Fri, Jul 25, 2008 at 04:32:03PM +0200, Andrea Arcangeli wrote:
> This allows reading memslots with only the mmu_lock hold for mmu
> notifiers that runs in atomic context and with mmu_lock held.
> 
> Compared to previous versions I had to move this:
> 
> -       if (mem->slot >= kvm->nmemslots)
> -               kvm->nmemslots = mem->slot + 1;
> 
> after the call to kvm_arch_flush_shadow but that's ok as they're
> mutually exclusive: npages being null asks to remove the memslot and
> you can't remove a memslot that doesn't exist yet even if there's no
> explicit check that forbids it. In any case the ordering didn't seem
> meaningful if slots is set higher than nmemslots. But please Marcelo
> double check, thanks!
> 
> Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

Looks good.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/6] kvm.git allow browsing memslots with mmu_lock
  2008-07-28 21:51     ` [PATCH 3/6] kvm.git allow browsing memslots with mmu_lock Marcelo Tosatti
@ 2008-07-28 22:06       ` Andrea Arcangeli
  0 siblings, 0 replies; 16+ messages in thread
From: Andrea Arcangeli @ 2008-07-28 22:06 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, kvm

On Mon, Jul 28, 2008 at 06:51:50PM -0300, Marcelo Tosatti wrote:
> Looks good.

Cool, thanks for the review of the three patches! Now I only hope
Andrew will post his -mm patches to Linus without waiting any further.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/6] kvm.git allow reading aliases with mmu_lock
  2008-07-25 14:26 ` [PATCH 2/6] kvm.git allow reading aliases with mmu_lock Andrea Arcangeli
  2008-07-25 14:32   ` [PATCH 3/6] kvm.git allow browsing memslots " Andrea Arcangeli
@ 2008-07-28 21:43   ` Marcelo Tosatti
  1 sibling, 0 replies; 16+ messages in thread
From: Marcelo Tosatti @ 2008-07-28 21:43 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Avi Kivity, Marcelo Tosatti, kvm

On Fri, Jul 25, 2008 at 04:26:39PM +0200, Andrea Arcangeli wrote:
> This allows the mmu notifier code to run unalias_gfn with only the
> mmu_lock held. Only alias writes need the mmu_lock held. Readers will
> either take the slots_lock in read mode or the mmu_lock.
> 
> Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index f7726a0..ddaf960 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1515,6 +1515,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
>  		goto out;
>  
>  	down_write(&kvm->slots_lock);
> +	spin_lock(&kvm->mmu_lock);
>  
>  	p = &kvm->arch.aliases[alias->slot];
>  	p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
> @@ -1526,6 +1527,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
>  			break;
>  	kvm->arch.naliases = n;
>  
> +	spin_unlock(&kvm->mmu_lock);
>  	kvm_mmu_zap_all(kvm);
>  
>  	up_write(&kvm->slots_lock);
> --

Looks good.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/6] kvm.git mmu notifier patch
  2008-07-25 14:24 [PATCH 1/6] kvm.git mmu notifier patch Andrea Arcangeli
  2008-07-25 14:26 ` [PATCH 2/6] kvm.git allow reading aliases with mmu_lock Andrea Arcangeli
@ 2008-07-28 21:11 ` Marcelo Tosatti
  2008-07-28 21:15   ` Marcelo Tosatti
  2008-07-29  9:31 ` Avi Kivity
  2 siblings, 1 reply; 16+ messages in thread
From: Marcelo Tosatti @ 2008-07-28 21:11 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Avi Kivity, Marcelo Tosatti, kvm

Hi Andrea,

On Fri, Jul 25, 2008 at 04:24:52PM +0200, Andrea Arcangeli wrote:
> Hello,
> 
> This is the latest version of the kvm mmu notifier patch.
> 
> This allows largepages to be invalidated too, and a separate patch
> takes care of alias locking (2/5) and memslot locking (3/5).
> 
> While in the long term we should remove the alias to microoptimize the
> fast paths and avoid the need of unalias_gfn calls, I decided not to
> remove alias immediately: that patch done best would have removed the
> alias API as a whole from userland, and done bad it would have kept
> the alias API intact while implementing the lowlevel kernel details
> with memslots instead of the alias array. Problem is the memslot array
> is already fragmented with the private entries, and so adding the
> alias entries before or after the private memslot entries resulted in
> different issues, often requiring a new loop over the alias range or
> the private range depending where I put the alias slots (before or
> after the private ones).
> 
> So I think to do it right the alias ioctl should be removed as a whole
> and that was too large to do right now, especially given we're
> concentrating on stability and fixing the alias with the mmu_lock was
> a 2 liner change.
> 
> Marcelo please double check this code, I didn't effectively test it
> myself with the largepages yet but it looks the rmap_next works the
> same and the rmap_pde is just a rmapp with the only difference of the
> PSE bit being set. I didn't hide the rmap_pde details with gfn_to_rmap
> to avoid an entirely superflous gfn_to_memslot.
> 
> +                       retval |= handler(kvm,
> +                                         &memslot->lpage_info[
> +                                                 gfn_offset / KVM_PAGES_PER_HPAGE].rmap_pde);
> 
> Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
> 
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index 8d45fab..ce3251c 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -21,6 +21,7 @@ config KVM
>  	tristate "Kernel-based Virtual Machine (KVM) support"
>  	depends on HAVE_KVM
>  	select PREEMPT_NOTIFIERS
> +	select MMU_NOTIFIER
>  	select ANON_INODES
>  	---help---
>  	  Support hosting fully virtualized guest machines using hardware
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 2b60b7d..9bc31fc 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -653,6 +653,84 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
>  	account_shadowed(kvm, gfn);
>  }
>  
> +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
> +{
> +	u64 *spte;
> +	int need_tlb_flush = 0;
> +
> +	while ((spte = rmap_next(kvm, rmapp, NULL))) {
> +		BUG_ON(!(*spte & PT_PRESENT_MASK));
> +		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
> +		rmap_remove(kvm, spte);
> +		set_shadow_pte(spte, shadow_trap_nonpresent_pte);
> +		need_tlb_flush = 1;
> +	}

Is rmap_remove() safe during rmap_next() iteration? Don't you need to
restart the rmap walk from the beginning?

Remember the discussion around commit
6597ca09e6c0e5aec7ffd2b8ab48c671d3c28414.

> +	return need_tlb_flush;
> +}
> +
> +static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
> +			  int (*handler)(struct kvm *kvm, unsigned long *rmapp))
> +{
> +	int i;
> +	int retval = 0;
> +
> +	/*
> +	 * If mmap_sem isn't taken, we can look the memslots with only
> +	 * the mmu_lock by skipping over the slots with userspace_addr == 0.
> +	 */
> +	for (i = 0; i < kvm->nmemslots; i++) {
> +		struct kvm_memory_slot *memslot = &kvm->memslots[i];
> +		unsigned long start = memslot->userspace_addr;
> +		unsigned long end;
> +
> +		/* mmu_lock protects userspace_addr */
> +		if (!start)
> +			continue;
> +
> +		end = start + (memslot->npages << PAGE_SHIFT);
> +		if (hva >= start && hva < end) {
> +			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
> +			retval |= handler(kvm, &memslot->rmap[gfn_offset]);
> +			retval |= handler(kvm,
> +					  &memslot->lpage_info[
> +						  gfn_offset /
> +						  KVM_PAGES_PER_HPAGE].rmap_pde);

Other than that looks good.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/6] kvm.git mmu notifier patch
  2008-07-28 21:11 ` [PATCH 1/6] kvm.git mmu notifier patch Marcelo Tosatti
@ 2008-07-28 21:15   ` Marcelo Tosatti
  0 siblings, 0 replies; 16+ messages in thread
From: Marcelo Tosatti @ 2008-07-28 21:15 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Avi Kivity, Marcelo Tosatti, kvm

On Mon, Jul 28, 2008 at 06:11:04PM -0300, Marcelo Tosatti wrote:
> > +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
> > +{
> > +	u64 *spte;
> > +	int need_tlb_flush = 0;
> > +
> > +	while ((spte = rmap_next(kvm, rmapp, NULL))) {
                                            ^^^^^^^^

Yeah, I can't read. :-)


> > +		BUG_ON(!(*spte & PT_PRESENT_MASK));
> > +		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
> > +		rmap_remove(kvm, spte);
> > +		set_shadow_pte(spte, shadow_trap_nonpresent_pte);
> > +		need_tlb_flush = 1;
> > +	}
> 
> Is rmap_remove() safe during rmap_next() iteration? Don't you need to
> restart the rmap walk from the beginning?
> 
> Remember the discussion around commit
> 6597ca09e6c0e5aec7ffd2b8ab48c671d3c28414.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/6] kvm.git mmu notifier patch
  2008-07-25 14:24 [PATCH 1/6] kvm.git mmu notifier patch Andrea Arcangeli
  2008-07-25 14:26 ` [PATCH 2/6] kvm.git allow reading aliases with mmu_lock Andrea Arcangeli
  2008-07-28 21:11 ` [PATCH 1/6] kvm.git mmu notifier patch Marcelo Tosatti
@ 2008-07-29  9:31 ` Avi Kivity
  2 siblings, 0 replies; 16+ messages in thread
From: Avi Kivity @ 2008-07-29  9:31 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Marcelo Tosatti, kvm

Andrea Arcangeli wrote:
> Hello,
>
> This is the latest version of the kvm mmu notifier patch.
>   

Thanks - applied patches 1-3; I reordered the patches so the mmu 
notifier implementation is the last.

Will look at the rest later.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2008-07-29 13:37 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-07-25 14:24 [PATCH 1/6] kvm.git mmu notifier patch Andrea Arcangeli
2008-07-25 14:26 ` [PATCH 2/6] kvm.git allow reading aliases with mmu_lock Andrea Arcangeli
2008-07-25 14:32   ` [PATCH 3/6] kvm.git allow browsing memslots " Andrea Arcangeli
2008-07-25 14:38     ` [PATCH 4/6] kvm-userland.git mmu notifier compat Andrea Arcangeli
2008-07-25 14:40       ` [PATCH 5/6] kvm.git handle reserved pages as mmio Andrea Arcangeli
2008-07-25 14:44         ` [PATCH 6/6] kvm.git allow reserved pages to be used as guest phys ram Andrea Arcangeli
2008-07-29 12:49       ` [PATCH 4/6] kvm-userland.git mmu notifier compat Avi Kivity
2008-07-29 13:03         ` Andrea Arcangeli
2008-07-29 13:30           ` Avi Kivity
2008-07-29 13:37             ` Andrea Arcangeli
2008-07-28 21:51     ` [PATCH 3/6] kvm.git allow browsing memslots with mmu_lock Marcelo Tosatti
2008-07-28 22:06       ` Andrea Arcangeli
2008-07-28 21:43   ` [PATCH 2/6] kvm.git allow reading aliases " Marcelo Tosatti
2008-07-28 21:11 ` [PATCH 1/6] kvm.git mmu notifier patch Marcelo Tosatti
2008-07-28 21:15   ` Marcelo Tosatti
2008-07-29  9:31 ` Avi Kivity

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox