[patch 0/5] KVM shadow scalability enhancements

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [patch 0/5] KVM shadow scalability enhancements
@ 2007-12-21  0:18 Marcelo Tosatti
  2007-12-21  0:18 ` [patch 1/5] KVM: concurrent guest walkers Marcelo Tosatti
                   ` (5 more replies)
  0 siblings, 6 replies; 26+ messages in thread
From: Marcelo Tosatti @ 2007-12-21  0:18 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, mtosatti-H+wXaHxf7aLQT0dZR+AlfA

The following patchset increases KVM shadow scalability by allowing concurrent
guest walking, allocation and instruction emulation.

-- 


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [patch 1/5] KVM: concurrent guest walkers
  2007-12-21  0:18 [patch 0/5] KVM shadow scalability enhancements Marcelo Tosatti
@ 2007-12-21  0:18 ` Marcelo Tosatti
  2007-12-21  0:18 ` [patch 2/5] KVM: add kvm_read_guest_atomic() Marcelo Tosatti
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 26+ messages in thread
From: Marcelo Tosatti @ 2007-12-21  0:18 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, mtosatti-H+wXaHxf7aLQT0dZR+AlfA

[-- Attachment #1: use-mmap-sem --]
[-- Type: text/plain, Size: 17795 bytes --]

Do not hold kvm->lock mutex across the entire pagefault code, 
only acquire it in places where it is necessary, such as mmu 
hash list, active list, rmap and parent pte handling.

Allow concurrent guest walkers by switching walk_addr() to use
mmap_sem in read-mode.

And get rid of the lockless __gfn_to_page.

Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

Index: kvm.quilt/arch/x86/kvm/mmu.c
===================================================================
--- kvm.quilt.orig/arch/x86/kvm/mmu.c
+++ kvm.quilt/arch/x86/kvm/mmu.c
@@ -1026,6 +1026,7 @@ static void mmu_free_roots(struct kvm_vc
 
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
+	mutex_lock(&vcpu->kvm->lock);
 #ifdef CONFIG_X86_64
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -1033,6 +1034,7 @@ static void mmu_free_roots(struct kvm_vc
 		sp = page_header(root);
 		--sp->root_count;
 		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+		mutex_unlock(&vcpu->kvm->lock);
 		return;
 	}
 #endif
@@ -1046,6 +1048,7 @@ static void mmu_free_roots(struct kvm_vc
 		}
 		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
 	}
+	mutex_unlock(&vcpu->kvm->lock);
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 }
 
@@ -1245,15 +1248,15 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 {
 	int r;
 
-	mutex_lock(&vcpu->kvm->lock);
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
 		goto out;
+	mutex_lock(&vcpu->kvm->lock);
 	mmu_alloc_roots(vcpu);
+	mutex_unlock(&vcpu->kvm->lock);
 	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
 	kvm_mmu_flush_tlb(vcpu);
 out:
-	mutex_unlock(&vcpu->kvm->lock);
 	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_load);
@@ -1420,13 +1423,22 @@ void kvm_mmu_pte_write(struct kvm_vcpu *
 
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
 {
-	gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+	gpa_t gpa;
+	int r;
+
+	down_read(&current->mm->mmap_sem);
+	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+	up_read(&current->mm->mmap_sem);
 
-	return kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+	mutex_lock(&vcpu->kvm->lock);
+	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+	mutex_unlock(&vcpu->kvm->lock);
+	return r;
 }
 
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
+	mutex_lock(&vcpu->kvm->lock);
 	while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
 		struct kvm_mmu_page *sp;
 
@@ -1435,6 +1447,7 @@ void __kvm_mmu_free_some_pages(struct kv
 		kvm_mmu_zap_page(vcpu->kvm, sp);
 		++vcpu->kvm->stat.mmu_recycled;
 	}
+	mutex_unlock(&vcpu->kvm->lock);
 }
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
@@ -1442,7 +1455,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *
 	int r;
 	enum emulation_result er;
 
-	mutex_lock(&vcpu->kvm->lock);
 	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
 	if (r < 0)
 		goto out;
@@ -1457,7 +1469,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *
 		goto out;
 
 	er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
-	mutex_unlock(&vcpu->kvm->lock);
 
 	switch (er) {
 	case EMULATE_DONE:
@@ -1472,7 +1483,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *
 		BUG();
 	}
 out:
-	mutex_unlock(&vcpu->kvm->lock);
 	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
@@ -1569,8 +1579,10 @@ void kvm_mmu_zap_all(struct kvm *kvm)
 {
 	struct kvm_mmu_page *sp, *node;
 
+	mutex_lock(&kvm->lock);
 	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
 		kvm_mmu_zap_page(kvm, sp);
+	mutex_unlock(&kvm->lock);
 
 	kvm_flush_remote_tlbs(kvm);
 }
Index: kvm.quilt/arch/x86/kvm/paging_tmpl.h
===================================================================
--- kvm.quilt.orig/arch/x86/kvm/paging_tmpl.h
+++ kvm.quilt/arch/x86/kvm/paging_tmpl.h
@@ -212,7 +212,9 @@ walk:
 		if (ret)
 			goto walk;
 		pte |= PT_DIRTY_MASK;
+		mutex_lock(&vcpu->kvm->lock);
 		kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
+		mutex_unlock(&vcpu->kvm->lock);
 		walker->ptes[walker->level - 1] = pte;
 	}
 
@@ -368,11 +370,13 @@ static int FNAME(page_fault)(struct kvm_
 	if (r)
 		return r;
 
+	down_read(&current->mm->mmap_sem);
 	/*
 	 * Look up the shadow pte for the faulting address.
 	 */
 	r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
 			     fetch_fault);
+	up_read(&current->mm->mmap_sem);
 
 	/*
 	 * The page is not mapped by the guest.  Let the guest handle it.
@@ -384,6 +388,7 @@ static int FNAME(page_fault)(struct kvm_
 		return 0;
 	}
 
+	mutex_lock(&vcpu->kvm->lock);
 	shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
 				  &write_pt);
 	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
@@ -395,11 +400,14 @@ static int FNAME(page_fault)(struct kvm_
 	/*
 	 * mmio: emulate if accessible, otherwise its a guest fault.
 	 */
-	if (shadow_pte && is_io_pte(*shadow_pte))
+	if (shadow_pte && is_io_pte(*shadow_pte)) {
+		mutex_unlock(&vcpu->kvm->lock);
 		return 1;
+	}
 
 	++vcpu->stat.pf_fixed;
 	kvm_mmu_audit(vcpu, "post page fault (fixed)");
+	mutex_unlock(&vcpu->kvm->lock);
 
 	return write_pt;
 }
Index: kvm.quilt/arch/x86/kvm/vmx.c
===================================================================
--- kvm.quilt.orig/arch/x86/kvm/vmx.c
+++ kvm.quilt/arch/x86/kvm/vmx.c
@@ -1431,27 +1431,34 @@ static int init_rmode_tss(struct kvm *kv
 {
 	gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
 	u16 data = 0;
+	int ret = 0;
 	int r;
 
+	down_read(&current->mm->mmap_sem);
 	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
 	if (r < 0)
-		return 0;
+		goto out;
 	data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
 	r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
 	if (r < 0)
-		return 0;
+		goto out;
 	r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
 	if (r < 0)
-		return 0;
+		goto out;
 	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
 	if (r < 0)
-		return 0;
+		goto out;
 	data = ~0;
-	r = kvm_write_guest_page(kvm, fn, &data, RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
-			sizeof(u8));
+	r = kvm_write_guest_page(kvm, fn, &data,
+				 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
+				 sizeof(u8));
 	if (r < 0)
-		return 0;
-	return 1;
+		goto out;
+
+	ret = 1;
+out:
+	up_read(&current->mm->mmap_sem);
+	return ret;
 }
 
 static void seg_setup(int seg)
@@ -1470,6 +1477,7 @@ static int alloc_apic_access_page(struct
 	int r = 0;
 
 	mutex_lock(&kvm->lock);
+	down_write(&current->mm->mmap_sem);
 	if (kvm->arch.apic_access_page)
 		goto out;
 	kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
@@ -1481,6 +1489,7 @@ static int alloc_apic_access_page(struct
 		goto out;
 	kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
 out:
+	up_write(&current->mm->mmap_sem);
 	mutex_unlock(&kvm->lock);
 	return r;
 }
Index: kvm.quilt/arch/x86/kvm/x86.c
===================================================================
--- kvm.quilt.orig/arch/x86/kvm/x86.c
+++ kvm.quilt/arch/x86/kvm/x86.c
@@ -180,7 +180,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, u
 	int ret;
 	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 
-	mutex_lock(&vcpu->kvm->lock);
+	down_read(&current->mm->mmap_sem);
 	ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
 				  offset * sizeof(u64), sizeof(pdpte));
 	if (ret < 0) {
@@ -197,7 +197,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, u
 
 	memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
 out:
-	mutex_unlock(&vcpu->kvm->lock);
+	up_read(&current->mm->mmap_sem);
 
 	return ret;
 }
@@ -211,13 +211,13 @@ static bool pdptrs_changed(struct kvm_vc
 	if (is_long_mode(vcpu) || !is_pae(vcpu))
 		return false;
 
-	mutex_lock(&vcpu->kvm->lock);
+	down_read(&current->mm->mmap_sem);
 	r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
 	if (r < 0)
 		goto out;
 	changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
 out:
-	mutex_unlock(&vcpu->kvm->lock);
+	up_read(&current->mm->mmap_sem);
 
 	return changed;
 }
@@ -277,9 +277,7 @@ void set_cr0(struct kvm_vcpu *vcpu, unsi
 	kvm_x86_ops->set_cr0(vcpu, cr0);
 	vcpu->arch.cr0 = cr0;
 
-	mutex_lock(&vcpu->kvm->lock);
 	kvm_mmu_reset_context(vcpu);
-	mutex_unlock(&vcpu->kvm->lock);
 	return;
 }
 EXPORT_SYMBOL_GPL(set_cr0);
@@ -319,9 +317,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsi
 	}
 	kvm_x86_ops->set_cr4(vcpu, cr4);
 	vcpu->arch.cr4 = cr4;
-	mutex_lock(&vcpu->kvm->lock);
 	kvm_mmu_reset_context(vcpu);
-	mutex_unlock(&vcpu->kvm->lock);
 }
 EXPORT_SYMBOL_GPL(set_cr4);
 
@@ -359,7 +355,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsi
 		 */
 	}
 
-	mutex_lock(&vcpu->kvm->lock);
+	down_read(&current->mm->mmap_sem);
 	/*
 	 * Does the new cr3 value map to physical memory? (Note, we
 	 * catch an invalid cr3 even in real-mode, because it would
@@ -375,7 +371,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsi
 		vcpu->arch.cr3 = cr3;
 		vcpu->arch.mmu.new_cr3(vcpu);
 	}
-	mutex_unlock(&vcpu->kvm->lock);
+	up_read(&current->mm->mmap_sem);
 }
 EXPORT_SYMBOL_GPL(set_cr3);
 
@@ -1170,12 +1166,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages
 	if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
 		return -EINVAL;
 
-	mutex_lock(&kvm->lock);
+	down_write(&current->mm->mmap_sem);
 
 	kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
 	kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
 
-	mutex_unlock(&kvm->lock);
+	up_write(&current->mm->mmap_sem);
 	return 0;
 }
 
@@ -1224,7 +1220,7 @@ static int kvm_vm_ioctl_set_memory_alias
 	    < alias->target_phys_addr)
 		goto out;
 
-	mutex_lock(&kvm->lock);
+	down_write(&current->mm->mmap_sem);
 
 	p = &kvm->arch.aliases[alias->slot];
 	p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
@@ -1238,7 +1234,7 @@ static int kvm_vm_ioctl_set_memory_alias
 
 	kvm_mmu_zap_all(kvm);
 
-	mutex_unlock(&kvm->lock);
+	up_write(&current->mm->mmap_sem);
 
 	return 0;
 
@@ -1314,7 +1310,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kv
 	struct kvm_memory_slot *memslot;
 	int is_dirty = 0;
 
-	mutex_lock(&kvm->lock);
+	down_write(&current->mm->mmap_sem);
 
 	r = kvm_get_dirty_log(kvm, log, &is_dirty);
 	if (r)
@@ -1330,7 +1326,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kv
 	}
 	r = 0;
 out:
-	mutex_unlock(&kvm->lock);
+	up_write(&current->mm->mmap_sem);
 	return r;
 }
 
@@ -1524,25 +1520,32 @@ int emulator_read_std(unsigned long addr
 			     struct kvm_vcpu *vcpu)
 {
 	void *data = val;
+	int r = X86EMUL_CONTINUE;
 
+	down_read(&current->mm->mmap_sem);
 	while (bytes) {
 		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
 		unsigned offset = addr & (PAGE_SIZE-1);
 		unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
 		int ret;
 
-		if (gpa == UNMAPPED_GVA)
-			return X86EMUL_PROPAGATE_FAULT;
+		if (gpa == UNMAPPED_GVA) {
+			r = X86EMUL_PROPAGATE_FAULT;
+			goto out;
+		}
 		ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
-		if (ret < 0)
-			return X86EMUL_UNHANDLEABLE;
+		if (ret < 0) {
+			r = X86EMUL_UNHANDLEABLE;
+			goto out;
+		}
 
 		bytes -= tocopy;
 		data += tocopy;
 		addr += tocopy;
 	}
-
-	return X86EMUL_CONTINUE;
+out:
+	up_read(&current->mm->mmap_sem);
+	return r;
 }
 EXPORT_SYMBOL_GPL(emulator_read_std);
 
@@ -1560,7 +1563,9 @@ static int emulator_read_emulated(unsign
 		return X86EMUL_CONTINUE;
 	}
 
+	down_read(&current->mm->mmap_sem);
 	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+	up_read(&current->mm->mmap_sem);
 
 	/* For APIC access vmexit */
 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -1576,11 +1581,14 @@ mmio:
 	/*
 	 * Is this MMIO handled locally?
 	 */
+	mutex_lock(&vcpu->kvm->lock);
 	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
 	if (mmio_dev) {
 		kvm_iodevice_read(mmio_dev, gpa, bytes, val);
+		mutex_unlock(&vcpu->kvm->lock);
 		return X86EMUL_CONTINUE;
 	}
+	mutex_unlock(&vcpu->kvm->lock);
 
 	vcpu->mmio_needed = 1;
 	vcpu->mmio_phys_addr = gpa;
@@ -1595,10 +1603,16 @@ static int emulator_write_phys(struct kv
 {
 	int ret;
 
+	down_read(&current->mm->mmap_sem);
 	ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
-	if (ret < 0)
+	if (ret < 0) {
+		up_read(&current->mm->mmap_sem);
 		return 0;
+	}
+	mutex_lock(&vcpu->kvm->lock);
 	kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+	mutex_unlock(&vcpu->kvm->lock);
+	up_read(&current->mm->mmap_sem);
 	return 1;
 }
 
@@ -1608,7 +1622,11 @@ static int emulator_write_emulated_onepa
 					   struct kvm_vcpu *vcpu)
 {
 	struct kvm_io_device *mmio_dev;
-	gpa_t                 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+	gpa_t                 gpa;
+
+	down_read(&current->mm->mmap_sem);
+	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+	up_read(&current->mm->mmap_sem);
 
 	if (gpa == UNMAPPED_GVA) {
 		kvm_inject_page_fault(vcpu, addr, 2);
@@ -1626,11 +1644,14 @@ mmio:
 	/*
 	 * Is this MMIO handled locally?
 	 */
+	mutex_lock(&vcpu->kvm->lock);
 	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
 	if (mmio_dev) {
 		kvm_iodevice_write(mmio_dev, gpa, bytes, val);
+		mutex_unlock(&vcpu->kvm->lock);
 		return X86EMUL_CONTINUE;
 	}
+	mutex_unlock(&vcpu->kvm->lock);
 
 	vcpu->mmio_needed = 1;
 	vcpu->mmio_phys_addr = gpa;
@@ -1677,11 +1698,14 @@ static int emulator_cmpxchg_emulated(uns
 #ifndef CONFIG_X86_64
 	/* guests cmpxchg8b have to be emulated atomically */
 	if (bytes == 8) {
-		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+		gpa_t gpa;
 		struct page *page;
 		char *addr;
 		u64 *val;
 
+		down_read(&current->mm->mmap_sem);
+		gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+
 		if (gpa == UNMAPPED_GVA ||
 		   (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
 			goto emul_write;
@@ -1697,6 +1721,7 @@ static int emulator_cmpxchg_emulated(uns
 		kvm_release_page_dirty(page);
 	}
 emul_write:
+	up_read(&current->mm->mmap_sem);
 #endif
 
 	return emulator_write_emulated(addr, new, bytes, vcpu);
@@ -2077,10 +2102,10 @@ int kvm_emulate_pio_string(struct kvm_vc
 		kvm_x86_ops->skip_emulated_instruction(vcpu);
 
 	for (i = 0; i < nr_pages; ++i) {
-		mutex_lock(&vcpu->kvm->lock);
+		down_read(&current->mm->mmap_sem);
 		page = gva_to_page(vcpu, address + i * PAGE_SIZE);
 		vcpu->arch.pio.guest_pages[i] = page;
-		mutex_unlock(&vcpu->kvm->lock);
+		up_read(&current->mm->mmap_sem);
 		if (!page) {
 			kvm_inject_gp(vcpu, 0);
 			free_pio_guest_pages(vcpu);
@@ -2203,7 +2228,6 @@ int kvm_fix_hypercall(struct kvm_vcpu *v
 	char instruction[3];
 	int ret = 0;
 
-	mutex_lock(&vcpu->kvm->lock);
 
 	/*
 	 * Blow out the MMU to ensure that no other VCPU has an active mapping
@@ -2218,8 +2242,6 @@ int kvm_fix_hypercall(struct kvm_vcpu *v
 	    != X86EMUL_CONTINUE)
 		ret = -EFAULT;
 
-	mutex_unlock(&vcpu->kvm->lock);
-
 	return ret;
 }
 
@@ -2827,13 +2849,13 @@ int kvm_arch_vcpu_ioctl_translate(struct
 	gpa_t gpa;
 
 	vcpu_load(vcpu);
-	mutex_lock(&vcpu->kvm->lock);
+	down_read(&current->mm->mmap_sem);
 	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
+	up_read(&current->mm->mmap_sem);
 	tr->physical_address = gpa;
 	tr->valid = gpa != UNMAPPED_GVA;
 	tr->writeable = 1;
 	tr->usermode = 0;
-	mutex_unlock(&vcpu->kvm->lock);
 	vcpu_put(vcpu);
 
 	return 0;
@@ -3102,13 +3124,11 @@ int kvm_arch_set_memory_region(struct kv
 	 */
 	if (!user_alloc) {
 		if (npages && !old.rmap) {
-			down_write(&current->mm->mmap_sem);
 			memslot->userspace_addr = do_mmap(NULL, 0,
 						     npages * PAGE_SIZE,
 						     PROT_READ | PROT_WRITE,
 						     MAP_SHARED | MAP_ANONYMOUS,
 						     0);
-			up_write(&current->mm->mmap_sem);
 
 			if (IS_ERR((void *)memslot->userspace_addr))
 				return PTR_ERR((void *)memslot->userspace_addr);
@@ -3116,10 +3136,8 @@ int kvm_arch_set_memory_region(struct kv
 			if (!old.user_alloc && old.rmap) {
 				int ret;
 
-				down_write(&current->mm->mmap_sem);
 				ret = do_munmap(current->mm, old.userspace_addr,
 						old.npages * PAGE_SIZE);
-				up_write(&current->mm->mmap_sem);
 				if (ret < 0)
 					printk(KERN_WARNING
 				       "kvm_vm_ioctl_set_memory_region: "
Index: kvm.quilt/virt/kvm/kvm_main.c
===================================================================
--- kvm.quilt.orig/virt/kvm/kvm_main.c
+++ kvm.quilt/virt/kvm/kvm_main.c
@@ -227,7 +227,7 @@ static int kvm_vm_release(struct inode *
  *
  * Discontiguous memory is allowed, mostly for framebuffers.
  *
- * Must be called holding kvm->lock.
+ * Must be called holding mmap_sem for write.
  */
 int __kvm_set_memory_region(struct kvm *kvm,
 			    struct kvm_userspace_memory_region *mem,
@@ -338,9 +338,9 @@ int kvm_set_memory_region(struct kvm *kv
 {
 	int r;
 
-	mutex_lock(&kvm->lock);
+	down_write(&current->mm->mmap_sem);
 	r = __kvm_set_memory_region(kvm, mem, user_alloc);
-	mutex_unlock(&kvm->lock);
+	up_write(&current->mm->mmap_sem);
 	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
@@ -456,7 +456,7 @@ static unsigned long gfn_to_hva(struct k
 /*
  * Requires current->mm->mmap_sem to be held
  */
-static struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn)
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 {
 	struct page *page[1];
 	unsigned long addr;
@@ -481,17 +481,6 @@ static struct page *__gfn_to_page(struct
 	return page[0];
 }
 
-struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
-{
-	struct page *page;
-
-	down_read(&current->mm->mmap_sem);
-	page = __gfn_to_page(kvm, gfn);
-	up_read(&current->mm->mmap_sem);
-
-	return page;
-}
-
 EXPORT_SYMBOL_GPL(gfn_to_page);
 
 void kvm_release_page_clean(struct page *page)
@@ -977,8 +966,7 @@ static int kvm_vm_fault(struct vm_area_s
 
 	if (!kvm_is_visible_gfn(kvm, vmf->pgoff))
 		return VM_FAULT_SIGBUS;
-	/* current->mm->mmap_sem is already held so call lockless version */
-	page = __gfn_to_page(kvm, vmf->pgoff);
+	page = gfn_to_page(kvm, vmf->pgoff);
 	if (is_error_page(page)) {
 		kvm_release_page_clean(page);
 		return VM_FAULT_SIGBUS;

-- 


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [patch 2/5] KVM: add kvm_read_guest_atomic()
  2007-12-21  0:18 [patch 0/5] KVM shadow scalability enhancements Marcelo Tosatti
  2007-12-21  0:18 ` [patch 1/5] KVM: concurrent guest walkers Marcelo Tosatti
@ 2007-12-21  0:18 ` Marcelo Tosatti
  2007-12-21  0:18 ` [patch 3/5] KVM: add kvm_follow_page() Marcelo Tosatti
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 26+ messages in thread
From: Marcelo Tosatti @ 2007-12-21  0:18 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, mtosatti-H+wXaHxf7aLQT0dZR+AlfA

[-- Attachment #1: mmu-atomicity --]
[-- Type: text/plain, Size: 3849 bytes --]

In preparation for a mmu spinlock, add kvm_read_guest_atomic() 
and use it in fetch() and prefetch_page().

Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

Index: kvm.quilt/arch/x86/kvm/paging_tmpl.h
===================================================================
--- kvm.quilt.orig/arch/x86/kvm/paging_tmpl.h
+++ kvm.quilt/arch/x86/kvm/paging_tmpl.h
@@ -318,10 +318,12 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
 					       metaphysical, access,
 					       shadow_ent, &new_page);
 		if (new_page && !metaphysical) {
+			int r;
 			pt_element_t curr_pte;
-			kvm_read_guest(vcpu->kvm, walker->pte_gpa[level - 2],
-				       &curr_pte, sizeof(curr_pte));
-			if (curr_pte != walker->ptes[level - 2])
+			r = kvm_read_guest_atomic(vcpu->kvm,
+						  walker->pte_gpa[level - 2],
+				       		  &curr_pte, sizeof(curr_pte));
+			if (r || curr_pte != walker->ptes[level - 2])
 				return NULL;
 		}
 		shadow_addr = __pa(shadow_page->spt);
@@ -431,9 +433,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kv
 static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
 				 struct kvm_mmu_page *sp)
 {
-	int i, offset = 0;
-	pt_element_t *gpt;
-	struct page *page;
+	int i, offset = 0, r = 0;
+	pt_element_t pt;
 
 	if (sp->role.metaphysical
 	    || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
@@ -443,15 +444,18 @@ static void FNAME(prefetch_page)(struct 
 
 	if (PTTYPE == 32)
 		offset = sp->role.quadrant << PT64_LEVEL_BITS;
-	page = gfn_to_page(vcpu->kvm, sp->gfn);
-	gpt = kmap_atomic(page, KM_USER0);
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
-		if (is_present_pte(gpt[offset + i]))
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+		gpa_t pte_gpa = gfn_to_gpa(sp->gfn);
+		pte_gpa += (i+offset) * sizeof(pt_element_t);
+
+		r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt,
+					  sizeof(pt_element_t));
+		if (r || is_present_pte(pt))
 			sp->spt[i] = shadow_trap_nonpresent_pte;
 		else
 			sp->spt[i] = shadow_notrap_nonpresent_pte;
-	kunmap_atomic(gpt, KM_USER0);
-	kvm_release_page_clean(page);
+	}
 }
 
 #undef pt_element_t
Index: kvm.quilt/include/linux/kvm_host.h
===================================================================
--- kvm.quilt.orig/include/linux/kvm_host.h
+++ kvm.quilt/include/linux/kvm_host.h
@@ -167,6 +167,8 @@ void kvm_release_page_clean(struct page 
 void kvm_release_page_dirty(struct page *page);
 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
 			int len);
+int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
+			  unsigned long len);
 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
 			 int offset, int len);
Index: kvm.quilt/virt/kvm/kvm_main.c
===================================================================
--- kvm.quilt.orig/virt/kvm/kvm_main.c
+++ kvm.quilt/virt/kvm/kvm_main.c
@@ -541,6 +541,26 @@ int kvm_read_guest(struct kvm *kvm, gpa_
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest);
 
+int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
+			  unsigned long len)
+{
+	int r;
+	unsigned long addr;
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+	int offset = offset_in_page(gpa);
+
+	addr = gfn_to_hva(kvm, gfn);
+	if (kvm_is_error_hva(addr))
+		return -EFAULT;
+	pagefault_disable();
+	r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
+	pagefault_enable();
+	if (r)
+		return -EFAULT;
+	return 0;
+}
+EXPORT_SYMBOL(kvm_read_guest_atomic);
+
 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
 			 int offset, int len)
 {

-- 


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [patch 3/5] KVM: add kvm_follow_page()
  2007-12-21  0:18 [patch 0/5] KVM shadow scalability enhancements Marcelo Tosatti
  2007-12-21  0:18 ` [patch 1/5] KVM: concurrent guest walkers Marcelo Tosatti
  2007-12-21  0:18 ` [patch 2/5] KVM: add kvm_read_guest_atomic() Marcelo Tosatti
@ 2007-12-21  0:18 ` Marcelo Tosatti
       [not found]   ` <20071221002024.345682789-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  2007-12-21  0:18 ` [patch 4/5] KVM: export follow_page() Marcelo Tosatti
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 26+ messages in thread
From: Marcelo Tosatti @ 2007-12-21  0:18 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, mtosatti-H+wXaHxf7aLQT0dZR+AlfA

[-- Attachment #1: mmu-setpte-atomic --]
[-- Type: text/plain, Size: 4557 bytes --]

In preparation for a mmu spinlock, avoid schedules in mmu_set_spte() 
by using follow_page() instead of get_user_pages().

The fault handling work by get_user_pages() now happens outside the lock.

Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

Index: kvm.quilt/arch/x86/kvm/mmu.c
===================================================================
--- kvm.quilt.orig/arch/x86/kvm/mmu.c
+++ kvm.quilt/arch/x86/kvm/mmu.c
@@ -907,7 +907,9 @@ static void mmu_set_spte(struct kvm_vcpu
 	if (!(pte_access & ACC_EXEC_MASK))
 		spte |= PT64_NX_MASK;
 
-	page = gfn_to_page(vcpu->kvm, gfn);
+	page = kvm_follow_page(vcpu->kvm, gfn);
+	if (!page)
+		return;
 
 	spte |= PT_PRESENT_MASK;
 	if (pte_access & ACC_USER_MASK)
@@ -983,8 +985,11 @@ static int nonpaging_map(struct kvm_vcpu
 		table = __va(table_addr);
 
 		if (level == 1) {
+			struct page *page;
+			page = gfn_to_page(vcpu->kvm, gfn);
 			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
 				     0, write, 1, &pt_write, gfn);
+			kvm_release_page_clean(page);
 			return pt_write || is_io_pte(table[index]);
 		}
 
Index: kvm.quilt/include/linux/kvm_host.h
===================================================================
--- kvm.quilt.orig/include/linux/kvm_host.h
+++ kvm.quilt/include/linux/kvm_host.h
@@ -163,6 +163,7 @@ int kvm_arch_set_memory_region(struct kv
 				int user_alloc);
 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
+struct page *kvm_follow_page(struct kvm *kvm, gfn_t gfn);
 void kvm_release_page_clean(struct page *page);
 void kvm_release_page_dirty(struct page *page);
 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
Index: kvm.quilt/virt/kvm/kvm_main.c
===================================================================
--- kvm.quilt.orig/virt/kvm/kvm_main.c
+++ kvm.quilt/virt/kvm/kvm_main.c
@@ -453,6 +453,25 @@ static unsigned long gfn_to_hva(struct k
 	return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
 }
 
+struct page *kvm_follow_page(struct kvm *kvm, gfn_t gfn)
+{
+	unsigned long addr;
+	struct vm_area_struct *vma;
+
+	addr = gfn_to_hva(kvm, gfn);
+	/* MMIO access */
+	if (kvm_is_error_hva(addr)) {
+		get_page(bad_page);
+		return bad_page;
+	}
+
+	vma = find_vma(current->mm, addr);
+	if (!vma)
+		return NULL;
+
+	return follow_page(vma, addr, FOLL_GET|FOLL_TOUCH);
+}
+
 /*
  * Requires current->mm->mmap_sem to be held
  */
Index: kvm.quilt/arch/x86/kvm/paging_tmpl.h
===================================================================
--- kvm.quilt.orig/arch/x86/kvm/paging_tmpl.h
+++ kvm.quilt/arch/x86/kvm/paging_tmpl.h
@@ -67,6 +67,7 @@ struct guest_walker {
 	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
 	pt_element_t ptes[PT_MAX_FULL_LEVELS];
 	gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
+	struct page *page;
 	unsigned pt_access;
 	unsigned pte_access;
 	gfn_t gfn;
@@ -203,14 +204,18 @@ walk:
 		--walker->level;
 	}
 
+	walker->page = gfn_to_page(vcpu->kvm, walker->gfn);
+
 	if (write_fault && !is_dirty_pte(pte)) {
 		bool ret;
 
 		mark_page_dirty(vcpu->kvm, table_gfn);
 		ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
 			    pte|PT_DIRTY_MASK);
-		if (ret)
+		if (ret) {
+			kvm_release_page_clean(walker->page);
 			goto walk;
+		}
 		pte |= PT_DIRTY_MASK;
 		mutex_lock(&vcpu->kvm->lock);
 		kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
@@ -323,8 +328,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
 			r = kvm_read_guest_atomic(vcpu->kvm,
 						  walker->pte_gpa[level - 2],
 				       		  &curr_pte, sizeof(curr_pte));
-			if (r || curr_pte != walker->ptes[level - 2])
-				return NULL;
+			if (r || curr_pte != walker->ptes[level - 2]) {
+				shadow_ent = NULL;
+				goto out;
+			}
 		}
 		shadow_addr = __pa(shadow_page->spt);
 		shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
@@ -336,7 +343,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
 		     user_fault, write_fault,
 		     walker->ptes[walker->level-1] & PT_DIRTY_MASK,
 		     ptwrite, walker->gfn);
-
+out:
+	kvm_release_page_clean(walker->page);
 	return shadow_ent;
 }
 
@@ -425,6 +433,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kv
 	if (r) {
 		gpa = gfn_to_gpa(walker.gfn);
 		gpa |= vaddr & ~PAGE_MASK;
+		kvm_release_page_clean(walker.page);
 	}
 
 	return gpa;

-- 


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [patch 4/5] KVM: export follow_page()
  2007-12-21  0:18 [patch 0/5] KVM shadow scalability enhancements Marcelo Tosatti
                   ` (2 preceding siblings ...)
  2007-12-21  0:18 ` [patch 3/5] KVM: add kvm_follow_page() Marcelo Tosatti
@ 2007-12-21  0:18 ` Marcelo Tosatti
       [not found]   ` <20071221002024.423895760-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  2007-12-21  0:18 ` [patch 5/5] KVM: switch to mmu spinlock Marcelo Tosatti
       [not found] ` <20071221001821.029994250-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  5 siblings, 1 reply; 26+ messages in thread
From: Marcelo Tosatti @ 2007-12-21  0:18 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, mtosatti-H+wXaHxf7aLQT0dZR+AlfA

[-- Attachment #1: export-follow-page --]
[-- Type: text/plain, Size: 799 bytes --]

follow_page() is required by KVM to find the struct page which maps
to a given address in spinlock protected code.

Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

Index: kvm.quilt/mm/memory.c
===================================================================
--- kvm.quilt.orig/mm/memory.c
+++ kvm.quilt/mm/memory.c
@@ -970,6 +970,7 @@ no_page_table:
 	}
 	return page;
 }
+EXPORT_SYMBOL(follow_page);
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long start, int len, int write, int force,

-- 


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [patch 5/5] KVM: switch to mmu spinlock
  2007-12-21  0:18 [patch 0/5] KVM shadow scalability enhancements Marcelo Tosatti
                   ` (3 preceding siblings ...)
  2007-12-21  0:18 ` [patch 4/5] KVM: export follow_page() Marcelo Tosatti
@ 2007-12-21  0:18 ` Marcelo Tosatti
       [not found] ` <20071221001821.029994250-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  5 siblings, 0 replies; 26+ messages in thread
From: Marcelo Tosatti @ 2007-12-21  0:18 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, mtosatti-H+wXaHxf7aLQT0dZR+AlfA

[-- Attachment #1: convert-to-spinlock --]
[-- Type: text/plain, Size: 7291 bytes --]

Convert the synchronization of the shadow handling to a separate mmu_lock
spinlock.

Also guard fetch() by mmap_sem in read-mode to protect against alias
and memslot changes.

Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

Index: kvm.quilt/arch/x86/kvm/mmu.c
===================================================================
--- kvm.quilt.orig/arch/x86/kvm/mmu.c
+++ kvm.quilt/arch/x86/kvm/mmu.c
@@ -1031,7 +1031,7 @@ static void mmu_free_roots(struct kvm_vc
 
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
-	mutex_lock(&vcpu->kvm->lock);
+	spin_lock(&vcpu->kvm->mmu_lock);
 #ifdef CONFIG_X86_64
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -1039,7 +1039,7 @@ static void mmu_free_roots(struct kvm_vc
 		sp = page_header(root);
 		--sp->root_count;
 		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-		mutex_unlock(&vcpu->kvm->lock);
+		spin_unlock(&vcpu->kvm->mmu_lock);
 		return;
 	}
 #endif
@@ -1053,7 +1053,7 @@ static void mmu_free_roots(struct kvm_vc
 		}
 		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
 	}
-	mutex_unlock(&vcpu->kvm->lock);
+	spin_unlock(&vcpu->kvm->mmu_lock);
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 }
 
@@ -1256,9 +1256,9 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
 		goto out;
-	mutex_lock(&vcpu->kvm->lock);
+	spin_lock(&vcpu->kvm->mmu_lock);
 	mmu_alloc_roots(vcpu);
-	mutex_unlock(&vcpu->kvm->lock);
+	spin_unlock(&vcpu->kvm->mmu_lock);
 	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
 	kvm_mmu_flush_tlb(vcpu);
 out:
@@ -1435,15 +1435,15 @@ int kvm_mmu_unprotect_page_virt(struct k
 	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
 	up_read(&current->mm->mmap_sem);
 
-	mutex_lock(&vcpu->kvm->lock);
+	spin_lock(&vcpu->kvm->mmu_lock);
 	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-	mutex_unlock(&vcpu->kvm->lock);
+	spin_unlock(&vcpu->kvm->mmu_lock);
 	return r;
 }
 
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
-	mutex_lock(&vcpu->kvm->lock);
+	spin_lock(&vcpu->kvm->mmu_lock);
 	while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
 		struct kvm_mmu_page *sp;
 
@@ -1452,7 +1452,7 @@ void __kvm_mmu_free_some_pages(struct kv
 		kvm_mmu_zap_page(vcpu->kvm, sp);
 		++vcpu->kvm->stat.mmu_recycled;
 	}
-	mutex_unlock(&vcpu->kvm->lock);
+	spin_unlock(&vcpu->kvm->mmu_lock);
 }
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
@@ -1584,10 +1584,10 @@ void kvm_mmu_zap_all(struct kvm *kvm)
 {
 	struct kvm_mmu_page *sp, *node;
 
-	mutex_lock(&kvm->lock);
+	spin_lock(&kvm->mmu_lock);
 	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
 		kvm_mmu_zap_page(kvm, sp);
-	mutex_unlock(&kvm->lock);
+	spin_unlock(&kvm->mmu_lock);
 
 	kvm_flush_remote_tlbs(kvm);
 }
Index: kvm.quilt/arch/x86/kvm/paging_tmpl.h
===================================================================
--- kvm.quilt.orig/arch/x86/kvm/paging_tmpl.h
+++ kvm.quilt/arch/x86/kvm/paging_tmpl.h
@@ -217,9 +217,9 @@ walk:
 			goto walk;
 		}
 		pte |= PT_DIRTY_MASK;
-		mutex_lock(&vcpu->kvm->lock);
+		spin_lock(&vcpu->kvm->mmu_lock);
 		kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
-		mutex_unlock(&vcpu->kvm->lock);
+		spin_unlock(&vcpu->kvm->mmu_lock);
 		walker->ptes[walker->level - 1] = pte;
 	}
 
@@ -386,7 +386,6 @@ static int FNAME(page_fault)(struct kvm_
 	 */
 	r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
 			     fetch_fault);
-	up_read(&current->mm->mmap_sem);
 
 	/*
 	 * The page is not mapped by the guest.  Let the guest handle it.
@@ -395,10 +394,11 @@ static int FNAME(page_fault)(struct kvm_
 		pgprintk("%s: guest page fault\n", __FUNCTION__);
 		inject_page_fault(vcpu, addr, walker.error_code);
 		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
+		up_read(&current->mm->mmap_sem);
 		return 0;
 	}
 
-	mutex_lock(&vcpu->kvm->lock);
+	spin_lock(&vcpu->kvm->mmu_lock);
 	shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
 				  &write_pt);
 	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
@@ -411,13 +411,15 @@ static int FNAME(page_fault)(struct kvm_
 	 * mmio: emulate if accessible, otherwise its a guest fault.
 	 */
 	if (shadow_pte && is_io_pte(*shadow_pte)) {
-		mutex_unlock(&vcpu->kvm->lock);
+		spin_unlock(&vcpu->kvm->mmu_lock);
+		up_read(&current->mm->mmap_sem);
 		return 1;
 	}
 
 	++vcpu->stat.pf_fixed;
 	kvm_mmu_audit(vcpu, "post page fault (fixed)");
-	mutex_unlock(&vcpu->kvm->lock);
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	up_read(&current->mm->mmap_sem);
 
 	return write_pt;
 }
Index: kvm.quilt/arch/x86/kvm/x86.c
===================================================================
--- kvm.quilt.orig/arch/x86/kvm/x86.c
+++ kvm.quilt/arch/x86/kvm/x86.c
@@ -1609,9 +1609,9 @@ static int emulator_write_phys(struct kv
 		up_read(&current->mm->mmap_sem);
 		return 0;
 	}
-	mutex_lock(&vcpu->kvm->lock);
+	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_pte_write(vcpu, gpa, val, bytes);
-	mutex_unlock(&vcpu->kvm->lock);
+	spin_unlock(&vcpu->kvm->mmu_lock);
 	up_read(&current->mm->mmap_sem);
 	return 1;
 }
Index: kvm.quilt/include/linux/kvm_host.h
===================================================================
--- kvm.quilt.orig/include/linux/kvm_host.h
+++ kvm.quilt/include/linux/kvm_host.h
@@ -104,7 +104,8 @@ struct kvm_memory_slot {
 };
 
 struct kvm {
-	struct mutex lock; /* protects everything except vcpus */
+	struct mutex lock; /* protects the vcpus array and APIC accesses */
+	spinlock_t mmu_lock;
 	struct mm_struct *mm; /* userspace tied to this vm */
 	int nmemslots;
 	struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
Index: kvm.quilt/virt/kvm/kvm_main.c
===================================================================
--- kvm.quilt.orig/virt/kvm/kvm_main.c
+++ kvm.quilt/virt/kvm/kvm_main.c
@@ -165,6 +165,7 @@ static struct kvm *kvm_create_vm(void)
 
 	kvm->mm = current->mm;
 	atomic_inc(&kvm->mm->mm_count);
+	spin_lock_init(&kvm->mmu_lock);
 	kvm_io_bus_init(&kvm->pio_bus);
 	mutex_init(&kvm->lock);
 	kvm_io_bus_init(&kvm->mmio_bus);
@@ -571,9 +572,7 @@ int kvm_read_guest_atomic(struct kvm *kv
 	addr = gfn_to_hva(kvm, gfn);
 	if (kvm_is_error_hva(addr))
 		return -EFAULT;
-	pagefault_disable();
 	r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
-	pagefault_enable();
 	if (r)
 		return -EFAULT;
 	return 0;
Index: kvm.quilt/arch/x86/kvm/vmx.c
===================================================================
--- kvm.quilt.orig/arch/x86/kvm/vmx.c
+++ kvm.quilt/arch/x86/kvm/vmx.c
@@ -1476,7 +1476,6 @@ static int alloc_apic_access_page(struct
 	struct kvm_userspace_memory_region kvm_userspace_mem;
 	int r = 0;
 
-	mutex_lock(&kvm->lock);
 	down_write(&current->mm->mmap_sem);
 	if (kvm->arch.apic_access_page)
 		goto out;
@@ -1490,7 +1489,6 @@ static int alloc_apic_access_page(struct
 	kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
 out:
 	up_write(&current->mm->mmap_sem);
-	mutex_unlock(&kvm->lock);
 	return r;
 }
 

-- 


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch 3/5] KVM: add kvm_follow_page()
       [not found]   ` <20071221002024.345682789-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2007-12-22 19:42     ` Avi Kivity
       [not found]       ` <476D68A3.3000200-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: Avi Kivity @ 2007-12-22 19:42 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel

Marcelo Tosatti wrote:
> In preparation for a mmu spinlock, avoid schedules in mmu_set_spte()
> by using follow_page() instead of get_user_pages().
>
> The fault handling work by get_user_pages() now happens outside the lock.
>
> Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
>
> Index: kvm.quilt/arch/x86/kvm/mmu.c
> @@ -983,8 +985,11 @@ static int nonpaging_map(struct kvm_vcpu
> table = __va(table_addr);
>
> if (level == 1) {
> + struct page *page;
> + page = gfn_to_page(vcpu->kvm, gfn);
> mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
> 0, write, 1, &pt_write, gfn);
> + kvm_release_page_clean(page);
> return pt_write || is_io_pte(table[index]);
> }

Is this hunk necessary? It will swap in the page in case it's swapped
out, but not sure we care about performance in that case.

> n.c
> @@ -453,6 +453,25 @@ static unsigned long gfn_to_hva(struct k
> return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
> }
>
> +struct page *kvm_follow_page(struct kvm *kvm, gfn_t gfn)
> +{
> + unsigned long addr;
> + struct vm_area_struct *vma;
> +
> + addr = gfn_to_hva(kvm, gfn);
> + /* MMIO access */
> + if (kvm_is_error_hva(addr)) {
> + get_page(bad_page);
> + return bad_page;
> + }
> +
> + vma = find_vma(current->mm, addr);
> + if (!vma)
> + return NULL;
> +
> + return follow_page(vma, addr, FOLL_GET|FOLL_TOUCH);
> +}

It may be better to create get_user_page_inatomic() as the last four
lines rather than exporting follow_page(). Maybe an mm maintainer can
comment.

> --- kvm.quilt.orig/arch/x86/kvm/paging_tmpl.h
> +++ kvm.quilt/arch/x86/kvm/paging_tmpl.h
> @@ -67,6 +67,7 @@ struct guest_walker {
> gfn_t table_gfn[PT_MAX_FULL_LEVELS];
> pt_element_t ptes[PT_MAX_FULL_LEVELS];
> gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
> + struct page *page;
> unsigned pt_access;
> unsigned pte_access;
> gfn_t gfn;
> @@ -203,14 +204,18 @@ walk:
> --walker->level;
> }
>
> + walker->page = gfn_to_page(vcpu->kvm, walker->gfn);
> +
> if (write_fault && !is_dirty_pte(pte)) {
> bool ret;
>
> mark_page_dirty(vcpu->kvm, table_gfn);
> ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
> pte|PT_DIRTY_MASK);
> - if (ret)
> + if (ret) {
> + kvm_release_page_clean(walker->page);
> goto walk;
> + }
> pte |= PT_DIRTY_MASK;
> mutex_lock(&vcpu->kvm->lock);
> kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
> @@ -323,8 +328,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
> r = kvm_read_guest_atomic(vcpu->kvm,
> walker->pte_gpa[level - 2],
> &curr_pte, sizeof(curr_pte));
> - if (r || curr_pte != walker->ptes[level - 2])
> - return NULL;
> + if (r || curr_pte != walker->ptes[level - 2]) {
> + shadow_ent = NULL;
> + goto out;
> + }
> }
> shadow_addr = __pa(shadow_page->spt);
> shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
> @@ -336,7 +343,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
> user_fault, write_fault,
> walker->ptes[walker->level-1] & PT_DIRTY_MASK,
> ptwrite, walker->gfn);
> -
> +out:
> + kvm_release_page_clean(walker->page);
> return shadow_ent;
> }
>

walker->page is never used, so I think it can be completely dropped.
mmu_set_spte() already does the right thing without its help.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch 4/5] KVM: export follow_page()
       [not found]   ` <20071221002024.423895760-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2007-12-22 19:43     ` Avi Kivity
  0 siblings, 0 replies; 26+ messages in thread
From: Avi Kivity @ 2007-12-22 19:43 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel

Marcelo Tosatti wrote:
> follow_page() is required by KVM to find the struct page which maps
> to a given address in spinlock protected code.
>
> Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
>
> Index: kvm.quilt/mm/memory.c
> ===================================================================
> --- kvm.quilt.orig/mm/memory.c
> +++ kvm.quilt/mm/memory.c
> @@ -970,6 +970,7 @@ no_page_table:
>      }
>      return page;
>  }
> +EXPORT_SYMBOL(follow_page);
>  

This should come before its use in the patch series (and of course, it
may get expanded into a new get_user_page_inatomic() if we're told
that's the right thing).

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch 0/5] KVM shadow scalability enhancements
       [not found] ` <20071221001821.029994250-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2007-12-22 19:46   ` Avi Kivity
  0 siblings, 0 replies; 26+ messages in thread
From: Avi Kivity @ 2007-12-22 19:46 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel

Marcelo Tosatti wrote:
> The following patchset increases KVM shadow scalability by allowing concurrent
> guest walking, allocation and instruction emulation.
>   

Thanks; the split patchset it _much_ easier to review, and hopefully to
bisect in case the review misses something.

I believe we can merge it at the next iteration.

Have you tested a real workload (like kernel compiles), for stability
and performance?

I can test Windows in case you don't have a copy handy.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch 3/5] KVM: add kvm_follow_page()
       [not found]       ` <476D68A3.3000200-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-12-23  5:25         ` Marcelo Tosatti
  2007-12-23  7:03           ` Avi Kivity
  0 siblings, 1 reply; 26+ messages in thread
From: Marcelo Tosatti @ 2007-12-23  5:25 UTC (permalink / raw)
  To: Avi Kivity, Andrew Morton; +Cc: kvm-devel

Hi Avi,

Andrew, please see the comments on the need for a atomic get_user_pages() 
below.

On Sat, Dec 22, 2007 at 09:42:27PM +0200, Avi Kivity wrote:
> Marcelo Tosatti wrote:
> > In preparation for a mmu spinlock, avoid schedules in mmu_set_spte()
> > by using follow_page() instead of get_user_pages().
> >
> > The fault handling work by get_user_pages() now happens outside the lock.
> >
> > Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
> >
> > Index: kvm.quilt/arch/x86/kvm/mmu.c
> > @@ -983,8 +985,11 @@ static int nonpaging_map(struct kvm_vcpu
> > table = __va(table_addr);
> >
> > if (level == 1) {
> > + struct page *page;
> > + page = gfn_to_page(vcpu->kvm, gfn);
> > mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
> > 0, write, 1, &pt_write, gfn);
> > + kvm_release_page_clean(page);
> > return pt_write || is_io_pte(table[index]);
> > }
> 
> Is this hunk necessary? It will swap in the page in case it's swapped
> out, but not sure we care about performance in that case.

The issue is that the mmap call done by QEMU won't instantiate
the pte's of the guest memory area. So it needs to either do that
get_user_pages()->handle_mm_fault() sequence outside mmu_set_spte() or
mmap() has to pass MAP_POPULATE, prefaulting all guest pages (which is
highly suboptimal).

> > n.c
> > @@ -453,6 +453,25 @@ static unsigned long gfn_to_hva(struct k
> > return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
> > }
> >
> > +struct page *kvm_follow_page(struct kvm *kvm, gfn_t gfn)
> > +{
> > + unsigned long addr;
> > + struct vm_area_struct *vma;
> > +
> > + addr = gfn_to_hva(kvm, gfn);
> > + /* MMIO access */
> > + if (kvm_is_error_hva(addr)) {
> > + get_page(bad_page);
> > + return bad_page;
> > + }
> > +
> > + vma = find_vma(current->mm, addr);
> > + if (!vma)
> > + return NULL;
> > +
> > + return follow_page(vma, addr, FOLL_GET|FOLL_TOUCH);
> > +}
> 
> It may be better to create get_user_page_inatomic() as the last four
> lines rather than exporting follow_page(). Maybe an mm maintainer can
> comment.

Andrew, KVM needs to find the struct page which maps to a certain       
address in the QEMU process without sleeping since that lookup is       
performed with a spinlock held.

So it needs to either export follow_page() or use a get_user_pages() variant 
which doesnt call cond_resched() or handle_mm_fault(), simply failing 
if the page is not pagetable mapped.

What you think? 

> > --- kvm.quilt.orig/arch/x86/kvm/paging_tmpl.h
> > +++ kvm.quilt/arch/x86/kvm/paging_tmpl.h
> > @@ -67,6 +67,7 @@ struct guest_walker {
> > gfn_t table_gfn[PT_MAX_FULL_LEVELS];
> > pt_element_t ptes[PT_MAX_FULL_LEVELS];
> > gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
> > + struct page *page;
> > unsigned pt_access;
> > unsigned pte_access;
> > gfn_t gfn;
> > @@ -203,14 +204,18 @@ walk:
> > --walker->level;
> > }
> >
> > + walker->page = gfn_to_page(vcpu->kvm, walker->gfn);
> > +
> > if (write_fault && !is_dirty_pte(pte)) {
> > bool ret;
> >
> > mark_page_dirty(vcpu->kvm, table_gfn);
> > ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
> > pte|PT_DIRTY_MASK);
> > - if (ret)
> > + if (ret) {
> > + kvm_release_page_clean(walker->page);
> > goto walk;
> > + }
> > pte |= PT_DIRTY_MASK;
> > mutex_lock(&vcpu->kvm->lock);
> > kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
> > @@ -323,8 +328,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
> > r = kvm_read_guest_atomic(vcpu->kvm,
> > walker->pte_gpa[level - 2],
> > &curr_pte, sizeof(curr_pte));
> > - if (r || curr_pte != walker->ptes[level - 2])
> > - return NULL;
> > + if (r || curr_pte != walker->ptes[level - 2]) {
> > + shadow_ent = NULL;
> > + goto out;
> > + }
> > }
> > shadow_addr = __pa(shadow_page->spt);
> > shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
> > @@ -336,7 +343,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
> > user_fault, write_fault,
> > walker->ptes[walker->level-1] & PT_DIRTY_MASK,
> > ptwrite, walker->gfn);
> > -
> > +out:
> > + kvm_release_page_clean(walker->page);
> > return shadow_ent;
> > }
> >
> 
> walker->page is never used, so I think it can be completely dropped.
> mmu_set_spte() already does the right thing without its help.

The page being mapped by the guest needs to be faulted in by someone...
So while walker->page is unused, it guarantees that the handler will
properly fault in the pagetables to know which physical address the page
resides at. ie. it is the place which guarantees that the page gets
mapped if its already in memory or swapped in and then mapped.

Thanks.

> -- 
> Do not meddle in the internals of kernels, for they are subtle and quick to panic.

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch 3/5] KVM: add kvm_follow_page()
  2007-12-23  5:25         ` Marcelo Tosatti
@ 2007-12-23  7:03           ` Avi Kivity
       [not found]             ` <476E083B.5040600-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: Avi Kivity @ 2007-12-23  7:03 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel, Andrew Morton

Marcelo Tosatti wrote:
> Hi Avi,
>
> Andrew, please see the comments on the need for a atomic get_user_pages() 
> below.
>
> On Sat, Dec 22, 2007 at 09:42:27PM +0200, Avi Kivity wrote:
>   
>> Marcelo Tosatti wrote:
>>     
>>> In preparation for a mmu spinlock, avoid schedules in mmu_set_spte()
>>> by using follow_page() instead of get_user_pages().
>>>
>>> The fault handling work by get_user_pages() now happens outside the lock.
>>>
>>> Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
>>>
>>> Index: kvm.quilt/arch/x86/kvm/mmu.c
>>> @@ -983,8 +985,11 @@ static int nonpaging_map(struct kvm_vcpu
>>> table = __va(table_addr);
>>>
>>> if (level == 1) {
>>> + struct page *page;
>>> + page = gfn_to_page(vcpu->kvm, gfn);
>>> mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
>>> 0, write, 1, &pt_write, gfn);
>>> + kvm_release_page_clean(page);
>>> return pt_write || is_io_pte(table[index]);
>>> }
>>>       
>> Is this hunk necessary? It will swap in the page in case it's swapped
>> out, but not sure we care about performance in that case.
>>     
>
> The issue is that the mmap call done by QEMU won't instantiate
> the pte's of the guest memory area. So it needs to either do that
> get_user_pages()->handle_mm_fault() sequence outside mmu_set_spte() 

But that's only on the very first access to the page.  Once the page has 
been accessed by the guest, it will be mapped in the host page table 
(unless it's subsequently swapped out).

Oh, but the page will never be faulted in this way because the path 
that's supposed to map it is mmu_set_spte().  In that case, your 
original implementation of passing in page directly is better (instead 
of being forced to pass it indirectly via the page tables).  Sorry about 
generating this confusion.

> or
> mmap() has to pass MAP_POPULATE, prefaulting all guest pages (which is
> highly suboptimal).
>   

Actually, MAP_POPULATE is optimal since it neatly batches all of the 
work.  I don't think we should do it since it gives the appearance of 
being slower (esp. with lots of guests memory).

>   
>>> n.c
>>> @@ -453,6 +453,25 @@ static unsigned long gfn_to_hva(struct k
>>> return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
>>> }
>>>
>>> +struct page *kvm_follow_page(struct kvm *kvm, gfn_t gfn)
>>> +{
>>> + unsigned long addr;
>>> + struct vm_area_struct *vma;
>>> +
>>> + addr = gfn_to_hva(kvm, gfn);
>>> + /* MMIO access */
>>> + if (kvm_is_error_hva(addr)) {
>>> + get_page(bad_page);
>>> + return bad_page;
>>> + }
>>> +
>>> + vma = find_vma(current->mm, addr);
>>> + if (!vma)
>>> + return NULL;
>>> +
>>> + return follow_page(vma, addr, FOLL_GET|FOLL_TOUCH);
>>> +}
>>>       
>> It may be better to create get_user_page_inatomic() as the last four
>> lines rather than exporting follow_page(). Maybe an mm maintainer can
>> comment.
>>     
>
> Andrew, KVM needs to find the struct page which maps to a certain       
> address in the QEMU process without sleeping since that lookup is       
> performed with a spinlock held.
>
> So it needs to either export follow_page() or use a get_user_pages() variant 
> which doesnt call cond_resched() or handle_mm_fault(), simply failing 
> if the page is not pagetable mapped.
>
> What you think? 
>
>   

[We need to pass FOLL_WRITE to follow_page()]

>>> --- kvm.quilt.orig/arch/x86/kvm/paging_tmpl.h
>>> +++ kvm.quilt/arch/x86/kvm/paging_tmpl.h
>>> @@ -67,6 +67,7 @@ struct guest_walker {
>>> gfn_t table_gfn[PT_MAX_FULL_LEVELS];
>>> pt_element_t ptes[PT_MAX_FULL_LEVELS];
>>> gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
>>> + struct page *page;
>>> unsigned pt_access;
>>> unsigned pte_access;
>>> gfn_t gfn;
>>> @@ -203,14 +204,18 @@ walk:
>>> --walker->level;
>>> }
>>>
>>> + walker->page = gfn_to_page(vcpu->kvm, walker->gfn);
>>> +
>>> if (write_fault && !is_dirty_pte(pte)) {
>>> bool ret;
>>>
>>> mark_page_dirty(vcpu->kvm, table_gfn);
>>> ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
>>> pte|PT_DIRTY_MASK);
>>> - if (ret)
>>> + if (ret) {
>>> + kvm_release_page_clean(walker->page);
>>> goto walk;
>>> + }
>>> pte |= PT_DIRTY_MASK;
>>> mutex_lock(&vcpu->kvm->lock);
>>> kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
>>> @@ -323,8 +328,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
>>> r = kvm_read_guest_atomic(vcpu->kvm,
>>> walker->pte_gpa[level - 2],
>>> &curr_pte, sizeof(curr_pte));
>>> - if (r || curr_pte != walker->ptes[level - 2])
>>> - return NULL;
>>> + if (r || curr_pte != walker->ptes[level - 2]) {
>>> + shadow_ent = NULL;
>>> + goto out;
>>> + }
>>> }
>>> shadow_addr = __pa(shadow_page->spt);
>>> shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
>>> @@ -336,7 +343,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
>>> user_fault, write_fault,
>>> walker->ptes[walker->level-1] & PT_DIRTY_MASK,
>>> ptwrite, walker->gfn);
>>> -
>>> +out:
>>> + kvm_release_page_clean(walker->page);
>>> return shadow_ent;
>>> }
>>>
>>>       
>> walker->page is never used, so I think it can be completely dropped.
>> mmu_set_spte() already does the right thing without its help.
>>     
>
> The page being mapped by the guest needs to be faulted in by someone...
> So while walker->page is unused, it guarantees that the handler will
> properly fault in the pagetables to know which physical address the page
> resides at. ie. it is the place which guarantees that the page gets
> mapped if its already in memory or swapped in and then mapped.

Exactly.  But it is better to be explicit about it and pass the page 
directly like you did before.  I hate to make you go back-and-fourth, 
but I did not understand the issue completely before.

-- 
Any sufficiently difficult bug is indistinguishable from a feature.


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch 3/5] KVM: add kvm_follow_page()
       [not found]             ` <476E083B.5040600-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-12-23  8:41               ` Avi Kivity
       [not found]                 ` <476E1F23.3020609-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: Avi Kivity @ 2007-12-23  8:41 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel, Andrew Morton

Avi Kivity wrote:
>
> Exactly.  But it is better to be explicit about it and pass the page
> directly like you did before.  I hate to make you go back-and-fourth,
> but I did not understand the issue completely before.
>

btw, the call to gfn_to_page() can happen in page_fault() instead of
walk_addr(); that will reduce the amount of error handling, and will
simplify the callers to walk_addr() that don't need the page.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch 3/5] KVM: add kvm_follow_page()
       [not found]                 ` <476E1F23.3020609-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-12-23  8:59                   ` Avi Kivity
       [not found]                     ` <476E236A.9000800-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  2007-12-23 19:14                   ` Marcelo Tosatti
  1 sibling, 1 reply; 26+ messages in thread
From: Avi Kivity @ 2007-12-23  8:59 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel, Andrew Morton

Avi Kivity wrote:
> Avi Kivity wrote:
>   
>> Exactly.  But it is better to be explicit about it and pass the page
>> directly like you did before.  I hate to make you go back-and-fourth,
>> but I did not understand the issue completely before.
>>
>>     
>
> btw, the call to gfn_to_page() can happen in page_fault() instead of
> walk_addr(); that will reduce the amount of error handling, and will
> simplify the callers to walk_addr() that don't need the page.
>
>   

Note further that all this doesn't obviate the need for follow_page()
(or get_user_pages_inatomic()); we still need something in update_pte()
for the demand paging case.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch 3/5] KVM: add kvm_follow_page()
       [not found]                     ` <476E236A.9000800-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-12-23  9:26                       ` Andrew Morton
       [not found]                         ` <20071223012618.e8cf8320.akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: Andrew Morton @ 2007-12-23  9:26 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm-devel

On Sun, 23 Dec 2007 10:59:22 +0200 Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote:

> Avi Kivity wrote:
> > Avi Kivity wrote:
> >   
> >> Exactly.  But it is better to be explicit about it and pass the page
> >> directly like you did before.  I hate to make you go back-and-fourth,
> >> but I did not understand the issue completely before.
> >>
> >>     
> >
> > btw, the call to gfn_to_page() can happen in page_fault() instead of
> > walk_addr(); that will reduce the amount of error handling, and will
> > simplify the callers to walk_addr() that don't need the page.
> >
> >   
> 
> Note further that all this doesn't obviate the need for follow_page()
> (or get_user_pages_inatomic()); we still need something in update_pte()
> for the demand paging case.

Please review -mm's mm/pagewalk.c for suitability.

If is is unsuitable but repairable then please cc Matt Mackall
<mpm-VDJrAJ4Gl5ZBDgjK7y7TUQ@public.gmane.org> on the review.


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch 3/5] KVM: add kvm_follow_page()
       [not found]                         ` <20071223012618.e8cf8320.akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
@ 2007-12-23 10:35                           ` Avi Kivity
       [not found]                             ` <476E39F2.4030202-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: Avi Kivity @ 2007-12-23 10:35 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Marcelo Tosatti, kvm-devel, Matt Mackall

Andrew Morton wrote:
> On Sun, 23 Dec 2007 10:59:22 +0200 Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote:
>
>   
>> Avi Kivity wrote:
>>     
>>> Avi Kivity wrote:
>>>   
>>>       
>>>> Exactly.  But it is better to be explicit about it and pass the page
>>>> directly like you did before.  I hate to make you go back-and-fourth,
>>>> but I did not understand the issue completely before.
>>>>
>>>>     
>>>>         
>>> btw, the call to gfn_to_page() can happen in page_fault() instead of
>>> walk_addr(); that will reduce the amount of error handling, and will
>>> simplify the callers to walk_addr() that don't need the page.
>>>
>>>   
>>>       
>> Note further that all this doesn't obviate the need for follow_page()
>> (or get_user_pages_inatomic()); we still need something in update_pte()
>> for the demand paging case.
>>     
>
> Please review -mm's mm/pagewalk.c for suitability.
>
> If is is unsuitable but repairable then please cc Matt Mackall
> <mpm-VDJrAJ4Gl5ZBDgjK7y7TUQ@public.gmane.org> on the review.
>
>   

The "no locks are taken" comment is very worrying.  We need accurate 
results.

Getting pte_t's in the callbacks is a little too low level for kvm's use 
(which wants struct page pointers) but of course that easily handled in 
a kvm wrapper.

I'd prefer an atomic version of get_user_pages(), but if pagewalk is 
fixed to take the necessary locks, it will do.


-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch 3/5] KVM: add kvm_follow_page()
       [not found]                             ` <476E39F2.4030202-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-12-23 10:52                               ` Andrew Morton
       [not found]                                 ` <20071223025245.d839c86d.akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: Andrew Morton @ 2007-12-23 10:52 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm-devel, Matt Mackall

On Sun, 23 Dec 2007 12:35:30 +0200 Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote:

> Andrew Morton wrote:
> > On Sun, 23 Dec 2007 10:59:22 +0200 Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote:
> >
> >   
> >> Avi Kivity wrote:
> >>     
> >>> Avi Kivity wrote:
> >>>   
> >>>       
> >>>> Exactly.  But it is better to be explicit about it and pass the page
> >>>> directly like you did before.  I hate to make you go back-and-fourth,
> >>>> but I did not understand the issue completely before.
> >>>>
> >>>>     
> >>>>         
> >>> btw, the call to gfn_to_page() can happen in page_fault() instead of
> >>> walk_addr(); that will reduce the amount of error handling, and will
> >>> simplify the callers to walk_addr() that don't need the page.
> >>>
> >>>   
> >>>       
> >> Note further that all this doesn't obviate the need for follow_page()
> >> (or get_user_pages_inatomic()); we still need something in update_pte()
> >> for the demand paging case.
> >>     
> >
> > Please review -mm's mm/pagewalk.c for suitability.
> >
> > If is is unsuitable but repairable then please cc Matt Mackall
> > <mpm-VDJrAJ4Gl5ZBDgjK7y7TUQ@public.gmane.org> on the review.
> >
> >   
> 
> The "no locks are taken" comment is very worrying.  We need accurate 
> results.

take down_read(mm->mmap_sem) before calling it..

You have to do that anyway for its results to be meaningful in the caller. 
Ditto get_user_pages().

> Getting pte_t's in the callbacks is a little too low level for kvm's use 
> (which wants struct page pointers) but of course that easily handled in 
> a kvm wrapper.
> 
> I'd prefer an atomic version of get_user_pages(), but if pagewalk is 
> fixed to take the necessary locks, it will do.

It isn't exported to modules at present, although I see no problem in
changing that.


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch 3/5] KVM: add kvm_follow_page()
       [not found]                                 ` <20071223025245.d839c86d.akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
@ 2007-12-23 11:01                                   ` Avi Kivity
       [not found]                                     ` <476E4005.90003-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: Avi Kivity @ 2007-12-23 11:01 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Marcelo Tosatti, kvm-devel, Matt Mackall

Andrew Morton wrote:
> On Sun, 23 Dec 2007 12:35:30 +0200 Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote:
>
>   
>> Andrew Morton wrote:
>>     
>>> On Sun, 23 Dec 2007 10:59:22 +0200 Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote:
>>>
>>>   
>>>       
>>>> Avi Kivity wrote:
>>>>     
>>>>         
>>>>> Avi Kivity wrote:
>>>>>   
>>>>>       
>>>>>           
>>>>>> Exactly.  But it is better to be explicit about it and pass the page
>>>>>> directly like you did before.  I hate to make you go back-and-fourth,
>>>>>> but I did not understand the issue completely before.
>>>>>>
>>>>>>     
>>>>>>         
>>>>>>             
>>>>> btw, the call to gfn_to_page() can happen in page_fault() instead of
>>>>> walk_addr(); that will reduce the amount of error handling, and will
>>>>> simplify the callers to walk_addr() that don't need the page.
>>>>>
>>>>>   
>>>>>       
>>>>>           
>>>> Note further that all this doesn't obviate the need for follow_page()
>>>> (or get_user_pages_inatomic()); we still need something in update_pte()
>>>> for the demand paging case.
>>>>     
>>>>         
>>> Please review -mm's mm/pagewalk.c for suitability.
>>>
>>> If is is unsuitable but repairable then please cc Matt Mackall
>>> <mpm-VDJrAJ4Gl5ZBDgjK7y7TUQ@public.gmane.org> on the review.
>>>
>>>   
>>>       
>> The "no locks are taken" comment is very worrying.  We need accurate 
>> results.
>>     
>
> take down_read(mm->mmap_sem) before calling it..
>
> You have to do that anyway for its results to be meaningful in the caller. 
> Ditto get_user_pages().
>
>   

Yes, but what about the page table locks?

follow_page() is much more thorough.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch 3/5] KVM: add kvm_follow_page()
       [not found]                 ` <476E1F23.3020609-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  2007-12-23  8:59                   ` Avi Kivity
@ 2007-12-23 19:14                   ` Marcelo Tosatti
  2007-12-24  6:50                     ` Avi Kivity
  1 sibling, 1 reply; 26+ messages in thread
From: Marcelo Tosatti @ 2007-12-23 19:14 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm-devel

On Sun, Dec 23, 2007 at 10:41:07AM +0200, Avi Kivity wrote:
> Avi Kivity wrote:
> >
> > Exactly.  But it is better to be explicit about it and pass the page
> > directly like you did before.  I hate to make you go back-and-fourth,
> > but I did not understand the issue completely before.
> >
> 
> btw, the call to gfn_to_page() can happen in page_fault() instead of
> walk_addr(); that will reduce the amount of error handling, and will
> simplify the callers to walk_addr() that don't need the page.

But the gfn in question is only known at walk_addr() time, so thats not
possible.


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch 3/5] KVM: add kvm_follow_page()
       [not found]                                     ` <476E4005.90003-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-12-23 20:15                                       ` Marcelo Tosatti
  2007-12-23 20:41                                         ` Andrew Morton
  2007-12-24  6:56                                         ` Avi Kivity
  0 siblings, 2 replies; 26+ messages in thread
From: Marcelo Tosatti @ 2007-12-23 20:15 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm-devel, Andrew Morton, Matt Mackall

On Sun, Dec 23, 2007 at 01:01:25PM +0200, Avi Kivity wrote:
> Andrew Morton wrote:
> >On Sun, 23 Dec 2007 12:35:30 +0200 Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote:
> >
> >  
> >>Andrew Morton wrote:
> >>    
> >>>On Sun, 23 Dec 2007 10:59:22 +0200 Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote:
> >>>
> >>>  
> >>>      
> >>>>Avi Kivity wrote:
> >>>>    
> >>>>        
> >>>>>Avi Kivity wrote:
> >>>>>  
> >>>>>      
> >>>>>          
> >>>>>>Exactly.  But it is better to be explicit about it and pass the page
> >>>>>>directly like you did before.  I hate to make you go back-and-fourth,
> >>>>>>but I did not understand the issue completely before.
> >>>>>>
> >>>>>>    
> >>>>>>        
> >>>>>>            
> >>>>>btw, the call to gfn_to_page() can happen in page_fault() instead of
> >>>>>walk_addr(); that will reduce the amount of error handling, and will
> >>>>>simplify the callers to walk_addr() that don't need the page.
> >>>>>
> >>>>>  
> >>>>>      
> >>>>>          
> >>>>Note further that all this doesn't obviate the need for follow_page()
> >>>>(or get_user_pages_inatomic()); we still need something in update_pte()
> >>>>for the demand paging case.
> >>>>    
> >>>>        
> >>>Please review -mm's mm/pagewalk.c for suitability.
> >>>
> >>>If is is unsuitable but repairable then please cc Matt Mackall
> >>><mpm-VDJrAJ4Gl5ZBDgjK7y7TUQ@public.gmane.org> on the review.
> >>>
> >>>  
> >>>      
> >>The "no locks are taken" comment is very worrying.  We need accurate 
> >>results.
> >>    
> >
> >take down_read(mm->mmap_sem) before calling it..
> >
> >You have to do that anyway for its results to be meaningful in the caller. 
> >Ditto get_user_pages().
> >
> >  
> 
> Yes, but what about the page table locks?
> 
> follow_page() is much more thorough.

It can acquire the pagetablelock in the callback handler. But then,
vm_normal_page() must also be exported.

Are you guys OK with this ?


Modular KVM needs walk_page_range(), and also vm_normal_page() to be 
used on pagewalk callback.

Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

Index: kvm.quilt/mm/pagewalk.c
===================================================================
--- kvm.quilt.orig/mm/pagewalk.c
+++ kvm.quilt/mm/pagewalk.c
@@ -1,6 +1,7 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/sched.h>
+#include <linux/module.h>
 
 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			  const struct mm_walk *walk, void *private)
@@ -129,3 +130,4 @@ int walk_page_range(const struct mm_stru
 
 	return err;
 }
+EXPORT_SYMBOL(walk_page_range);
Index: kvm.quilt/mm/memory.c
===================================================================
--- kvm.quilt.orig/mm/memory.c
+++ kvm.quilt/mm/memory.c
@@ -412,6 +412,7 @@ struct page *vm_normal_page(struct vm_ar
 	 */
 	return pfn_to_page(pfn);
 }
+EXPORT_SYMBOL(vm_normal_page);
 
 /*
  * copy one vm_area from one task to the other. Assumes the page tables




[PATCH] KVM: add kvm_follow_page()

In preparation for a mmu spinlock, avoid schedules in mmu_set_spte() 
by using walk_page_range() instead of get_user_pages().

The fault handling work by get_user_pages() now happens outside the lock.

Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

Index: kvm.quilt/arch/x86/kvm/mmu.c
===================================================================
--- kvm.quilt.orig/arch/x86/kvm/mmu.c
+++ kvm.quilt/arch/x86/kvm/mmu.c
@@ -882,14 +882,18 @@ struct page *gva_to_page(struct kvm_vcpu
 	return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
 }
 
+/*
+ * mmu_set_spte must be called with an additional reference on
+ * struct page *page, and it will release that if necessary (so
+ * the callers should not worry about it).
+ */
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 			 unsigned pt_access, unsigned pte_access,
 			 int user_fault, int write_fault, int dirty,
-			 int *ptwrite, gfn_t gfn)
+			 int *ptwrite, gfn_t gfn, struct page *page)
 {
 	u64 spte;
 	int was_rmapped = is_rmap_pte(*shadow_pte);
-	struct page *page;
 
 	pgprintk("%s: spte %llx access %x write_fault %d"
 		 " user_fault %d gfn %lx\n",
@@ -907,8 +911,6 @@ static void mmu_set_spte(struct kvm_vcpu
 	if (!(pte_access & ACC_EXEC_MASK))
 		spte |= PT64_NX_MASK;
 
-	page = gfn_to_page(vcpu->kvm, gfn);
-
 	spte |= PT_PRESENT_MASK;
 	if (pte_access & ACC_USER_MASK)
 		spte |= PT_USER_MASK;
@@ -983,8 +985,10 @@ static int nonpaging_map(struct kvm_vcpu
 		table = __va(table_addr);
 
 		if (level == 1) {
+			struct page *page;
+			page = gfn_to_page(vcpu->kvm, gfn);
 			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
-				     0, write, 1, &pt_write, gfn);
+				     0, write, 1, &pt_write, gfn, page);
 			return pt_write || is_io_pte(table[index]);
 		}
 
Index: kvm.quilt/include/linux/kvm_host.h
===================================================================
--- kvm.quilt.orig/include/linux/kvm_host.h
+++ kvm.quilt/include/linux/kvm_host.h
@@ -163,6 +163,7 @@ int kvm_arch_set_memory_region(struct kv
 				int user_alloc);
 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
+struct page *kvm_find_page(struct kvm *kvm, gfn_t gfn);
 void kvm_release_page_clean(struct page *page);
 void kvm_release_page_dirty(struct page *page);
 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
Index: kvm.quilt/virt/kvm/kvm_main.c
===================================================================
--- kvm.quilt.orig/virt/kvm/kvm_main.c
+++ kvm.quilt/virt/kvm/kvm_main.c
@@ -453,6 +453,54 @@ static unsigned long gfn_to_hva(struct k
 	return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
 }
 
+static int kvm_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+		           void *private)
+{
+	struct page **page = private;
+	struct vm_area_struct *vma;
+	pte_t *ptep, pte;
+	spinlock_t *ptl;
+	int err = -EFAULT;
+
+	vma = find_vma(current->mm, addr);
+	if (!vma)
+		return err;
+
+	ptep = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
+	pte = *ptep;
+	if (!pte_present(pte))
+		goto unlock;
+
+	*page = vm_normal_page(vma, addr, pte);
+	if (!*page)
+		goto unlock;
+
+	get_page(*page);
+	err = 0;
+unlock:
+	pte_unmap_unlock(ptep, ptl);
+	return err;
+}
+
+static struct mm_walk kvm_mm_walk = { .pmd_entry = kvm_pte_range };
+
+struct page *kvm_find_page(struct kvm *kvm, gfn_t gfn)
+{
+	unsigned long addr;
+	struct page *page = NULL;
+
+	addr = gfn_to_hva(kvm, gfn);
+	/* MMIO access */
+	if (kvm_is_error_hva(addr)) {
+		get_page(bad_page);
+		return bad_page;
+	}
+
+	walk_page_range(current->mm, addr, addr+PAGE_SIZE, &kvm_mm_walk,
+			&page);
+	return page;
+}
+
 /*
  * Requires current->mm->mmap_sem to be held
  */
Index: kvm.quilt/arch/x86/kvm/paging_tmpl.h
===================================================================
--- kvm.quilt.orig/arch/x86/kvm/paging_tmpl.h
+++ kvm.quilt/arch/x86/kvm/paging_tmpl.h
@@ -67,6 +67,7 @@ struct guest_walker {
 	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
 	pt_element_t ptes[PT_MAX_FULL_LEVELS];
 	gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
+	struct page *page;
 	unsigned pt_access;
 	unsigned pte_access;
 	gfn_t gfn;
@@ -203,14 +204,18 @@ walk:
 		--walker->level;
 	}
 
+	walker->page = gfn_to_page(vcpu->kvm, walker->gfn);
+
 	if (write_fault && !is_dirty_pte(pte)) {
 		bool ret;
 
 		mark_page_dirty(vcpu->kvm, table_gfn);
 		ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
 			    pte|PT_DIRTY_MASK);
-		if (ret)
+		if (ret) {
+			kvm_release_page_clean(walker->page);
 			goto walk;
+		}
 		pte |= PT_DIRTY_MASK;
 		mutex_lock(&vcpu->kvm->lock);
 		kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
@@ -241,12 +246,13 @@ err:
 	return 0;
 }
 
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *spage,
 			      u64 *spte, const void *pte, int bytes,
 			      int offset_in_pte)
 {
 	pt_element_t gpte;
 	unsigned pte_access;
+	struct page *page;
 
 	gpte = *(const pt_element_t *)pte;
 	if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
@@ -256,10 +262,15 @@ static void FNAME(update_pte)(struct kvm
 	}
 	if (bytes < sizeof(pt_element_t))
 		return;
+
+	page = kvm_find_page(vcpu->kvm, gpte_to_gfn(gpte));
+	if (!page)
+		return;
+
 	pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
-	pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
-	mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
-		     gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte));
+	pte_access = spage->role.access & FNAME(gpte_access)(vcpu, gpte);
+	mmu_set_spte(vcpu, spte, spage->role.access, pte_access, 0, 0,
+		     gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), page);
 }
 
 /*
@@ -323,8 +334,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
 			r = kvm_read_guest_atomic(vcpu->kvm,
 						  walker->pte_gpa[level - 2],
 				       		  &curr_pte, sizeof(curr_pte));
-			if (r || curr_pte != walker->ptes[level - 2])
+			if (r || curr_pte != walker->ptes[level - 2]) {
+				kvm_release_page_clean(walker->page);
 				return NULL;
+			}
 		}
 		shadow_addr = __pa(shadow_page->spt);
 		shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
@@ -335,7 +348,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
 	mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
 		     user_fault, write_fault,
 		     walker->ptes[walker->level-1] & PT_DIRTY_MASK,
-		     ptwrite, walker->gfn);
+		     ptwrite, walker->gfn, walker->page);
 
 	return shadow_ent;
 }
@@ -425,6 +438,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kv
 	if (r) {
 		gpa = gfn_to_gpa(walker.gfn);
 		gpa |= vaddr & ~PAGE_MASK;
+		kvm_release_page_clean(walker.page);
 	}
 
 	return gpa;

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch 3/5] KVM: add kvm_follow_page()
  2007-12-23 20:15                                       ` Marcelo Tosatti
@ 2007-12-23 20:41                                         ` Andrew Morton
  2007-12-24  6:56                                         ` Avi Kivity
  1 sibling, 0 replies; 26+ messages in thread
From: Andrew Morton @ 2007-12-23 20:41 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel, Avi Kivity, Matt Mackall

On Sun, 23 Dec 2007 15:15:25 -0500 Marcelo Tosatti <marcelo-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org> wrote:

> Are you guys OK with this ?
> 
> 
> Modular KVM needs walk_page_range(), and also vm_normal_page() to be 
> used on pagewalk callback.

I am.

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch 3/5] KVM: add kvm_follow_page()
  2007-12-23 19:14                   ` Marcelo Tosatti
@ 2007-12-24  6:50                     ` Avi Kivity
       [not found]                       ` <476F56A5.2020607-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: Avi Kivity @ 2007-12-24  6:50 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel

Marcelo Tosatti wrote:
> On Sun, Dec 23, 2007 at 10:41:07AM +0200, Avi Kivity wrote:
>   
>> Avi Kivity wrote:
>>     
>>> Exactly.  But it is better to be explicit about it and pass the page
>>> directly like you did before.  I hate to make you go back-and-fourth,
>>> but I did not understand the issue completely before.
>>>
>>>       
>> btw, the call to gfn_to_page() can happen in page_fault() instead of
>> walk_addr(); that will reduce the amount of error handling, and will
>> simplify the callers to walk_addr() that don't need the page.
>>     
>
> But the gfn in question is only known at walk_addr() time, so thats not
> possible.
>   

It's just walker->gfn; certainly it is known in page_fault().

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch 3/5] KVM: add kvm_follow_page()
  2007-12-23 20:15                                       ` Marcelo Tosatti
  2007-12-23 20:41                                         ` Andrew Morton
@ 2007-12-24  6:56                                         ` Avi Kivity
       [not found]                                           ` <476F5801.2030002-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  1 sibling, 1 reply; 26+ messages in thread
From: Avi Kivity @ 2007-12-24  6:56 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel, Andrew Morton, Matt Mackall

Marcelo Tosatti wrote:
> It can acquire the pagetablelock in the callback handler. But then,
> vm_normal_page() must also be exported.
>
> Are you guys OK with this ?
>
>   

Seems to me that requires fairly detailed mucking in mm details, just to 
get at a page.

I believe that a new get_users_pages_inatomic() is more suitable; 
Andrew, I'll write it if you agree.  Alternatively, walk_page_range() 
should take the lock itself, otherwise it is only usable if you don't 
care about correctness?

>  
> +static int kvm_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
> +		           void *private)
> +{
> +	struct page **page = private;
> +	struct vm_area_struct *vma;
> +	pte_t *ptep, pte;
> +	spinlock_t *ptl;
> +	int err = -EFAULT;
> +
> +	vma = find_vma(current->mm, addr);
> +	if (!vma)
> +		return err;
> +
> +	ptep = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
> +	pte = *ptep;
> +	if (!pte_present(pte))
> +		goto unlock;
> +
> +	*page = vm_normal_page(vma, addr, pte);
> +	if (!*page)
> +		goto unlock;
> +
> +	get_page(*page);
> +	err = 0;
> +unlock:
> +	pte_unmap_unlock(ptep, ptl);
> +	return err;
> +}
> +
>   

See what one has to write to get at a page?  Unreasonable IMO.  And 
that's without hugetlb support AFAICT (kvm will want that soon).

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch 3/5] KVM: add kvm_follow_page()
       [not found]                                           ` <476F5801.2030002-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-12-24 14:26                                             ` Marcelo Tosatti
  2007-12-24 14:36                                               ` Avi Kivity
  0 siblings, 1 reply; 26+ messages in thread
From: Marcelo Tosatti @ 2007-12-24 14:26 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm-devel, Andrew Morton, Matt Mackall

On Mon, Dec 24, 2007 at 08:56:01AM +0200, Avi Kivity wrote:
> Marcelo Tosatti wrote:
> >It can acquire the pagetablelock in the callback handler. But then,
> >vm_normal_page() must also be exported.
> >
> >Are you guys OK with this ?
> >
> >  
> 
> Seems to me that requires fairly detailed mucking in mm details, just to 
> get at a page.

I agree.

> I believe that a new get_users_pages_inatomic() is more suitable; 
> Andrew, I'll write it if you agree.  Alternatively, walk_page_range() 
> should take the lock itself, otherwise it is only usable if you don't 
> care about correctness?

Why not just export follow_page() ? That is exactly what KVM needs.

> >+static int kvm_pte_range(pmd_t *pmd, unsigned long addr, unsigned long 
> >end,
> >+		           void *private)
> >+{
> >+	struct page **page = private;
> >+	struct vm_area_struct *vma;
> >+	pte_t *ptep, pte;
> >+	spinlock_t *ptl;
> >+	int err = -EFAULT;
> >+
> >+	vma = find_vma(current->mm, addr);
> >+	if (!vma)
> >+		return err;
> >+
> >+	ptep = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
> >+	pte = *ptep;
> >+	if (!pte_present(pte))
> >+		goto unlock;
> >+
> >+	*page = vm_normal_page(vma, addr, pte);
> >+	if (!*page)
> >+		goto unlock;
> >+
> >+	get_page(*page);
> >+	err = 0;
> >+unlock:
> >+	pte_unmap_unlock(ptep, ptl);
> >+	return err;
> >+}
> >+
> >  
> 
> See what one has to write to get at a page?  Unreasonable IMO.  And 
> that's without hugetlb support AFAICT (kvm will want that soon).

Indeed... Its pretty much duplicating what follow_page() already does.


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch 3/5] KVM: add kvm_follow_page()
       [not found]                       ` <476F56A5.2020607-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-12-24 14:28                         ` Marcelo Tosatti
  2007-12-24 14:34                           ` Avi Kivity
  0 siblings, 1 reply; 26+ messages in thread
From: Marcelo Tosatti @ 2007-12-24 14:28 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm-devel

On Mon, Dec 24, 2007 at 08:50:13AM +0200, Avi Kivity wrote:
> Marcelo Tosatti wrote:
> >On Sun, Dec 23, 2007 at 10:41:07AM +0200, Avi Kivity wrote:
> >  
> >>Avi Kivity wrote:
> >>    
> >>>Exactly.  But it is better to be explicit about it and pass the page
> >>>directly like you did before.  I hate to make you go back-and-fourth,
> >>>but I did not understand the issue completely before.
> >>>
> >>>      
> >>btw, the call to gfn_to_page() can happen in page_fault() instead of
> >>walk_addr(); that will reduce the amount of error handling, and will
> >>simplify the callers to walk_addr() that don't need the page.
> >>    
> >
> >But the gfn in question is only known at walk_addr() time, so thats not
> >possible.
> >  
> 
> It's just walker->gfn; certainly it is known in page_fault().

Oh, you mean to grab walker->gfn before fetch() ?

Yeah, that makes sense.


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch 3/5] KVM: add kvm_follow_page()
  2007-12-24 14:28                         ` Marcelo Tosatti
@ 2007-12-24 14:34                           ` Avi Kivity
  0 siblings, 0 replies; 26+ messages in thread
From: Avi Kivity @ 2007-12-24 14:34 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel

Marcelo Tosatti wrote:

     

>>>> btw, the call to gfn_to_page() can happen in page_fault() instead of
>>>> walk_addr(); that will reduce the amount of error handling, and will
>>>> simplify the callers to walk_addr() that don't need the page.
>>>>    
>>>>         
>>> But the gfn in question is only known at walk_addr() time, so thats not
>>> possible.
>>>  
>>>       
>> It's just walker->gfn; certainly it is known in page_fault().
>>     
>
> Oh, you mean to grab walker->gfn before fetch() ?
>
>   

Yes.


-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch 3/5] KVM: add kvm_follow_page()
  2007-12-24 14:26                                             ` Marcelo Tosatti
@ 2007-12-24 14:36                                               ` Avi Kivity
  0 siblings, 0 replies; 26+ messages in thread
From: Avi Kivity @ 2007-12-24 14:36 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel, Andrew Morton, Matt Mackall

Marcelo Tosatti wrote:
>
>> I believe that a new get_users_pages_inatomic() is more suitable; 
>> Andrew, I'll write it if you agree.  Alternatively, walk_page_range() 
>> should take the lock itself, otherwise it is only usable if you don't 
>> care about correctness?
>>     
>
> Why not just export follow_page() ? That is exactly what KVM needs.
>
>   

Well, follow_page() needs the vma, and doesn't support large pages.  
It's too low-level to be a public API.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2007-12-24 14:36 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-12-21  0:18 [patch 0/5] KVM shadow scalability enhancements Marcelo Tosatti
2007-12-21  0:18 ` [patch 1/5] KVM: concurrent guest walkers Marcelo Tosatti
2007-12-21  0:18 ` [patch 2/5] KVM: add kvm_read_guest_atomic() Marcelo Tosatti
2007-12-21  0:18 ` [patch 3/5] KVM: add kvm_follow_page() Marcelo Tosatti
     [not found]   ` <20071221002024.345682789-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2007-12-22 19:42     ` Avi Kivity
     [not found]       ` <476D68A3.3000200-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-12-23  5:25         ` Marcelo Tosatti
2007-12-23  7:03           ` Avi Kivity
     [not found]             ` <476E083B.5040600-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-12-23  8:41               ` Avi Kivity
     [not found]                 ` <476E1F23.3020609-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-12-23  8:59                   ` Avi Kivity
     [not found]                     ` <476E236A.9000800-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-12-23  9:26                       ` Andrew Morton
     [not found]                         ` <20071223012618.e8cf8320.akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
2007-12-23 10:35                           ` Avi Kivity
     [not found]                             ` <476E39F2.4030202-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-12-23 10:52                               ` Andrew Morton
     [not found]                                 ` <20071223025245.d839c86d.akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
2007-12-23 11:01                                   ` Avi Kivity
     [not found]                                     ` <476E4005.90003-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-12-23 20:15                                       ` Marcelo Tosatti
2007-12-23 20:41                                         ` Andrew Morton
2007-12-24  6:56                                         ` Avi Kivity
     [not found]                                           ` <476F5801.2030002-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-12-24 14:26                                             ` Marcelo Tosatti
2007-12-24 14:36                                               ` Avi Kivity
2007-12-23 19:14                   ` Marcelo Tosatti
2007-12-24  6:50                     ` Avi Kivity
     [not found]                       ` <476F56A5.2020607-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-12-24 14:28                         ` Marcelo Tosatti
2007-12-24 14:34                           ` Avi Kivity
2007-12-21  0:18 ` [patch 4/5] KVM: export follow_page() Marcelo Tosatti
     [not found]   ` <20071221002024.423895760-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2007-12-22 19:43     ` Avi Kivity
2007-12-21  0:18 ` [patch 5/5] KVM: switch to mmu spinlock Marcelo Tosatti
     [not found] ` <20071221001821.029994250-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2007-12-22 19:46   ` [patch 0/5] KVM shadow scalability enhancements Avi Kivity

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox