* [patch 0/5] KVM shadow scalability enhancements
@ 2007-12-21 0:18 Marcelo Tosatti
2007-12-21 0:18 ` [patch 1/5] KVM: concurrent guest walkers Marcelo Tosatti
` (5 more replies)
0 siblings, 6 replies; 26+ messages in thread
From: Marcelo Tosatti @ 2007-12-21 0:18 UTC (permalink / raw)
To: Avi Kivity; +Cc: kvm-devel, mtosatti-H+wXaHxf7aLQT0dZR+AlfA
The following patchset increases KVM shadow scalability by allowing concurrent
guest walking, allocation and instruction emulation.
--
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
^ permalink raw reply [flat|nested] 26+ messages in thread* [patch 1/5] KVM: concurrent guest walkers 2007-12-21 0:18 [patch 0/5] KVM shadow scalability enhancements Marcelo Tosatti @ 2007-12-21 0:18 ` Marcelo Tosatti 2007-12-21 0:18 ` [patch 2/5] KVM: add kvm_read_guest_atomic() Marcelo Tosatti ` (4 subsequent siblings) 5 siblings, 0 replies; 26+ messages in thread From: Marcelo Tosatti @ 2007-12-21 0:18 UTC (permalink / raw) To: Avi Kivity; +Cc: kvm-devel, mtosatti-H+wXaHxf7aLQT0dZR+AlfA [-- Attachment #1: use-mmap-sem --] [-- Type: text/plain, Size: 17795 bytes --] Do not hold kvm->lock mutex across the entire pagefault code, only acquire it in places where it is necessary, such as mmu hash list, active list, rmap and parent pte handling. Allow concurrent guest walkers by switching walk_addr() to use mmap_sem in read-mode. And get rid of the lockless __gfn_to_page. Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> Index: kvm.quilt/arch/x86/kvm/mmu.c =================================================================== --- kvm.quilt.orig/arch/x86/kvm/mmu.c +++ kvm.quilt/arch/x86/kvm/mmu.c @@ -1026,6 +1026,7 @@ static void mmu_free_roots(struct kvm_vc if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; + mutex_lock(&vcpu->kvm->lock); #ifdef CONFIG_X86_64 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; @@ -1033,6 +1034,7 @@ static void mmu_free_roots(struct kvm_vc sp = page_header(root); --sp->root_count; vcpu->arch.mmu.root_hpa = INVALID_PAGE; + mutex_unlock(&vcpu->kvm->lock); return; } #endif @@ -1046,6 +1048,7 @@ static void mmu_free_roots(struct kvm_vc } vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; } + mutex_unlock(&vcpu->kvm->lock); vcpu->arch.mmu.root_hpa = INVALID_PAGE; } @@ -1245,15 +1248,15 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) { int r; - mutex_lock(&vcpu->kvm->lock); r = mmu_topup_memory_caches(vcpu); if (r) goto out; + mutex_lock(&vcpu->kvm->lock); mmu_alloc_roots(vcpu); + mutex_unlock(&vcpu->kvm->lock); kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); kvm_mmu_flush_tlb(vcpu); out: - mutex_unlock(&vcpu->kvm->lock); return r; } EXPORT_SYMBOL_GPL(kvm_mmu_load); @@ -1420,13 +1423,22 @@ void kvm_mmu_pte_write(struct kvm_vcpu * int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + gpa_t gpa; + int r; + + down_read(¤t->mm->mmap_sem); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + up_read(¤t->mm->mmap_sem); - return kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); + mutex_lock(&vcpu->kvm->lock); + r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); + mutex_unlock(&vcpu->kvm->lock); + return r; } void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { + mutex_lock(&vcpu->kvm->lock); while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) { struct kvm_mmu_page *sp; @@ -1435,6 +1447,7 @@ void __kvm_mmu_free_some_pages(struct kv kvm_mmu_zap_page(vcpu->kvm, sp); ++vcpu->kvm->stat.mmu_recycled; } + mutex_unlock(&vcpu->kvm->lock); } int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) @@ -1442,7 +1455,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu * int r; enum emulation_result er; - mutex_lock(&vcpu->kvm->lock); r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); if (r < 0) goto out; @@ -1457,7 +1469,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu * goto out; er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0); - mutex_unlock(&vcpu->kvm->lock); switch (er) { case EMULATE_DONE: @@ -1472,7 +1483,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu * BUG(); } out: - mutex_unlock(&vcpu->kvm->lock); return r; } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); @@ -1569,8 +1579,10 @@ void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; + mutex_lock(&kvm->lock); list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) kvm_mmu_zap_page(kvm, sp); + mutex_unlock(&kvm->lock); kvm_flush_remote_tlbs(kvm); } Index: kvm.quilt/arch/x86/kvm/paging_tmpl.h =================================================================== --- kvm.quilt.orig/arch/x86/kvm/paging_tmpl.h +++ kvm.quilt/arch/x86/kvm/paging_tmpl.h @@ -212,7 +212,9 @@ walk: if (ret) goto walk; pte |= PT_DIRTY_MASK; + mutex_lock(&vcpu->kvm->lock); kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); + mutex_unlock(&vcpu->kvm->lock); walker->ptes[walker->level - 1] = pte; } @@ -368,11 +370,13 @@ static int FNAME(page_fault)(struct kvm_ if (r) return r; + down_read(¤t->mm->mmap_sem); /* * Look up the shadow pte for the faulting address. */ r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, fetch_fault); + up_read(¤t->mm->mmap_sem); /* * The page is not mapped by the guest. Let the guest handle it. @@ -384,6 +388,7 @@ static int FNAME(page_fault)(struct kvm_ return 0; } + mutex_lock(&vcpu->kvm->lock); shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, &write_pt); pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, @@ -395,11 +400,14 @@ static int FNAME(page_fault)(struct kvm_ /* * mmio: emulate if accessible, otherwise its a guest fault. */ - if (shadow_pte && is_io_pte(*shadow_pte)) + if (shadow_pte && is_io_pte(*shadow_pte)) { + mutex_unlock(&vcpu->kvm->lock); return 1; + } ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, "post page fault (fixed)"); + mutex_unlock(&vcpu->kvm->lock); return write_pt; } Index: kvm.quilt/arch/x86/kvm/vmx.c =================================================================== --- kvm.quilt.orig/arch/x86/kvm/vmx.c +++ kvm.quilt/arch/x86/kvm/vmx.c @@ -1431,27 +1431,34 @@ static int init_rmode_tss(struct kvm *kv { gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; u16 data = 0; + int ret = 0; int r; + down_read(¤t->mm->mmap_sem); r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); if (r < 0) - return 0; + goto out; data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); if (r < 0) - return 0; + goto out; r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); if (r < 0) - return 0; + goto out; r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); if (r < 0) - return 0; + goto out; data = ~0; - r = kvm_write_guest_page(kvm, fn, &data, RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, - sizeof(u8)); + r = kvm_write_guest_page(kvm, fn, &data, + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, + sizeof(u8)); if (r < 0) - return 0; - return 1; + goto out; + + ret = 1; +out: + up_read(¤t->mm->mmap_sem); + return ret; } static void seg_setup(int seg) @@ -1470,6 +1477,7 @@ static int alloc_apic_access_page(struct int r = 0; mutex_lock(&kvm->lock); + down_write(¤t->mm->mmap_sem); if (kvm->arch.apic_access_page) goto out; kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; @@ -1481,6 +1489,7 @@ static int alloc_apic_access_page(struct goto out; kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); out: + up_write(¤t->mm->mmap_sem); mutex_unlock(&kvm->lock); return r; } Index: kvm.quilt/arch/x86/kvm/x86.c =================================================================== --- kvm.quilt.orig/arch/x86/kvm/x86.c +++ kvm.quilt/arch/x86/kvm/x86.c @@ -180,7 +180,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, u int ret; u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; - mutex_lock(&vcpu->kvm->lock); + down_read(¤t->mm->mmap_sem); ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, offset * sizeof(u64), sizeof(pdpte)); if (ret < 0) { @@ -197,7 +197,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, u memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); out: - mutex_unlock(&vcpu->kvm->lock); + up_read(¤t->mm->mmap_sem); return ret; } @@ -211,13 +211,13 @@ static bool pdptrs_changed(struct kvm_vc if (is_long_mode(vcpu) || !is_pae(vcpu)) return false; - mutex_lock(&vcpu->kvm->lock); + down_read(¤t->mm->mmap_sem); r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); if (r < 0) goto out; changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; out: - mutex_unlock(&vcpu->kvm->lock); + up_read(¤t->mm->mmap_sem); return changed; } @@ -277,9 +277,7 @@ void set_cr0(struct kvm_vcpu *vcpu, unsi kvm_x86_ops->set_cr0(vcpu, cr0); vcpu->arch.cr0 = cr0; - mutex_lock(&vcpu->kvm->lock); kvm_mmu_reset_context(vcpu); - mutex_unlock(&vcpu->kvm->lock); return; } EXPORT_SYMBOL_GPL(set_cr0); @@ -319,9 +317,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsi } kvm_x86_ops->set_cr4(vcpu, cr4); vcpu->arch.cr4 = cr4; - mutex_lock(&vcpu->kvm->lock); kvm_mmu_reset_context(vcpu); - mutex_unlock(&vcpu->kvm->lock); } EXPORT_SYMBOL_GPL(set_cr4); @@ -359,7 +355,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsi */ } - mutex_lock(&vcpu->kvm->lock); + down_read(¤t->mm->mmap_sem); /* * Does the new cr3 value map to physical memory? (Note, we * catch an invalid cr3 even in real-mode, because it would @@ -375,7 +371,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsi vcpu->arch.cr3 = cr3; vcpu->arch.mmu.new_cr3(vcpu); } - mutex_unlock(&vcpu->kvm->lock); + up_read(¤t->mm->mmap_sem); } EXPORT_SYMBOL_GPL(set_cr3); @@ -1170,12 +1166,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) return -EINVAL; - mutex_lock(&kvm->lock); + down_write(¤t->mm->mmap_sem); kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; - mutex_unlock(&kvm->lock); + up_write(¤t->mm->mmap_sem); return 0; } @@ -1224,7 +1220,7 @@ static int kvm_vm_ioctl_set_memory_alias < alias->target_phys_addr) goto out; - mutex_lock(&kvm->lock); + down_write(¤t->mm->mmap_sem); p = &kvm->arch.aliases[alias->slot]; p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; @@ -1238,7 +1234,7 @@ static int kvm_vm_ioctl_set_memory_alias kvm_mmu_zap_all(kvm); - mutex_unlock(&kvm->lock); + up_write(¤t->mm->mmap_sem); return 0; @@ -1314,7 +1310,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kv struct kvm_memory_slot *memslot; int is_dirty = 0; - mutex_lock(&kvm->lock); + down_write(¤t->mm->mmap_sem); r = kvm_get_dirty_log(kvm, log, &is_dirty); if (r) @@ -1330,7 +1326,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kv } r = 0; out: - mutex_unlock(&kvm->lock); + up_write(¤t->mm->mmap_sem); return r; } @@ -1524,25 +1520,32 @@ int emulator_read_std(unsigned long addr struct kvm_vcpu *vcpu) { void *data = val; + int r = X86EMUL_CONTINUE; + down_read(¤t->mm->mmap_sem); while (bytes) { gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); unsigned offset = addr & (PAGE_SIZE-1); unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; + if (gpa == UNMAPPED_GVA) { + r = X86EMUL_PROPAGATE_FAULT; + goto out; + } ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy); - if (ret < 0) - return X86EMUL_UNHANDLEABLE; + if (ret < 0) { + r = X86EMUL_UNHANDLEABLE; + goto out; + } bytes -= tocopy; data += tocopy; addr += tocopy; } - - return X86EMUL_CONTINUE; +out: + up_read(¤t->mm->mmap_sem); + return r; } EXPORT_SYMBOL_GPL(emulator_read_std); @@ -1560,7 +1563,9 @@ static int emulator_read_emulated(unsign return X86EMUL_CONTINUE; } + down_read(¤t->mm->mmap_sem); gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + up_read(¤t->mm->mmap_sem); /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -1576,11 +1581,14 @@ mmio: /* * Is this MMIO handled locally? */ + mutex_lock(&vcpu->kvm->lock); mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); if (mmio_dev) { kvm_iodevice_read(mmio_dev, gpa, bytes, val); + mutex_unlock(&vcpu->kvm->lock); return X86EMUL_CONTINUE; } + mutex_unlock(&vcpu->kvm->lock); vcpu->mmio_needed = 1; vcpu->mmio_phys_addr = gpa; @@ -1595,10 +1603,16 @@ static int emulator_write_phys(struct kv { int ret; + down_read(¤t->mm->mmap_sem); ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); - if (ret < 0) + if (ret < 0) { + up_read(¤t->mm->mmap_sem); return 0; + } + mutex_lock(&vcpu->kvm->lock); kvm_mmu_pte_write(vcpu, gpa, val, bytes); + mutex_unlock(&vcpu->kvm->lock); + up_read(¤t->mm->mmap_sem); return 1; } @@ -1608,7 +1622,11 @@ static int emulator_write_emulated_onepa struct kvm_vcpu *vcpu) { struct kvm_io_device *mmio_dev; - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa; + + down_read(¤t->mm->mmap_sem); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + up_read(¤t->mm->mmap_sem); if (gpa == UNMAPPED_GVA) { kvm_inject_page_fault(vcpu, addr, 2); @@ -1626,11 +1644,14 @@ mmio: /* * Is this MMIO handled locally? */ + mutex_lock(&vcpu->kvm->lock); mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); if (mmio_dev) { kvm_iodevice_write(mmio_dev, gpa, bytes, val); + mutex_unlock(&vcpu->kvm->lock); return X86EMUL_CONTINUE; } + mutex_unlock(&vcpu->kvm->lock); vcpu->mmio_needed = 1; vcpu->mmio_phys_addr = gpa; @@ -1677,11 +1698,14 @@ static int emulator_cmpxchg_emulated(uns #ifndef CONFIG_X86_64 /* guests cmpxchg8b have to be emulated atomically */ if (bytes == 8) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa; struct page *page; char *addr; u64 *val; + down_read(¤t->mm->mmap_sem); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + if (gpa == UNMAPPED_GVA || (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto emul_write; @@ -1697,6 +1721,7 @@ static int emulator_cmpxchg_emulated(uns kvm_release_page_dirty(page); } emul_write: + up_read(¤t->mm->mmap_sem); #endif return emulator_write_emulated(addr, new, bytes, vcpu); @@ -2077,10 +2102,10 @@ int kvm_emulate_pio_string(struct kvm_vc kvm_x86_ops->skip_emulated_instruction(vcpu); for (i = 0; i < nr_pages; ++i) { - mutex_lock(&vcpu->kvm->lock); + down_read(¤t->mm->mmap_sem); page = gva_to_page(vcpu, address + i * PAGE_SIZE); vcpu->arch.pio.guest_pages[i] = page; - mutex_unlock(&vcpu->kvm->lock); + up_read(¤t->mm->mmap_sem); if (!page) { kvm_inject_gp(vcpu, 0); free_pio_guest_pages(vcpu); @@ -2203,7 +2228,6 @@ int kvm_fix_hypercall(struct kvm_vcpu *v char instruction[3]; int ret = 0; - mutex_lock(&vcpu->kvm->lock); /* * Blow out the MMU to ensure that no other VCPU has an active mapping @@ -2218,8 +2242,6 @@ int kvm_fix_hypercall(struct kvm_vcpu *v != X86EMUL_CONTINUE) ret = -EFAULT; - mutex_unlock(&vcpu->kvm->lock); - return ret; } @@ -2827,13 +2849,13 @@ int kvm_arch_vcpu_ioctl_translate(struct gpa_t gpa; vcpu_load(vcpu); - mutex_lock(&vcpu->kvm->lock); + down_read(¤t->mm->mmap_sem); gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); + up_read(¤t->mm->mmap_sem); tr->physical_address = gpa; tr->valid = gpa != UNMAPPED_GVA; tr->writeable = 1; tr->usermode = 0; - mutex_unlock(&vcpu->kvm->lock); vcpu_put(vcpu); return 0; @@ -3102,13 +3124,11 @@ int kvm_arch_set_memory_region(struct kv */ if (!user_alloc) { if (npages && !old.rmap) { - down_write(¤t->mm->mmap_sem); memslot->userspace_addr = do_mmap(NULL, 0, npages * PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0); - up_write(¤t->mm->mmap_sem); if (IS_ERR((void *)memslot->userspace_addr)) return PTR_ERR((void *)memslot->userspace_addr); @@ -3116,10 +3136,8 @@ int kvm_arch_set_memory_region(struct kv if (!old.user_alloc && old.rmap) { int ret; - down_write(¤t->mm->mmap_sem); ret = do_munmap(current->mm, old.userspace_addr, old.npages * PAGE_SIZE); - up_write(¤t->mm->mmap_sem); if (ret < 0) printk(KERN_WARNING "kvm_vm_ioctl_set_memory_region: " Index: kvm.quilt/virt/kvm/kvm_main.c =================================================================== --- kvm.quilt.orig/virt/kvm/kvm_main.c +++ kvm.quilt/virt/kvm/kvm_main.c @@ -227,7 +227,7 @@ static int kvm_vm_release(struct inode * * * Discontiguous memory is allowed, mostly for framebuffers. * - * Must be called holding kvm->lock. + * Must be called holding mmap_sem for write. */ int __kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, @@ -338,9 +338,9 @@ int kvm_set_memory_region(struct kvm *kv { int r; - mutex_lock(&kvm->lock); + down_write(¤t->mm->mmap_sem); r = __kvm_set_memory_region(kvm, mem, user_alloc); - mutex_unlock(&kvm->lock); + up_write(¤t->mm->mmap_sem); return r; } EXPORT_SYMBOL_GPL(kvm_set_memory_region); @@ -456,7 +456,7 @@ static unsigned long gfn_to_hva(struct k /* * Requires current->mm->mmap_sem to be held */ -static struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn) +struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) { struct page *page[1]; unsigned long addr; @@ -481,17 +481,6 @@ static struct page *__gfn_to_page(struct return page[0]; } -struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) -{ - struct page *page; - - down_read(¤t->mm->mmap_sem); - page = __gfn_to_page(kvm, gfn); - up_read(¤t->mm->mmap_sem); - - return page; -} - EXPORT_SYMBOL_GPL(gfn_to_page); void kvm_release_page_clean(struct page *page) @@ -977,8 +966,7 @@ static int kvm_vm_fault(struct vm_area_s if (!kvm_is_visible_gfn(kvm, vmf->pgoff)) return VM_FAULT_SIGBUS; - /* current->mm->mmap_sem is already held so call lockless version */ - page = __gfn_to_page(kvm, vmf->pgoff); + page = gfn_to_page(kvm, vmf->pgoff); if (is_error_page(page)) { kvm_release_page_clean(page); return VM_FAULT_SIGBUS; -- ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
* [patch 2/5] KVM: add kvm_read_guest_atomic() 2007-12-21 0:18 [patch 0/5] KVM shadow scalability enhancements Marcelo Tosatti 2007-12-21 0:18 ` [patch 1/5] KVM: concurrent guest walkers Marcelo Tosatti @ 2007-12-21 0:18 ` Marcelo Tosatti 2007-12-21 0:18 ` [patch 3/5] KVM: add kvm_follow_page() Marcelo Tosatti ` (3 subsequent siblings) 5 siblings, 0 replies; 26+ messages in thread From: Marcelo Tosatti @ 2007-12-21 0:18 UTC (permalink / raw) To: Avi Kivity; +Cc: kvm-devel, mtosatti-H+wXaHxf7aLQT0dZR+AlfA [-- Attachment #1: mmu-atomicity --] [-- Type: text/plain, Size: 3849 bytes --] In preparation for a mmu spinlock, add kvm_read_guest_atomic() and use it in fetch() and prefetch_page(). Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> Index: kvm.quilt/arch/x86/kvm/paging_tmpl.h =================================================================== --- kvm.quilt.orig/arch/x86/kvm/paging_tmpl.h +++ kvm.quilt/arch/x86/kvm/paging_tmpl.h @@ -318,10 +318,12 @@ static u64 *FNAME(fetch)(struct kvm_vcpu metaphysical, access, shadow_ent, &new_page); if (new_page && !metaphysical) { + int r; pt_element_t curr_pte; - kvm_read_guest(vcpu->kvm, walker->pte_gpa[level - 2], - &curr_pte, sizeof(curr_pte)); - if (curr_pte != walker->ptes[level - 2]) + r = kvm_read_guest_atomic(vcpu->kvm, + walker->pte_gpa[level - 2], + &curr_pte, sizeof(curr_pte)); + if (r || curr_pte != walker->ptes[level - 2]) return NULL; } shadow_addr = __pa(shadow_page->spt); @@ -431,9 +433,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kv static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - int i, offset = 0; - pt_element_t *gpt; - struct page *page; + int i, offset = 0, r = 0; + pt_element_t pt; if (sp->role.metaphysical || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { @@ -443,15 +444,18 @@ static void FNAME(prefetch_page)(struct if (PTTYPE == 32) offset = sp->role.quadrant << PT64_LEVEL_BITS; - page = gfn_to_page(vcpu->kvm, sp->gfn); - gpt = kmap_atomic(page, KM_USER0); - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) - if (is_present_pte(gpt[offset + i])) + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + gpa_t pte_gpa = gfn_to_gpa(sp->gfn); + pte_gpa += (i+offset) * sizeof(pt_element_t); + + r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt, + sizeof(pt_element_t)); + if (r || is_present_pte(pt)) sp->spt[i] = shadow_trap_nonpresent_pte; else sp->spt[i] = shadow_notrap_nonpresent_pte; - kunmap_atomic(gpt, KM_USER0); - kvm_release_page_clean(page); + } } #undef pt_element_t Index: kvm.quilt/include/linux/kvm_host.h =================================================================== --- kvm.quilt.orig/include/linux/kvm_host.h +++ kvm.quilt/include/linux/kvm_host.h @@ -167,6 +167,8 @@ void kvm_release_page_clean(struct page void kvm_release_page_dirty(struct page *page); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); +int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, + unsigned long len); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, int offset, int len); Index: kvm.quilt/virt/kvm/kvm_main.c =================================================================== --- kvm.quilt.orig/virt/kvm/kvm_main.c +++ kvm.quilt/virt/kvm/kvm_main.c @@ -541,6 +541,26 @@ int kvm_read_guest(struct kvm *kvm, gpa_ } EXPORT_SYMBOL_GPL(kvm_read_guest); +int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, + unsigned long len) +{ + int r; + unsigned long addr; + gfn_t gfn = gpa >> PAGE_SHIFT; + int offset = offset_in_page(gpa); + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return -EFAULT; + pagefault_disable(); + r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); + pagefault_enable(); + if (r) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL(kvm_read_guest_atomic); + int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, int offset, int len) { -- ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
* [patch 3/5] KVM: add kvm_follow_page() 2007-12-21 0:18 [patch 0/5] KVM shadow scalability enhancements Marcelo Tosatti 2007-12-21 0:18 ` [patch 1/5] KVM: concurrent guest walkers Marcelo Tosatti 2007-12-21 0:18 ` [patch 2/5] KVM: add kvm_read_guest_atomic() Marcelo Tosatti @ 2007-12-21 0:18 ` Marcelo Tosatti [not found] ` <20071221002024.345682789-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> 2007-12-21 0:18 ` [patch 4/5] KVM: export follow_page() Marcelo Tosatti ` (2 subsequent siblings) 5 siblings, 1 reply; 26+ messages in thread From: Marcelo Tosatti @ 2007-12-21 0:18 UTC (permalink / raw) To: Avi Kivity; +Cc: kvm-devel, mtosatti-H+wXaHxf7aLQT0dZR+AlfA [-- Attachment #1: mmu-setpte-atomic --] [-- Type: text/plain, Size: 4557 bytes --] In preparation for a mmu spinlock, avoid schedules in mmu_set_spte() by using follow_page() instead of get_user_pages(). The fault handling work by get_user_pages() now happens outside the lock. Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> Index: kvm.quilt/arch/x86/kvm/mmu.c =================================================================== --- kvm.quilt.orig/arch/x86/kvm/mmu.c +++ kvm.quilt/arch/x86/kvm/mmu.c @@ -907,7 +907,9 @@ static void mmu_set_spte(struct kvm_vcpu if (!(pte_access & ACC_EXEC_MASK)) spte |= PT64_NX_MASK; - page = gfn_to_page(vcpu->kvm, gfn); + page = kvm_follow_page(vcpu->kvm, gfn); + if (!page) + return; spte |= PT_PRESENT_MASK; if (pte_access & ACC_USER_MASK) @@ -983,8 +985,11 @@ static int nonpaging_map(struct kvm_vcpu table = __va(table_addr); if (level == 1) { + struct page *page; + page = gfn_to_page(vcpu->kvm, gfn); mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, 0, write, 1, &pt_write, gfn); + kvm_release_page_clean(page); return pt_write || is_io_pte(table[index]); } Index: kvm.quilt/include/linux/kvm_host.h =================================================================== --- kvm.quilt.orig/include/linux/kvm_host.h +++ kvm.quilt/include/linux/kvm_host.h @@ -163,6 +163,7 @@ int kvm_arch_set_memory_region(struct kv int user_alloc); gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); +struct page *kvm_follow_page(struct kvm *kvm, gfn_t gfn); void kvm_release_page_clean(struct page *page); void kvm_release_page_dirty(struct page *page); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, Index: kvm.quilt/virt/kvm/kvm_main.c =================================================================== --- kvm.quilt.orig/virt/kvm/kvm_main.c +++ kvm.quilt/virt/kvm/kvm_main.c @@ -453,6 +453,25 @@ static unsigned long gfn_to_hva(struct k return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); } +struct page *kvm_follow_page(struct kvm *kvm, gfn_t gfn) +{ + unsigned long addr; + struct vm_area_struct *vma; + + addr = gfn_to_hva(kvm, gfn); + /* MMIO access */ + if (kvm_is_error_hva(addr)) { + get_page(bad_page); + return bad_page; + } + + vma = find_vma(current->mm, addr); + if (!vma) + return NULL; + + return follow_page(vma, addr, FOLL_GET|FOLL_TOUCH); +} + /* * Requires current->mm->mmap_sem to be held */ Index: kvm.quilt/arch/x86/kvm/paging_tmpl.h =================================================================== --- kvm.quilt.orig/arch/x86/kvm/paging_tmpl.h +++ kvm.quilt/arch/x86/kvm/paging_tmpl.h @@ -67,6 +67,7 @@ struct guest_walker { gfn_t table_gfn[PT_MAX_FULL_LEVELS]; pt_element_t ptes[PT_MAX_FULL_LEVELS]; gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; + struct page *page; unsigned pt_access; unsigned pte_access; gfn_t gfn; @@ -203,14 +204,18 @@ walk: --walker->level; } + walker->page = gfn_to_page(vcpu->kvm, walker->gfn); + if (write_fault && !is_dirty_pte(pte)) { bool ret; mark_page_dirty(vcpu->kvm, table_gfn); ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, pte|PT_DIRTY_MASK); - if (ret) + if (ret) { + kvm_release_page_clean(walker->page); goto walk; + } pte |= PT_DIRTY_MASK; mutex_lock(&vcpu->kvm->lock); kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); @@ -323,8 +328,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu r = kvm_read_guest_atomic(vcpu->kvm, walker->pte_gpa[level - 2], &curr_pte, sizeof(curr_pte)); - if (r || curr_pte != walker->ptes[level - 2]) - return NULL; + if (r || curr_pte != walker->ptes[level - 2]) { + shadow_ent = NULL; + goto out; + } } shadow_addr = __pa(shadow_page->spt); shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK @@ -336,7 +343,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu user_fault, write_fault, walker->ptes[walker->level-1] & PT_DIRTY_MASK, ptwrite, walker->gfn); - +out: + kvm_release_page_clean(walker->page); return shadow_ent; } @@ -425,6 +433,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kv if (r) { gpa = gfn_to_gpa(walker.gfn); gpa |= vaddr & ~PAGE_MASK; + kvm_release_page_clean(walker.page); } return gpa; -- ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <20071221002024.345682789-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>]
* Re: [patch 3/5] KVM: add kvm_follow_page() [not found] ` <20071221002024.345682789-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> @ 2007-12-22 19:42 ` Avi Kivity [not found] ` <476D68A3.3000200-atKUWr5tajBWk0Htik3J/w@public.gmane.org> 0 siblings, 1 reply; 26+ messages in thread From: Avi Kivity @ 2007-12-22 19:42 UTC (permalink / raw) To: Marcelo Tosatti; +Cc: kvm-devel Marcelo Tosatti wrote: > In preparation for a mmu spinlock, avoid schedules in mmu_set_spte() > by using follow_page() instead of get_user_pages(). > > The fault handling work by get_user_pages() now happens outside the lock. > > Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> > > Index: kvm.quilt/arch/x86/kvm/mmu.c > @@ -983,8 +985,11 @@ static int nonpaging_map(struct kvm_vcpu > table = __va(table_addr); > > if (level == 1) { > + struct page *page; > + page = gfn_to_page(vcpu->kvm, gfn); > mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, > 0, write, 1, &pt_write, gfn); > + kvm_release_page_clean(page); > return pt_write || is_io_pte(table[index]); > } Is this hunk necessary? It will swap in the page in case it's swapped out, but not sure we care about performance in that case. > n.c > @@ -453,6 +453,25 @@ static unsigned long gfn_to_hva(struct k > return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); > } > > +struct page *kvm_follow_page(struct kvm *kvm, gfn_t gfn) > +{ > + unsigned long addr; > + struct vm_area_struct *vma; > + > + addr = gfn_to_hva(kvm, gfn); > + /* MMIO access */ > + if (kvm_is_error_hva(addr)) { > + get_page(bad_page); > + return bad_page; > + } > + > + vma = find_vma(current->mm, addr); > + if (!vma) > + return NULL; > + > + return follow_page(vma, addr, FOLL_GET|FOLL_TOUCH); > +} It may be better to create get_user_page_inatomic() as the last four lines rather than exporting follow_page(). Maybe an mm maintainer can comment. > --- kvm.quilt.orig/arch/x86/kvm/paging_tmpl.h > +++ kvm.quilt/arch/x86/kvm/paging_tmpl.h > @@ -67,6 +67,7 @@ struct guest_walker { > gfn_t table_gfn[PT_MAX_FULL_LEVELS]; > pt_element_t ptes[PT_MAX_FULL_LEVELS]; > gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; > + struct page *page; > unsigned pt_access; > unsigned pte_access; > gfn_t gfn; > @@ -203,14 +204,18 @@ walk: > --walker->level; > } > > + walker->page = gfn_to_page(vcpu->kvm, walker->gfn); > + > if (write_fault && !is_dirty_pte(pte)) { > bool ret; > > mark_page_dirty(vcpu->kvm, table_gfn); > ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, > pte|PT_DIRTY_MASK); > - if (ret) > + if (ret) { > + kvm_release_page_clean(walker->page); > goto walk; > + } > pte |= PT_DIRTY_MASK; > mutex_lock(&vcpu->kvm->lock); > kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); > @@ -323,8 +328,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu > r = kvm_read_guest_atomic(vcpu->kvm, > walker->pte_gpa[level - 2], > &curr_pte, sizeof(curr_pte)); > - if (r || curr_pte != walker->ptes[level - 2]) > - return NULL; > + if (r || curr_pte != walker->ptes[level - 2]) { > + shadow_ent = NULL; > + goto out; > + } > } > shadow_addr = __pa(shadow_page->spt); > shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK > @@ -336,7 +343,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu > user_fault, write_fault, > walker->ptes[walker->level-1] & PT_DIRTY_MASK, > ptwrite, walker->gfn); > - > +out: > + kvm_release_page_clean(walker->page); > return shadow_ent; > } > walker->page is never used, so I think it can be completely dropped. mmu_set_spte() already does the right thing without its help. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <476D68A3.3000200-atKUWr5tajBWk0Htik3J/w@public.gmane.org>]
* Re: [patch 3/5] KVM: add kvm_follow_page() [not found] ` <476D68A3.3000200-atKUWr5tajBWk0Htik3J/w@public.gmane.org> @ 2007-12-23 5:25 ` Marcelo Tosatti 2007-12-23 7:03 ` Avi Kivity 0 siblings, 1 reply; 26+ messages in thread From: Marcelo Tosatti @ 2007-12-23 5:25 UTC (permalink / raw) To: Avi Kivity, Andrew Morton; +Cc: kvm-devel Hi Avi, Andrew, please see the comments on the need for a atomic get_user_pages() below. On Sat, Dec 22, 2007 at 09:42:27PM +0200, Avi Kivity wrote: > Marcelo Tosatti wrote: > > In preparation for a mmu spinlock, avoid schedules in mmu_set_spte() > > by using follow_page() instead of get_user_pages(). > > > > The fault handling work by get_user_pages() now happens outside the lock. > > > > Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> > > > > Index: kvm.quilt/arch/x86/kvm/mmu.c > > @@ -983,8 +985,11 @@ static int nonpaging_map(struct kvm_vcpu > > table = __va(table_addr); > > > > if (level == 1) { > > + struct page *page; > > + page = gfn_to_page(vcpu->kvm, gfn); > > mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, > > 0, write, 1, &pt_write, gfn); > > + kvm_release_page_clean(page); > > return pt_write || is_io_pte(table[index]); > > } > > Is this hunk necessary? It will swap in the page in case it's swapped > out, but not sure we care about performance in that case. The issue is that the mmap call done by QEMU won't instantiate the pte's of the guest memory area. So it needs to either do that get_user_pages()->handle_mm_fault() sequence outside mmu_set_spte() or mmap() has to pass MAP_POPULATE, prefaulting all guest pages (which is highly suboptimal). > > n.c > > @@ -453,6 +453,25 @@ static unsigned long gfn_to_hva(struct k > > return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); > > } > > > > +struct page *kvm_follow_page(struct kvm *kvm, gfn_t gfn) > > +{ > > + unsigned long addr; > > + struct vm_area_struct *vma; > > + > > + addr = gfn_to_hva(kvm, gfn); > > + /* MMIO access */ > > + if (kvm_is_error_hva(addr)) { > > + get_page(bad_page); > > + return bad_page; > > + } > > + > > + vma = find_vma(current->mm, addr); > > + if (!vma) > > + return NULL; > > + > > + return follow_page(vma, addr, FOLL_GET|FOLL_TOUCH); > > +} > > It may be better to create get_user_page_inatomic() as the last four > lines rather than exporting follow_page(). Maybe an mm maintainer can > comment. Andrew, KVM needs to find the struct page which maps to a certain address in the QEMU process without sleeping since that lookup is performed with a spinlock held. So it needs to either export follow_page() or use a get_user_pages() variant which doesnt call cond_resched() or handle_mm_fault(), simply failing if the page is not pagetable mapped. What you think? > > --- kvm.quilt.orig/arch/x86/kvm/paging_tmpl.h > > +++ kvm.quilt/arch/x86/kvm/paging_tmpl.h > > @@ -67,6 +67,7 @@ struct guest_walker { > > gfn_t table_gfn[PT_MAX_FULL_LEVELS]; > > pt_element_t ptes[PT_MAX_FULL_LEVELS]; > > gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; > > + struct page *page; > > unsigned pt_access; > > unsigned pte_access; > > gfn_t gfn; > > @@ -203,14 +204,18 @@ walk: > > --walker->level; > > } > > > > + walker->page = gfn_to_page(vcpu->kvm, walker->gfn); > > + > > if (write_fault && !is_dirty_pte(pte)) { > > bool ret; > > > > mark_page_dirty(vcpu->kvm, table_gfn); > > ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, > > pte|PT_DIRTY_MASK); > > - if (ret) > > + if (ret) { > > + kvm_release_page_clean(walker->page); > > goto walk; > > + } > > pte |= PT_DIRTY_MASK; > > mutex_lock(&vcpu->kvm->lock); > > kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); > > @@ -323,8 +328,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu > > r = kvm_read_guest_atomic(vcpu->kvm, > > walker->pte_gpa[level - 2], > > &curr_pte, sizeof(curr_pte)); > > - if (r || curr_pte != walker->ptes[level - 2]) > > - return NULL; > > + if (r || curr_pte != walker->ptes[level - 2]) { > > + shadow_ent = NULL; > > + goto out; > > + } > > } > > shadow_addr = __pa(shadow_page->spt); > > shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK > > @@ -336,7 +343,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu > > user_fault, write_fault, > > walker->ptes[walker->level-1] & PT_DIRTY_MASK, > > ptwrite, walker->gfn); > > - > > +out: > > + kvm_release_page_clean(walker->page); > > return shadow_ent; > > } > > > > walker->page is never used, so I think it can be completely dropped. > mmu_set_spte() already does the right thing without its help. The page being mapped by the guest needs to be faulted in by someone... So while walker->page is unused, it guarantees that the handler will properly fault in the pagetables to know which physical address the page resides at. ie. it is the place which guarantees that the page gets mapped if its already in memory or swapped in and then mapped. Thanks. > -- > Do not meddle in the internals of kernels, for they are subtle and quick to panic. ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 3/5] KVM: add kvm_follow_page() 2007-12-23 5:25 ` Marcelo Tosatti @ 2007-12-23 7:03 ` Avi Kivity [not found] ` <476E083B.5040600-atKUWr5tajBWk0Htik3J/w@public.gmane.org> 0 siblings, 1 reply; 26+ messages in thread From: Avi Kivity @ 2007-12-23 7:03 UTC (permalink / raw) To: Marcelo Tosatti; +Cc: kvm-devel, Andrew Morton Marcelo Tosatti wrote: > Hi Avi, > > Andrew, please see the comments on the need for a atomic get_user_pages() > below. > > On Sat, Dec 22, 2007 at 09:42:27PM +0200, Avi Kivity wrote: > >> Marcelo Tosatti wrote: >> >>> In preparation for a mmu spinlock, avoid schedules in mmu_set_spte() >>> by using follow_page() instead of get_user_pages(). >>> >>> The fault handling work by get_user_pages() now happens outside the lock. >>> >>> Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> >>> >>> Index: kvm.quilt/arch/x86/kvm/mmu.c >>> @@ -983,8 +985,11 @@ static int nonpaging_map(struct kvm_vcpu >>> table = __va(table_addr); >>> >>> if (level == 1) { >>> + struct page *page; >>> + page = gfn_to_page(vcpu->kvm, gfn); >>> mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, >>> 0, write, 1, &pt_write, gfn); >>> + kvm_release_page_clean(page); >>> return pt_write || is_io_pte(table[index]); >>> } >>> >> Is this hunk necessary? It will swap in the page in case it's swapped >> out, but not sure we care about performance in that case. >> > > The issue is that the mmap call done by QEMU won't instantiate > the pte's of the guest memory area. So it needs to either do that > get_user_pages()->handle_mm_fault() sequence outside mmu_set_spte() But that's only on the very first access to the page. Once the page has been accessed by the guest, it will be mapped in the host page table (unless it's subsequently swapped out). Oh, but the page will never be faulted in this way because the path that's supposed to map it is mmu_set_spte(). In that case, your original implementation of passing in page directly is better (instead of being forced to pass it indirectly via the page tables). Sorry about generating this confusion. > or > mmap() has to pass MAP_POPULATE, prefaulting all guest pages (which is > highly suboptimal). > Actually, MAP_POPULATE is optimal since it neatly batches all of the work. I don't think we should do it since it gives the appearance of being slower (esp. with lots of guests memory). > >>> n.c >>> @@ -453,6 +453,25 @@ static unsigned long gfn_to_hva(struct k >>> return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); >>> } >>> >>> +struct page *kvm_follow_page(struct kvm *kvm, gfn_t gfn) >>> +{ >>> + unsigned long addr; >>> + struct vm_area_struct *vma; >>> + >>> + addr = gfn_to_hva(kvm, gfn); >>> + /* MMIO access */ >>> + if (kvm_is_error_hva(addr)) { >>> + get_page(bad_page); >>> + return bad_page; >>> + } >>> + >>> + vma = find_vma(current->mm, addr); >>> + if (!vma) >>> + return NULL; >>> + >>> + return follow_page(vma, addr, FOLL_GET|FOLL_TOUCH); >>> +} >>> >> It may be better to create get_user_page_inatomic() as the last four >> lines rather than exporting follow_page(). Maybe an mm maintainer can >> comment. >> > > Andrew, KVM needs to find the struct page which maps to a certain > address in the QEMU process without sleeping since that lookup is > performed with a spinlock held. > > So it needs to either export follow_page() or use a get_user_pages() variant > which doesnt call cond_resched() or handle_mm_fault(), simply failing > if the page is not pagetable mapped. > > What you think? > > [We need to pass FOLL_WRITE to follow_page()] >>> --- kvm.quilt.orig/arch/x86/kvm/paging_tmpl.h >>> +++ kvm.quilt/arch/x86/kvm/paging_tmpl.h >>> @@ -67,6 +67,7 @@ struct guest_walker { >>> gfn_t table_gfn[PT_MAX_FULL_LEVELS]; >>> pt_element_t ptes[PT_MAX_FULL_LEVELS]; >>> gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; >>> + struct page *page; >>> unsigned pt_access; >>> unsigned pte_access; >>> gfn_t gfn; >>> @@ -203,14 +204,18 @@ walk: >>> --walker->level; >>> } >>> >>> + walker->page = gfn_to_page(vcpu->kvm, walker->gfn); >>> + >>> if (write_fault && !is_dirty_pte(pte)) { >>> bool ret; >>> >>> mark_page_dirty(vcpu->kvm, table_gfn); >>> ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, >>> pte|PT_DIRTY_MASK); >>> - if (ret) >>> + if (ret) { >>> + kvm_release_page_clean(walker->page); >>> goto walk; >>> + } >>> pte |= PT_DIRTY_MASK; >>> mutex_lock(&vcpu->kvm->lock); >>> kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); >>> @@ -323,8 +328,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu >>> r = kvm_read_guest_atomic(vcpu->kvm, >>> walker->pte_gpa[level - 2], >>> &curr_pte, sizeof(curr_pte)); >>> - if (r || curr_pte != walker->ptes[level - 2]) >>> - return NULL; >>> + if (r || curr_pte != walker->ptes[level - 2]) { >>> + shadow_ent = NULL; >>> + goto out; >>> + } >>> } >>> shadow_addr = __pa(shadow_page->spt); >>> shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK >>> @@ -336,7 +343,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu >>> user_fault, write_fault, >>> walker->ptes[walker->level-1] & PT_DIRTY_MASK, >>> ptwrite, walker->gfn); >>> - >>> +out: >>> + kvm_release_page_clean(walker->page); >>> return shadow_ent; >>> } >>> >>> >> walker->page is never used, so I think it can be completely dropped. >> mmu_set_spte() already does the right thing without its help. >> > > The page being mapped by the guest needs to be faulted in by someone... > So while walker->page is unused, it guarantees that the handler will > properly fault in the pagetables to know which physical address the page > resides at. ie. it is the place which guarantees that the page gets > mapped if its already in memory or swapped in and then mapped. Exactly. But it is better to be explicit about it and pass the page directly like you did before. I hate to make you go back-and-fourth, but I did not understand the issue completely before. -- Any sufficiently difficult bug is indistinguishable from a feature. ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <476E083B.5040600-atKUWr5tajBWk0Htik3J/w@public.gmane.org>]
* Re: [patch 3/5] KVM: add kvm_follow_page() [not found] ` <476E083B.5040600-atKUWr5tajBWk0Htik3J/w@public.gmane.org> @ 2007-12-23 8:41 ` Avi Kivity [not found] ` <476E1F23.3020609-atKUWr5tajBWk0Htik3J/w@public.gmane.org> 0 siblings, 1 reply; 26+ messages in thread From: Avi Kivity @ 2007-12-23 8:41 UTC (permalink / raw) To: Marcelo Tosatti; +Cc: kvm-devel, Andrew Morton Avi Kivity wrote: > > Exactly. But it is better to be explicit about it and pass the page > directly like you did before. I hate to make you go back-and-fourth, > but I did not understand the issue completely before. > btw, the call to gfn_to_page() can happen in page_fault() instead of walk_addr(); that will reduce the amount of error handling, and will simplify the callers to walk_addr() that don't need the page. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <476E1F23.3020609-atKUWr5tajBWk0Htik3J/w@public.gmane.org>]
* Re: [patch 3/5] KVM: add kvm_follow_page() [not found] ` <476E1F23.3020609-atKUWr5tajBWk0Htik3J/w@public.gmane.org> @ 2007-12-23 8:59 ` Avi Kivity [not found] ` <476E236A.9000800-atKUWr5tajBWk0Htik3J/w@public.gmane.org> 2007-12-23 19:14 ` Marcelo Tosatti 1 sibling, 1 reply; 26+ messages in thread From: Avi Kivity @ 2007-12-23 8:59 UTC (permalink / raw) To: Marcelo Tosatti; +Cc: kvm-devel, Andrew Morton Avi Kivity wrote: > Avi Kivity wrote: > >> Exactly. But it is better to be explicit about it and pass the page >> directly like you did before. I hate to make you go back-and-fourth, >> but I did not understand the issue completely before. >> >> > > btw, the call to gfn_to_page() can happen in page_fault() instead of > walk_addr(); that will reduce the amount of error handling, and will > simplify the callers to walk_addr() that don't need the page. > > Note further that all this doesn't obviate the need for follow_page() (or get_user_pages_inatomic()); we still need something in update_pte() for the demand paging case. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <476E236A.9000800-atKUWr5tajBWk0Htik3J/w@public.gmane.org>]
* Re: [patch 3/5] KVM: add kvm_follow_page() [not found] ` <476E236A.9000800-atKUWr5tajBWk0Htik3J/w@public.gmane.org> @ 2007-12-23 9:26 ` Andrew Morton [not found] ` <20071223012618.e8cf8320.akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org> 0 siblings, 1 reply; 26+ messages in thread From: Andrew Morton @ 2007-12-23 9:26 UTC (permalink / raw) To: Avi Kivity; +Cc: Marcelo Tosatti, kvm-devel On Sun, 23 Dec 2007 10:59:22 +0200 Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote: > Avi Kivity wrote: > > Avi Kivity wrote: > > > >> Exactly. But it is better to be explicit about it and pass the page > >> directly like you did before. I hate to make you go back-and-fourth, > >> but I did not understand the issue completely before. > >> > >> > > > > btw, the call to gfn_to_page() can happen in page_fault() instead of > > walk_addr(); that will reduce the amount of error handling, and will > > simplify the callers to walk_addr() that don't need the page. > > > > > > Note further that all this doesn't obviate the need for follow_page() > (or get_user_pages_inatomic()); we still need something in update_pte() > for the demand paging case. Please review -mm's mm/pagewalk.c for suitability. If is is unsuitable but repairable then please cc Matt Mackall <mpm-VDJrAJ4Gl5ZBDgjK7y7TUQ@public.gmane.org> on the review. ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <20071223012618.e8cf8320.akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>]
* Re: [patch 3/5] KVM: add kvm_follow_page() [not found] ` <20071223012618.e8cf8320.akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org> @ 2007-12-23 10:35 ` Avi Kivity [not found] ` <476E39F2.4030202-atKUWr5tajBWk0Htik3J/w@public.gmane.org> 0 siblings, 1 reply; 26+ messages in thread From: Avi Kivity @ 2007-12-23 10:35 UTC (permalink / raw) To: Andrew Morton; +Cc: Marcelo Tosatti, kvm-devel, Matt Mackall Andrew Morton wrote: > On Sun, 23 Dec 2007 10:59:22 +0200 Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote: > > >> Avi Kivity wrote: >> >>> Avi Kivity wrote: >>> >>> >>>> Exactly. But it is better to be explicit about it and pass the page >>>> directly like you did before. I hate to make you go back-and-fourth, >>>> but I did not understand the issue completely before. >>>> >>>> >>>> >>> btw, the call to gfn_to_page() can happen in page_fault() instead of >>> walk_addr(); that will reduce the amount of error handling, and will >>> simplify the callers to walk_addr() that don't need the page. >>> >>> >>> >> Note further that all this doesn't obviate the need for follow_page() >> (or get_user_pages_inatomic()); we still need something in update_pte() >> for the demand paging case. >> > > Please review -mm's mm/pagewalk.c for suitability. > > If is is unsuitable but repairable then please cc Matt Mackall > <mpm-VDJrAJ4Gl5ZBDgjK7y7TUQ@public.gmane.org> on the review. > > The "no locks are taken" comment is very worrying. We need accurate results. Getting pte_t's in the callbacks is a little too low level for kvm's use (which wants struct page pointers) but of course that easily handled in a kvm wrapper. I'd prefer an atomic version of get_user_pages(), but if pagewalk is fixed to take the necessary locks, it will do. -- error compiling committee.c: too many arguments to function ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <476E39F2.4030202-atKUWr5tajBWk0Htik3J/w@public.gmane.org>]
* Re: [patch 3/5] KVM: add kvm_follow_page() [not found] ` <476E39F2.4030202-atKUWr5tajBWk0Htik3J/w@public.gmane.org> @ 2007-12-23 10:52 ` Andrew Morton [not found] ` <20071223025245.d839c86d.akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org> 0 siblings, 1 reply; 26+ messages in thread From: Andrew Morton @ 2007-12-23 10:52 UTC (permalink / raw) To: Avi Kivity; +Cc: Marcelo Tosatti, kvm-devel, Matt Mackall On Sun, 23 Dec 2007 12:35:30 +0200 Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote: > Andrew Morton wrote: > > On Sun, 23 Dec 2007 10:59:22 +0200 Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote: > > > > > >> Avi Kivity wrote: > >> > >>> Avi Kivity wrote: > >>> > >>> > >>>> Exactly. But it is better to be explicit about it and pass the page > >>>> directly like you did before. I hate to make you go back-and-fourth, > >>>> but I did not understand the issue completely before. > >>>> > >>>> > >>>> > >>> btw, the call to gfn_to_page() can happen in page_fault() instead of > >>> walk_addr(); that will reduce the amount of error handling, and will > >>> simplify the callers to walk_addr() that don't need the page. > >>> > >>> > >>> > >> Note further that all this doesn't obviate the need for follow_page() > >> (or get_user_pages_inatomic()); we still need something in update_pte() > >> for the demand paging case. > >> > > > > Please review -mm's mm/pagewalk.c for suitability. > > > > If is is unsuitable but repairable then please cc Matt Mackall > > <mpm-VDJrAJ4Gl5ZBDgjK7y7TUQ@public.gmane.org> on the review. > > > > > > The "no locks are taken" comment is very worrying. We need accurate > results. take down_read(mm->mmap_sem) before calling it.. You have to do that anyway for its results to be meaningful in the caller. Ditto get_user_pages(). > Getting pte_t's in the callbacks is a little too low level for kvm's use > (which wants struct page pointers) but of course that easily handled in > a kvm wrapper. > > I'd prefer an atomic version of get_user_pages(), but if pagewalk is > fixed to take the necessary locks, it will do. It isn't exported to modules at present, although I see no problem in changing that. ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <20071223025245.d839c86d.akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>]
* Re: [patch 3/5] KVM: add kvm_follow_page() [not found] ` <20071223025245.d839c86d.akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org> @ 2007-12-23 11:01 ` Avi Kivity [not found] ` <476E4005.90003-atKUWr5tajBWk0Htik3J/w@public.gmane.org> 0 siblings, 1 reply; 26+ messages in thread From: Avi Kivity @ 2007-12-23 11:01 UTC (permalink / raw) To: Andrew Morton; +Cc: Marcelo Tosatti, kvm-devel, Matt Mackall Andrew Morton wrote: > On Sun, 23 Dec 2007 12:35:30 +0200 Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote: > > >> Andrew Morton wrote: >> >>> On Sun, 23 Dec 2007 10:59:22 +0200 Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote: >>> >>> >>> >>>> Avi Kivity wrote: >>>> >>>> >>>>> Avi Kivity wrote: >>>>> >>>>> >>>>> >>>>>> Exactly. But it is better to be explicit about it and pass the page >>>>>> directly like you did before. I hate to make you go back-and-fourth, >>>>>> but I did not understand the issue completely before. >>>>>> >>>>>> >>>>>> >>>>>> >>>>> btw, the call to gfn_to_page() can happen in page_fault() instead of >>>>> walk_addr(); that will reduce the amount of error handling, and will >>>>> simplify the callers to walk_addr() that don't need the page. >>>>> >>>>> >>>>> >>>>> >>>> Note further that all this doesn't obviate the need for follow_page() >>>> (or get_user_pages_inatomic()); we still need something in update_pte() >>>> for the demand paging case. >>>> >>>> >>> Please review -mm's mm/pagewalk.c for suitability. >>> >>> If is is unsuitable but repairable then please cc Matt Mackall >>> <mpm-VDJrAJ4Gl5ZBDgjK7y7TUQ@public.gmane.org> on the review. >>> >>> >>> >> The "no locks are taken" comment is very worrying. We need accurate >> results. >> > > take down_read(mm->mmap_sem) before calling it.. > > You have to do that anyway for its results to be meaningful in the caller. > Ditto get_user_pages(). > > Yes, but what about the page table locks? follow_page() is much more thorough. -- error compiling committee.c: too many arguments to function ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <476E4005.90003-atKUWr5tajBWk0Htik3J/w@public.gmane.org>]
* Re: [patch 3/5] KVM: add kvm_follow_page() [not found] ` <476E4005.90003-atKUWr5tajBWk0Htik3J/w@public.gmane.org> @ 2007-12-23 20:15 ` Marcelo Tosatti 2007-12-23 20:41 ` Andrew Morton 2007-12-24 6:56 ` Avi Kivity 0 siblings, 2 replies; 26+ messages in thread From: Marcelo Tosatti @ 2007-12-23 20:15 UTC (permalink / raw) To: Avi Kivity; +Cc: Marcelo Tosatti, kvm-devel, Andrew Morton, Matt Mackall On Sun, Dec 23, 2007 at 01:01:25PM +0200, Avi Kivity wrote: > Andrew Morton wrote: > >On Sun, 23 Dec 2007 12:35:30 +0200 Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote: > > > > > >>Andrew Morton wrote: > >> > >>>On Sun, 23 Dec 2007 10:59:22 +0200 Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote: > >>> > >>> > >>> > >>>>Avi Kivity wrote: > >>>> > >>>> > >>>>>Avi Kivity wrote: > >>>>> > >>>>> > >>>>> > >>>>>>Exactly. But it is better to be explicit about it and pass the page > >>>>>>directly like you did before. I hate to make you go back-and-fourth, > >>>>>>but I did not understand the issue completely before. > >>>>>> > >>>>>> > >>>>>> > >>>>>> > >>>>>btw, the call to gfn_to_page() can happen in page_fault() instead of > >>>>>walk_addr(); that will reduce the amount of error handling, and will > >>>>>simplify the callers to walk_addr() that don't need the page. > >>>>> > >>>>> > >>>>> > >>>>> > >>>>Note further that all this doesn't obviate the need for follow_page() > >>>>(or get_user_pages_inatomic()); we still need something in update_pte() > >>>>for the demand paging case. > >>>> > >>>> > >>>Please review -mm's mm/pagewalk.c for suitability. > >>> > >>>If is is unsuitable but repairable then please cc Matt Mackall > >>><mpm-VDJrAJ4Gl5ZBDgjK7y7TUQ@public.gmane.org> on the review. > >>> > >>> > >>> > >>The "no locks are taken" comment is very worrying. We need accurate > >>results. > >> > > > >take down_read(mm->mmap_sem) before calling it.. > > > >You have to do that anyway for its results to be meaningful in the caller. > >Ditto get_user_pages(). > > > > > > Yes, but what about the page table locks? > > follow_page() is much more thorough. It can acquire the pagetablelock in the callback handler. But then, vm_normal_page() must also be exported. Are you guys OK with this ? Modular KVM needs walk_page_range(), and also vm_normal_page() to be used on pagewalk callback. Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> Index: kvm.quilt/mm/pagewalk.c =================================================================== --- kvm.quilt.orig/mm/pagewalk.c +++ kvm.quilt/mm/pagewalk.c @@ -1,6 +1,7 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/sched.h> +#include <linux/module.h> static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, const struct mm_walk *walk, void *private) @@ -129,3 +130,4 @@ int walk_page_range(const struct mm_stru return err; } +EXPORT_SYMBOL(walk_page_range); Index: kvm.quilt/mm/memory.c =================================================================== --- kvm.quilt.orig/mm/memory.c +++ kvm.quilt/mm/memory.c @@ -412,6 +412,7 @@ struct page *vm_normal_page(struct vm_ar */ return pfn_to_page(pfn); } +EXPORT_SYMBOL(vm_normal_page); /* * copy one vm_area from one task to the other. Assumes the page tables [PATCH] KVM: add kvm_follow_page() In preparation for a mmu spinlock, avoid schedules in mmu_set_spte() by using walk_page_range() instead of get_user_pages(). The fault handling work by get_user_pages() now happens outside the lock. Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> Index: kvm.quilt/arch/x86/kvm/mmu.c =================================================================== --- kvm.quilt.orig/arch/x86/kvm/mmu.c +++ kvm.quilt/arch/x86/kvm/mmu.c @@ -882,14 +882,18 @@ struct page *gva_to_page(struct kvm_vcpu return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); } +/* + * mmu_set_spte must be called with an additional reference on + * struct page *page, and it will release that if necessary (so + * the callers should not worry about it). + */ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, - int *ptwrite, gfn_t gfn) + int *ptwrite, gfn_t gfn, struct page *page) { u64 spte; int was_rmapped = is_rmap_pte(*shadow_pte); - struct page *page; pgprintk("%s: spte %llx access %x write_fault %d" " user_fault %d gfn %lx\n", @@ -907,8 +911,6 @@ static void mmu_set_spte(struct kvm_vcpu if (!(pte_access & ACC_EXEC_MASK)) spte |= PT64_NX_MASK; - page = gfn_to_page(vcpu->kvm, gfn); - spte |= PT_PRESENT_MASK; if (pte_access & ACC_USER_MASK) spte |= PT_USER_MASK; @@ -983,8 +985,10 @@ static int nonpaging_map(struct kvm_vcpu table = __va(table_addr); if (level == 1) { + struct page *page; + page = gfn_to_page(vcpu->kvm, gfn); mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, - 0, write, 1, &pt_write, gfn); + 0, write, 1, &pt_write, gfn, page); return pt_write || is_io_pte(table[index]); } Index: kvm.quilt/include/linux/kvm_host.h =================================================================== --- kvm.quilt.orig/include/linux/kvm_host.h +++ kvm.quilt/include/linux/kvm_host.h @@ -163,6 +163,7 @@ int kvm_arch_set_memory_region(struct kv int user_alloc); gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); +struct page *kvm_find_page(struct kvm *kvm, gfn_t gfn); void kvm_release_page_clean(struct page *page); void kvm_release_page_dirty(struct page *page); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, Index: kvm.quilt/virt/kvm/kvm_main.c =================================================================== --- kvm.quilt.orig/virt/kvm/kvm_main.c +++ kvm.quilt/virt/kvm/kvm_main.c @@ -453,6 +453,54 @@ static unsigned long gfn_to_hva(struct k return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); } +static int kvm_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + void *private) +{ + struct page **page = private; + struct vm_area_struct *vma; + pte_t *ptep, pte; + spinlock_t *ptl; + int err = -EFAULT; + + vma = find_vma(current->mm, addr); + if (!vma) + return err; + + ptep = pte_offset_map_lock(current->mm, pmd, addr, &ptl); + pte = *ptep; + if (!pte_present(pte)) + goto unlock; + + *page = vm_normal_page(vma, addr, pte); + if (!*page) + goto unlock; + + get_page(*page); + err = 0; +unlock: + pte_unmap_unlock(ptep, ptl); + return err; +} + +static struct mm_walk kvm_mm_walk = { .pmd_entry = kvm_pte_range }; + +struct page *kvm_find_page(struct kvm *kvm, gfn_t gfn) +{ + unsigned long addr; + struct page *page = NULL; + + addr = gfn_to_hva(kvm, gfn); + /* MMIO access */ + if (kvm_is_error_hva(addr)) { + get_page(bad_page); + return bad_page; + } + + walk_page_range(current->mm, addr, addr+PAGE_SIZE, &kvm_mm_walk, + &page); + return page; +} + /* * Requires current->mm->mmap_sem to be held */ Index: kvm.quilt/arch/x86/kvm/paging_tmpl.h =================================================================== --- kvm.quilt.orig/arch/x86/kvm/paging_tmpl.h +++ kvm.quilt/arch/x86/kvm/paging_tmpl.h @@ -67,6 +67,7 @@ struct guest_walker { gfn_t table_gfn[PT_MAX_FULL_LEVELS]; pt_element_t ptes[PT_MAX_FULL_LEVELS]; gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; + struct page *page; unsigned pt_access; unsigned pte_access; gfn_t gfn; @@ -203,14 +204,18 @@ walk: --walker->level; } + walker->page = gfn_to_page(vcpu->kvm, walker->gfn); + if (write_fault && !is_dirty_pte(pte)) { bool ret; mark_page_dirty(vcpu->kvm, table_gfn); ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, pte|PT_DIRTY_MASK); - if (ret) + if (ret) { + kvm_release_page_clean(walker->page); goto walk; + } pte |= PT_DIRTY_MASK; mutex_lock(&vcpu->kvm->lock); kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); @@ -241,12 +246,13 @@ err: return 0; } -static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, +static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *spage, u64 *spte, const void *pte, int bytes, int offset_in_pte) { pt_element_t gpte; unsigned pte_access; + struct page *page; gpte = *(const pt_element_t *)pte; if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { @@ -256,10 +262,15 @@ static void FNAME(update_pte)(struct kvm } if (bytes < sizeof(pt_element_t)) return; + + page = kvm_find_page(vcpu->kvm, gpte_to_gfn(gpte)); + if (!page) + return; + pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); - pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); - mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, - gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte)); + pte_access = spage->role.access & FNAME(gpte_access)(vcpu, gpte); + mmu_set_spte(vcpu, spte, spage->role.access, pte_access, 0, 0, + gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), page); } /* @@ -323,8 +334,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu r = kvm_read_guest_atomic(vcpu->kvm, walker->pte_gpa[level - 2], &curr_pte, sizeof(curr_pte)); - if (r || curr_pte != walker->ptes[level - 2]) + if (r || curr_pte != walker->ptes[level - 2]) { + kvm_release_page_clean(walker->page); return NULL; + } } shadow_addr = __pa(shadow_page->spt); shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK @@ -335,7 +348,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, user_fault, write_fault, walker->ptes[walker->level-1] & PT_DIRTY_MASK, - ptwrite, walker->gfn); + ptwrite, walker->gfn, walker->page); return shadow_ent; } @@ -425,6 +438,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kv if (r) { gpa = gfn_to_gpa(walker.gfn); gpa |= vaddr & ~PAGE_MASK; + kvm_release_page_clean(walker.page); } return gpa; ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 3/5] KVM: add kvm_follow_page() 2007-12-23 20:15 ` Marcelo Tosatti @ 2007-12-23 20:41 ` Andrew Morton 2007-12-24 6:56 ` Avi Kivity 1 sibling, 0 replies; 26+ messages in thread From: Andrew Morton @ 2007-12-23 20:41 UTC (permalink / raw) To: Marcelo Tosatti; +Cc: kvm-devel, Avi Kivity, Matt Mackall On Sun, 23 Dec 2007 15:15:25 -0500 Marcelo Tosatti <marcelo-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org> wrote: > Are you guys OK with this ? > > > Modular KVM needs walk_page_range(), and also vm_normal_page() to be > used on pagewalk callback. I am. ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 3/5] KVM: add kvm_follow_page() 2007-12-23 20:15 ` Marcelo Tosatti 2007-12-23 20:41 ` Andrew Morton @ 2007-12-24 6:56 ` Avi Kivity [not found] ` <476F5801.2030002-atKUWr5tajBWk0Htik3J/w@public.gmane.org> 1 sibling, 1 reply; 26+ messages in thread From: Avi Kivity @ 2007-12-24 6:56 UTC (permalink / raw) To: Marcelo Tosatti; +Cc: kvm-devel, Andrew Morton, Matt Mackall Marcelo Tosatti wrote: > It can acquire the pagetablelock in the callback handler. But then, > vm_normal_page() must also be exported. > > Are you guys OK with this ? > > Seems to me that requires fairly detailed mucking in mm details, just to get at a page. I believe that a new get_users_pages_inatomic() is more suitable; Andrew, I'll write it if you agree. Alternatively, walk_page_range() should take the lock itself, otherwise it is only usable if you don't care about correctness? > > +static int kvm_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, > + void *private) > +{ > + struct page **page = private; > + struct vm_area_struct *vma; > + pte_t *ptep, pte; > + spinlock_t *ptl; > + int err = -EFAULT; > + > + vma = find_vma(current->mm, addr); > + if (!vma) > + return err; > + > + ptep = pte_offset_map_lock(current->mm, pmd, addr, &ptl); > + pte = *ptep; > + if (!pte_present(pte)) > + goto unlock; > + > + *page = vm_normal_page(vma, addr, pte); > + if (!*page) > + goto unlock; > + > + get_page(*page); > + err = 0; > +unlock: > + pte_unmap_unlock(ptep, ptl); > + return err; > +} > + > See what one has to write to get at a page? Unreasonable IMO. And that's without hugetlb support AFAICT (kvm will want that soon). -- error compiling committee.c: too many arguments to function ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <476F5801.2030002-atKUWr5tajBWk0Htik3J/w@public.gmane.org>]
* Re: [patch 3/5] KVM: add kvm_follow_page() [not found] ` <476F5801.2030002-atKUWr5tajBWk0Htik3J/w@public.gmane.org> @ 2007-12-24 14:26 ` Marcelo Tosatti 2007-12-24 14:36 ` Avi Kivity 0 siblings, 1 reply; 26+ messages in thread From: Marcelo Tosatti @ 2007-12-24 14:26 UTC (permalink / raw) To: Avi Kivity; +Cc: Marcelo Tosatti, kvm-devel, Andrew Morton, Matt Mackall On Mon, Dec 24, 2007 at 08:56:01AM +0200, Avi Kivity wrote: > Marcelo Tosatti wrote: > >It can acquire the pagetablelock in the callback handler. But then, > >vm_normal_page() must also be exported. > > > >Are you guys OK with this ? > > > > > > Seems to me that requires fairly detailed mucking in mm details, just to > get at a page. I agree. > I believe that a new get_users_pages_inatomic() is more suitable; > Andrew, I'll write it if you agree. Alternatively, walk_page_range() > should take the lock itself, otherwise it is only usable if you don't > care about correctness? Why not just export follow_page() ? That is exactly what KVM needs. > >+static int kvm_pte_range(pmd_t *pmd, unsigned long addr, unsigned long > >end, > >+ void *private) > >+{ > >+ struct page **page = private; > >+ struct vm_area_struct *vma; > >+ pte_t *ptep, pte; > >+ spinlock_t *ptl; > >+ int err = -EFAULT; > >+ > >+ vma = find_vma(current->mm, addr); > >+ if (!vma) > >+ return err; > >+ > >+ ptep = pte_offset_map_lock(current->mm, pmd, addr, &ptl); > >+ pte = *ptep; > >+ if (!pte_present(pte)) > >+ goto unlock; > >+ > >+ *page = vm_normal_page(vma, addr, pte); > >+ if (!*page) > >+ goto unlock; > >+ > >+ get_page(*page); > >+ err = 0; > >+unlock: > >+ pte_unmap_unlock(ptep, ptl); > >+ return err; > >+} > >+ > > > > See what one has to write to get at a page? Unreasonable IMO. And > that's without hugetlb support AFAICT (kvm will want that soon). Indeed... Its pretty much duplicating what follow_page() already does. ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 3/5] KVM: add kvm_follow_page() 2007-12-24 14:26 ` Marcelo Tosatti @ 2007-12-24 14:36 ` Avi Kivity 0 siblings, 0 replies; 26+ messages in thread From: Avi Kivity @ 2007-12-24 14:36 UTC (permalink / raw) To: Marcelo Tosatti; +Cc: kvm-devel, Andrew Morton, Matt Mackall Marcelo Tosatti wrote: > >> I believe that a new get_users_pages_inatomic() is more suitable; >> Andrew, I'll write it if you agree. Alternatively, walk_page_range() >> should take the lock itself, otherwise it is only usable if you don't >> care about correctness? >> > > Why not just export follow_page() ? That is exactly what KVM needs. > > Well, follow_page() needs the vma, and doesn't support large pages. It's too low-level to be a public API. -- error compiling committee.c: too many arguments to function ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 3/5] KVM: add kvm_follow_page() [not found] ` <476E1F23.3020609-atKUWr5tajBWk0Htik3J/w@public.gmane.org> 2007-12-23 8:59 ` Avi Kivity @ 2007-12-23 19:14 ` Marcelo Tosatti 2007-12-24 6:50 ` Avi Kivity 1 sibling, 1 reply; 26+ messages in thread From: Marcelo Tosatti @ 2007-12-23 19:14 UTC (permalink / raw) To: Avi Kivity; +Cc: Marcelo Tosatti, kvm-devel On Sun, Dec 23, 2007 at 10:41:07AM +0200, Avi Kivity wrote: > Avi Kivity wrote: > > > > Exactly. But it is better to be explicit about it and pass the page > > directly like you did before. I hate to make you go back-and-fourth, > > but I did not understand the issue completely before. > > > > btw, the call to gfn_to_page() can happen in page_fault() instead of > walk_addr(); that will reduce the amount of error handling, and will > simplify the callers to walk_addr() that don't need the page. But the gfn in question is only known at walk_addr() time, so thats not possible. ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 3/5] KVM: add kvm_follow_page() 2007-12-23 19:14 ` Marcelo Tosatti @ 2007-12-24 6:50 ` Avi Kivity [not found] ` <476F56A5.2020607-atKUWr5tajBWk0Htik3J/w@public.gmane.org> 0 siblings, 1 reply; 26+ messages in thread From: Avi Kivity @ 2007-12-24 6:50 UTC (permalink / raw) To: Marcelo Tosatti; +Cc: kvm-devel Marcelo Tosatti wrote: > On Sun, Dec 23, 2007 at 10:41:07AM +0200, Avi Kivity wrote: > >> Avi Kivity wrote: >> >>> Exactly. But it is better to be explicit about it and pass the page >>> directly like you did before. I hate to make you go back-and-fourth, >>> but I did not understand the issue completely before. >>> >>> >> btw, the call to gfn_to_page() can happen in page_fault() instead of >> walk_addr(); that will reduce the amount of error handling, and will >> simplify the callers to walk_addr() that don't need the page. >> > > But the gfn in question is only known at walk_addr() time, so thats not > possible. > It's just walker->gfn; certainly it is known in page_fault(). -- error compiling committee.c: too many arguments to function ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <476F56A5.2020607-atKUWr5tajBWk0Htik3J/w@public.gmane.org>]
* Re: [patch 3/5] KVM: add kvm_follow_page() [not found] ` <476F56A5.2020607-atKUWr5tajBWk0Htik3J/w@public.gmane.org> @ 2007-12-24 14:28 ` Marcelo Tosatti 2007-12-24 14:34 ` Avi Kivity 0 siblings, 1 reply; 26+ messages in thread From: Marcelo Tosatti @ 2007-12-24 14:28 UTC (permalink / raw) To: Avi Kivity; +Cc: Marcelo Tosatti, kvm-devel On Mon, Dec 24, 2007 at 08:50:13AM +0200, Avi Kivity wrote: > Marcelo Tosatti wrote: > >On Sun, Dec 23, 2007 at 10:41:07AM +0200, Avi Kivity wrote: > > > >>Avi Kivity wrote: > >> > >>>Exactly. But it is better to be explicit about it and pass the page > >>>directly like you did before. I hate to make you go back-and-fourth, > >>>but I did not understand the issue completely before. > >>> > >>> > >>btw, the call to gfn_to_page() can happen in page_fault() instead of > >>walk_addr(); that will reduce the amount of error handling, and will > >>simplify the callers to walk_addr() that don't need the page. > >> > > > >But the gfn in question is only known at walk_addr() time, so thats not > >possible. > > > > It's just walker->gfn; certainly it is known in page_fault(). Oh, you mean to grab walker->gfn before fetch() ? Yeah, that makes sense. ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [patch 3/5] KVM: add kvm_follow_page() 2007-12-24 14:28 ` Marcelo Tosatti @ 2007-12-24 14:34 ` Avi Kivity 0 siblings, 0 replies; 26+ messages in thread From: Avi Kivity @ 2007-12-24 14:34 UTC (permalink / raw) To: Marcelo Tosatti; +Cc: kvm-devel Marcelo Tosatti wrote: >>>> btw, the call to gfn_to_page() can happen in page_fault() instead of >>>> walk_addr(); that will reduce the amount of error handling, and will >>>> simplify the callers to walk_addr() that don't need the page. >>>> >>>> >>> But the gfn in question is only known at walk_addr() time, so thats not >>> possible. >>> >>> >> It's just walker->gfn; certainly it is known in page_fault(). >> > > Oh, you mean to grab walker->gfn before fetch() ? > > Yes. -- error compiling committee.c: too many arguments to function ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
* [patch 4/5] KVM: export follow_page() 2007-12-21 0:18 [patch 0/5] KVM shadow scalability enhancements Marcelo Tosatti ` (2 preceding siblings ...) 2007-12-21 0:18 ` [patch 3/5] KVM: add kvm_follow_page() Marcelo Tosatti @ 2007-12-21 0:18 ` Marcelo Tosatti [not found] ` <20071221002024.423895760-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> 2007-12-21 0:18 ` [patch 5/5] KVM: switch to mmu spinlock Marcelo Tosatti [not found] ` <20071221001821.029994250-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> 5 siblings, 1 reply; 26+ messages in thread From: Marcelo Tosatti @ 2007-12-21 0:18 UTC (permalink / raw) To: Avi Kivity; +Cc: kvm-devel, mtosatti-H+wXaHxf7aLQT0dZR+AlfA [-- Attachment #1: export-follow-page --] [-- Type: text/plain, Size: 799 bytes --] follow_page() is required by KVM to find the struct page which maps to a given address in spinlock protected code. Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> Index: kvm.quilt/mm/memory.c =================================================================== --- kvm.quilt.orig/mm/memory.c +++ kvm.quilt/mm/memory.c @@ -970,6 +970,7 @@ no_page_table: } return page; } +EXPORT_SYMBOL(follow_page); int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, -- ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <20071221002024.423895760-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>]
* Re: [patch 4/5] KVM: export follow_page() [not found] ` <20071221002024.423895760-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> @ 2007-12-22 19:43 ` Avi Kivity 0 siblings, 0 replies; 26+ messages in thread From: Avi Kivity @ 2007-12-22 19:43 UTC (permalink / raw) To: Marcelo Tosatti; +Cc: kvm-devel Marcelo Tosatti wrote: > follow_page() is required by KVM to find the struct page which maps > to a given address in spinlock protected code. > > Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> > > Index: kvm.quilt/mm/memory.c > =================================================================== > --- kvm.quilt.orig/mm/memory.c > +++ kvm.quilt/mm/memory.c > @@ -970,6 +970,7 @@ no_page_table: > } > return page; > } > +EXPORT_SYMBOL(follow_page); > This should come before its use in the patch series (and of course, it may get expanded into a new get_user_page_inatomic() if we're told that's the right thing). -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
* [patch 5/5] KVM: switch to mmu spinlock 2007-12-21 0:18 [patch 0/5] KVM shadow scalability enhancements Marcelo Tosatti ` (3 preceding siblings ...) 2007-12-21 0:18 ` [patch 4/5] KVM: export follow_page() Marcelo Tosatti @ 2007-12-21 0:18 ` Marcelo Tosatti [not found] ` <20071221001821.029994250-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> 5 siblings, 0 replies; 26+ messages in thread From: Marcelo Tosatti @ 2007-12-21 0:18 UTC (permalink / raw) To: Avi Kivity; +Cc: kvm-devel, mtosatti-H+wXaHxf7aLQT0dZR+AlfA [-- Attachment #1: convert-to-spinlock --] [-- Type: text/plain, Size: 7291 bytes --] Convert the synchronization of the shadow handling to a separate mmu_lock spinlock. Also guard fetch() by mmap_sem in read-mode to protect against alias and memslot changes. Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> Index: kvm.quilt/arch/x86/kvm/mmu.c =================================================================== --- kvm.quilt.orig/arch/x86/kvm/mmu.c +++ kvm.quilt/arch/x86/kvm/mmu.c @@ -1031,7 +1031,7 @@ static void mmu_free_roots(struct kvm_vc if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - mutex_lock(&vcpu->kvm->lock); + spin_lock(&vcpu->kvm->mmu_lock); #ifdef CONFIG_X86_64 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; @@ -1039,7 +1039,7 @@ static void mmu_free_roots(struct kvm_vc sp = page_header(root); --sp->root_count; vcpu->arch.mmu.root_hpa = INVALID_PAGE; - mutex_unlock(&vcpu->kvm->lock); + spin_unlock(&vcpu->kvm->mmu_lock); return; } #endif @@ -1053,7 +1053,7 @@ static void mmu_free_roots(struct kvm_vc } vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; } - mutex_unlock(&vcpu->kvm->lock); + spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.root_hpa = INVALID_PAGE; } @@ -1256,9 +1256,9 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) r = mmu_topup_memory_caches(vcpu); if (r) goto out; - mutex_lock(&vcpu->kvm->lock); + spin_lock(&vcpu->kvm->mmu_lock); mmu_alloc_roots(vcpu); - mutex_unlock(&vcpu->kvm->lock); + spin_unlock(&vcpu->kvm->mmu_lock); kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); kvm_mmu_flush_tlb(vcpu); out: @@ -1435,15 +1435,15 @@ int kvm_mmu_unprotect_page_virt(struct k gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); up_read(¤t->mm->mmap_sem); - mutex_lock(&vcpu->kvm->lock); + spin_lock(&vcpu->kvm->mmu_lock); r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); - mutex_unlock(&vcpu->kvm->lock); + spin_unlock(&vcpu->kvm->mmu_lock); return r; } void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { - mutex_lock(&vcpu->kvm->lock); + spin_lock(&vcpu->kvm->mmu_lock); while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) { struct kvm_mmu_page *sp; @@ -1452,7 +1452,7 @@ void __kvm_mmu_free_some_pages(struct kv kvm_mmu_zap_page(vcpu->kvm, sp); ++vcpu->kvm->stat.mmu_recycled; } - mutex_unlock(&vcpu->kvm->lock); + spin_unlock(&vcpu->kvm->mmu_lock); } int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) @@ -1584,10 +1584,10 @@ void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; - mutex_lock(&kvm->lock); + spin_lock(&kvm->mmu_lock); list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) kvm_mmu_zap_page(kvm, sp); - mutex_unlock(&kvm->lock); + spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); } Index: kvm.quilt/arch/x86/kvm/paging_tmpl.h =================================================================== --- kvm.quilt.orig/arch/x86/kvm/paging_tmpl.h +++ kvm.quilt/arch/x86/kvm/paging_tmpl.h @@ -217,9 +217,9 @@ walk: goto walk; } pte |= PT_DIRTY_MASK; - mutex_lock(&vcpu->kvm->lock); + spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); - mutex_unlock(&vcpu->kvm->lock); + spin_unlock(&vcpu->kvm->mmu_lock); walker->ptes[walker->level - 1] = pte; } @@ -386,7 +386,6 @@ static int FNAME(page_fault)(struct kvm_ */ r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, fetch_fault); - up_read(¤t->mm->mmap_sem); /* * The page is not mapped by the guest. Let the guest handle it. @@ -395,10 +394,11 @@ static int FNAME(page_fault)(struct kvm_ pgprintk("%s: guest page fault\n", __FUNCTION__); inject_page_fault(vcpu, addr, walker.error_code); vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ + up_read(¤t->mm->mmap_sem); return 0; } - mutex_lock(&vcpu->kvm->lock); + spin_lock(&vcpu->kvm->mmu_lock); shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, &write_pt); pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, @@ -411,13 +411,15 @@ static int FNAME(page_fault)(struct kvm_ * mmio: emulate if accessible, otherwise its a guest fault. */ if (shadow_pte && is_io_pte(*shadow_pte)) { - mutex_unlock(&vcpu->kvm->lock); + spin_unlock(&vcpu->kvm->mmu_lock); + up_read(¤t->mm->mmap_sem); return 1; } ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, "post page fault (fixed)"); - mutex_unlock(&vcpu->kvm->lock); + spin_unlock(&vcpu->kvm->mmu_lock); + up_read(¤t->mm->mmap_sem); return write_pt; } Index: kvm.quilt/arch/x86/kvm/x86.c =================================================================== --- kvm.quilt.orig/arch/x86/kvm/x86.c +++ kvm.quilt/arch/x86/kvm/x86.c @@ -1609,9 +1609,9 @@ static int emulator_write_phys(struct kv up_read(¤t->mm->mmap_sem); return 0; } - mutex_lock(&vcpu->kvm->lock); + spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_pte_write(vcpu, gpa, val, bytes); - mutex_unlock(&vcpu->kvm->lock); + spin_unlock(&vcpu->kvm->mmu_lock); up_read(¤t->mm->mmap_sem); return 1; } Index: kvm.quilt/include/linux/kvm_host.h =================================================================== --- kvm.quilt.orig/include/linux/kvm_host.h +++ kvm.quilt/include/linux/kvm_host.h @@ -104,7 +104,8 @@ struct kvm_memory_slot { }; struct kvm { - struct mutex lock; /* protects everything except vcpus */ + struct mutex lock; /* protects the vcpus array and APIC accesses */ + spinlock_t mmu_lock; struct mm_struct *mm; /* userspace tied to this vm */ int nmemslots; struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + Index: kvm.quilt/virt/kvm/kvm_main.c =================================================================== --- kvm.quilt.orig/virt/kvm/kvm_main.c +++ kvm.quilt/virt/kvm/kvm_main.c @@ -165,6 +165,7 @@ static struct kvm *kvm_create_vm(void) kvm->mm = current->mm; atomic_inc(&kvm->mm->mm_count); + spin_lock_init(&kvm->mmu_lock); kvm_io_bus_init(&kvm->pio_bus); mutex_init(&kvm->lock); kvm_io_bus_init(&kvm->mmio_bus); @@ -571,9 +572,7 @@ int kvm_read_guest_atomic(struct kvm *kv addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) return -EFAULT; - pagefault_disable(); r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); - pagefault_enable(); if (r) return -EFAULT; return 0; Index: kvm.quilt/arch/x86/kvm/vmx.c =================================================================== --- kvm.quilt.orig/arch/x86/kvm/vmx.c +++ kvm.quilt/arch/x86/kvm/vmx.c @@ -1476,7 +1476,6 @@ static int alloc_apic_access_page(struct struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - mutex_lock(&kvm->lock); down_write(¤t->mm->mmap_sem); if (kvm->arch.apic_access_page) goto out; @@ -1490,7 +1489,6 @@ static int alloc_apic_access_page(struct kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); out: up_write(¤t->mm->mmap_sem); - mutex_unlock(&kvm->lock); return r; } -- ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <20071221001821.029994250-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>]
* Re: [patch 0/5] KVM shadow scalability enhancements [not found] ` <20071221001821.029994250-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> @ 2007-12-22 19:46 ` Avi Kivity 0 siblings, 0 replies; 26+ messages in thread From: Avi Kivity @ 2007-12-22 19:46 UTC (permalink / raw) To: Marcelo Tosatti; +Cc: kvm-devel Marcelo Tosatti wrote: > The following patchset increases KVM shadow scalability by allowing concurrent > guest walking, allocation and instruction emulation. > Thanks; the split patchset it _much_ easier to review, and hopefully to bisect in case the review misses something. I believe we can merge it at the next iteration. Have you tested a real workload (like kernel compiles), for stability and performance? I can test Windows in case you don't have a copy handy. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2005. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ ^ permalink raw reply [flat|nested] 26+ messages in thread
end of thread, other threads:[~2007-12-24 14:36 UTC | newest]
Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-12-21 0:18 [patch 0/5] KVM shadow scalability enhancements Marcelo Tosatti
2007-12-21 0:18 ` [patch 1/5] KVM: concurrent guest walkers Marcelo Tosatti
2007-12-21 0:18 ` [patch 2/5] KVM: add kvm_read_guest_atomic() Marcelo Tosatti
2007-12-21 0:18 ` [patch 3/5] KVM: add kvm_follow_page() Marcelo Tosatti
[not found] ` <20071221002024.345682789-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2007-12-22 19:42 ` Avi Kivity
[not found] ` <476D68A3.3000200-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-12-23 5:25 ` Marcelo Tosatti
2007-12-23 7:03 ` Avi Kivity
[not found] ` <476E083B.5040600-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-12-23 8:41 ` Avi Kivity
[not found] ` <476E1F23.3020609-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-12-23 8:59 ` Avi Kivity
[not found] ` <476E236A.9000800-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-12-23 9:26 ` Andrew Morton
[not found] ` <20071223012618.e8cf8320.akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
2007-12-23 10:35 ` Avi Kivity
[not found] ` <476E39F2.4030202-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-12-23 10:52 ` Andrew Morton
[not found] ` <20071223025245.d839c86d.akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
2007-12-23 11:01 ` Avi Kivity
[not found] ` <476E4005.90003-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-12-23 20:15 ` Marcelo Tosatti
2007-12-23 20:41 ` Andrew Morton
2007-12-24 6:56 ` Avi Kivity
[not found] ` <476F5801.2030002-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-12-24 14:26 ` Marcelo Tosatti
2007-12-24 14:36 ` Avi Kivity
2007-12-23 19:14 ` Marcelo Tosatti
2007-12-24 6:50 ` Avi Kivity
[not found] ` <476F56A5.2020607-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-12-24 14:28 ` Marcelo Tosatti
2007-12-24 14:34 ` Avi Kivity
2007-12-21 0:18 ` [patch 4/5] KVM: export follow_page() Marcelo Tosatti
[not found] ` <20071221002024.423895760-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2007-12-22 19:43 ` Avi Kivity
2007-12-21 0:18 ` [patch 5/5] KVM: switch to mmu spinlock Marcelo Tosatti
[not found] ` <20071221001821.029994250-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2007-12-22 19:46 ` [patch 0/5] KVM shadow scalability enhancements Avi Kivity
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox