* [PATCH 1/4] KVM: MMU: Concurrent guest walkers
[not found] ` <1199013439-2047-1-git-send-email-avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-12-30 11:17 ` Avi Kivity
[not found] ` <1199013439-2047-2-git-send-email-avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-12-30 11:17 ` [PATCH 2/4] KVM: Add kvm_read_guest_atomic() Avi Kivity
` (3 subsequent siblings)
4 siblings, 1 reply; 11+ messages in thread
From: Avi Kivity @ 2007-12-30 11:17 UTC (permalink / raw)
To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
Cc: mtosatti-H+wXaHxf7aLQT0dZR+AlfA
From: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Do not hold kvm->lock mutex across the entire pagefault code,
only acquire it in places where it is necessary, such as mmu
hash list, active list, rmap and parent pte handling.
Allow concurrent guest walkers by switching walk_addr() to use
mmap_sem in read-mode.
And get rid of the lockless __gfn_to_page.
[avi: move kvm_mmu_pte_write() locking inside the function]
[avi: add locking for real mode]
Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
---
arch/x86/kvm/mmu.c | 41 ++++++++++++++++----
arch/x86/kvm/paging_tmpl.h | 8 +++-
arch/x86/kvm/vmx.c | 25 ++++++++----
arch/x86/kvm/x86.c | 90 ++++++++++++++++++++++++++-----------------
virt/kvm/kvm_main.c | 22 ++--------
5 files changed, 116 insertions(+), 70 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8f12ec5..3b91227 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -974,7 +974,7 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
+static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
{
int level = PT32E_ROOT_LEVEL;
hpa_t table_addr = vcpu->arch.mmu.root_hpa;
@@ -1015,6 +1015,17 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
}
}
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
+{
+ int r;
+
+ mutex_lock(&vcpu->kvm->lock);
+ r = __nonpaging_map(vcpu, v, write, gfn);
+ mutex_unlock(&vcpu->kvm->lock);
+ return r;
+}
+
+
static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp)
{
@@ -1031,6 +1042,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
+ mutex_lock(&vcpu->kvm->lock);
#ifdef CONFIG_X86_64
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -1038,6 +1050,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
sp = page_header(root);
--sp->root_count;
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ mutex_unlock(&vcpu->kvm->lock);
return;
}
#endif
@@ -1051,6 +1064,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
}
vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
}
+ mutex_unlock(&vcpu->kvm->lock);
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
}
@@ -1250,15 +1264,15 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
{
int r;
- mutex_lock(&vcpu->kvm->lock);
r = mmu_topup_memory_caches(vcpu);
if (r)
goto out;
+ mutex_lock(&vcpu->kvm->lock);
mmu_alloc_roots(vcpu);
+ mutex_unlock(&vcpu->kvm->lock);
kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
kvm_mmu_flush_tlb(vcpu);
out:
- mutex_unlock(&vcpu->kvm->lock);
return r;
}
EXPORT_SYMBOL_GPL(kvm_mmu_load);
@@ -1353,6 +1367,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
int npte;
pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
+ mutex_lock(&vcpu->kvm->lock);
++vcpu->kvm->stat.mmu_pte_write;
kvm_mmu_audit(vcpu, "pre pte write");
if (gfn == vcpu->arch.last_pt_write_gfn
@@ -1421,17 +1436,27 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
}
}
kvm_mmu_audit(vcpu, "post pte write");
+ mutex_unlock(&vcpu->kvm->lock);
}
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
{
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+ gpa_t gpa;
+ int r;
- return kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ down_read(¤t->mm->mmap_sem);
+ gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+ up_read(¤t->mm->mmap_sem);
+
+ mutex_lock(&vcpu->kvm->lock);
+ r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ mutex_unlock(&vcpu->kvm->lock);
+ return r;
}
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
{
+ mutex_lock(&vcpu->kvm->lock);
while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
struct kvm_mmu_page *sp;
@@ -1440,6 +1465,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
kvm_mmu_zap_page(vcpu->kvm, sp);
++vcpu->kvm->stat.mmu_recycled;
}
+ mutex_unlock(&vcpu->kvm->lock);
}
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
@@ -1447,7 +1473,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
int r;
enum emulation_result er;
- mutex_lock(&vcpu->kvm->lock);
r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
if (r < 0)
goto out;
@@ -1462,7 +1487,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
goto out;
er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
- mutex_unlock(&vcpu->kvm->lock);
switch (er) {
case EMULATE_DONE:
@@ -1477,7 +1501,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
BUG();
}
out:
- mutex_unlock(&vcpu->kvm->lock);
return r;
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
@@ -1574,8 +1597,10 @@ void kvm_mmu_zap_all(struct kvm *kvm)
{
struct kvm_mmu_page *sp, *node;
+ mutex_lock(&kvm->lock);
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
kvm_mmu_zap_page(kvm, sp);
+ mutex_unlock(&kvm->lock);
kvm_flush_remote_tlbs(kvm);
}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 56b88f7..7f83f55 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -368,11 +368,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
if (r)
return r;
+ down_read(¤t->mm->mmap_sem);
/*
* Look up the shadow pte for the faulting address.
*/
r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
fetch_fault);
+ up_read(¤t->mm->mmap_sem);
/*
* The page is not mapped by the guest. Let the guest handle it.
@@ -384,6 +386,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
return 0;
}
+ mutex_lock(&vcpu->kvm->lock);
shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
&write_pt);
pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
@@ -395,11 +398,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
/*
* mmio: emulate if accessible, otherwise its a guest fault.
*/
- if (shadow_pte && is_io_pte(*shadow_pte))
+ if (shadow_pte && is_io_pte(*shadow_pte)) {
+ mutex_unlock(&vcpu->kvm->lock);
return 1;
+ }
++vcpu->stat.pf_fixed;
kvm_mmu_audit(vcpu, "post page fault (fixed)");
+ mutex_unlock(&vcpu->kvm->lock);
return write_pt;
}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fa72f72..99fec63 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1431,27 +1431,34 @@ static int init_rmode_tss(struct kvm *kvm)
{
gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
u16 data = 0;
+ int ret = 0;
int r;
+ down_read(¤t->mm->mmap_sem);
r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
if (r < 0)
- return 0;
+ goto out;
data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
if (r < 0)
- return 0;
+ goto out;
r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
if (r < 0)
- return 0;
+ goto out;
r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
if (r < 0)
- return 0;
+ goto out;
data = ~0;
- r = kvm_write_guest_page(kvm, fn, &data, RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
- sizeof(u8));
+ r = kvm_write_guest_page(kvm, fn, &data,
+ RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
+ sizeof(u8));
if (r < 0)
- return 0;
- return 1;
+ goto out;
+
+ ret = 1;
+out:
+ up_read(¤t->mm->mmap_sem);
+ return ret;
}
static void seg_setup(int seg)
@@ -1470,6 +1477,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
int r = 0;
mutex_lock(&kvm->lock);
+ down_write(¤t->mm->mmap_sem);
if (kvm->arch.apic_access_page)
goto out;
kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
@@ -1481,6 +1489,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
goto out;
kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
out:
+ up_write(¤t->mm->mmap_sem);
mutex_unlock(&kvm->lock);
return r;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a58f882..0b11b7f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -181,7 +181,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
int ret;
u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
- mutex_lock(&vcpu->kvm->lock);
+ down_read(¤t->mm->mmap_sem);
ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
offset * sizeof(u64), sizeof(pdpte));
if (ret < 0) {
@@ -198,7 +198,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
out:
- mutex_unlock(&vcpu->kvm->lock);
+ up_read(¤t->mm->mmap_sem);
return ret;
}
@@ -212,13 +212,13 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
if (is_long_mode(vcpu) || !is_pae(vcpu))
return false;
- mutex_lock(&vcpu->kvm->lock);
+ down_read(¤t->mm->mmap_sem);
r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
if (r < 0)
goto out;
changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
out:
- mutex_unlock(&vcpu->kvm->lock);
+ up_read(¤t->mm->mmap_sem);
return changed;
}
@@ -278,9 +278,7 @@ void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
kvm_x86_ops->set_cr0(vcpu, cr0);
vcpu->arch.cr0 = cr0;
- mutex_lock(&vcpu->kvm->lock);
kvm_mmu_reset_context(vcpu);
- mutex_unlock(&vcpu->kvm->lock);
return;
}
EXPORT_SYMBOL_GPL(set_cr0);
@@ -320,9 +318,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
}
kvm_x86_ops->set_cr4(vcpu, cr4);
vcpu->arch.cr4 = cr4;
- mutex_lock(&vcpu->kvm->lock);
kvm_mmu_reset_context(vcpu);
- mutex_unlock(&vcpu->kvm->lock);
}
EXPORT_SYMBOL_GPL(set_cr4);
@@ -360,7 +356,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
*/
}
- mutex_lock(&vcpu->kvm->lock);
+ down_read(¤t->mm->mmap_sem);
/*
* Does the new cr3 value map to physical memory? (Note, we
* catch an invalid cr3 even in real-mode, because it would
@@ -376,7 +372,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
vcpu->arch.cr3 = cr3;
vcpu->arch.mmu.new_cr3(vcpu);
}
- mutex_unlock(&vcpu->kvm->lock);
+ up_read(¤t->mm->mmap_sem);
}
EXPORT_SYMBOL_GPL(set_cr3);
@@ -1211,12 +1207,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
return -EINVAL;
- mutex_lock(&kvm->lock);
+ down_write(¤t->mm->mmap_sem);
kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
- mutex_unlock(&kvm->lock);
+ up_write(¤t->mm->mmap_sem);
return 0;
}
@@ -1265,7 +1261,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
< alias->target_phys_addr)
goto out;
- mutex_lock(&kvm->lock);
+ down_write(¤t->mm->mmap_sem);
p = &kvm->arch.aliases[alias->slot];
p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
@@ -1279,7 +1275,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
kvm_mmu_zap_all(kvm);
- mutex_unlock(&kvm->lock);
+ up_write(¤t->mm->mmap_sem);
return 0;
@@ -1355,7 +1351,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_memory_slot *memslot;
int is_dirty = 0;
- mutex_lock(&kvm->lock);
+ down_write(¤t->mm->mmap_sem);
r = kvm_get_dirty_log(kvm, log, &is_dirty);
if (r)
@@ -1371,7 +1367,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
}
r = 0;
out:
- mutex_unlock(&kvm->lock);
+ up_write(¤t->mm->mmap_sem);
return r;
}
@@ -1565,25 +1561,32 @@ int emulator_read_std(unsigned long addr,
struct kvm_vcpu *vcpu)
{
void *data = val;
+ int r = X86EMUL_CONTINUE;
+ down_read(¤t->mm->mmap_sem);
while (bytes) {
gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
- if (gpa == UNMAPPED_GVA)
- return X86EMUL_PROPAGATE_FAULT;
+ if (gpa == UNMAPPED_GVA) {
+ r = X86EMUL_PROPAGATE_FAULT;
+ goto out;
+ }
ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
- if (ret < 0)
- return X86EMUL_UNHANDLEABLE;
+ if (ret < 0) {
+ r = X86EMUL_UNHANDLEABLE;
+ goto out;
+ }
bytes -= tocopy;
data += tocopy;
addr += tocopy;
}
-
- return X86EMUL_CONTINUE;
+out:
+ up_read(¤t->mm->mmap_sem);
+ return r;
}
EXPORT_SYMBOL_GPL(emulator_read_std);
@@ -1601,7 +1604,9 @@ static int emulator_read_emulated(unsigned long addr,
return X86EMUL_CONTINUE;
}
+ down_read(¤t->mm->mmap_sem);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ up_read(¤t->mm->mmap_sem);
/* For APIC access vmexit */
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -1617,11 +1622,14 @@ mmio:
/*
* Is this MMIO handled locally?
*/
+ mutex_lock(&vcpu->kvm->lock);
mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
if (mmio_dev) {
kvm_iodevice_read(mmio_dev, gpa, bytes, val);
+ mutex_unlock(&vcpu->kvm->lock);
return X86EMUL_CONTINUE;
}
+ mutex_unlock(&vcpu->kvm->lock);
vcpu->mmio_needed = 1;
vcpu->mmio_phys_addr = gpa;
@@ -1636,10 +1644,14 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
{
int ret;
+ down_read(¤t->mm->mmap_sem);
ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
- if (ret < 0)
+ if (ret < 0) {
+ up_read(¤t->mm->mmap_sem);
return 0;
+ }
kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+ up_read(¤t->mm->mmap_sem);
return 1;
}
@@ -1649,7 +1661,11 @@ static int emulator_write_emulated_onepage(unsigned long addr,
struct kvm_vcpu *vcpu)
{
struct kvm_io_device *mmio_dev;
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ gpa_t gpa;
+
+ down_read(¤t->mm->mmap_sem);
+ gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ up_read(¤t->mm->mmap_sem);
if (gpa == UNMAPPED_GVA) {
kvm_inject_page_fault(vcpu, addr, 2);
@@ -1667,11 +1683,14 @@ mmio:
/*
* Is this MMIO handled locally?
*/
+ mutex_lock(&vcpu->kvm->lock);
mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
if (mmio_dev) {
kvm_iodevice_write(mmio_dev, gpa, bytes, val);
+ mutex_unlock(&vcpu->kvm->lock);
return X86EMUL_CONTINUE;
}
+ mutex_unlock(&vcpu->kvm->lock);
vcpu->mmio_needed = 1;
vcpu->mmio_phys_addr = gpa;
@@ -1718,11 +1737,14 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
#ifndef CONFIG_X86_64
/* guests cmpxchg8b have to be emulated atomically */
if (bytes == 8) {
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ gpa_t gpa;
struct page *page;
char *addr;
u64 val;
+ down_read(¤t->mm->mmap_sem);
+ gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+
if (gpa == UNMAPPED_GVA ||
(gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
goto emul_write;
@@ -1738,6 +1760,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
kvm_release_page_dirty(page);
}
emul_write:
+ up_read(¤t->mm->mmap_sem);
#endif
return emulator_write_emulated(addr, new, bytes, vcpu);
@@ -2118,10 +2141,10 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
kvm_x86_ops->skip_emulated_instruction(vcpu);
for (i = 0; i < nr_pages; ++i) {
- mutex_lock(&vcpu->kvm->lock);
+ down_read(¤t->mm->mmap_sem);
page = gva_to_page(vcpu, address + i * PAGE_SIZE);
vcpu->arch.pio.guest_pages[i] = page;
- mutex_unlock(&vcpu->kvm->lock);
+ up_read(¤t->mm->mmap_sem);
if (!page) {
kvm_inject_gp(vcpu, 0);
free_pio_guest_pages(vcpu);
@@ -2247,7 +2270,6 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
char instruction[3];
int ret = 0;
- mutex_lock(&vcpu->kvm->lock);
/*
* Blow out the MMU to ensure that no other VCPU has an active mapping
@@ -2262,8 +2284,6 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
!= X86EMUL_CONTINUE)
ret = -EFAULT;
- mutex_unlock(&vcpu->kvm->lock);
-
return ret;
}
@@ -2447,8 +2467,10 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
if (!apic || !apic->vapic_addr)
return;
+ down_read(¤t->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
vcpu->arch.apic->vapic_page = page;
+ up_read(¤t->mm->mmap_sem);
}
static void vapic_exit(struct kvm_vcpu *vcpu)
@@ -2909,13 +2931,13 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
gpa_t gpa;
vcpu_load(vcpu);
- mutex_lock(&vcpu->kvm->lock);
+ down_read(¤t->mm->mmap_sem);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
+ up_read(¤t->mm->mmap_sem);
tr->physical_address = gpa;
tr->valid = gpa != UNMAPPED_GVA;
tr->writeable = 1;
tr->usermode = 0;
- mutex_unlock(&vcpu->kvm->lock);
vcpu_put(vcpu);
return 0;
@@ -3184,13 +3206,11 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
*/
if (!user_alloc) {
if (npages && !old.rmap) {
- down_write(¤t->mm->mmap_sem);
memslot->userspace_addr = do_mmap(NULL, 0,
npages * PAGE_SIZE,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS,
0);
- up_write(¤t->mm->mmap_sem);
if (IS_ERR((void *)memslot->userspace_addr))
return PTR_ERR((void *)memslot->userspace_addr);
@@ -3198,10 +3218,8 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
if (!old.user_alloc && old.rmap) {
int ret;
- down_write(¤t->mm->mmap_sem);
ret = do_munmap(current->mm, old.userspace_addr,
old.npages * PAGE_SIZE);
- up_write(¤t->mm->mmap_sem);
if (ret < 0)
printk(KERN_WARNING
"kvm_vm_ioctl_set_memory_region: "
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a5ee518..396c619 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -227,7 +227,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
*
* Discontiguous memory is allowed, mostly for framebuffers.
*
- * Must be called holding kvm->lock.
+ * Must be called holding mmap_sem for write.
*/
int __kvm_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
@@ -338,9 +338,9 @@ int kvm_set_memory_region(struct kvm *kvm,
{
int r;
- mutex_lock(&kvm->lock);
+ down_write(¤t->mm->mmap_sem);
r = __kvm_set_memory_region(kvm, mem, user_alloc);
- mutex_unlock(&kvm->lock);
+ up_write(¤t->mm->mmap_sem);
return r;
}
EXPORT_SYMBOL_GPL(kvm_set_memory_region);
@@ -456,7 +456,7 @@ static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
/*
* Requires current->mm->mmap_sem to be held
*/
-static struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn)
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
struct page *page[1];
unsigned long addr;
@@ -481,17 +481,6 @@ static struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn)
return page[0];
}
-struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
-{
- struct page *page;
-
- down_read(¤t->mm->mmap_sem);
- page = __gfn_to_page(kvm, gfn);
- up_read(¤t->mm->mmap_sem);
-
- return page;
-}
-
EXPORT_SYMBOL_GPL(gfn_to_page);
void kvm_release_page_clean(struct page *page)
@@ -977,8 +966,7 @@ static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (!kvm_is_visible_gfn(kvm, vmf->pgoff))
return VM_FAULT_SIGBUS;
- /* current->mm->mmap_sem is already held so call lockless version */
- page = __gfn_to_page(kvm, vmf->pgoff);
+ page = gfn_to_page(kvm, vmf->pgoff);
if (is_error_page(page)) {
kvm_release_page_clean(page);
return VM_FAULT_SIGBUS;
--
1.5.3.7
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH 2/4] KVM: Add kvm_read_guest_atomic()
[not found] ` <1199013439-2047-1-git-send-email-avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-12-30 11:17 ` [PATCH 1/4] KVM: MMU: Concurrent guest walkers Avi Kivity
@ 2007-12-30 11:17 ` Avi Kivity
2007-12-30 11:17 ` [PATCH 3/4] KVM: MMU: Avoid calling gfn_to_page() in mmu_set_spte() Avi Kivity
` (2 subsequent siblings)
4 siblings, 0 replies; 11+ messages in thread
From: Avi Kivity @ 2007-12-30 11:17 UTC (permalink / raw)
To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
Cc: mtosatti-H+wXaHxf7aLQT0dZR+AlfA
From: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
In preparation for a mmu spinlock, add kvm_read_guest_atomic()
and use it in fetch() and prefetch_page().
Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
---
arch/x86/kvm/paging_tmpl.h | 28 ++++++++++++++++------------
include/linux/kvm_host.h | 2 ++
virt/kvm/kvm_main.c | 20 ++++++++++++++++++++
3 files changed, 38 insertions(+), 12 deletions(-)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 7f83f55..136a65d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -316,10 +316,12 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
metaphysical, access,
shadow_ent, &new_page);
if (new_page && !metaphysical) {
+ int r;
pt_element_t curr_pte;
- kvm_read_guest(vcpu->kvm, walker->pte_gpa[level - 2],
- &curr_pte, sizeof(curr_pte));
- if (curr_pte != walker->ptes[level - 2])
+ r = kvm_read_guest_atomic(vcpu->kvm,
+ walker->pte_gpa[level - 2],
+ &curr_pte, sizeof(curr_pte));
+ if (r || curr_pte != walker->ptes[level - 2])
return NULL;
}
shadow_addr = __pa(shadow_page->spt);
@@ -429,9 +431,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp)
{
- int i, offset = 0;
- pt_element_t *gpt;
- struct page *page;
+ int i, offset = 0, r = 0;
+ pt_element_t pt;
if (sp->role.metaphysical
|| (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
@@ -441,15 +442,18 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
if (PTTYPE == 32)
offset = sp->role.quadrant << PT64_LEVEL_BITS;
- page = gfn_to_page(vcpu->kvm, sp->gfn);
- gpt = kmap_atomic(page, KM_USER0);
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
- if (is_present_pte(gpt[offset + i]))
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ gpa_t pte_gpa = gfn_to_gpa(sp->gfn);
+ pte_gpa += (i+offset) * sizeof(pt_element_t);
+
+ r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt,
+ sizeof(pt_element_t));
+ if (r || is_present_pte(pt))
sp->spt[i] = shadow_trap_nonpresent_pte;
else
sp->spt[i] = shadow_notrap_nonpresent_pte;
- kunmap_atomic(gpt, KM_USER0);
- kvm_release_page_clean(page);
+ }
}
#undef pt_element_t
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9ff5904..a020fb2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -167,6 +167,8 @@ void kvm_release_page_clean(struct page *page);
void kvm_release_page_dirty(struct page *page);
int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
int len);
+int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
+ unsigned long len);
int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
int offset, int len);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 396c619..c462d7e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -541,6 +541,26 @@ int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
}
EXPORT_SYMBOL_GPL(kvm_read_guest);
+int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
+ unsigned long len)
+{
+ int r;
+ unsigned long addr;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ int offset = offset_in_page(gpa);
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return -EFAULT;
+ pagefault_disable();
+ r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
+ pagefault_enable();
+ if (r)
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL(kvm_read_guest_atomic);
+
int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
int offset, int len)
{
--
1.5.3.7
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH 3/4] KVM: MMU: Avoid calling gfn_to_page() in mmu_set_spte()
[not found] ` <1199013439-2047-1-git-send-email-avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-12-30 11:17 ` [PATCH 1/4] KVM: MMU: Concurrent guest walkers Avi Kivity
2007-12-30 11:17 ` [PATCH 2/4] KVM: Add kvm_read_guest_atomic() Avi Kivity
@ 2007-12-30 11:17 ` Avi Kivity
[not found] ` <1199013439-2047-4-git-send-email-avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-12-30 11:17 ` [PATCH 4/4] KVM: MMU: Switch to mmu spinlock Avi Kivity
2007-12-31 13:39 ` [PATCH 0/4] Updated kvm mmu scaling patch Avi Kivity
4 siblings, 1 reply; 11+ messages in thread
From: Avi Kivity @ 2007-12-30 11:17 UTC (permalink / raw)
To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
Cc: mtosatti-H+wXaHxf7aLQT0dZR+AlfA
Since gfn_to_page() is a sleeping function, and we want to make the core mmu
spinlocked, we need to pass the page from the walker context (which can sleep)
to the shadow context (which cannot).
Signed-off-by: Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
---
arch/x86/kvm/mmu.c | 58 ++++++++++++++++++++++++++++++++++++++++----
arch/x86/kvm/paging_tmpl.h | 25 +++++++++++++++----
include/asm-x86/kvm_host.h | 5 ++++
3 files changed, 78 insertions(+), 10 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3b91227..1b68f07 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -890,11 +890,10 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault, int dirty,
- int *ptwrite, gfn_t gfn)
+ int *ptwrite, gfn_t gfn, struct page *page)
{
u64 spte;
int was_rmapped = is_rmap_pte(*shadow_pte);
- struct page *page;
pgprintk("%s: spte %llx access %x write_fault %d"
" user_fault %d gfn %lx\n",
@@ -912,8 +911,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
if (!(pte_access & ACC_EXEC_MASK))
spte |= PT64_NX_MASK;
- page = gfn_to_page(vcpu->kvm, gfn);
-
spte |= PT_PRESENT_MASK;
if (pte_access & ACC_USER_MASK)
spte |= PT_USER_MASK;
@@ -979,6 +976,11 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
int level = PT32E_ROOT_LEVEL;
hpa_t table_addr = vcpu->arch.mmu.root_hpa;
int pt_write = 0;
+ struct page *page;
+
+ down_read(¤t->mm->mmap_sem);
+ page = gfn_to_page(vcpu->kvm, gfn);
+ up_read(¤t->mm->mmap_sem);
for (; ; level--) {
u32 index = PT64_INDEX(v, level);
@@ -989,7 +991,7 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
if (level == 1) {
mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
- 0, write, 1, &pt_write, gfn);
+ 0, write, 1, &pt_write, gfn, page);
return pt_write || is_io_pte(table[index]);
}
@@ -1005,6 +1007,7 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
NULL);
if (!new_table) {
pgprintk("nonpaging_map: ENOMEM\n");
+ kvm_release_page_clean(page);
return -ENOMEM;
}
@@ -1347,6 +1350,46 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
return !!(spte && (*spte & PT_ACCESSED_MASK));
}
+static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const u8 *new, int bytes)
+{
+ gfn_t gfn;
+ int r;
+ u64 gpte = 0;
+
+ if (bytes != 4 && bytes != 8)
+ return;
+
+ down_read(¤t->mm->mmap_sem);
+ /*
+ * Assume that the pte write on a page table of the same type
+ * as the current vcpu paging mode. This is nearly always true
+ * (might be false while changing modes). Note it is verified later
+ * by update_pte().
+ */
+ if (is_pae(vcpu)) {
+ /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
+ if ((bytes == 4) && (gpa % 4 == 0)) {
+ r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
+ if (r)
+ goto out;
+ memcpy((void *)&gpte + (gpa % 8), new, 4);
+ } else if ((bytes == 8) && (gpa % 8 == 0)) {
+ memcpy((void *)&gpte, new, 8);
+ }
+ } else {
+ if ((bytes == 4) && (gpa % 4 == 0))
+ memcpy((void *)&gpte, new, 4);
+ }
+ if (!is_present_pte(gpte))
+ goto out;
+ gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+ vcpu->arch.update_pte.gfn = gfn;
+ vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn);
+out:
+ up_read(¤t->mm->mmap_sem);
+}
+
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *new, int bytes)
{
@@ -1367,6 +1410,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
int npte;
pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
+ mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
mutex_lock(&vcpu->kvm->lock);
++vcpu->kvm->stat.mmu_pte_write;
kvm_mmu_audit(vcpu, "pre pte write");
@@ -1437,6 +1481,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
}
kvm_mmu_audit(vcpu, "post pte write");
mutex_unlock(&vcpu->kvm->lock);
+ if (vcpu->arch.update_pte.page) {
+ kvm_release_page_clean(vcpu->arch.update_pte.page);
+ vcpu->arch.update_pte.page = NULL;
+ }
}
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 136a65d..5b3ee81 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -245,6 +245,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
{
pt_element_t gpte;
unsigned pte_access;
+ struct page *npage;
gpte = *(const pt_element_t *)pte;
if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
@@ -256,8 +257,14 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
return;
pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
+ if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
+ return;
+ npage = vcpu->arch.update_pte.page;
+ if (!npage)
+ return;
+ get_page(npage);
mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
- gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte));
+ gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage);
}
/*
@@ -265,7 +272,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
*/
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *walker,
- int user_fault, int write_fault, int *ptwrite)
+ int user_fault, int write_fault, int *ptwrite,
+ struct page *page)
{
hpa_t shadow_addr;
int level;
@@ -321,8 +329,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
r = kvm_read_guest_atomic(vcpu->kvm,
walker->pte_gpa[level - 2],
&curr_pte, sizeof(curr_pte));
- if (r || curr_pte != walker->ptes[level - 2])
+ if (r || curr_pte != walker->ptes[level - 2]) {
+ kvm_release_page_clean(page);
return NULL;
+ }
}
shadow_addr = __pa(shadow_page->spt);
shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
@@ -333,7 +343,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
user_fault, write_fault,
walker->ptes[walker->level-1] & PT_DIRTY_MASK,
- ptwrite, walker->gfn);
+ ptwrite, walker->gfn, page);
return shadow_ent;
}
@@ -362,6 +372,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
u64 *shadow_pte;
int write_pt = 0;
int r;
+ struct page *page;
pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
kvm_mmu_audit(vcpu, "pre page fault");
@@ -388,9 +399,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
return 0;
}
+ down_read(¤t->mm->mmap_sem);
+ page = gfn_to_page(vcpu->kvm, walker.gfn);
+ up_read(¤t->mm->mmap_sem);
+
mutex_lock(&vcpu->kvm->lock);
shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
- &write_pt);
+ &write_pt, page);
pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
shadow_pte, *shadow_pte, write_pt);
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 44b8925..20597bc 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -224,6 +224,11 @@ struct kvm_vcpu_arch {
int last_pt_write_count;
u64 *last_pte_updated;
+ struct {
+ gfn_t gfn; /* presumed gfn during guest pte update */
+ struct page *page; /* page corresponding to that gfn */
+ } update_pte;
+
struct i387_fxsave_struct host_fx_image;
struct i387_fxsave_struct guest_fx_image;
--
1.5.3.7
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH 4/4] KVM: MMU: Switch to mmu spinlock
[not found] ` <1199013439-2047-1-git-send-email-avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
` (2 preceding siblings ...)
2007-12-30 11:17 ` [PATCH 3/4] KVM: MMU: Avoid calling gfn_to_page() in mmu_set_spte() Avi Kivity
@ 2007-12-30 11:17 ` Avi Kivity
[not found] ` <1199013439-2047-5-git-send-email-avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-12-31 13:39 ` [PATCH 0/4] Updated kvm mmu scaling patch Avi Kivity
4 siblings, 1 reply; 11+ messages in thread
From: Avi Kivity @ 2007-12-30 11:17 UTC (permalink / raw)
To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
Cc: mtosatti-H+wXaHxf7aLQT0dZR+AlfA
From: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Convert the synchronization of the shadow handling to a separate mmu_lock
spinlock.
Also guard fetch() by mmap_sem in read-mode to protect against alias
and memslot changes.
Signed-off-by: Marcelo Tosatti <mtosatti-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
---
arch/x86/kvm/mmu.c | 46 ++++++++++++++++++++++---------------------
arch/x86/kvm/paging_tmpl.h | 10 +++++---
arch/x86/kvm/vmx.c | 2 -
include/linux/kvm_host.h | 3 +-
virt/kvm/kvm_main.c | 3 +-
5 files changed, 33 insertions(+), 31 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1b68f07..356e361 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -971,16 +971,12 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}
-static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
+static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
+ gfn_t gfn, struct page *page)
{
int level = PT32E_ROOT_LEVEL;
hpa_t table_addr = vcpu->arch.mmu.root_hpa;
int pt_write = 0;
- struct page *page;
-
- down_read(¤t->mm->mmap_sem);
- page = gfn_to_page(vcpu->kvm, gfn);
- up_read(¤t->mm->mmap_sem);
for (; ; level--) {
u32 index = PT64_INDEX(v, level);
@@ -1022,9 +1018,15 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
{
int r;
- mutex_lock(&vcpu->kvm->lock);
- r = __nonpaging_map(vcpu, v, write, gfn);
- mutex_unlock(&vcpu->kvm->lock);
+ struct page *page;
+
+ down_read(¤t->mm->mmap_sem);
+ page = gfn_to_page(vcpu->kvm, gfn);
+ up_read(¤t->mm->mmap_sem);
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ r = __nonpaging_map(vcpu, v, write, gfn, page);
+ spin_unlock(&vcpu->kvm->mmu_lock);
return r;
}
@@ -1045,7 +1047,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
- mutex_lock(&vcpu->kvm->lock);
+ spin_lock(&vcpu->kvm->mmu_lock);
#ifdef CONFIG_X86_64
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -1053,7 +1055,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
sp = page_header(root);
--sp->root_count;
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
- mutex_unlock(&vcpu->kvm->lock);
+ spin_unlock(&vcpu->kvm->mmu_lock);
return;
}
#endif
@@ -1067,7 +1069,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
}
vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
}
- mutex_unlock(&vcpu->kvm->lock);
+ spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
}
@@ -1270,9 +1272,9 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
r = mmu_topup_memory_caches(vcpu);
if (r)
goto out;
- mutex_lock(&vcpu->kvm->lock);
+ spin_lock(&vcpu->kvm->mmu_lock);
mmu_alloc_roots(vcpu);
- mutex_unlock(&vcpu->kvm->lock);
+ spin_unlock(&vcpu->kvm->mmu_lock);
kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
kvm_mmu_flush_tlb(vcpu);
out:
@@ -1411,7 +1413,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
- mutex_lock(&vcpu->kvm->lock);
+ spin_lock(&vcpu->kvm->mmu_lock);
++vcpu->kvm->stat.mmu_pte_write;
kvm_mmu_audit(vcpu, "pre pte write");
if (gfn == vcpu->arch.last_pt_write_gfn
@@ -1480,7 +1482,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
}
}
kvm_mmu_audit(vcpu, "post pte write");
- mutex_unlock(&vcpu->kvm->lock);
+ spin_unlock(&vcpu->kvm->mmu_lock);
if (vcpu->arch.update_pte.page) {
kvm_release_page_clean(vcpu->arch.update_pte.page);
vcpu->arch.update_pte.page = NULL;
@@ -1496,15 +1498,15 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
up_read(¤t->mm->mmap_sem);
- mutex_lock(&vcpu->kvm->lock);
+ spin_lock(&vcpu->kvm->mmu_lock);
r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- mutex_unlock(&vcpu->kvm->lock);
+ spin_unlock(&vcpu->kvm->mmu_lock);
return r;
}
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
{
- mutex_lock(&vcpu->kvm->lock);
+ spin_lock(&vcpu->kvm->mmu_lock);
while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
struct kvm_mmu_page *sp;
@@ -1513,7 +1515,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
kvm_mmu_zap_page(vcpu->kvm, sp);
++vcpu->kvm->stat.mmu_recycled;
}
- mutex_unlock(&vcpu->kvm->lock);
+ spin_unlock(&vcpu->kvm->mmu_lock);
}
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
@@ -1645,10 +1647,10 @@ void kvm_mmu_zap_all(struct kvm *kvm)
{
struct kvm_mmu_page *sp, *node;
- mutex_lock(&kvm->lock);
+ spin_lock(&kvm->mmu_lock);
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
kvm_mmu_zap_page(kvm, sp);
- mutex_unlock(&kvm->lock);
+ spin_unlock(&kvm->mmu_lock);
kvm_flush_remote_tlbs(kvm);
}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 5b3ee81..73ff66b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -387,7 +387,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
*/
r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
fetch_fault);
- up_read(¤t->mm->mmap_sem);
/*
* The page is not mapped by the guest. Let the guest handle it.
@@ -396,6 +395,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
pgprintk("%s: guest page fault\n", __FUNCTION__);
inject_page_fault(vcpu, addr, walker.error_code);
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
+ up_read(¤t->mm->mmap_sem);
return 0;
}
@@ -403,7 +403,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
page = gfn_to_page(vcpu->kvm, walker.gfn);
up_read(¤t->mm->mmap_sem);
- mutex_lock(&vcpu->kvm->lock);
+ spin_lock(&vcpu->kvm->mmu_lock);
shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
&write_pt, page);
pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
@@ -416,13 +416,15 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
* mmio: emulate if accessible, otherwise its a guest fault.
*/
if (shadow_pte && is_io_pte(*shadow_pte)) {
- mutex_unlock(&vcpu->kvm->lock);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ up_read(¤t->mm->mmap_sem);
return 1;
}
++vcpu->stat.pf_fixed;
kvm_mmu_audit(vcpu, "post page fault (fixed)");
- mutex_unlock(&vcpu->kvm->lock);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ up_read(¤t->mm->mmap_sem);
return write_pt;
}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 99fec63..4741806 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1476,7 +1476,6 @@ static int alloc_apic_access_page(struct kvm *kvm)
struct kvm_userspace_memory_region kvm_userspace_mem;
int r = 0;
- mutex_lock(&kvm->lock);
down_write(¤t->mm->mmap_sem);
if (kvm->arch.apic_access_page)
goto out;
@@ -1490,7 +1489,6 @@ static int alloc_apic_access_page(struct kvm *kvm)
kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
out:
up_write(¤t->mm->mmap_sem);
- mutex_unlock(&kvm->lock);
return r;
}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a020fb2..2714068 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -104,7 +104,8 @@ struct kvm_memory_slot {
};
struct kvm {
- struct mutex lock; /* protects everything except vcpus */
+ struct mutex lock; /* protects the vcpus array and APIC accesses */
+ spinlock_t mmu_lock;
struct mm_struct *mm; /* userspace tied to this vm */
int nmemslots;
struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c462d7e..4295623 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -165,6 +165,7 @@ static struct kvm *kvm_create_vm(void)
kvm->mm = current->mm;
atomic_inc(&kvm->mm->mm_count);
+ spin_lock_init(&kvm->mmu_lock);
kvm_io_bus_init(&kvm->pio_bus);
mutex_init(&kvm->lock);
kvm_io_bus_init(&kvm->mmio_bus);
@@ -552,9 +553,7 @@ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
addr = gfn_to_hva(kvm, gfn);
if (kvm_is_error_hva(addr))
return -EFAULT;
- pagefault_disable();
r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
- pagefault_enable();
if (r)
return -EFAULT;
return 0;
--
1.5.3.7
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
^ permalink raw reply related [flat|nested] 11+ messages in thread* Re: [PATCH 0/4] Updated kvm mmu scaling patch
[not found] ` <1199013439-2047-1-git-send-email-avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
` (3 preceding siblings ...)
2007-12-30 11:17 ` [PATCH 4/4] KVM: MMU: Switch to mmu spinlock Avi Kivity
@ 2007-12-31 13:39 ` Avi Kivity
4 siblings, 0 replies; 11+ messages in thread
From: Avi Kivity @ 2007-12-31 13:39 UTC (permalink / raw)
To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
Cc: mtosatti-H+wXaHxf7aLQT0dZR+AlfA
[-- Attachment #1: Type: text/plain, Size: 583 bytes --]
Avi Kivity wrote:
> The following patchset, based on Marcelo's original mmu scaling patch,
> allows the kvm guest walker to run concurrently. Shadow pagetable
> manipulation is still single threaded.
>
> Handling pte writes is similar to an early version of the patchset: the page
> is derived from the guest pte being written. Unlike the original version,
> we only "guess" the gfn (based on the current paging mode), and if we are
> wrong, we just throw the page away.
>
The attached patch is needed as well.
--
error compiling committee.c: too many arguments to function
[-- Attachment #2: 0001-KVM-MMU-Move-kvm_free_some_pages-into-critical-s.patch --]
[-- Type: text/x-patch, Size: 3446 bytes --]
>From 53015fc9df345a58f96375e082083a108eaaf1da Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
Date: Mon, 31 Dec 2007 15:27:49 +0200
Subject: [PATCH] KVM: MMU: Move kvm_free_some_pages() into critical section
If some other cpu steals mmu pages between our check and an attempt to
allocate, we can run out of mmu pages. Fix by moving the check into the
same critical section as the allocation.
Signed-off-by: Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
---
arch/x86/kvm/mmu.c | 9 +++------
arch/x86/kvm/paging_tmpl.h | 1 +
2 files changed, 4 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 356e361..1e6d928 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -291,7 +291,6 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
{
int r;
- kvm_mmu_free_some_pages(vcpu);
r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
pte_chain_cache, 4);
if (r)
@@ -569,9 +568,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
{
struct kvm_mmu_page *sp;
- if (!vcpu->kvm->arch.n_free_mmu_pages)
- return NULL;
-
sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
@@ -1025,6 +1021,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
up_read(¤t->mm->mmap_sem);
spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
r = __nonpaging_map(vcpu, v, write, gfn, page);
spin_unlock(&vcpu->kvm->mmu_lock);
return r;
@@ -1273,6 +1270,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
if (r)
goto out;
spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
mmu_alloc_roots(vcpu);
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
@@ -1414,6 +1412,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
++vcpu->kvm->stat.mmu_pte_write;
kvm_mmu_audit(vcpu, "pre pte write");
if (gfn == vcpu->arch.last_pt_write_gfn
@@ -1506,7 +1505,6 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
{
- spin_lock(&vcpu->kvm->mmu_lock);
while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
struct kvm_mmu_page *sp;
@@ -1515,7 +1513,6 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
kvm_mmu_zap_page(vcpu->kvm, sp);
++vcpu->kvm->stat.mmu_recycled;
}
- spin_unlock(&vcpu->kvm->mmu_lock);
}
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 73ff66b..6208325 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -404,6 +404,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
up_read(¤t->mm->mmap_sem);
spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
&write_pt, page);
pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
--
1.5.3.7
[-- Attachment #3: Type: text/plain, Size: 228 bytes --]
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
[-- Attachment #4: Type: text/plain, Size: 186 bytes --]
_______________________________________________
kvm-devel mailing list
kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org
https://lists.sourceforge.net/lists/listinfo/kvm-devel
^ permalink raw reply related [flat|nested] 11+ messages in thread