From: Izik Eidus <izike@qumranet.com>
To: kvm-devel <kvm-devel@lists.sourceforge.net>,
Avi Kivity <avi@qumranet.com>,
mtosatti@redhat.com
Subject: [PATCH] replace the slots lock from the mmap_sem to private kvm lock
Date: Mon, 11 Feb 2008 17:12:42 +0200 [thread overview]
Message-ID: <47B065EA.7010805@qumranet.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 410 bytes --]
right now kvm take the down_read(mmap_sem) lock to make sure that no
slots will be
removed while trying to get them / make the dirty log safe,
but in some cases when the down_read(mmap_sem) is called and after that
kvm_read_guest() is called
copy_from_user can result in a page fault that will lead to recursivly
taking the mmap_sem,
this patch remove the mmap_sem, with new kvm private lock.
--
woof.
[-- Attachment #2: 0004-KVM-remove-the-usage-of-the-mmap_sem-for-the-protec.patch --]
[-- Type: text/x-patch, Size: 14845 bytes --]
>From 17ab59f76968731a1dc1067fc07d637b5c52e3da Mon Sep 17 00:00:00 2001
From: Izik Eidus <izike@qumranet.com>
Date: Sun, 10 Feb 2008 18:04:15 +0200
Subject: [PATCH] KVM: remove the usage of the mmap_sem for the protection of the memory slots.
this patch replace the mmap_sem lock for the memory slots with a new
kvm private lock,
it is needed beacuse untill now there were cases where the down_read for the
mmap_sem can be called recursivly.
(for example when doing copy_to_user() unpresent memory that will result in
pagefault)
Signed-off-by: Izik Eidus <izike@qumranet.com>
---
arch/x86/kvm/mmu.c | 24 +++++++++++++---
arch/x86/kvm/paging_tmpl.h | 13 ++++++---
arch/x86/kvm/vmx.c | 7 +++-
arch/x86/kvm/x86.c | 65 +++++++++++++++++++++++++-------------------
include/linux/kvm_host.h | 1 +
virt/kvm/kvm_main.c | 7 +++-
6 files changed, 76 insertions(+), 41 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 635e70c..461a8de 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -868,11 +868,18 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
{
+ struct page *page;
+
gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
if (gpa == UNMAPPED_GVA)
return NULL;
- return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+
+ down_read(¤t->mm->mmap_sem);
+ page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ up_read(¤t->mm->mmap_sem);
+
+ return page;
}
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
@@ -1005,13 +1012,16 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
struct page *page;
+ down_read(&vcpu->kvm->slots_lock);
+
down_read(¤t->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, gfn);
+ up_read(¤t->mm->mmap_sem);
/* mmio */
if (is_error_page(page)) {
kvm_release_page_clean(page);
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return 1;
}
@@ -1020,7 +1030,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
r = __nonpaging_map(vcpu, v, write, gfn, page);
spin_unlock(&vcpu->kvm->mmu_lock);
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return r;
}
@@ -1381,7 +1391,11 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
if (!is_present_pte(gpte))
return;
gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+
+ down_read(¤t->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, gfn);
+ up_read(¤t->mm->mmap_sem);
+
if (is_error_page(page)) {
kvm_release_page_clean(page);
return;
@@ -1503,9 +1517,9 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
gpa_t gpa;
int r;
- down_read(¤t->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
spin_lock(&vcpu->kvm->mmu_lock);
r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index b13e823..f58c143 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -91,7 +91,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
pt_element_t *table;
struct page *page;
+ down_read(¤t->mm->mmap_sem);
page = gfn_to_page(kvm, table_gfn);
+ up_read(¤t->mm->mmap_sem);
+
table = kmap_atomic(page, KM_USER0);
ret = CMPXCHG(&table[index], orig_pte, new_pte);
@@ -375,7 +378,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
if (r)
return r;
- down_read(¤t->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
/*
* Look up the shadow pte for the faulting address.
*/
@@ -389,17 +392,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
pgprintk("%s: guest page fault\n", __FUNCTION__);
inject_page_fault(vcpu, addr, walker.error_code);
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return 0;
}
+ down_read(¤t->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, walker.gfn);
+ up_read(¤t->mm->mmap_sem);
/* mmio */
if (is_error_page(page)) {
pgprintk("gfn %x is mmio\n", walker.gfn);
kvm_release_page_clean(page);
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return 1;
}
@@ -416,7 +421,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
++vcpu->stat.pf_fixed;
kvm_mmu_audit(vcpu, "post page fault (fixed)");
spin_unlock(&vcpu->kvm->mmu_lock);
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return write_pt;
}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8496dbe..e83c415 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1519,7 +1519,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
struct kvm_userspace_memory_region kvm_userspace_mem;
int r = 0;
- down_write(¤t->mm->mmap_sem);
+ down_write(&kvm->slots_lock);
if (kvm->arch.apic_access_page)
goto out;
kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
@@ -1529,9 +1529,12 @@ static int alloc_apic_access_page(struct kvm *kvm)
r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
if (r)
goto out;
+
+ down_read(¤t->mm->mmap_sem);
kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
+ up_read(¤t->mm->mmap_sem);
out:
- up_write(¤t->mm->mmap_sem);
+ up_write(&kvm->slots_lock);
return r;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 08e1edc..93425bb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -189,7 +189,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
int ret;
u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
- down_read(¤t->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
offset * sizeof(u64), sizeof(pdpte));
if (ret < 0) {
@@ -206,7 +206,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
out:
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return ret;
}
@@ -220,13 +220,13 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
if (is_long_mode(vcpu) || !is_pae(vcpu))
return false;
- down_read(¤t->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
if (r < 0)
goto out;
changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
out:
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return changed;
}
@@ -364,7 +364,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
*/
}
- down_read(¤t->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
/*
* Does the new cr3 value map to physical memory? (Note, we
* catch an invalid cr3 even in real-mode, because it would
@@ -380,7 +380,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
vcpu->arch.cr3 = cr3;
vcpu->arch.mmu.new_cr3(vcpu);
}
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
}
EXPORT_SYMBOL_GPL(set_cr3);
@@ -1214,12 +1214,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
return -EINVAL;
- down_write(¤t->mm->mmap_sem);
+ down_write(&kvm->slots_lock);
kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
- up_write(¤t->mm->mmap_sem);
+ up_write(&kvm->slots_lock);
return 0;
}
@@ -1268,7 +1268,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
< alias->target_phys_addr)
goto out;
- down_write(¤t->mm->mmap_sem);
+ down_write(&kvm->slots_lock);
p = &kvm->arch.aliases[alias->slot];
p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
@@ -1282,7 +1282,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
kvm_mmu_zap_all(kvm);
- up_write(¤t->mm->mmap_sem);
+ up_write(&kvm->slots_lock);
return 0;
@@ -1358,7 +1358,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_memory_slot *memslot;
int is_dirty = 0;
- down_write(¤t->mm->mmap_sem);
+ down_write(&kvm->slots_lock);
r = kvm_get_dirty_log(kvm, log, &is_dirty);
if (r)
@@ -1374,7 +1374,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
}
r = 0;
out:
- up_write(¤t->mm->mmap_sem);
+ up_write(&kvm->slots_lock);
return r;
}
@@ -1570,7 +1570,7 @@ int emulator_read_std(unsigned long addr,
void *data = val;
int r = X86EMUL_CONTINUE;
- down_read(¤t->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
while (bytes) {
gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
unsigned offset = addr & (PAGE_SIZE-1);
@@ -1592,7 +1592,7 @@ int emulator_read_std(unsigned long addr,
addr += tocopy;
}
out:
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return r;
}
EXPORT_SYMBOL_GPL(emulator_read_std);
@@ -1611,9 +1611,9 @@ static int emulator_read_emulated(unsigned long addr,
return X86EMUL_CONTINUE;
}
- down_read(¤t->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
/* For APIC access vmexit */
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -1651,14 +1651,14 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
{
int ret;
- down_read(¤t->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
if (ret < 0) {
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return 0;
}
kvm_mmu_pte_write(vcpu, gpa, val, bytes);
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return 1;
}
@@ -1670,9 +1670,9 @@ static int emulator_write_emulated_onepage(unsigned long addr,
struct kvm_io_device *mmio_dev;
gpa_t gpa;
- down_read(¤t->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
if (gpa == UNMAPPED_GVA) {
kvm_inject_page_fault(vcpu, addr, 2);
@@ -1749,7 +1749,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
char *kaddr;
u64 val;
- down_read(¤t->mm->mmap_sem);
+ down_read(&kvm->slots_lock);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
if (gpa == UNMAPPED_GVA ||
@@ -1760,13 +1760,17 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
goto emul_write;
val = *(u64 *)new;
+
+ down_read(¤t->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ up_read(¤t->mm->mmap_sem);
+
kaddr = kmap_atomic(page, KM_USER0);
set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
kunmap_atomic(kaddr, KM_USER0);
kvm_release_page_dirty(page);
emul_write:
- up_read(¤t->mm->mmap_sem);
+ up_read(&kvm->slots_lock);
}
#endif
@@ -2159,10 +2163,10 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
kvm_x86_ops->skip_emulated_instruction(vcpu);
for (i = 0; i < nr_pages; ++i) {
- down_read(¤t->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
page = gva_to_page(vcpu, address + i * PAGE_SIZE);
vcpu->arch.pio.guest_pages[i] = page;
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
if (!page) {
kvm_inject_gp(vcpu, 0);
free_pio_guest_pages(vcpu);
@@ -2485,8 +2489,9 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
down_read(¤t->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
- vcpu->arch.apic->vapic_page = page;
up_read(¤t->mm->mmap_sem);
+
+ vcpu->arch.apic->vapic_page = page;
}
static void vapic_exit(struct kvm_vcpu *vcpu)
@@ -2957,9 +2962,9 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
gpa_t gpa;
vcpu_load(vcpu);
- down_read(¤t->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
tr->physical_address = gpa;
tr->valid = gpa != UNMAPPED_GVA;
tr->writeable = 1;
@@ -3232,11 +3237,13 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
*/
if (!user_alloc) {
if (npages && !old.rmap) {
+ down_read(¤t->mm->mmap_sem);
memslot->userspace_addr = do_mmap(NULL, 0,
npages * PAGE_SIZE,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS,
0);
+ up_read(¤t->mm->mmap_sem);
if (IS_ERR((void *)memslot->userspace_addr))
return PTR_ERR((void *)memslot->userspace_addr);
@@ -3244,8 +3251,10 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
if (!old.user_alloc && old.rmap) {
int ret;
+ down_read(¤t->mm->mmap_sem);
ret = do_munmap(current->mm, old.userspace_addr,
old.npages * PAGE_SIZE);
+ up_read(¤t->mm->mmap_sem);
if (ret < 0)
printk(KERN_WARNING
"kvm_vm_ioctl_set_memory_region: "
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a84c912..b90ca36 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -109,6 +109,7 @@ struct kvm_memory_slot {
struct kvm {
struct mutex lock; /* protects the vcpus array and APIC accesses */
spinlock_t mmu_lock;
+ struct rw_semaphore slots_lock;
struct mm_struct *mm; /* userspace tied to this vm */
int nmemslots;
struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4da308e..8ea19d5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -169,6 +169,7 @@ static struct kvm *kvm_create_vm(void)
kvm_io_bus_init(&kvm->pio_bus);
mutex_init(&kvm->lock);
kvm_io_bus_init(&kvm->mmio_bus);
+ init_rwsem(&kvm->slots_lock);
spin_lock(&kvm_lock);
list_add(&kvm->vm_list, &vm_list);
spin_unlock(&kvm_lock);
@@ -339,9 +340,9 @@ int kvm_set_memory_region(struct kvm *kvm,
{
int r;
- down_write(¤t->mm->mmap_sem);
+ down_write(&kvm->slots_lock);
r = __kvm_set_memory_region(kvm, mem, user_alloc);
- up_write(¤t->mm->mmap_sem);
+ up_write(&kvm->slots_lock);
return r;
}
EXPORT_SYMBOL_GPL(kvm_set_memory_region);
@@ -989,7 +990,9 @@ static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (!kvm_is_visible_gfn(kvm, vmf->pgoff))
return VM_FAULT_SIGBUS;
+
page = gfn_to_page(kvm, vmf->pgoff);
+
if (is_error_page(page)) {
kvm_release_page_clean(page);
return VM_FAULT_SIGBUS;
--
1.5.3.6
[-- Attachment #3: Type: text/plain, Size: 228 bytes --]
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
[-- Attachment #4: Type: text/plain, Size: 158 bytes --]
_______________________________________________
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel
next reply other threads:[~2008-02-11 15:12 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-02-11 15:12 Izik Eidus [this message]
2008-02-11 17:08 ` [PATCH] replace the slots lock from the mmap_sem to private kvm lock Marcelo Tosatti
2008-02-11 17:26 ` Izik Eidus
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=47B065EA.7010805@qumranet.com \
--to=izike@qumranet.com \
--cc=avi@qumranet.com \
--cc=kvm-devel@lists.sourceforge.net \
--cc=mtosatti@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.