From mboxrd@z Thu Jan 1 00:00:00 1970 From: Marcelo Tosatti Subject: Re: KVM: MMU: nuke shadowed pgtable pages and pte's on memslot destruction Date: Mon, 7 Jul 2008 16:58:22 -0300 Message-ID: <20080707195822.GA16787@dmt.cnet> References: <20080704010618.GA25834@dmt.cnet> <486FAE8A.2050308@qumranet.com> <20080705192344.GA16596@dmt.cnet> <486FE48C.7030002@qumranet.com> <20080707173155.GB10372@dmt.cnet> Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Cc: "Richard W.M. Jones" , kvm-devel To: Avi Kivity Return-path: Received: from mx1.redhat.com ([66.187.233.31]:51425 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1755970AbYGGT66 (ORCPT ); Mon, 7 Jul 2008 15:58:58 -0400 Content-Disposition: inline In-Reply-To: <20080707173155.GB10372@dmt.cnet> Sender: kvm-owner@vger.kernel.org List-ID: On Mon, Jul 07, 2008 at 02:31:55PM -0300, Marcelo Tosatti wrote: > On Sun, Jul 06, 2008 at 12:15:56AM +0300, Avi Kivity wrote: > > Marcelo Tosatti wrote: > >> On Sat, Jul 05, 2008 at 08:25:30PM +0300, Avi Kivity wrote: > >> > >>>> @@ -1955,6 +1955,22 @@ void kvm_mmu_slot_remove_write_access(st > >>>> } > >>>> } > >>>> +int kvm_mmu_slot_has_shadowed_page(struct kvm *kvm, int slot) > >>>> +{ > >>>> + struct kvm_mmu_page *sp; > >>>> + int ret = 0; > >>>> + > >>>> + spin_lock(&kvm->mmu_lock); > >>>> + list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { > >>>> + if (test_bit(slot, &sp->slot_bitmap)) { > >>>> + ret = -EINVAL; > >>>> + break; > >>>> + } > >>>> + } > >>>> + spin_unlock(&kvm->mmu_lock); > >>>> + return ret; > >>>> +} > >>>> + > >>>> > >>> I don't like the guest influencing host actions in this way. It's > >>> just a guest. > >>> > >>> But I think it's unneeded. kvm_mmu_zap_page() will mark a root > >>> shadow page invalid and force all vcpus to reload it, so all that's > >>> needed is to keep the mmu spinlock held while removing the slot. > >>> > >> > >> You're still keeping a shadowed page around with sp->gfn pointing to > >> non-existant memslot. The code generally makes the assumption that > >> gfn_to_memslot(gfn) on shadowed info will not fail. > >> > >> kvm_mmu_zap_page -> unaccount_shadowed, for example. > >> > >> > > > > The page has already been zapped, so we might as well > > unaccount_shadowed() on the first run. It needs to be moved until after > > the reload_remote_mmus() call, though. Oops, previous patch was unaccounting multiple times for invalid pages. This should be better: During RH6.2 graphical installation the following oops is triggered: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 IP: [] :kvm:gfn_to_rmap+0x3e/0x61 Pid: 4559, comm: qemu-system-x86 Not tainted The problem is that KVM allows shadow pagetable entries that point to a removed memslot to exist. In this case the cirrus vram mapping was removed, and the NULL dereference happened during kvm_set_memory_alias()'s zap_all_pages(). So nuke all shadowed pages before memslot removal. Signed-off-by: Marcelo Tosatti diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index a4cf4a2..76259da 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1455,6 +1455,10 @@ int kvm_arch_set_memory_region(struct kvm *kvm, return 0; } +int kvm_arch_destroy_memory_region(struct kvm *kvm, int slot) +{ + return 0; +} long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index b850d24..07d69cf 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -170,6 +170,11 @@ int kvm_arch_set_memory_region(struct kvm *kvm, return 0; } +int kvm_arch_destroy_memory_region(struct kvm *kvm, int slot) +{ + return 0; +} + struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvm_vcpu *vcpu; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 399acf3..201a7e1 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -675,6 +675,11 @@ int kvm_arch_set_memory_region(struct kvm *kvm, return 0; } +int kvm_arch_destroy_memory_region(struct kvm *kvm, int slot) +{ + return 0; +} + gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) { return gfn; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1fd8e3b..89cd1cf 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -930,14 +930,17 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) } kvm_mmu_page_unlink_children(kvm, sp); if (!sp->root_count) { - if (!sp->role.metaphysical) + if (!sp->role.metaphysical && !sp->role.invalid) unaccount_shadowed(kvm, sp->gfn); hlist_del(&sp->hash_link); kvm_mmu_free_page(kvm, sp); } else { + int invalid = sp->role.invalid; list_move(&sp->link, &kvm->arch.active_mmu_pages); sp->role.invalid = 1; kvm_reload_remote_mmus(kvm); + if (!sp->role.metaphysical && !invalid) + unaccount_shadowed(kvm, sp->gfn); } kvm_mmu_reset_last_pte_updated(kvm); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5a83c3b..8815f1d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4040,6 +4040,12 @@ int kvm_arch_set_memory_region(struct kvm *kvm, return 0; } +int kvm_arch_destroy_memory_region(struct kvm *kvm, int slot) +{ + kvm_mmu_zap_all(kvm); + return 0; +} + int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d220b49..0fc7ddc 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -168,6 +168,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, struct kvm_memory_slot old, int user_alloc); +int kvm_arch_destroy_memory_region(struct kvm *kvm, int slot); gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b90da0b..5ef3a5e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -405,6 +405,12 @@ int __kvm_set_memory_region(struct kvm *kvm, if (mem->slot >= kvm->nmemslots) kvm->nmemslots = mem->slot + 1; + if (!npages) { + r = kvm_arch_destroy_memory_region(kvm, mem->slot); + if (r) + goto out_free; + } + *memslot = new; r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);