[PATCH] KVM: MMU: Fix rmap

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] KVM: MMU: Fix rmap_remove() race
@ 2008-03-26 15:02 Avi Kivity
  2008-03-26 15:15 ` Avi Kivity
                   ` (3 more replies)
  0 siblings, 4 replies; 21+ messages in thread
From: Avi Kivity @ 2008-03-26 15:02 UTC (permalink / raw)
  To: Marcelo Tosatti, Andrea Arcangeli; +Cc: kvm-devel

Andrea notes that freeing the page before flushing the tlb is a race, as the
guest can sneak in one last write before the tlb is flushed, writing to a
page that may belong to someone else.

Fix be reversing the order of freeing and flushing the tlb.  Since the tlb
flush is expensive, queue the pages to be freed so we need to flush just once.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c         |   66 +++++++++++++++++++++++++++++++++++++++----
 include/asm-x86/kvm_host.h |    8 +++++
 2 files changed, 67 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 95c12bc..2033879 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -536,6 +536,52 @@ static void rmap_desc_remove_entry(unsigned long *rmapp,
 	mmu_free_rmap_desc(desc);
 }
 
+static void rmap_remove_complete(struct kvm *kvm)
+{
+	int i;
+	struct page *page;
+
+	if (!kvm->arch.nr_page_release_queue)
+		return;
+	kvm_flush_remote_tlbs(kvm);
+	for (i = 0; i < kvm->arch.nr_page_release_queue; ++i) {
+		page = kvm->arch.page_release_queue[i].page;
+		if (kvm->arch.page_release_queue[i].dirty)
+			kvm_release_page_dirty(page);
+		else
+			kvm_release_page_clean(page);
+	}
+	kvm->arch.nr_page_release_queue = 0;
+}
+
+static void kvm_release_page_deferred(struct kvm *kvm,
+				      struct page *page,
+				      bool dirty)
+{
+	int i;
+
+	i = kvm->arch.nr_page_release_queue;
+	if (i == KVM_MAX_PAGE_RELEASE_QUEUE) {
+		rmap_remove_complete(kvm);
+		i = 0;
+	}
+	kvm->arch.page_release_queue[i].page = page;
+	kvm->arch.page_release_queue[i].dirty = dirty;
+	kvm->arch.nr_page_release_queue = i + 1;
+}
+
+static void kvm_release_page_dirty_deferred(struct kvm *kvm,
+					    struct page *page)
+{
+	kvm_release_page_deferred(kvm, page, true);
+}
+
+static void kvm_release_page_clean_deferred(struct kvm *kvm,
+					    struct page *page)
+{
+	kvm_release_page_deferred(kvm, page, true);
+}
+
 static void rmap_remove(struct kvm *kvm, u64 *spte)
 {
 	struct kvm_rmap_desc *desc;
@@ -551,9 +597,9 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	page = spte_to_page(*spte);
 	mark_page_accessed(page);
 	if (is_writeble_pte(*spte))
-		kvm_release_page_dirty(page);
+		kvm_release_page_dirty_deferred(kvm, page);
 	else
-		kvm_release_page_clean(page);
+		kvm_release_page_clean_deferred(kvm, page);
 	rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
 	if (!*rmapp) {
 		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
@@ -585,6 +631,12 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	}
 }
 
+static void rmap_remove_one(struct kvm *kvm, u64 *spte)
+{
+	rmap_remove(kvm, spte);
+	rmap_remove_complete(kvm);
+}
+
 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
 {
 	struct kvm_rmap_desc *desc;
@@ -650,7 +702,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 		BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
 		pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
 		if (is_writeble_pte(*spte)) {
-			rmap_remove(kvm, spte);
+			rmap_remove_one(kvm, spte);
 			--kvm->stat.lpages;
 			set_shadow_pte(spte, shadow_trap_nonpresent_pte);
 			write_protected = 1;
@@ -872,7 +924,7 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 				rmap_remove(kvm, &pt[i]);
 			pt[i] = shadow_trap_nonpresent_pte;
 		}
-		kvm_flush_remote_tlbs(kvm);
+		rmap_remove_complete(kvm);
 		return;
 	}
 
@@ -891,7 +943,7 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 		}
 		pt[i] = shadow_trap_nonpresent_pte;
 	}
-	kvm_flush_remote_tlbs(kvm);
+	rmap_remove_complete(kvm);
 }
 
 static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
@@ -1048,7 +1100,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 
 	if (is_rmap_pte(*shadow_pte)) {
 		if (page != spte_to_page(*shadow_pte))
-			rmap_remove(vcpu->kvm, shadow_pte);
+			rmap_remove_one(vcpu->kvm, shadow_pte);
 		else
 			was_rmapped = 1;
 	}
@@ -1583,7 +1635,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
 	if (is_shadow_present_pte(pte)) {
 		if (sp->role.level == PT_PAGE_TABLE_LEVEL ||
 		    is_large_pte(pte))
-			rmap_remove(vcpu->kvm, spte);
+			rmap_remove_one(vcpu->kvm, spte);
 		else {
 			child = page_header(pte & PT64_BASE_ADDR_MASK);
 			mmu_page_remove_parent_pte(child, spte);
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 4382ca0..8a71616 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -77,6 +77,8 @@
 #define KVM_REFILL_PAGES 25
 #define KVM_MAX_CPUID_ENTRIES 40
 
+#define KVM_MAX_PAGE_RELEASE_QUEUE 16
+
 extern spinlock_t kvm_lock;
 extern struct list_head vm_list;
 
@@ -312,6 +314,12 @@ struct kvm_arch{
 	struct kvm_ioapic *vioapic;
 	struct kvm_pit *vpit;
 
+	int nr_page_release_queue;
+	struct {
+		struct page *page;
+		bool dirty;
+	} page_release_queue[KVM_MAX_PAGE_RELEASE_QUEUE];
+
 	int round_robin_prev_vcpu;
 	unsigned int tss_addr;
 	struct page *apic_access_page;
-- 
1.5.4.2


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH] KVM: MMU: Fix rmap_remove() race
  2008-03-26 15:02 [PATCH] KVM: MMU: Fix rmap_remove() race Avi Kivity
@ 2008-03-26 15:15 ` Avi Kivity
  2008-03-26 17:51 ` Marcelo Tosatti
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 21+ messages in thread
From: Avi Kivity @ 2008-03-26 15:15 UTC (permalink / raw)
  To: Marcelo Tosatti, Andrea Arcangeli; +Cc: kvm-devel

Avi Kivity wrote:
> Andrea notes that freeing the page before flushing the tlb is a race, as the
> guest can sneak in one last write before the tlb is flushed, writing to a
> page that may belong to someone else.
>
> Fix be reversing the order of freeing and flushing the tlb.  Since the tlb
> flush is expensive, queue the pages to be freed so we need to flush just once.
>
>   

This was missing a conversion of rmap_remove() to rmap_remove_one() in 
paging_tmpl.h.  I fixed it in my tree.

But, a review would still be appreciated.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] KVM: MMU: Fix rmap_remove() race
  2008-03-26 15:02 [PATCH] KVM: MMU: Fix rmap_remove() race Avi Kivity
  2008-03-26 15:15 ` Avi Kivity
@ 2008-03-26 17:51 ` Marcelo Tosatti
  2008-03-26 18:12   ` Andrea Arcangeli
  2008-03-26 19:22 ` Andrea Arcangeli
  2008-03-27 15:26 ` Andi Kleen
  3 siblings, 1 reply; 21+ messages in thread
From: Marcelo Tosatti @ 2008-03-26 17:51 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, Andrea Arcangeli

On Wed, Mar 26, 2008 at 05:02:53PM +0200, Avi Kivity wrote:
> Andrea notes that freeing the page before flushing the tlb is a race, as the
> guest can sneak in one last write before the tlb is flushed, writing to a
> page that may belong to someone else.
> 
> Fix be reversing the order of freeing and flushing the tlb.  Since the tlb
> flush is expensive, queue the pages to be freed so we need to flush just once.


Andrea previously wrote (which I somehow missed):

>> This case is safe because the path that frees a pte and subsequently
>> a page will take care of flushing the TLB of any remote CPU's that
>> possibly have it cached (before freeing the page, of course).
>> ptep_clear_flush->flush_tlb_page.
>>
>> Am I missing something?

> flush_tlb_page only takes care of accesses to the page through qemu
> task when outside vmx/svm mode. If the other physical cpu, is running
> some code of some guest vcpu in vmx/svm mode after the race window is
> open and before it is closed the race will trigger.

Nope. If a physical CPU has page translations cached it _must_ be
running in the context of a qemu thread (does not matter if its in
userspace or executing guest code). The bit corresponding to such CPU's
will be set in mm->vm_cpu_mask and flush_tlb_page() will take care of
flushing the TLBs appropriately.

That is, in the perspective of the generic TLB flushing code, there is
no distinction between page translations cached via the qemu process
mappings or "shadow guest pages" (you can view shadow guest pages as a
subset of qemu process translations in this context).

So the patch is unnecessary and adds additional IPI TLB flushes
(especially in mmu_pte_write_zap_pte path where the need_remote_flush
optimization essentially vanishes).

> Signed-off-by: Avi Kivity <avi@qumranet.com>
> ---
>  arch/x86/kvm/mmu.c         |   66 +++++++++++++++++++++++++++++++++++++++----
>  include/asm-x86/kvm_host.h |    8 +++++
>  2 files changed, 67 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 95c12bc..2033879 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -536,6 +536,52 @@ static void rmap_desc_remove_entry(unsigned long *rmapp,
>  	mmu_free_rmap_desc(desc);
>  }
>  
> +static void rmap_remove_complete(struct kvm *kvm)
> +{
> +	int i;
> +	struct page *page;
> +
> +	if (!kvm->arch.nr_page_release_queue)
> +		return;
> +	kvm_flush_remote_tlbs(kvm);
> +	for (i = 0; i < kvm->arch.nr_page_release_queue; ++i) {
> +		page = kvm->arch.page_release_queue[i].page;
> +		if (kvm->arch.page_release_queue[i].dirty)
> +			kvm_release_page_dirty(page);
> +		else
> +			kvm_release_page_clean(page);
> +	}
> +	kvm->arch.nr_page_release_queue = 0;
> +}
> +
> +static void kvm_release_page_deferred(struct kvm *kvm,
> +				      struct page *page,
> +				      bool dirty)
> +{
> +	int i;
> +
> +	i = kvm->arch.nr_page_release_queue;
> +	if (i == KVM_MAX_PAGE_RELEASE_QUEUE) {
> +		rmap_remove_complete(kvm);
> +		i = 0;
> +	}
> +	kvm->arch.page_release_queue[i].page = page;
> +	kvm->arch.page_release_queue[i].dirty = dirty;
> +	kvm->arch.nr_page_release_queue = i + 1;
> +}
> +
> +static void kvm_release_page_dirty_deferred(struct kvm *kvm,
> +					    struct page *page)
> +{
> +	kvm_release_page_deferred(kvm, page, true);
> +}
> +
> +static void kvm_release_page_clean_deferred(struct kvm *kvm,
> +					    struct page *page)
> +{
> +	kvm_release_page_deferred(kvm, page, true);
> +}
> +
>  static void rmap_remove(struct kvm *kvm, u64 *spte)
>  {
>  	struct kvm_rmap_desc *desc;
> @@ -551,9 +597,9 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
>  	page = spte_to_page(*spte);
>  	mark_page_accessed(page);
>  	if (is_writeble_pte(*spte))
> -		kvm_release_page_dirty(page);
> +		kvm_release_page_dirty_deferred(kvm, page);
>  	else
> -		kvm_release_page_clean(page);
> +		kvm_release_page_clean_deferred(kvm, page);
>  	rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
>  	if (!*rmapp) {
>  		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
> @@ -585,6 +631,12 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
>  	}
>  }
>  
> +static void rmap_remove_one(struct kvm *kvm, u64 *spte)
> +{
> +	rmap_remove(kvm, spte);
> +	rmap_remove_complete(kvm);
> +}
> +
>  static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
>  {
>  	struct kvm_rmap_desc *desc;
> @@ -650,7 +702,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
>  		BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
>  		pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
>  		if (is_writeble_pte(*spte)) {
> -			rmap_remove(kvm, spte);
> +			rmap_remove_one(kvm, spte);
>  			--kvm->stat.lpages;
>  			set_shadow_pte(spte, shadow_trap_nonpresent_pte);
>  			write_protected = 1;
> @@ -872,7 +924,7 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
>  				rmap_remove(kvm, &pt[i]);
>  			pt[i] = shadow_trap_nonpresent_pte;
>  		}
> -		kvm_flush_remote_tlbs(kvm);
> +		rmap_remove_complete(kvm);
>  		return;
>  	}
>  
> @@ -891,7 +943,7 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
>  		}
>  		pt[i] = shadow_trap_nonpresent_pte;
>  	}
> -	kvm_flush_remote_tlbs(kvm);
> +	rmap_remove_complete(kvm);
>  }
>  
>  static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
> @@ -1048,7 +1100,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
>  
>  	if (is_rmap_pte(*shadow_pte)) {
>  		if (page != spte_to_page(*shadow_pte))
> -			rmap_remove(vcpu->kvm, shadow_pte);
> +			rmap_remove_one(vcpu->kvm, shadow_pte);
>  		else
>  			was_rmapped = 1;
>  	}
> @@ -1583,7 +1635,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
>  	if (is_shadow_present_pte(pte)) {
>  		if (sp->role.level == PT_PAGE_TABLE_LEVEL ||
>  		    is_large_pte(pte))
> -			rmap_remove(vcpu->kvm, spte);
> +			rmap_remove_one(vcpu->kvm, spte);
>  		else {
>  			child = page_header(pte & PT64_BASE_ADDR_MASK);
>  			mmu_page_remove_parent_pte(child, spte);
> diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
> index 4382ca0..8a71616 100644
> --- a/include/asm-x86/kvm_host.h
> +++ b/include/asm-x86/kvm_host.h
> @@ -77,6 +77,8 @@
>  #define KVM_REFILL_PAGES 25
>  #define KVM_MAX_CPUID_ENTRIES 40
>  
> +#define KVM_MAX_PAGE_RELEASE_QUEUE 16
> +
>  extern spinlock_t kvm_lock;
>  extern struct list_head vm_list;
>  
> @@ -312,6 +314,12 @@ struct kvm_arch{
>  	struct kvm_ioapic *vioapic;
>  	struct kvm_pit *vpit;
>  
> +	int nr_page_release_queue;
> +	struct {
> +		struct page *page;
> +		bool dirty;
> +	} page_release_queue[KVM_MAX_PAGE_RELEASE_QUEUE];
> +
>  	int round_robin_prev_vcpu;
>  	unsigned int tss_addr;
>  	struct page *apic_access_page;
> -- 
> 1.5.4.2

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] KVM: MMU: Fix rmap_remove() race
  2008-03-26 17:51 ` Marcelo Tosatti
@ 2008-03-26 18:12   ` Andrea Arcangeli
  2008-03-26 19:01     ` Marcelo Tosatti
  2008-03-27  8:01     ` Avi Kivity
  0 siblings, 2 replies; 21+ messages in thread
From: Andrea Arcangeli @ 2008-03-26 18:12 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel, Avi Kivity

On Wed, Mar 26, 2008 at 02:51:28PM -0300, Marcelo Tosatti wrote:
> Nope. If a physical CPU has page translations cached it _must_ be
> running in the context of a qemu thread (does not matter if its in
> userspace or executing guest code). The bit corresponding to such CPU's
> will be set in mm->vm_cpu_mask and flush_tlb_page() will take care of
> flushing the TLBs appropriately.

That would require the tlb to lookup any SVM/VMX virtualized address
that could point to the same physical page that is pointed by the
invlpg linux virtual address. So it might be feasible if the hardware
is using a physical tag on each spte translation, but it'd require the
tlb to do a lot more work than we need it to do, so I hope you're
wrong on the hardware side of things. That would be an hardware
complexity or slowdown that would provide no benefit to software.

But regardless, even if this was the case and you're right that
invalidating a linux userland address invalidates atomically all other
virtualized addresses in the svm/vmx tlbs (all asn included, not just
one), the spte is still instantiated when flush_tlb_page runs on all
cpus. So just after the global tlb flush, a guest spte tlb-miss (no
page fault required, no get_user_pages required) can happen that will
re-instantiate the spte contents inside the tlb before flush_tlb_page
returns.

	CPU0					CPU 1
	pte_clear() inside ptep_clear_flush
	flush_tlb_page inside ptep_clear_flush inside rmap
	page_count = 1
						guest tlb miss
						tlb entry is back!
						ioctl()
						mark spte nonpresent
						rmap_remove -> put_page
						tlb flush

Any tlb flush happening before clearing the shadow-pte entry is
totally useless.

Avi patch is great fix, and it will need furtherx changes to properly
fix this race, because many set_shadow_pte/and pt[i] = nonpresent, are
executed _after_ rmap_remove (they must be executed first, in case the
array is full and we have flush tlb and free the page).

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] KVM: MMU: Fix rmap_remove() race
  2008-03-26 18:12   ` Andrea Arcangeli
@ 2008-03-26 19:01     ` Marcelo Tosatti
  2008-03-27  8:01     ` Avi Kivity
  1 sibling, 0 replies; 21+ messages in thread
From: Marcelo Tosatti @ 2008-03-26 19:01 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Marcelo Tosatti, kvm-devel, Avi Kivity

On Wed, Mar 26, 2008 at 07:12:25PM +0100, Andrea Arcangeli wrote:
> On Wed, Mar 26, 2008 at 02:51:28PM -0300, Marcelo Tosatti wrote:
> > Nope. If a physical CPU has page translations cached it _must_ be
> > running in the context of a qemu thread (does not matter if its in
> > userspace or executing guest code). The bit corresponding to such CPU's
> > will be set in mm->vm_cpu_mask and flush_tlb_page() will take care of
> > flushing the TLBs appropriately.
> 
> That would require the tlb to lookup any SVM/VMX virtualized address
> that could point to the same physical page that is pointed by the
> invlpg linux virtual address. So it might be feasible if the hardware
> is using a physical tag on each spte translation, but it'd require the
> tlb to do a lot more work than we need it to do, so I hope you're
> wrong on the hardware side of things. That would be an hardware
> complexity or slowdown that would provide no benefit to software.
> 
> But regardless, even if this was the case and you're right that
> invalidating a linux userland address invalidates atomically all other
> virtualized addresses in the svm/vmx tlbs (all asn included, not just
> one), the spte is still instantiated when flush_tlb_page runs on all
> cpus. 

You're right, there is no guarantee that a full TLB flush will happen,
and invlpg for the qemu task virtual address is not enough.

> So just after the global tlb flush, a guest spte tlb-miss (no page
> fault required, no get_user_pages required) can happen that will
> re-instantiate the spte contents inside the tlb before flush_tlb_page
> returns.
> 
> 	CPU0					CPU 1
> 	pte_clear() inside ptep_clear_flush
> 	flush_tlb_page inside ptep_clear_flush inside rmap
> 	page_count = 1
> 						guest tlb miss
> 						tlb entry is back!
> 						ioctl()
> 						mark spte nonpresent
> 						rmap_remove -> put_page
> 						tlb flush
> 
> Any tlb flush happening before clearing the shadow-pte entry is
> totally useless.
> 
> Avi patch is great fix, and it will need furtherx changes to properly
> fix this race, because many set_shadow_pte/and pt[i] = nonpresent, are
> executed _after_ rmap_remove (they must be executed first, in case the
> array is full and we have flush tlb and free the page).

One comment on the patch:

+static void kvm_release_page_clean_deferred(struct kvm *kvm,
+                                           struct page *page)
+{
+       kvm_release_page_deferred(kvm, page, true);
+}

                                             false



-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] KVM: MMU: Fix rmap_remove() race
  2008-03-26 15:02 [PATCH] KVM: MMU: Fix rmap_remove() race Avi Kivity
  2008-03-26 15:15 ` Avi Kivity
  2008-03-26 17:51 ` Marcelo Tosatti
@ 2008-03-26 19:22 ` Andrea Arcangeli
  2008-03-26 19:27   ` Andrea Arcangeli
  2008-03-27 15:26 ` Andi Kleen
  3 siblings, 1 reply; 21+ messages in thread
From: Andrea Arcangeli @ 2008-03-26 19:22 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, Marcelo Tosatti

On Wed, Mar 26, 2008 at 05:02:53PM +0200, Avi Kivity wrote:
> Andrea notes that freeing the page before flushing the tlb is a race, as the
> guest can sneak in one last write before the tlb is flushed, writing to a
> page that may belong to someone else.
> 
> Fix be reversing the order of freeing and flushing the tlb.  Since the tlb
> flush is expensive, queue the pages to be freed so we need to flush just once.

Problem is the *spte = nonpresent, must now happen _before_
rmap_remove for this race to be fixed. So we either remove the
is_rmap_pte from the top of rmap_remove and we reorder all the
callers, or we apply this.

I'm not sure anymore the mmu notifiers are enough to fix this, because
what happens if invalidate_page runs after rmap_remove is returned
(the spte isn't visible anymore by the rmap code and in turn by
invalidate_page) but before the set_shadow_pte(nonpresent) runs.

Index: arch/x86/kvm/mmu.c
--- arch/x86/kvm/mmu.c.~1~	2008-03-26 16:55:14.000000000 +0100
+++ arch/x86/kvm/mmu.c	2008-03-26 19:57:53.000000000 +0100
@@ -582,7 +582,7 @@ static void kvm_release_page_clean_defer
 	kvm_release_page_deferred(kvm, page, true);
 }
 
-static void rmap_remove(struct kvm *kvm, u64 *spte)
+static void rmap_remove(struct kvm *kvm, u64 *sptep)
 {
 	struct kvm_rmap_desc *desc;
 	struct kvm_rmap_desc *prev_desc;
@@ -590,35 +590,37 @@ static void rmap_remove(struct kvm *kvm,
 	struct page *page;
 	unsigned long *rmapp;
 	int i;
+	u64 spte = *sptep;
 
-	if (!is_rmap_pte(*spte))
+	if (!is_rmap_pte(spte))
 		return;
-	sp = page_header(__pa(spte));
-	page = spte_to_page(*spte);
+	sp = page_header(__pa(sptep));
+	page = spte_to_page(spte);
 	mark_page_accessed(page);
-	if (is_writeble_pte(*spte))
+	set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+	if (is_writeble_pte(spte))
 		kvm_release_page_dirty_deferred(kvm, page);
 	else
 		kvm_release_page_clean_deferred(kvm, page);
-	rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
+	rmapp = gfn_to_rmap(kvm, sp->gfns[sptep - sp->spt], is_large_pte(spte));
 	if (!*rmapp) {
-		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
+		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", sptep, spte);
 		BUG();
 	} else if (!(*rmapp & 1)) {
-		rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
-		if ((u64 *)*rmapp != spte) {
+		rmap_printk("rmap_remove:  %p %llx 1->0\n", sptep, spte);
+		if ((u64 *)*rmapp != sptep) {
 			printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
-			       spte, *spte);
+			       sptep, spte);
 			BUG();
 		}
 		*rmapp = 0;
 	} else {
-		rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
+		rmap_printk("rmap_remove:  %p %llx many->many\n", sptep, spte);
 		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
 		prev_desc = NULL;
 		while (desc) {
 			for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
-				if (desc->shadow_ptes[i] == spte) {
+				if (desc->shadow_ptes[i] == sptep) {
 					rmap_desc_remove_entry(rmapp,
 							       desc, i,
 							       prev_desc);


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] KVM: MMU: Fix rmap_remove() race
  2008-03-26 19:22 ` Andrea Arcangeli
@ 2008-03-26 19:27   ` Andrea Arcangeli
  2008-03-27  8:06     ` Avi Kivity
  2008-03-27  8:11     ` Avi Kivity
  0 siblings, 2 replies; 21+ messages in thread
From: Andrea Arcangeli @ 2008-03-26 19:27 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, Marcelo Tosatti

On Wed, Mar 26, 2008 at 08:22:31PM +0100, Andrea Arcangeli wrote:
> what happens if invalidate_page runs after rmap_remove is returned
> (the spte isn't visible anymore by the rmap code and in turn by
> invalidate_page) but before the set_shadow_pte(nonpresent) runs.

Thinking some more the mmu_lock is meant to prevent this. So
invalidate_page should wait. As long as the kvm tlb flush happens
inside the mmu lock we should be safe.

Fixing it with mmu notifiers is the higher performance way too. This
would be the patch if we decide to do that.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 95c12bc..80cf172 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -550,6 +550,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	sp = page_header(__pa(spte));
 	page = spte_to_page(*spte);
 	mark_page_accessed(page);
+	BUG_ON(page_count(page) <= 1);
 	if (is_writeble_pte(*spte))
 		kvm_release_page_dirty(page);
 	else
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 30bf832..a49987c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -741,6 +741,10 @@ static struct vm_operations_struct kvm_vcpu_vm_ops = {
 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	vma->vm_ops = &kvm_vcpu_vm_ops;
+#ifndef CONFIG_MMU_NOTIFIER
+	/* prevent the VM to release pages under sptes mappings */
+	vma->vm_flags |= VM_LOCKED;
+#endif
 	return 0;
 }
 

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH] KVM: MMU: Fix rmap_remove() race
  2008-03-26 18:12   ` Andrea Arcangeli
  2008-03-26 19:01     ` Marcelo Tosatti
@ 2008-03-27  8:01     ` Avi Kivity
  1 sibling, 0 replies; 21+ messages in thread
From: Avi Kivity @ 2008-03-27  8:01 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Marcelo Tosatti, kvm-devel

Andrea Arcangeli wrote:
> Any tlb flush happening before clearing the shadow-pte entry is
> totally useless.
>
>   

Doh!  Well that's easy to fix, by moving the spte clear to rmap_remove().

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] KVM: MMU: Fix rmap_remove() race
  2008-03-26 19:27   ` Andrea Arcangeli
@ 2008-03-27  8:06     ` Avi Kivity
  2008-03-27  8:11     ` Avi Kivity
  1 sibling, 0 replies; 21+ messages in thread
From: Avi Kivity @ 2008-03-27  8:06 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: kvm-devel, Marcelo Tosatti

Andrea Arcangeli wrote:
> On Wed, Mar 26, 2008 at 08:22:31PM +0100, Andrea Arcangeli wrote:
>   
>> what happens if invalidate_page runs after rmap_remove is returned
>> (the spte isn't visible anymore by the rmap code and in turn by
>> invalidate_page) but before the set_shadow_pte(nonpresent) runs.
>>     
>
> Thinking some more the mmu_lock is meant to prevent this. So
> invalidate_page should wait. As long as the kvm tlb flush happens
> inside the mmu lock we should be safe.
>
> Fixing it with mmu notifiers is the higher performance way too. This
> would be the patch if we decide to do that.
>
>   

Well, obviously mmu notifiers is the future and we should code for that, 
instead of increasing code complexity.


> Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 95c12bc..80cf172 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -550,6 +550,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
>  	sp = page_header(__pa(spte));
>  	page = spte_to_page(*spte);
>  	mark_page_accessed(page);
> +	BUG_ON(page_count(page) <= 1);
>  	if (is_writeble_pte(*spte))
>  		kvm_release_page_dirty(page);
>  	else
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 30bf832..a49987c 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -741,6 +741,10 @@ static struct vm_operations_struct kvm_vcpu_vm_ops = {
>  static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
>  {
>  	vma->vm_ops = &kvm_vcpu_vm_ops;
> +#ifndef CONFIG_MMU_NOTIFIER
> +	/* prevent the VM to release pages under sptes mappings */
> +	vma->vm_flags |= VM_LOCKED;
> +#endif
>  	return 0;
>  }
>  
>   

That's sad, but I guess the only safe and simple option is to queue this 
for 2.6.25 and remove it in 2.6.26.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] KVM: MMU: Fix rmap_remove() race
  2008-03-26 19:27   ` Andrea Arcangeli
  2008-03-27  8:06     ` Avi Kivity
@ 2008-03-27  8:11     ` Avi Kivity
  2008-03-27 13:52       ` Andrea Arcangeli
  1 sibling, 1 reply; 21+ messages in thread
From: Avi Kivity @ 2008-03-27  8:11 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: kvm-devel, Marcelo Tosatti

Andrea Arcangeli wrote:
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 30bf832..a49987c 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -741,6 +741,10 @@ static struct vm_operations_struct kvm_vcpu_vm_ops = {
>  static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
>  {
>  	vma->vm_ops = &kvm_vcpu_vm_ops;
> +#ifndef CONFIG_MMU_NOTIFIER
> +	/* prevent the VM to release pages under sptes mappings */
> +	vma->vm_flags |= VM_LOCKED;
> +#endif
>  	return 0;
>  }
>  
>   

Erm I don't think this means what you think it means.  This is the 
kernel/user communication area, used to pass exit data to userspace.  
It's not the memslot vma.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] KVM: MMU: Fix rmap_remove() race
  2008-03-27  8:11     ` Avi Kivity
@ 2008-03-27 13:52       ` Andrea Arcangeli
  2008-03-27 13:56         ` Avi Kivity
  0 siblings, 1 reply; 21+ messages in thread
From: Andrea Arcangeli @ 2008-03-27 13:52 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, Marcelo Tosatti

On Thu, Mar 27, 2008 at 10:11:42AM +0200, Avi Kivity wrote:
> Erm I don't think this means what you think it means.  This is the 
> kernel/user communication area, used to pass exit data to userspace.  It's 
> not the memslot vma.

Yep... only the kvm_vm_vm_ops can run gfn_to_page, and I assume that
was used by the old userland and not relevant anymore (smaps don't
show it either), or the pages could not be unmapped. So btw, that
kvm-vm anon inode must be disabled when mmu notifiers are configured
in to guarantee that access to /dev/kvm won't lead to mlock behavior.

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] KVM: MMU: Fix rmap_remove() race
  2008-03-27 13:52       ` Andrea Arcangeli
@ 2008-03-27 13:56         ` Avi Kivity
  2008-03-27 14:26           ` Andrea Arcangeli
  0 siblings, 1 reply; 21+ messages in thread
From: Avi Kivity @ 2008-03-27 13:56 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: kvm-devel, Marcelo Tosatti

Andrea Arcangeli wrote:
> On Thu, Mar 27, 2008 at 10:11:42AM +0200, Avi Kivity wrote:
>   
>> Erm I don't think this means what you think it means.  This is the 
>> kernel/user communication area, used to pass exit data to userspace.  It's 
>> not the memslot vma.
>>     
>
> Yep... only the kvm_vm_vm_ops can run gfn_to_page, and I assume that
> was used by the old userland and not relevant anymore (smaps don't
> show it either), or the pages could not be unmapped. So btw, that
> kvm-vm anon inode must be disabled when mmu notifiers are configured
> in to guarantee that access to /dev/kvm won't lead to mlock behavior.
>   

That's not good.  We need to support the older userspace, for a while yet.

Why is there a problem? IIRC it's just anonymous memory.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] KVM: MMU: Fix rmap_remove() race
  2008-03-27 13:56         ` Avi Kivity
@ 2008-03-27 14:26           ` Andrea Arcangeli
  2008-03-27 14:35             ` Avi Kivity
  0 siblings, 1 reply; 21+ messages in thread
From: Andrea Arcangeli @ 2008-03-27 14:26 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, Marcelo Tosatti

On Thu, Mar 27, 2008 at 03:56:56PM +0200, Avi Kivity wrote:
> That's not good.  We need to support the older userspace, for a while yet.
>
> Why is there a problem? IIRC it's just anonymous memory.

Problem is that for it to be unmapped __do_fault must call
page_add_new_anon_rmap on it. Even anon would be 1, that would mean
cow and that's clearly not what you want as it would lose the
visibility on the future guest writes. anon will be 0 if qemu is
reading for example, leaving pinned anonymous memory that mmu
notifiers won't be able to swap anymore. This is about providing the
guarantee to the admin, that if he enables the mmu notifiers in the
kernel, giving access to /dev/kvm won't give full mlock privileges
too. Not sure if anybody will care, but it's a bit like removing the
limit on tmpfs that only half ram can be pinned.

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] KVM: MMU: Fix rmap_remove() race
  2008-03-27 14:26           ` Andrea Arcangeli
@ 2008-03-27 14:35             ` Avi Kivity
  2008-03-27 14:50               ` Andrea Arcangeli
  0 siblings, 1 reply; 21+ messages in thread
From: Avi Kivity @ 2008-03-27 14:35 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: kvm-devel, Marcelo Tosatti

Andrea Arcangeli wrote:
> On Thu, Mar 27, 2008 at 03:56:56PM +0200, Avi Kivity wrote:
>   
>> That's not good.  We need to support the older userspace, for a while yet.
>>
>> Why is there a problem? IIRC it's just anonymous memory.
>>     
>
> Problem is that for it to be unmapped __do_fault must call
> page_add_new_anon_rmap on it. Even anon would be 1, that would mean
> cow and that's clearly not what you want as it would lose the
> visibility on the future guest writes. anon will be 0 if qemu is
> reading for example, leaving pinned anonymous memory that mmu
> notifiers won't be able to swap anymore. This is about providing the
> guarantee to the admin, that if he enables the mmu notifiers in the
> kernel, giving access to /dev/kvm won't give full mlock privileges
> too. Not sure if anybody will care, but it's a bit like removing the
> limit on tmpfs that only half ram can be pinned.
>   

We can put it under a Kconfig option.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] KVM: MMU: Fix rmap_remove() race
  2008-03-27 14:35             ` Avi Kivity
@ 2008-03-27 14:50               ` Andrea Arcangeli
  2008-03-27 14:56                 ` Avi Kivity
  2008-03-28 14:01                 ` Andrea Arcangeli
  0 siblings, 2 replies; 21+ messages in thread
From: Andrea Arcangeli @ 2008-03-27 14:50 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, Marcelo Tosatti

On Thu, Mar 27, 2008 at 04:35:55PM +0200, Avi Kivity wrote:
> We can put it under a Kconfig option.

Yes, I thought the MMU_NOTIFIER knob would be enough but we can of
course add more finegrined options if we think it worth to allow
unprivileged users to own /dev/kvm.

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] KVM: MMU: Fix rmap_remove() race
  2008-03-27 14:50               ` Andrea Arcangeli
@ 2008-03-27 14:56                 ` Avi Kivity
  2008-03-28 14:01                 ` Andrea Arcangeli
  1 sibling, 0 replies; 21+ messages in thread
From: Avi Kivity @ 2008-03-27 14:56 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: kvm-devel, Marcelo Tosatti

Andrea Arcangeli wrote:
> On Thu, Mar 27, 2008 at 04:35:55PM +0200, Avi Kivity wrote:
>   
>> We can put it under a Kconfig option.
>>     
>
> Yes, I thought the MMU_NOTIFIER knob would be enough but we can of
> course add more finegrined options if we think it worth to allow
> unprivileged users to own /dev/kvm.
>   

With mmu notifiers and the shrinker, there is no reason not to allow 
ordinary users access to /dev/kvm, so I think it's worthwhile.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] KVM: MMU: Fix rmap_remove() race
  2008-03-26 15:02 [PATCH] KVM: MMU: Fix rmap_remove() race Avi Kivity
                   ` (2 preceding siblings ...)
  2008-03-26 19:22 ` Andrea Arcangeli
@ 2008-03-27 15:26 ` Andi Kleen
  3 siblings, 0 replies; 21+ messages in thread
From: Andi Kleen @ 2008-03-27 15:26 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, Marcelo Tosatti, Andrea Arcangeli

Avi Kivity <avi@qumranet.com> writes:

(thought i sent a reply before, but i don't see it now. sorry if you
see it twice)

> Andrea notes that freeing the page before flushing the tlb is a race, as the
> guest can sneak in one last write before the tlb is flushed, writing to a
> page that may belong to someone else.
> 
> Fix be reversing the order of freeing and flushing the tlb.  Since the tlb
> flush is expensive, queue the pages to be freed so we need to flush just once.

You have to do the same for the page tables too, because several modern
CPUs cache the higher level of the page tables and only invalidate the
cache on any TLB flush. Strictly it is only needed for the higher levels,
but doing it for all is safer.

-Andi

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] KVM: MMU: Fix rmap_remove() race
  2008-03-27 14:50               ` Andrea Arcangeli
  2008-03-27 14:56                 ` Avi Kivity
@ 2008-03-28 14:01                 ` Andrea Arcangeli
  2008-03-28 20:07                   ` Andrea Arcangeli
  2008-03-31  6:35                   ` Avi Kivity
  1 sibling, 2 replies; 21+ messages in thread
From: Andrea Arcangeli @ 2008-03-28 14:01 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, Marcelo Tosatti

I thought some more about this.

BTW, for completeness: normally (with exception of vm_destroy) the
put_page run by rmap_remove won't be the last one, but still the page
can go in the freelist a moment after put_page runs (leading to the
same problem). The VM is prevented to free the page while it's pinned,
but the VM can do the final free on the page before rmap_remove
returns. And w/o mmu notifiers there's no serialization that makes the
core VM stop on the mmu_lock to wait the tlb flush to run, before the
VM finally executes the last free of the page. mmu notifiers fixes
this race for regular swapping as the core VM will block on the
mmu_lock waiting the tlb flush (for this to work the tlb flush must
always happen inside the mmu_lock unless the order is exactly "spte =
nonpresent; tlbflush; put_page"). A VM_LOCKED on the vmas backing the
anonymous memory will fix this for regolar swapping too (I did
something like this in a patch at the end as a band-aid).

But thinking more the moment we pretend to allow anybody to randomly
__munmap__ any part of the guest physical address space like for
ballooning while the guest runs (think unprivileged user owning
/dev/kvm and running munmap at will), not even VM_LOCKED (ignored by
munmap) and not even the mmu notifiers, can prevent the page to be
queued in the kernel freelists immediately after rmap_remove returns,
this is because rmap_remove may run in a different host-cpu in between
unmap_vmas and invalidate_range_end.

Running the ioctl before munmap won't help to prevent the race as the
guest can still re-instantiate the sptes with page faults between the
ioctl and munmap.

However we've invalidate_range_begin. If we invalidate all sptes in
invalidate_range_begin and we hold off the page faults in between
_begin/_end, then we can fix this with the mmu notifiers.

So I think I can allow munmap safely (to unprivileged user too) by
using _range_begin somehow. For this to work any relevant tlb flush
must happen inside the _same_ mmu_lock critical section where
spte=nonpresent and rmap_remove run too (thanks to the mmu_lock the
ordering of those operations won't matter anymore, and no queue will
be needed).

-------

Temporary band-aid to prevent the VM to do the final free on the page
immediately after put_page run inside rmap_remove, and before the tlb
flush (mmu_lock shall be released always after the tlb flush for
future appraoches to be safe). This still can't allow safe (safe as
guest not able to memory corrupt the host) munmap in the middle of the
guest physical memory. Only a combination of both _range_begin and
_range_end mmu notifier will allow that, unless rmap_remove callers
are all changed to do safe "spte=nonpresent; tlbflush; put_page"
ordering, then it'll always be safe, but it'll be slower as there will
be more tlb flushes than needed.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index caa9f94..5343216 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3887,8 +3887,11 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 			memslot->userspace_addr = do_mmap(NULL, 0,
 						     npages * PAGE_SIZE,
 						     PROT_READ | PROT_WRITE,
-						     MAP_SHARED | MAP_ANONYMOUS,
-						     0);
+						     MAP_SHARED | MAP_ANONYMOUS
+#ifndef CONFIG_MMU_NOTIFIER
+						     |MAP_LOCKED
+#endif
+						     , 0);
 			up_write(&current->mm->mmap_sem);

 			if (IS_ERR((void *)memslot->userspace_addr))
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 30bf832..ed65e29 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -250,6 +250,27 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
 	return 0;
 }

+#ifndef CONFIG_MMU_NOTIFIER
+static int memslot_mlock(unsigned long start, unsigned long len)
+{
+	struct vm_area_struct * vma;
+	unsigned long end = start+len;
+
+	vma = find_vma(current->mm, start);
+	if (!vma || vma->vm_start > start)
+		return -ENOMEM;
+
+	/* go simple and don't split vmas */
+	for (;vma && vma->vm_start < end; vma = vma->vm_next) {
+		vma->vm_flags |= VM_LOCKED;
+		start = vma->vm_end;
+	}
+	if (start < end)
+		return -ENOMEM;
+	return 0;
+}
+#endif
+
 /*
  * Allocate some memory and give it an address in the guest physical address
  * space.
@@ -271,8 +292,12 @@ int __kvm_set_memory_region(struct kvm *kvm,

 	r = -EINVAL;
 	/* General sanity checks */
+	if (mem->userspace_addr & (PAGE_SIZE - 1))
+		goto out;
 	if (mem->memory_size & (PAGE_SIZE - 1))
 		goto out;
+	if (mem->userspace_addr + mem->memory_size < mem->userspace_addr)
+		goto out;
 	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
 		goto out;
 	if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
@@ -318,6 +343,12 @@ int __kvm_set_memory_region(struct kvm *kvm,

 	/* Allocate if a slot is being created */
 	if (npages && !new.rmap) {
+#ifndef CONFIG_MMU_NOTIFIER
+		if (user_alloc &&
+		    memslot_mlock(mem->userspace_addr, mem->memory_size))
+			goto out_free;
+#endif
+
 		new.rmap = vmalloc(npages * sizeof(struct page *));

 		if (!new.rmap)

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH] KVM: MMU: Fix rmap_remove() race
  2008-03-28 14:01                 ` Andrea Arcangeli
@ 2008-03-28 20:07                   ` Andrea Arcangeli
  2008-03-31  6:35                   ` Avi Kivity
  1 sibling, 0 replies; 21+ messages in thread
From: Andrea Arcangeli @ 2008-03-28 20:07 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, Marcelo Tosatti

On Fri, Mar 28, 2008 at 03:01:13PM +0100, Andrea Arcangeli wrote:
> @@ -271,8 +292,12 @@ int __kvm_set_memory_region(struct kvm *kvm,
>  
>  	r = -EINVAL;
>  	/* General sanity checks */
> +	if (mem->userspace_addr & (PAGE_SIZE - 1))
> +		goto out;
>  	if (mem->memory_size & (PAGE_SIZE - 1))
>  		goto out;
> +	if (mem->userspace_addr + mem->memory_size < mem->userspace_addr)
> +		goto out;

Those above (I suppose they're needed even if not strictly related to
the VM_LOCKED) broke vmx (only my laptop is vmx so I noticed it after
submission sorry!, the testing I did on svm test system before
submission worked fine and kvm was oom killed w/o swapping as
expected). I figured out the above likely broke older userland for the
same reason it broke vmx, so a new patch follows that should works for
vmx and should allow older userspace to still work and in VM_LOCKED
safe mode.

I'll try to make a new mmu notifier patch soon that will allow munmap
on a guest live gphys address space for the first time so ballooning
will be safe. Swapping is already safe and allowed with the current
kvm-swapping patch using the mmu notifiers, thanks to the mmu_lock
taken by invalidate_page that serializes the VM against
"spin_lock(mmu_lock); rmap_remove; tlbflush; spin_unlock(mmu_lock)".

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 30bf832..8ece406 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -250,6 +250,24 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+#ifndef CONFIG_MMU_NOTIFIER
+static void memslot_mlock(unsigned long start, unsigned long len)
+{
+	struct vm_area_struct * vma;
+	unsigned long end = start+len;
+
+	vma = find_vma(current->mm, start);
+	if (!vma || vma->vm_start > start)
+		return;
+
+	/* go simple and don't split vmas */
+	for (;vma && vma->vm_start < end; vma = vma->vm_next) {
+		vma->vm_flags |= VM_LOCKED;
+		start = vma->vm_end;
+	}
+}
+#endif
+
 /*
  * Allocate some memory and give it an address in the guest physical address
  * space.
@@ -273,6 +291,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	/* General sanity checks */
 	if (mem->memory_size & (PAGE_SIZE - 1))
 		goto out;
+	if (user_alloc &&
+	    ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
+	     (mem->userspace_addr + mem->memory_size < mem->userspace_addr)))
+		goto out;
 	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
 		goto out;
 	if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
@@ -366,6 +388,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		goto out_free;
 	}
 
+#ifndef CONFIG_MMU_NOTIFIER
+	memslot_mlock(mem->userspace_addr, mem->memory_size);
+#endif
+
 	kvm_free_physmem_slot(&old, &new);
 	return 0;
 

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH] KVM: MMU: Fix rmap_remove() race
  2008-03-28 14:01                 ` Andrea Arcangeli
  2008-03-28 20:07                   ` Andrea Arcangeli
@ 2008-03-31  6:35                   ` Avi Kivity
  2008-03-31  9:25                     ` Andrea Arcangeli
  1 sibling, 1 reply; 21+ messages in thread
From: Avi Kivity @ 2008-03-31  6:35 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: kvm-devel, Marcelo Tosatti

Andrea Arcangeli wrote:
> I thought some more about this.
>
> BTW, for completeness: normally (with exception of vm_destroy) the
> put_page run by rmap_remove won't be the last one, but still the page
> can go in the freelist a moment after put_page runs (leading to the
> same problem). The VM is prevented to free the page while it's pinned,
> but the VM can do the final free on the page before rmap_remove
> returns. And w/o mmu notifiers there's no serialization that makes the
> core VM stop on the mmu_lock to wait the tlb flush to run, before the
> VM finally executes the last free of the page. mmu notifiers fixes
> this race for regular swapping as the core VM will block on the
> mmu_lock waiting the tlb flush (for this to work the tlb flush must
> always happen inside the mmu_lock unless the order is exactly "spte =
> nonpresent; tlbflush; put_page"). A VM_LOCKED on the vmas backing the
> anonymous memory will fix this for regolar swapping too (I did
> something like this in a patch at the end as a band-aid).
>
> But thinking more the moment we pretend to allow anybody to randomly
> __munmap__ any part of the guest physical address space like for
> ballooning while the guest runs (think unprivileged user owning
> /dev/kvm and running munmap at will), not even VM_LOCKED (ignored by
> munmap) and not even the mmu notifiers, can prevent the page to be
> queued in the kernel freelists immediately after rmap_remove returns,
> this is because rmap_remove may run in a different host-cpu in between
> unmap_vmas and invalidate_range_end.
>
> Running the ioctl before munmap won't help to prevent the race as the
> guest can still re-instantiate the sptes with page faults between the
> ioctl and munmap.
>
> However we've invalidate_range_begin. If we invalidate all sptes in
> invalidate_range_begin and we hold off the page faults in between
> _begin/_end, then we can fix this with the mmu notifiers.
>   

This can be done by taking mmu_lock in _begin and releasing it in _end, 
unless there's a lock dependency issue.

> So I think I can allow munmap safely (to unprivileged user too) by
> using _range_begin somehow. For this to work any relevant tlb flush
> must happen inside the _same_ mmu_lock critical section where
> spte=nonpresent and rmap_remove run too (thanks to the mmu_lock the
> ordering of those operations won't matter anymore, and no queue will
> be needed).
>
>   

I don't understand your conclusion: you prove that mlock() is not good 
enough, then post a patch to do it?

I'll take another shot at fixing rmap_remove(), I don't like to cripple 
swapping for 2.6.25 (though it will only be really dependable in .26).

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] KVM: MMU: Fix rmap_remove() race
  2008-03-31  6:35                   ` Avi Kivity
@ 2008-03-31  9:25                     ` Andrea Arcangeli
  0 siblings, 0 replies; 21+ messages in thread
From: Andrea Arcangeli @ 2008-03-31  9:25 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, Marcelo Tosatti

On Mon, Mar 31, 2008 at 09:35:00AM +0300, Avi Kivity wrote:
> This can be done by taking mmu_lock in _begin and releasing it in _end, 
> unless there's a lock dependency issue.

The main problem is if want to be able to co-exit with XPMEM methods
registered in the same notifier chain for the same MM with the KVM
methods. The ideal would be to solve the race with a non-blocking lock
like seqlock.

> I don't understand your conclusion: you prove that mlock() is not good 
> enough, then post a patch to do it?

mlock isn't good enough to allow munmap/madvise(don't need). So mlock
fixes the race in the current kvm code, but only unless you use
ballooning. This is because VM_LOCKED should be ignored by
madvise(don't need). But at least this is only a trouble for smp
guest. It'd require rmap_remove to run on a different physical cpu
while another qemu thread runs madvise. So supposedly with an up
guest, the guest won't run rmap_remove while madvise runs. To better
explain the race, if we could take the mmu_lock around madvise that
would fix it for smp guest too (however currently it's userland
calling into madvise so that's not feasible with the current model).

> I'll take another shot at fixing rmap_remove(), I don't like to cripple 
> swapping for 2.6.25 (though it will only be really dependable in .26).

Ok! Clearly it would look more robust if rmap_remove is capable of
doing the last free on the page and it won't relay on the page not
to be freed until mmu_lock is released.

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2008-03-31  9:25 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-03-26 15:02 [PATCH] KVM: MMU: Fix rmap_remove() race Avi Kivity
2008-03-26 15:15 ` Avi Kivity
2008-03-26 17:51 ` Marcelo Tosatti
2008-03-26 18:12   ` Andrea Arcangeli
2008-03-26 19:01     ` Marcelo Tosatti
2008-03-27  8:01     ` Avi Kivity
2008-03-26 19:22 ` Andrea Arcangeli
2008-03-26 19:27   ` Andrea Arcangeli
2008-03-27  8:06     ` Avi Kivity
2008-03-27  8:11     ` Avi Kivity
2008-03-27 13:52       ` Andrea Arcangeli
2008-03-27 13:56         ` Avi Kivity
2008-03-27 14:26           ` Andrea Arcangeli
2008-03-27 14:35             ` Avi Kivity
2008-03-27 14:50               ` Andrea Arcangeli
2008-03-27 14:56                 ` Avi Kivity
2008-03-28 14:01                 ` Andrea Arcangeli
2008-03-28 20:07                   ` Andrea Arcangeli
2008-03-31  6:35                   ` Avi Kivity
2008-03-31  9:25                     ` Andrea Arcangeli
2008-03-27 15:26 ` Andi Kleen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox