KVM: MMU: add KVM_ZAP

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* KVM: MMU: add KVM_ZAP_GFN ioctl
@ 2008-03-12 22:29 Marcelo Tosatti
  2008-03-20 12:09 ` Avi Kivity
  0 siblings, 1 reply; 10+ messages in thread
From: Marcelo Tosatti @ 2008-03-12 22:29 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel


Add an ioctl to zap all mappings to a given gfn. This allows userspace
remove the QEMU process mappings and the page without causing
inconsistency.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>


diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f0cdfba..c41464f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -642,6 +642,67 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 	account_shadowed(kvm, gfn);
 }
 
+static void rmap_nuke(struct kvm *kvm, u64 gfn)
+{
+	unsigned long *rmapp;
+	u64 *spte;
+	int nuked = 0;
+
+	gfn = unalias_gfn(kvm, gfn);
+	rmapp = gfn_to_rmap(kvm, gfn, 0);
+
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		BUG_ON(!spte);
+		BUG_ON(!(*spte & PT_PRESENT_MASK));
+		rmap_printk("rmap_nuke: spte %p %llx\n", spte, *spte);
+		rmap_remove(kvm, spte);
+		set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+                nuked = 1;
+		spte = rmap_next(kvm, rmapp, spte);
+	}
+	/* check for huge page mappings */
+	rmapp = gfn_to_rmap(kvm, gfn, 1);
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		BUG_ON(!spte);
+		BUG_ON(!(*spte & PT_PRESENT_MASK));
+		BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
+		pgprintk("rmap_nuke(large): spte %p %llx %lld\n", spte, *spte, gfn);
+		rmap_remove(kvm, spte);
+		--kvm->stat.lpages;
+		set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+		nuked = 1;
+		spte = rmap_next(kvm, rmapp, spte);
+	}
+
+	if (nuked)
+		kvm_flush_remote_tlbs(kvm);
+}
+
+int kvm_zap_single_gfn(struct kvm *kvm, gfn_t gfn)
+{
+	unsigned long addr;
+	int have_mmu_notifiers = 0;
+
+	down_read(&kvm->slots_lock);
+	addr = gfn_to_hva(kvm, gfn);
+
+	if (kvm_is_error_hva(addr)) {
+		up_read(&kvm->slots_lock);
+		return -EINVAL;
+	}
+
+	if (!have_mmu_notifiers) {
+		spin_lock(&kvm->mmu_lock);
+		rmap_nuke(kvm, gfn);
+		spin_unlock(&kvm->mmu_lock);
+	}
+	up_read(&kvm->slots_lock);
+
+	return 0;
+}
+
 #ifdef MMU_DEBUG
 static int is_empty_shadow_page(u64 *spt)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e65a9d6..d982ca1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -816,6 +816,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_NR_MEMSLOTS:
 		r = KVM_MEMORY_SLOTS;
 		break;
+	case KVM_CAP_ZAP_GFN:
+		r = 1;
+		break;
 	default:
 		r = 0;
 		break;
@@ -1636,6 +1639,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_ZAP_GFN: {
+		gfn_t gfn;
+
+		r = -EFAULT;
+		if (copy_from_user(&gfn, argp, sizeof gfn))
+			goto out;
+		r = kvm_zap_single_gfn(kvm, gfn);
+		break;
+        } 
 	default:
 		;
 	}
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 024b57c..4e45bd2 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -425,6 +425,7 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
 void kvm_mmu_zap_all(struct kvm *kvm);
+int  kvm_zap_single_gfn(struct kvm *kvm, gfn_t gfn);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
 
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index e92e703..9ea714f 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -236,6 +236,7 @@ struct kvm_vapic_addr {
 #define KVM_CAP_CLOCKSOURCE 8
 #define KVM_CAP_NR_VCPUS 9       /* returns max vcpus per vm */
 #define KVM_CAP_NR_MEMSLOTS 10   /* returns max memory slots per vm */
+#define KVM_CAP_ZAP_GFN	11
 
 /*
  * ioctls for VM fds
@@ -258,6 +259,7 @@ struct kvm_vapic_addr {
 #define KVM_IRQ_LINE		  _IOW(KVMIO, 0x61, struct kvm_irq_level)
 #define KVM_GET_IRQCHIP		  _IOWR(KVMIO, 0x62, struct kvm_irqchip)
 #define KVM_SET_IRQCHIP		  _IOR(KVMIO,  0x63, struct kvm_irqchip)
+#define KVM_ZAP_GFN	   	  _IOR(KVMIO,  0x64, unsigned long)
 
 /*
  * ioctls for vcpu fds

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: KVM: MMU: add KVM_ZAP_GFN ioctl
  2008-03-12 22:29 KVM: MMU: add KVM_ZAP_GFN ioctl Marcelo Tosatti
@ 2008-03-20 12:09 ` Avi Kivity
  2008-03-21 11:42   ` Andrea Arcangeli
  0 siblings, 1 reply; 10+ messages in thread
From: Avi Kivity @ 2008-03-20 12:09 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel, Andrea Arcangeli

Marcelo Tosatti wrote:
> Add an ioctl to zap all mappings to a given gfn. This allows userspace
> remove the QEMU process mappings and the page without causing
> inconsistency.
>
>   

I'm thinking of comitting rmap_nuke() to kvm.git, and the rest to the 
external module, since this is only needed on kernels without mmu notifiers.

Andrea, is rmap_nuke() suitable for the mmu notifiers pte clear callback?


Oh, and a single gfn may have multiple hvas, so we need to iterate over 
something here.

> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index f0cdfba..c41464f 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -642,6 +642,67 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
>  	account_shadowed(kvm, gfn);
>  }
>  
> +static void rmap_nuke(struct kvm *kvm, u64 gfn)
> +{
> +	unsigned long *rmapp;
> +	u64 *spte;
> +	int nuked = 0;
> +
> +	gfn = unalias_gfn(kvm, gfn);
> +	rmapp = gfn_to_rmap(kvm, gfn, 0);
> +
> +	spte = rmap_next(kvm, rmapp, NULL);
> +	while (spte) {
> +		BUG_ON(!spte);
> +		BUG_ON(!(*spte & PT_PRESENT_MASK));
> +		rmap_printk("rmap_nuke: spte %p %llx\n", spte, *spte);
> +		rmap_remove(kvm, spte);
> +		set_shadow_pte(spte, shadow_trap_nonpresent_pte);
> +                nuked = 1;
> +		spte = rmap_next(kvm, rmapp, spte);
> +	}
> +	/* check for huge page mappings */
> +	rmapp = gfn_to_rmap(kvm, gfn, 1);
> +	spte = rmap_next(kvm, rmapp, NULL);
> +	while (spte) {
> +		BUG_ON(!spte);
> +		BUG_ON(!(*spte & PT_PRESENT_MASK));
> +		BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
> +		pgprintk("rmap_nuke(large): spte %p %llx %lld\n", spte, *spte, gfn);
> +		rmap_remove(kvm, spte);
> +		--kvm->stat.lpages;
> +		set_shadow_pte(spte, shadow_trap_nonpresent_pte);
> +		nuked = 1;
> +		spte = rmap_next(kvm, rmapp, spte);
> +	}
> +
> +	if (nuked)
> +		kvm_flush_remote_tlbs(kvm);
> +}
> +
> +int kvm_zap_single_gfn(struct kvm *kvm, gfn_t gfn)
> +{
> +	unsigned long addr;
> +	int have_mmu_notifiers = 0;
> +
> +	down_read(&kvm->slots_lock);
> +	addr = gfn_to_hva(kvm, gfn);
> +
> +	if (kvm_is_error_hva(addr)) {
> +		up_read(&kvm->slots_lock);
> +		return -EINVAL;
> +	}
> +
> +	if (!have_mmu_notifiers) {
> +		spin_lock(&kvm->mmu_lock);
> +		rmap_nuke(kvm, gfn);
> +		spin_unlock(&kvm->mmu_lock);
> +	}
> +	up_read(&kvm->slots_lock);
> +
> +	return 0;
> +}
> +
>  #ifdef MMU_DEBUG
>  static int is_empty_shadow_page(u64 *spt)
>  {
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index e65a9d6..d982ca1 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -816,6 +816,9 @@ int kvm_dev_ioctl_check_extension(long ext)
>  	case KVM_CAP_NR_MEMSLOTS:
>  		r = KVM_MEMORY_SLOTS;
>  		break;
> +	case KVM_CAP_ZAP_GFN:
> +		r = 1;
> +		break;
>  	default:
>  		r = 0;
>  		break;
> @@ -1636,6 +1639,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
>  		r = 0;
>  		break;
>  	}
> +	case KVM_ZAP_GFN: {
> +		gfn_t gfn;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&gfn, argp, sizeof gfn))
> +			goto out;
> +		r = kvm_zap_single_gfn(kvm, gfn);
> +		break;
> +        } 
>  	default:
>  		;
>  	}
> diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
> index 024b57c..4e45bd2 100644
> --- a/include/asm-x86/kvm_host.h
> +++ b/include/asm-x86/kvm_host.h
> @@ -425,6 +425,7 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
>  int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
>  void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
>  void kvm_mmu_zap_all(struct kvm *kvm);
> +int  kvm_zap_single_gfn(struct kvm *kvm, gfn_t gfn);
>  unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
>  void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
>  
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index e92e703..9ea714f 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -236,6 +236,7 @@ struct kvm_vapic_addr {
>  #define KVM_CAP_CLOCKSOURCE 8
>  #define KVM_CAP_NR_VCPUS 9       /* returns max vcpus per vm */
>  #define KVM_CAP_NR_MEMSLOTS 10   /* returns max memory slots per vm */
> +#define KVM_CAP_ZAP_GFN	11
>  
>  /*
>   * ioctls for VM fds
> @@ -258,6 +259,7 @@ struct kvm_vapic_addr {
>  #define KVM_IRQ_LINE		  _IOW(KVMIO, 0x61, struct kvm_irq_level)
>  #define KVM_GET_IRQCHIP		  _IOWR(KVMIO, 0x62, struct kvm_irqchip)
>  #define KVM_SET_IRQCHIP		  _IOR(KVMIO,  0x63, struct kvm_irqchip)
> +#define KVM_ZAP_GFN	   	  _IOR(KVMIO,  0x64, unsigned long)
>  
>  /*
>   * ioctls for vcpu fds
>
> -------------------------------------------------------------------------
> This SF.net email is sponsored by: Microsoft
> Defy all challenges. Microsoft(R) Visual Studio 2008.
> http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
> _______________________________________________
> kvm-devel mailing list
> kvm-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/kvm-devel
>   


-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: KVM: MMU: add KVM_ZAP_GFN ioctl
  2008-03-20 12:09 ` Avi Kivity
@ 2008-03-21 11:42   ` Andrea Arcangeli
  2008-03-21 13:37     ` Marcelo Tosatti
  0 siblings, 1 reply; 10+ messages in thread
From: Andrea Arcangeli @ 2008-03-21 11:42 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm-devel

On Thu, Mar 20, 2008 at 02:09:15PM +0200, Avi Kivity wrote:
> Marcelo Tosatti wrote:
> > Add an ioctl to zap all mappings to a given gfn. This allows userspace
> > remove the QEMU process mappings and the page without causing
> > inconsistency.
> >
> >   
> 
> I'm thinking of comitting rmap_nuke() to kvm.git, and the rest to the 
> external module, since this is only needed on kernels without mmu notifiers.
> 
> Andrea, is rmap_nuke() suitable for the mmu notifiers pte clear callback?

There's the usual smp race condition. The tlb must be flushed before
the final put_page in rmap_remove. And it can't be safe to call this
ioctl before sys_munmap(), so this would be the final put_page.

My kvm_unmap_hva takes care of that.

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: KVM: MMU: add KVM_ZAP_GFN ioctl
  2008-03-21 11:42   ` Andrea Arcangeli
@ 2008-03-21 13:37     ` Marcelo Tosatti
  2008-03-21 15:56       ` Andrea Arcangeli
  2008-03-23  8:50       ` Avi Kivity
  0 siblings, 2 replies; 10+ messages in thread
From: Marcelo Tosatti @ 2008-03-21 13:37 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Marcelo Tosatti, kvm-devel, Avi Kivity

On Fri, Mar 21, 2008 at 12:42:14PM +0100, Andrea Arcangeli wrote:
> On Thu, Mar 20, 2008 at 02:09:15PM +0200, Avi Kivity wrote:
> > Marcelo Tosatti wrote:
> > > Add an ioctl to zap all mappings to a given gfn. This allows userspace
> > > remove the QEMU process mappings and the page without causing
> > > inconsistency.
> > >
> > >   
> > 
> > I'm thinking of comitting rmap_nuke() to kvm.git, and the rest to the 
> > external module, since this is only needed on kernels without mmu notifiers.
> > 
> > Andrea, is rmap_nuke() suitable for the mmu notifiers pte clear callback?
> 
> There's the usual smp race condition. The tlb must be flushed before
> the final put_page in rmap_remove. And it can't be safe to call this
> ioctl before sys_munmap(), so this would be the final put_page.
> 
> My kvm_unmap_hva takes care of that.

This is not the final put_page().

Remote TLB's are flushed here, after rmap_remove:

+       if (nuked)
+               kvm_flush_remote_tlbs(kvm);

This ioctl is called before zap_page_range() is executed through
sys_madvise(MADV_DONTNEED) to remove the page in question.

We know that the guest will not attempt to fault in the gfn because
the virtio balloon driver is synchronous (it will only attempt to
release that page back to the guest OS once rmap_nuke+zap_page_range has
finished).

Can you be more verbose?

By the way, I don't see invalidate_begin/invalidate_end hooks in the KVM 
part of MMU notifiers V9 patch? (meaning that zap_page_range will not zap
the spte's for the pages in question).

Thanks

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: KVM: MMU: add KVM_ZAP_GFN ioctl
  2008-03-21 13:37     ` Marcelo Tosatti
@ 2008-03-21 15:56       ` Andrea Arcangeli
  2008-03-21 21:23         ` Marcelo Tosatti
  2008-03-23  8:50       ` Avi Kivity
  1 sibling, 1 reply; 10+ messages in thread
From: Andrea Arcangeli @ 2008-03-21 15:56 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel, Avi Kivity

On Fri, Mar 21, 2008 at 10:37:00AM -0300, Marcelo Tosatti wrote:
> This is not the final put_page().
> 
> Remote TLB's are flushed here, after rmap_remove:
> 
> +       if (nuked)
> +               kvm_flush_remote_tlbs(kvm);
> 
> This ioctl is called before zap_page_range() is executed through
> sys_madvise(MADV_DONTNEED) to remove the page in question.
> 
> We know that the guest will not attempt to fault in the gfn because
> the virtio balloon driver is synchronous (it will only attempt to
> release that page back to the guest OS once rmap_nuke+zap_page_range has
> finished).
> 
> Can you be more verbose?

Sure.

1) even if you run madvise(MADV_DONTNEED) after KVM_ZAP_GFN, the anon
   page can be released by the VM at any time without any kvm-aware
   lock (there will be a swap reference to it, no any more page_count
   references leading to memory corruption in the host in presence of
   memory pressure). This is purely theoretical of course, not sure if
   timings or probabilities allows for reproducing this in real life.

2) not sure what you mean with synchronous, do you mean single
   threaded? I can't see how it can be single threaded (does
   ballooning stops all other vcpus?). Why are you taking the mmu_lock
   around rmap_nuke if no other vcpu can take any page fault and call
   into get_user_pages in between KVM_ZAP_GFN and madvise? As far as I
   can tell the only possible safe ordering is madvise; KVM_ZAP_GFN,
   which is emulating the mmu notifier behavior incidentally.

Note that the rmap_remove smp race (also note here smp race means
smp-host race, it will trigger even if guest is UP) might be a generic
issue with the rmap_remove logic. I didn't analyze all the possible
rmap_remove callers yet (this was in my todo list), I just made sure
that my code would be smp safe.

> By the way, I don't see invalidate_begin/invalidate_end hooks in the KVM 
> part of MMU notifiers V9 patch? (meaning that zap_page_range will not zap
> the spte's for the pages in question).

range_begin isn't needed. range_begin is needed only by secondary mmu
drivers that aren't reference counting the pages. The _end callback is
below. It could be improved to skip the whole range in a single browse
of the memslots instead of browsing it for each page in the range. The
mmu notifiers aren't merged and this code may still require changes in
terms of API if EMM is merged instead of #v9 (hope not), so I tried to
keep it simple.

+void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
+				      struct mm_struct *mm,
+				      unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	BUG_ON(mm != kvm->mm);
+	write_seqlock(&kvm->arch.mmu_notifier_invalidate_lock);
+	kvm_unmap_hva(kvm, address);
+	write_sequnlock(&kvm->arch.mmu_notifier_invalidate_lock);
+}
+
+void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+					   struct mm_struct *mm,
+					   unsigned long start,
+					   unsigned long end)
+{
+	for (; start < end; start += PAGE_SIZE)
+		kvm_mmu_notifier_invalidate_page(mn, mm, start);
+}
+
+int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
+				       struct mm_struct *mm,
+				       unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	BUG_ON(mm != kvm->mm);
+	return kvm_age_hva(kvm, address);
+}
+
+static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+	.invalidate_page	= kvm_mmu_notifier_invalidate_page,
+	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
+	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
+};
+
 struct  kvm *kvm_arch_create_vm(void)
 {
 	struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: KVM: MMU: add KVM_ZAP_GFN ioctl
  2008-03-21 15:56       ` Andrea Arcangeli
@ 2008-03-21 21:23         ` Marcelo Tosatti
  2008-03-24  6:54           ` Andrea Arcangeli
  0 siblings, 1 reply; 10+ messages in thread
From: Marcelo Tosatti @ 2008-03-21 21:23 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Marcelo Tosatti, kvm-devel, Avi Kivity

On Fri, Mar 21, 2008 at 04:56:50PM +0100, Andrea Arcangeli wrote:
> On Fri, Mar 21, 2008 at 10:37:00AM -0300, Marcelo Tosatti wrote:
> > This is not the final put_page().
> > 
> > Remote TLB's are flushed here, after rmap_remove:
> > 
> > +       if (nuked)
> > +               kvm_flush_remote_tlbs(kvm);
> > 
> > This ioctl is called before zap_page_range() is executed through
> > sys_madvise(MADV_DONTNEED) to remove the page in question.
> > 
> > We know that the guest will not attempt to fault in the gfn because
> > the virtio balloon driver is synchronous (it will only attempt to
> > release that page back to the guest OS once rmap_nuke+zap_page_range has
> > finished).
> > 
> > Can you be more verbose?
> 
> Sure.
> 
> 1) even if you run madvise(MADV_DONTNEED) after KVM_ZAP_GFN, the anon
>    page can be released by the VM at any time without any kvm-aware
>    lock (there will be a swap reference to it, no any more page_count
>    references leading to memory corruption in the host in presence of
>    memory pressure). This is purely theoretical of course, not sure if
>    timings or probabilities allows for reproducing this in real life.

If there are any active shadow mappings to a page there is a guarantee
that there is a valid linux pte mapping pointing at it. So page_count ==
1 + nr_sptes.

So the theoretical race you're talking about is:


    CPU0							CPU1

    spte = rmap_next(kvm, rmapp, NULL);
    while (spte) {
            BUG_ON(!spte);
            BUG_ON(!(*spte & PT_PRESENT_MASK));
            rmap_printk("rmap_nuke: spte %p %llx\n", spte, *spte);
            rmap_remove(kvm, spte);
            set_shadow_pte(spte, shadow_trap_nonpresent_pte);
            nuked = 1;
            spte = rmap_next(kvm, rmapp, spte);
    }
    							<---------- try_to_unmap_one()
								    page is now free
								    page allocated for other
								    purposes
   if (nuked)
       kvm_flush_remote_tlbs(kvm);

And some other VCPU with the TLB cached writes to the now freed (and
possibly allocated to another purpose) page.

This case is safe because the path that frees a pte and subsequently
a page will take care of flushing the TLB of any remote CPU's that
possibly have it cached (before freeing the page, of course).
ptep_clear_flush->flush_tlb_page.

Am I missing something?

> 2) not sure what you mean with synchronous, do you mean single
>    threaded? I can't see how it can be single threaded (does
>    ballooning stops all other vcpus?). 

No, I mean synchronous as in that no other vcpu will attempt to fault
that _particular gfn_ in between KVM_ZAP_GFN and madvise.

>  Why are you taking the mmu_lock around rmap_nuke if no other vcpu
>  can take any page fault and call into get_user_pages in between
>  KVM_ZAP_GFN and madvise?

Other vcpu's can take page faults and call into get_user_pages, but not
for the gfn KVM_ZAP_GFN is operating on, because it has been allocated
by the balloon driver.

So we need mmu_lock to protect against concurrent shadow page and rmap
operations.

> As far as I
>    can tell the only possible safe ordering is madvise; KVM_ZAP_GFN,
>    which is emulating the mmu notifier behavior incidentally.
> 
> Note that the rmap_remove smp race (also note here smp race means
> smp-host race, it will trigger even if guest is UP) might be a generic
> issue with the rmap_remove logic. I didn't analyze all the possible
> rmap_remove callers yet (this was in my todo list), I just made sure
> that my code would be smp safe.

As detailed above, we have a guarantee that there is a live linux pte
by the time rmap_remove() nukes a shadow pte.

> > By the way, I don't see invalidate_begin/invalidate_end hooks in the KVM 
> > part of MMU notifiers V9 patch? (meaning that zap_page_range will not zap
> > the spte's for the pages in question).
> 
> range_begin isn't needed. range_begin is needed only by secondary mmu
> drivers that aren't reference counting the pages. The _end callback is
> below. It could be improved to skip the whole range in a single browse
> of the memslots instead of browsing it for each page in the range. The
> mmu notifiers aren't merged and this code may still require changes in
> terms of API if EMM is merged instead of #v9 (hope not), so I tried to
> keep it simple.

Oh, I missed that. Nice.

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: KVM: MMU: add KVM_ZAP_GFN ioctl
  2008-03-21 13:37     ` Marcelo Tosatti
  2008-03-21 15:56       ` Andrea Arcangeli
@ 2008-03-23  8:50       ` Avi Kivity
  2008-03-23 20:23         ` Andrea Arcangeli
  1 sibling, 1 reply; 10+ messages in thread
From: Avi Kivity @ 2008-03-23  8:50 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel, Andrea Arcangeli

Marcelo Tosatti wrote:
> On Fri, Mar 21, 2008 at 12:42:14PM +0100, Andrea Arcangeli wrote:
>   
>> On Thu, Mar 20, 2008 at 02:09:15PM +0200, Avi Kivity wrote:
>>     
>>> Marcelo Tosatti wrote:
>>>       
>>>> Add an ioctl to zap all mappings to a given gfn. This allows userspace
>>>> remove the QEMU process mappings and the page without causing
>>>> inconsistency.
>>>>
>>>>   
>>>>         
>>> I'm thinking of comitting rmap_nuke() to kvm.git, and the rest to the 
>>> external module, since this is only needed on kernels without mmu notifiers.
>>>
>>> Andrea, is rmap_nuke() suitable for the mmu notifiers pte clear callback?
>>>       
>> There's the usual smp race condition. The tlb must be flushed before
>> the final put_page in rmap_remove. And it can't be safe to call this
>> ioctl before sys_munmap(), so this would be the final put_page.
>>
>> My kvm_unmap_hva takes care of that.
>>     
>
> This is not the final put_page().
>
>   

I think it may be, when this is used as an mmu notifier callback.

btw, when we nuke an spte, don't we lose dirty bit information?  That 
doesn't matter with madvise(), but it does when removing a pte for other 
reasons, say swapping.  Don't we need to clear the spte with cmpxchg(), 
to make sure the dirty bit is what we think it is?



-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: KVM: MMU: add KVM_ZAP_GFN ioctl
  2008-03-23  8:50       ` Avi Kivity
@ 2008-03-23 20:23         ` Andrea Arcangeli
  0 siblings, 0 replies; 10+ messages in thread
From: Andrea Arcangeli @ 2008-03-23 20:23 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm-devel

On Sun, Mar 23, 2008 at 10:50:18AM +0200, Avi Kivity wrote:
> btw, when we nuke an spte, don't we lose dirty bit information?  That 
> doesn't matter with madvise(), but it does when removing a pte for other 
> reasons, say swapping.  Don't we need to clear the spte with cmpxchg(), to 
> make sure the dirty bit is what we think it is?

get_user_pages is always called with dirty=1, so we know PG_dirty will
be set on the page_t when the pte is cleared. The invalidate_page
method is called by the rmap code just after clearing the pte while
the page_t is locked, and while the page is locked PG_dirty shouldn't
disappear. So as long as we only map anonymous memory we should be
safe. (hugetlbfs wasn't allowed as guest physical memory yet when I
wrote that code)

But if we want to also call set_page_dirty and check the spte dirty
bit, that's sure safe addition to make it less dependent on mmu
notifier invocation details (notably PG_lock being set).

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: KVM: MMU: add KVM_ZAP_GFN ioctl
  2008-03-21 21:23         ` Marcelo Tosatti
@ 2008-03-24  6:54           ` Andrea Arcangeli
  2008-03-26 12:31             ` Andrea Arcangeli
  0 siblings, 1 reply; 10+ messages in thread
From: Andrea Arcangeli @ 2008-03-24  6:54 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel, Avi Kivity

On Fri, Mar 21, 2008 at 06:23:41PM -0300, Marcelo Tosatti wrote:
> If there are any active shadow mappings to a page there is a guarantee
> that there is a valid linux pte mapping pointing at it. So page_count ==
             ^^ was

> 1 + nr_sptes.

Yes.

> So the theoretical race you're talking about is:
> 
> 
>     CPU0							CPU1
> 
>     spte = rmap_next(kvm, rmapp, NULL);
>     while (spte) {
>             BUG_ON(!spte);
>             BUG_ON(!(*spte & PT_PRESENT_MASK));
>             rmap_printk("rmap_nuke: spte %p %llx\n", spte, *spte);
>             rmap_remove(kvm, spte);
>             set_shadow_pte(spte, shadow_trap_nonpresent_pte);
>             nuked = 1;
>             spte = rmap_next(kvm, rmapp, spte);
>     }
>     							<---------- try_to_unmap_one()
> 								    page is now free
> 								    page allocated for other
> 								    purposes
>    if (nuked)
>        kvm_flush_remote_tlbs(kvm);

I'd more accurately describe the race as this:

     CPU0							CPU1

     spte = rmap_next(kvm, rmapp, NULL);
     while (spte) {
             BUG_ON(!spte);
             BUG_ON(!(*spte & PT_PRESENT_MASK));
             rmap_printk("rmap_nuke: spte %p %llx\n", spte, *spte);
							            page_count = 2
     							<---------- try_to_unmap_one()
 								    page_count = 1
             rmap_remove(kvm, spte);   <- page is freed, race window opened
             set_shadow_pte(spte, shadow_trap_nonpresent_pte);
             nuked = 1;
             spte = rmap_next(kvm, rmapp, spte);
     }
    if (nuked)
        kvm_flush_remote_tlbs(kvm); <- race window closed

> And some other VCPU with the TLB cached writes to the now freed (and
> possibly allocated to another purpose) page.

The other VCPU could also read, not just write. If it reads the guest
will crash, if it writes the host will crash (host fs corruption
etc...) too.

> This case is safe because the path that frees a pte and subsequently
> a page will take care of flushing the TLB of any remote CPU's that
> possibly have it cached (before freeing the page, of course).
> ptep_clear_flush->flush_tlb_page.
> 
> Am I missing something?

flush_tlb_page only takes care of accesses to the page through qemu
task when outside vmx/svm mode. If the other physical cpu, is running
some code of some guest vcpu in vmx/svm mode after the race window is
open and before it is closed the race will trigger.

> > 2) not sure what you mean with synchronous, do you mean single
> >    threaded? I can't see how it can be single threaded (does
> >    ballooning stops all other vcpus?). 
> 
> No, I mean synchronous as in that no other vcpu will attempt to fault
> that _particular gfn_ in between KVM_ZAP_GFN and madvise.

The other vcpu can't fault any gfn, the other vcpu will fault only
_after_ the kvm_flush_remote_tlbs. Otherwise there would be no
problem.

The fix is exactly to make sure the other vcpu faults that particular
gfn before the page is in the freelist, and to do that the tlb flush
must happen _before_ rmap_remove calls put_page.

And madvise should happen before KVM_ZAP_GFN to prevent a gfn_to_page
in a parallel kvm page fault triggered by some other vcpu to
re-instantiate the spte after KVM_ZAP_GFN run.

> >  Why are you taking the mmu_lock around rmap_nuke if no other vcpu
> >  can take any page fault and call into get_user_pages in between
> >  KVM_ZAP_GFN and madvise?
> 
> Other vcpu's can take page faults and call into get_user_pages, but not
> for the gfn KVM_ZAP_GFN is operating on, because it has been allocated
> by the balloon driver.

With regard to the ordering of ioctl vs madvise, we better not trust
the guest. If the guest tries to access ram ranges that has been
inflated in the balloon qemu should behave like if the guest is
accessing ram outside the virtualized e820 map instead of leaking ram
to the buggy guest. There is no difference in the ordering, other than
doing madvise first is more robust in detecting buggy guest. I can't
see any benefit in doing the ioctl first. The memory corrupting race
window will happen regardless because the host VM rmap code is the one
releasing the page_count.

> So we need mmu_lock to protect against concurrent shadow page and rmap
> operations.

Ok.

> As detailed above, we have a guarantee that there is a live linux pte
> by the time rmap_remove() nukes a shadow pte.

I can't see how. I mean right now with current kvm.git, not with some
patch floating around. Which is the whole point of adding the
KVM_ZAP_GFN because the mainline folks didn't merge the mmu notifiers
in time for .25. My mmu notifier code already taken care of doing all
of this race free but I sure agree with this approach for the short
term, the less we have to wait the better.

Thanks!
Andrea

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: KVM: MMU: add KVM_ZAP_GFN ioctl
  2008-03-24  6:54           ` Andrea Arcangeli
@ 2008-03-26 12:31             ` Andrea Arcangeli
  0 siblings, 0 replies; 10+ messages in thread
From: Andrea Arcangeli @ 2008-03-26 12:31 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel, Avi Kivity

On Mon, Mar 24, 2008 at 07:54:27AM +0100, Andrea Arcangeli wrote:
> I'd more accurately describe the race as this:
> 
> 
>      CPU0							CPU1
>  
>      spte = rmap_next(kvm, rmapp, NULL);
>      while (spte) {
>              BUG_ON(!spte);
>              BUG_ON(!(*spte & PT_PRESENT_MASK));
>              rmap_printk("rmap_nuke: spte %p %llx\n", spte, *spte);
> 							            page_count = 2
>      							<---------- try_to_unmap_one()
>  								    page_count = 1
>              rmap_remove(kvm, spte);   <- page is freed, race window opened
>              set_shadow_pte(spte, shadow_trap_nonpresent_pte);
>              nuked = 1;
>              spte = rmap_next(kvm, rmapp, spte);
>      }
>     if (nuked)
>         kvm_flush_remote_tlbs(kvm); <- race window closed

As I suspected, it seems this race can happen even when sptes are
removed from the cache. The VM may move the page_count to swapcount
while rmap_remove runs. Nobody has ever reproduced it, but it has to
be fixed nevertheless.

My mmu notifier patch closed this race condition against the linux VM,
by doing the last free with the right ordering under the PG_lock, in
turn ensuring when the VM removes the page from anonymous memory, the
tlb of the secondary mmu is flushed before the final put_page.

We discussed with Avi to do the right ordering all the time in
rmap_remove so there's no dependency on the Linux VM locking when
calling mmu notifier methods, and to be more robust in general (so
unmap all sptes, queue all the pages to unpin in a global array inside
the kvm_arch, flush the tlb once, put_page all the pages in the
array). As Avi pointed out we can't use page->lru for the queue.

Then perhaps your KVM ioctl will just work safe.

Marcelo, if you're interested to fix this let us know, so we don't
duplicate effort, or Avi or me can do the fix on rmap_remove.

Thanks!
Andrea

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2008-03-26 12:31 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-03-12 22:29 KVM: MMU: add KVM_ZAP_GFN ioctl Marcelo Tosatti
2008-03-20 12:09 ` Avi Kivity
2008-03-21 11:42   ` Andrea Arcangeli
2008-03-21 13:37     ` Marcelo Tosatti
2008-03-21 15:56       ` Andrea Arcangeli
2008-03-21 21:23         ` Marcelo Tosatti
2008-03-24  6:54           ` Andrea Arcangeli
2008-03-26 12:31             ` Andrea Arcangeli
2008-03-23  8:50       ` Avi Kivity
2008-03-23 20:23         ` Andrea Arcangeli

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox