[PATCH 5/6] kvm, x86: use ro page and don't copy shared page

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 5/6] kvm, x86: use ro page and don't copy shared page
@ 2010-07-16  2:13 Lai Jiangshan
  2010-07-16  7:19 ` Gleb Natapov
  0 siblings, 1 reply; 10+ messages in thread
From: Lai Jiangshan @ 2010-07-16  2:13 UTC (permalink / raw)
  To: LKML, kvm, Avi Kivity, Marcelo Tosatti, Nick Piggin

When page fault, we always call get_user_pages(write=1).

Actually, we don't need to do this when it is not write fault.
get_user_pages(write=1) will cause shared page(ksm) copied.
If this page is not modified in future, this copying and the copied page
are just wasted. Ksm may scan and merge them and may cause thrash.

In this patch, if the page is RO for host VMM and it not write fault for guest,
we will use RO page, otherwise we use a writable page.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8ba9b0d..6382140 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1832,6 +1832,45 @@ static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 	}
 }
 
+/* get a current mapped page fast, and test whether the page is writable. */
+static struct page *get_user_page_and_protection(unsigned long addr,
+	int *writable)
+{
+	struct page *page[1];
+
+	if (__get_user_pages_fast(addr, 1, 1, page) == 1) {
+		*writable = 1;
+		return page[0];
+	}
+	if (__get_user_pages_fast(addr, 1, 0, page) == 1) {
+		*writable = 0;
+		return page[0];
+	}
+	return NULL;
+}
+
+static pfn_t kvm_get_pfn_for_page_fault(struct kvm *kvm, gfn_t gfn,
+		int write_fault, int *host_writable)
+{
+	unsigned long addr;
+	struct page *page;
+
+	if (!write_fault) {
+		addr = gfn_to_hva(kvm, gfn);
+		if (kvm_is_error_hva(addr)) {
+			get_page(bad_page);
+			return page_to_pfn(bad_page);
+		}
+
+		page = get_user_page_and_protection(addr, host_writable);
+		if (page)
+			return page_to_pfn(page);
+	}
+
+	*host_writable = 1;
+	return kvm_get_pfn_for_gfn(kvm, gfn);
+}
+
 static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 				  bool can_unsync)
 {
@@ -2085,6 +2124,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	int level;
 	pfn_t pfn;
 	unsigned long mmu_seq;
+	int host_writable;
 
 	level = mapping_level(vcpu, gfn);
 
@@ -2099,7 +2139,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
-	pfn = kvm_get_pfn_for_gfn(vcpu->kvm, gfn);
+	pfn = kvm_get_pfn_for_page_fault(vcpu->kvm, gfn, write, &host_writable);
 
 	/* mmio */
 	if (is_error_pfn(pfn))
@@ -2109,7 +2149,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
-	r = __direct_map(vcpu, v, write, level, gfn, pfn, true);
+	r = __direct_map(vcpu, v, write, level, gfn, pfn, host_writable);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 
@@ -2307,6 +2347,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 	int level;
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	unsigned long mmu_seq;
+	int write_fault = error_code & PFERR_WRITE_MASK;
+	int host_writable;
 
 	ASSERT(vcpu);
 	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -2321,15 +2363,16 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
-	pfn = kvm_get_pfn_for_gfn(vcpu->kvm, gfn);
+	pfn = kvm_get_pfn_for_page_fault(vcpu->kvm, gfn, write_fault,
+			&host_writable);
 	if (is_error_pfn(pfn))
 		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
-	r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
-			 level, gfn, pfn, true);
+	r = __direct_map(vcpu, gpa, write_fault,
+			 level, gfn, pfn, host_writable);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return r;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a9dbaa0..1874f51 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -430,6 +430,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	pfn_t pfn;
 	int level = PT_PAGE_TABLE_LEVEL;
 	unsigned long mmu_seq;
+	int host_writable;
 
 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
 	kvm_mmu_audit(vcpu, "pre page fault");
@@ -461,7 +462,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
-	pfn = kvm_get_pfn_for_gfn(vcpu->kvm, walker.gfn);
+	pfn = kvm_get_pfn_for_page_fault(vcpu->kvm, walker.gfn, write_fault,
+			&host_writable);
 
 	/* mmio */
 	if (is_error_pfn(pfn))
@@ -472,7 +474,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
 	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
-			     level, &write_pt, pfn, true);
+			     level, &write_pt, pfn, host_writable);
 	(void)sptep;
 	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
 		 sptep, *sptep, write_pt);
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 738e659..a4ce19f 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -8,6 +8,7 @@
 #include <linux/mm.h>
 #include <linux/vmstat.h>
 #include <linux/highmem.h>
+#include <linux/module.h>
 
 #include <asm/pgtable.h>
 
@@ -274,6 +275,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 
 	return nr;
 }
+EXPORT_SYMBOL_GPL(__get_user_pages_fast);
 
 /**
  * get_user_pages_fast() - pin user pages in memory

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 5/6] kvm, x86: use ro page and don't copy shared page
  2010-07-16  2:13 [PATCH 5/6] kvm, x86: use ro page and don't copy shared page Lai Jiangshan
@ 2010-07-16  7:19 ` Gleb Natapov
  2010-07-16 23:26   ` Marcelo Tosatti
  2010-07-29  2:15   ` Lai Jiangshan
  0 siblings, 2 replies; 10+ messages in thread
From: Gleb Natapov @ 2010-07-16  7:19 UTC (permalink / raw)
  To: Lai Jiangshan; +Cc: LKML, kvm, Avi Kivity, Marcelo Tosatti, Nick Piggin

On Fri, Jul 16, 2010 at 10:13:07AM +0800, Lai Jiangshan wrote:
> When page fault, we always call get_user_pages(write=1).
> 
> Actually, we don't need to do this when it is not write fault.
> get_user_pages(write=1) will cause shared page(ksm) copied.
> If this page is not modified in future, this copying and the copied page
> are just wasted. Ksm may scan and merge them and may cause thrash.
> 
But is page is written into afterwords we will get another page fault.

> In this patch, if the page is RO for host VMM and it not write fault for guest,
> we will use RO page, otherwise we use a writable page.
> 
Currently pages allocated for guest memory are required to be RW, so after your series
behaviour will remain exactly the same as before.

> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
> ---
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 8ba9b0d..6382140 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1832,6 +1832,45 @@ static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
>  	}
>  }
>  
> +/* get a current mapped page fast, and test whether the page is writable. */
> +static struct page *get_user_page_and_protection(unsigned long addr,
> +	int *writable)
> +{
> +	struct page *page[1];
> +
> +	if (__get_user_pages_fast(addr, 1, 1, page) == 1) {
> +		*writable = 1;
> +		return page[0];
> +	}
> +	if (__get_user_pages_fast(addr, 1, 0, page) == 1) {
> +		*writable = 0;
> +		return page[0];
> +	}
> +	return NULL;
> +}
> +
> +static pfn_t kvm_get_pfn_for_page_fault(struct kvm *kvm, gfn_t gfn,
> +		int write_fault, int *host_writable)
> +{
> +	unsigned long addr;
> +	struct page *page;
> +
> +	if (!write_fault) {
> +		addr = gfn_to_hva(kvm, gfn);
> +		if (kvm_is_error_hva(addr)) {
> +			get_page(bad_page);
> +			return page_to_pfn(bad_page);
> +		}
> +
> +		page = get_user_page_and_protection(addr, host_writable);
> +		if (page)
> +			return page_to_pfn(page);
> +	}
> +
> +	*host_writable = 1;
> +	return kvm_get_pfn_for_gfn(kvm, gfn);
> +}
> +
kvm_get_pfn_for_gfn() returns fault_page if page is mapped RO, so caller
of kvm_get_pfn_for_page_fault() and kvm_get_pfn_for_gfn() will get
different results when called on the same page. Not good.
kvm_get_pfn_for_page_fault() logic should be folded into
kvm_get_pfn_for_gfn().

>  static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
>  				  bool can_unsync)
>  {
> @@ -2085,6 +2124,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
>  	int level;
>  	pfn_t pfn;
>  	unsigned long mmu_seq;
> +	int host_writable;
>  
>  	level = mapping_level(vcpu, gfn);
>  
> @@ -2099,7 +2139,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
>  
>  	mmu_seq = vcpu->kvm->mmu_notifier_seq;
>  	smp_rmb();
> -	pfn = kvm_get_pfn_for_gfn(vcpu->kvm, gfn);
> +	pfn = kvm_get_pfn_for_page_fault(vcpu->kvm, gfn, write, &host_writable);
>  
>  	/* mmio */
>  	if (is_error_pfn(pfn))
> @@ -2109,7 +2149,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
>  	if (mmu_notifier_retry(vcpu, mmu_seq))
>  		goto out_unlock;
>  	kvm_mmu_free_some_pages(vcpu);
> -	r = __direct_map(vcpu, v, write, level, gfn, pfn, true);
> +	r = __direct_map(vcpu, v, write, level, gfn, pfn, host_writable);
>  	spin_unlock(&vcpu->kvm->mmu_lock);
>  
>  
> @@ -2307,6 +2347,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
>  	int level;
>  	gfn_t gfn = gpa >> PAGE_SHIFT;
>  	unsigned long mmu_seq;
> +	int write_fault = error_code & PFERR_WRITE_MASK;
> +	int host_writable;
>  
>  	ASSERT(vcpu);
>  	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
> @@ -2321,15 +2363,16 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
>  
>  	mmu_seq = vcpu->kvm->mmu_notifier_seq;
>  	smp_rmb();
> -	pfn = kvm_get_pfn_for_gfn(vcpu->kvm, gfn);
> +	pfn = kvm_get_pfn_for_page_fault(vcpu->kvm, gfn, write_fault,
> +			&host_writable);
>  	if (is_error_pfn(pfn))
>  		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
>  	spin_lock(&vcpu->kvm->mmu_lock);
>  	if (mmu_notifier_retry(vcpu, mmu_seq))
>  		goto out_unlock;
>  	kvm_mmu_free_some_pages(vcpu);
> -	r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
> -			 level, gfn, pfn, true);
> +	r = __direct_map(vcpu, gpa, write_fault,
> +			 level, gfn, pfn, host_writable);
>  	spin_unlock(&vcpu->kvm->mmu_lock);
>  
>  	return r;
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index a9dbaa0..1874f51 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -430,6 +430,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
>  	pfn_t pfn;
>  	int level = PT_PAGE_TABLE_LEVEL;
>  	unsigned long mmu_seq;
> +	int host_writable;
>  
>  	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
>  	kvm_mmu_audit(vcpu, "pre page fault");
> @@ -461,7 +462,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
>  
>  	mmu_seq = vcpu->kvm->mmu_notifier_seq;
>  	smp_rmb();
> -	pfn = kvm_get_pfn_for_gfn(vcpu->kvm, walker.gfn);
> +	pfn = kvm_get_pfn_for_page_fault(vcpu->kvm, walker.gfn, write_fault,
> +			&host_writable);
>  
>  	/* mmio */
>  	if (is_error_pfn(pfn))
> @@ -472,7 +474,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
>  		goto out_unlock;
>  	kvm_mmu_free_some_pages(vcpu);
>  	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
> -			     level, &write_pt, pfn, true);
> +			     level, &write_pt, pfn, host_writable);
>  	(void)sptep;
>  	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
>  		 sptep, *sptep, write_pt);
> diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
> index 738e659..a4ce19f 100644
> --- a/arch/x86/mm/gup.c
> +++ b/arch/x86/mm/gup.c
> @@ -8,6 +8,7 @@
>  #include <linux/mm.h>
>  #include <linux/vmstat.h>
>  #include <linux/highmem.h>
> +#include <linux/module.h>
>  
>  #include <asm/pgtable.h>
>  
> @@ -274,6 +275,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
>  
>  	return nr;
>  }
> +EXPORT_SYMBOL_GPL(__get_user_pages_fast);
>  
>  /**
>   * get_user_pages_fast() - pin user pages in memory
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
			Gleb.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 5/6] kvm, x86: use ro page and don't copy shared page
  2010-07-16  7:19 ` Gleb Natapov
@ 2010-07-16 23:26   ` Marcelo Tosatti
  2010-07-17  4:31     ` Gleb Natapov
  2010-07-29  2:19     ` Lai Jiangshan
  2010-07-29  2:15   ` Lai Jiangshan
  1 sibling, 2 replies; 10+ messages in thread
From: Marcelo Tosatti @ 2010-07-16 23:26 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: Lai Jiangshan, LKML, kvm, Avi Kivity, Nick Piggin

On Fri, Jul 16, 2010 at 10:19:36AM +0300, Gleb Natapov wrote:
> On Fri, Jul 16, 2010 at 10:13:07AM +0800, Lai Jiangshan wrote:
> > When page fault, we always call get_user_pages(write=1).
> > 
> > Actually, we don't need to do this when it is not write fault.
> > get_user_pages(write=1) will cause shared page(ksm) copied.
> > If this page is not modified in future, this copying and the copied page
> > are just wasted. Ksm may scan and merge them and may cause thrash.
> > 
> But is page is written into afterwords we will get another page fault.
> 
> > In this patch, if the page is RO for host VMM and it not write fault for guest,
> > we will use RO page, otherwise we use a writable page.
> > 
> Currently pages allocated for guest memory are required to be RW, so after your series
> behaviour will remain exactly the same as before.

Except KSM pages.

> > Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
> > ---
> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > index 8ba9b0d..6382140 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -1832,6 +1832,45 @@ static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
> >  	}
> >  }
> >  
> > +/* get a current mapped page fast, and test whether the page is writable. */
> > +static struct page *get_user_page_and_protection(unsigned long addr,
> > +	int *writable)
> > +{
> > +	struct page *page[1];
> > +
> > +	if (__get_user_pages_fast(addr, 1, 1, page) == 1) {
> > +		*writable = 1;
> > +		return page[0];
> > +	}
> > +	if (__get_user_pages_fast(addr, 1, 0, page) == 1) {
> > +		*writable = 0;
> > +		return page[0];
> > +	}
> > +	return NULL;
> > +}
> > +
> > +static pfn_t kvm_get_pfn_for_page_fault(struct kvm *kvm, gfn_t gfn,
> > +		int write_fault, int *host_writable)
> > +{
> > +	unsigned long addr;
> > +	struct page *page;
> > +
> > +	if (!write_fault) {
> > +		addr = gfn_to_hva(kvm, gfn);
> > +		if (kvm_is_error_hva(addr)) {
> > +			get_page(bad_page);
> > +			return page_to_pfn(bad_page);
> > +		}
> > +
> > +		page = get_user_page_and_protection(addr, host_writable);
> > +		if (page)
> > +			return page_to_pfn(page);
> > +	}
> > +
> > +	*host_writable = 1;
> > +	return kvm_get_pfn_for_gfn(kvm, gfn);
> > +}
> > +
> kvm_get_pfn_for_gfn() returns fault_page if page is mapped RO, so caller
> of kvm_get_pfn_for_page_fault() and kvm_get_pfn_for_gfn() will get
> different results when called on the same page. Not good.
> kvm_get_pfn_for_page_fault() logic should be folded into
> kvm_get_pfn_for_gfn().

Agreed. Please keep gfn_to_pfn related code in virt/kvm/kvm_main.c.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 5/6] kvm, x86: use ro page and don't copy shared page
  2010-07-16 23:26   ` Marcelo Tosatti
@ 2010-07-17  4:31     ` Gleb Natapov
  2010-07-18 15:14       ` Avi Kivity
  2010-07-29  2:19     ` Lai Jiangshan
  1 sibling, 1 reply; 10+ messages in thread
From: Gleb Natapov @ 2010-07-17  4:31 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Lai Jiangshan, LKML, kvm, Avi Kivity, Nick Piggin

On Fri, Jul 16, 2010 at 08:26:12PM -0300, Marcelo Tosatti wrote:
> On Fri, Jul 16, 2010 at 10:19:36AM +0300, Gleb Natapov wrote:
> > On Fri, Jul 16, 2010 at 10:13:07AM +0800, Lai Jiangshan wrote:
> > > When page fault, we always call get_user_pages(write=1).
> > > 
> > > Actually, we don't need to do this when it is not write fault.
> > > get_user_pages(write=1) will cause shared page(ksm) copied.
> > > If this page is not modified in future, this copying and the copied page
> > > are just wasted. Ksm may scan and merge them and may cause thrash.
> > > 
> > But is page is written into afterwords we will get another page fault.
> > 
> > > In this patch, if the page is RO for host VMM and it not write fault for guest,
> > > we will use RO page, otherwise we use a writable page.
> > > 
> > Currently pages allocated for guest memory are required to be RW, so after your series
> > behaviour will remain exactly the same as before.
> 
> Except KSM pages.
> 
KSM page will be COWed by __get_user_pages_fast(addr, 1, 1, page) in
get_user_page_and_protection() just like it COWed now, no?
 
> > > Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
> > > ---
> > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > > index 8ba9b0d..6382140 100644
> > > --- a/arch/x86/kvm/mmu.c
> > > +++ b/arch/x86/kvm/mmu.c
> > > @@ -1832,6 +1832,45 @@ static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
> > >  	}
> > >  }
> > >  
> > > +/* get a current mapped page fast, and test whether the page is writable. */
> > > +static struct page *get_user_page_and_protection(unsigned long addr,
> > > +	int *writable)
> > > +{
> > > +	struct page *page[1];
> > > +
> > > +	if (__get_user_pages_fast(addr, 1, 1, page) == 1) {
> > > +		*writable = 1;
> > > +		return page[0];
> > > +	}
> > > +	if (__get_user_pages_fast(addr, 1, 0, page) == 1) {
> > > +		*writable = 0;
> > > +		return page[0];
> > > +	}
> > > +	return NULL;
> > > +}
> > > +
> > > +static pfn_t kvm_get_pfn_for_page_fault(struct kvm *kvm, gfn_t gfn,
> > > +		int write_fault, int *host_writable)
> > > +{
> > > +	unsigned long addr;
> > > +	struct page *page;
> > > +
> > > +	if (!write_fault) {
> > > +		addr = gfn_to_hva(kvm, gfn);
> > > +		if (kvm_is_error_hva(addr)) {
> > > +			get_page(bad_page);
> > > +			return page_to_pfn(bad_page);
> > > +		}
> > > +
> > > +		page = get_user_page_and_protection(addr, host_writable);
> > > +		if (page)
> > > +			return page_to_pfn(page);
> > > +	}
> > > +
> > > +	*host_writable = 1;
> > > +	return kvm_get_pfn_for_gfn(kvm, gfn);
> > > +}
> > > +
> > kvm_get_pfn_for_gfn() returns fault_page if page is mapped RO, so caller
> > of kvm_get_pfn_for_page_fault() and kvm_get_pfn_for_gfn() will get
> > different results when called on the same page. Not good.
> > kvm_get_pfn_for_page_fault() logic should be folded into
> > kvm_get_pfn_for_gfn().
> 
> Agreed. Please keep gfn_to_pfn related code in virt/kvm/kvm_main.c.

--
			Gleb.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 5/6] kvm, x86: use ro page and don't copy shared page
  2010-07-17  4:31     ` Gleb Natapov
@ 2010-07-18 15:14       ` Avi Kivity
  2010-07-18 15:23         ` Gleb Natapov
  0 siblings, 1 reply; 10+ messages in thread
From: Avi Kivity @ 2010-07-18 15:14 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: Marcelo Tosatti, Lai Jiangshan, LKML, kvm, Nick Piggin

On 07/17/2010 07:31 AM, Gleb Natapov wrote:
>>>
>>> Currently pages allocated for guest memory are required to be RW, so after your series
>>> behaviour will remain exactly the same as before.
>>>        
>> Except KSM pages.
>>
>>      
> KSM page will be COWed by __get_user_pages_fast(addr, 1, 1, page) in
> get_user_page_and_protection() just like it COWed now, no?
>    

Well, we don't want to COW it on write faults.

The optimal behaviour is:

- write faults: COW and instantiate a writeable spte
- read faults: instantiate a readable spte; if likely(page is 
writeable), make it a writeable spte; if likely(page is dirty) make it a 
dirty spte
- speculative spte instantiations: if likely(page is present) 
instantiate a pte; if accessed, mark it accessed, if writeable, mark it 
writeable; if dirty, mark it dirty

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 5/6] kvm, x86: use ro page and don't copy shared page
  2010-07-18 15:14       ` Avi Kivity
@ 2010-07-18 15:23         ` Gleb Natapov
  2010-07-18 15:31           ` Avi Kivity
  0 siblings, 1 reply; 10+ messages in thread
From: Gleb Natapov @ 2010-07-18 15:23 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, Lai Jiangshan, LKML, kvm, Nick Piggin

On Sun, Jul 18, 2010 at 06:14:11PM +0300, Avi Kivity wrote:
> On 07/17/2010 07:31 AM, Gleb Natapov wrote:
> >>>
> >>>Currently pages allocated for guest memory are required to be RW, so after your series
> >>>behaviour will remain exactly the same as before.
> >>Except KSM pages.
> >>
> >KSM page will be COWed by __get_user_pages_fast(addr, 1, 1, page) in
> >get_user_page_and_protection() just like it COWed now, no?
> 
> Well, we don't want to COW it on write faults.
> 
> The optimal behaviour is:
> 
> - write faults: COW and instantiate a writeable spte
So do we or don't we want to COW on write faults?

--
			Gleb.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 5/6] kvm, x86: use ro page and don't copy shared page
  2010-07-18 15:23         ` Gleb Natapov
@ 2010-07-18 15:31           ` Avi Kivity
  0 siblings, 0 replies; 10+ messages in thread
From: Avi Kivity @ 2010-07-18 15:31 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: Marcelo Tosatti, Lai Jiangshan, LKML, kvm, Nick Piggin

On 07/18/2010 06:23 PM, Gleb Natapov wrote:
> On Sun, Jul 18, 2010 at 06:14:11PM +0300, Avi Kivity wrote:
>    
>> On 07/17/2010 07:31 AM, Gleb Natapov wrote:
>>      
>>>>> Currently pages allocated for guest memory are required to be RW, so after your series
>>>>> behaviour will remain exactly the same as before.
>>>>>            
>>>> Except KSM pages.
>>>>
>>>>          
>>> KSM page will be COWed by __get_user_pages_fast(addr, 1, 1, page) in
>>> get_user_page_and_protection() just like it COWed now, no?
>>>        
>> Well, we don't want to COW it on write faults.
>>      

I meant read faults here.

>> The optimal behaviour is:
>>
>> - write faults: COW and instantiate a writeable spte
>>      
> So do we or don't we want to COW on write faults?
>    

We do (no choice).

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 5/6] kvm, x86: use ro page and don't copy shared page
  2010-07-16  7:19 ` Gleb Natapov
  2010-07-16 23:26   ` Marcelo Tosatti
@ 2010-07-29  2:15   ` Lai Jiangshan
  2010-07-29  5:56     ` Gleb Natapov
  1 sibling, 1 reply; 10+ messages in thread
From: Lai Jiangshan @ 2010-07-29  2:15 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: LKML, kvm, Avi Kivity, Marcelo Tosatti, Nick Piggin

On 07/16/2010 03:19 PM, Gleb Natapov wrote:

>> +/* get a current mapped page fast, and test whether the page is writable. */
>> +static struct page *get_user_page_and_protection(unsigned long addr,
>> +	int *writable)
>> +{
>> +	struct page *page[1];
>> +
>> +	if (__get_user_pages_fast(addr, 1, 1, page) == 1) {
>> +		*writable = 1;
>> +		return page[0];
>> +	}
>> +	if (__get_user_pages_fast(addr, 1, 0, page) == 1) {
>> +		*writable = 0;
>> +		return page[0];
>> +	}
>> +	return NULL;
>> +}
>> +
>> +static pfn_t kvm_get_pfn_for_page_fault(struct kvm *kvm, gfn_t gfn,
>> +		int write_fault, int *host_writable)
>> +{
>> +	unsigned long addr;
>> +	struct page *page;
>> +
>> +	if (!write_fault) {
>> +		addr = gfn_to_hva(kvm, gfn);
>> +		if (kvm_is_error_hva(addr)) {
>> +			get_page(bad_page);
>> +			return page_to_pfn(bad_page);
>> +		}
>> +
>> +		page = get_user_page_and_protection(addr, host_writable);
>> +		if (page)
>> +			return page_to_pfn(page);
>> +	}
>> +
>> +	*host_writable = 1;
>> +	return kvm_get_pfn_for_gfn(kvm, gfn);
>> +}
>> +
> kvm_get_pfn_for_gfn() returns fault_page if page is mapped RO, so caller
> of kvm_get_pfn_for_page_fault() and kvm_get_pfn_for_gfn() will get
> different results when called on the same page. Not good.
> kvm_get_pfn_for_page_fault() logic should be folded into
> kvm_get_pfn_for_gfn().
> 


The different results are the things we just need.
We don't want to copy and write a page which is mapped RO when
only read fault.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 5/6] kvm, x86: use ro page and don't copy shared page
  2010-07-16 23:26   ` Marcelo Tosatti
  2010-07-17  4:31     ` Gleb Natapov
@ 2010-07-29  2:19     ` Lai Jiangshan
  1 sibling, 0 replies; 10+ messages in thread
From: Lai Jiangshan @ 2010-07-29  2:19 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Gleb Natapov, LKML, kvm, Avi Kivity, Nick Piggin

On 07/17/2010 07:26 AM, Marcelo Tosatti wrote:
> On Fri, Jul 16, 2010 at 10:19:36AM +0300, Gleb Natapov wrote:
>> On Fri, Jul 16, 2010 at 10:13:07AM +0800, Lai Jiangshan wrote:
>>> When page fault, we always call get_user_pages(write=1).
>>>
>>> Actually, we don't need to do this when it is not write fault.
>>> get_user_pages(write=1) will cause shared page(ksm) copied.
>>> If this page is not modified in future, this copying and the copied page
>>> are just wasted. Ksm may scan and merge them and may cause thrash.
>>>
>> But is page is written into afterwords we will get another page fault.
>>
>>> In this patch, if the page is RO for host VMM and it not write fault for guest,
>>> we will use RO page, otherwise we use a writable page.
>>>
>> Currently pages allocated for guest memory are required to be RW, so after your series
>> behaviour will remain exactly the same as before.
> 
> Except KSM pages.
> 
>>> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
>>> ---
>>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>>> index 8ba9b0d..6382140 100644
>>> --- a/arch/x86/kvm/mmu.c
>>> +++ b/arch/x86/kvm/mmu.c
>>> @@ -1832,6 +1832,45 @@ static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
>>>  	}
>>>  }
>>>  
>>> +/* get a current mapped page fast, and test whether the page is writable. */
>>> +static struct page *get_user_page_and_protection(unsigned long addr,
>>> +	int *writable)
>>> +{
>>> +	struct page *page[1];
>>> +
>>> +	if (__get_user_pages_fast(addr, 1, 1, page) == 1) {
>>> +		*writable = 1;
>>> +		return page[0];
>>> +	}
>>> +	if (__get_user_pages_fast(addr, 1, 0, page) == 1) {
>>> +		*writable = 0;
>>> +		return page[0];
>>> +	}
>>> +	return NULL;
>>> +}
>>> +
>>> +static pfn_t kvm_get_pfn_for_page_fault(struct kvm *kvm, gfn_t gfn,
>>> +		int write_fault, int *host_writable)
>>> +{
>>> +	unsigned long addr;
>>> +	struct page *page;
>>> +
>>> +	if (!write_fault) {
>>> +		addr = gfn_to_hva(kvm, gfn);
>>> +		if (kvm_is_error_hva(addr)) {
>>> +			get_page(bad_page);
>>> +			return page_to_pfn(bad_page);
>>> +		}
>>> +
>>> +		page = get_user_page_and_protection(addr, host_writable);
>>> +		if (page)
>>> +			return page_to_pfn(page);
>>> +	}
>>> +
>>> +	*host_writable = 1;
>>> +	return kvm_get_pfn_for_gfn(kvm, gfn);
>>> +}
>>> +
>> kvm_get_pfn_for_gfn() returns fault_page if page is mapped RO, so caller
>> of kvm_get_pfn_for_page_fault() and kvm_get_pfn_for_gfn() will get
>> different results when called on the same page. Not good.
>> kvm_get_pfn_for_page_fault() logic should be folded into
>> kvm_get_pfn_for_gfn().
> 
> Agreed. Please keep gfn_to_pfn related code in virt/kvm/kvm_main.c.
> 
> 

Pass write_fault parameter to kvm_get_pfn_for_gfn()?
But only X86 use this parameter currently, I think it is OK to
keep these code in arch/x86/kvm/mmu.c

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 5/6] kvm, x86: use ro page and don't copy shared page
  2010-07-29  2:15   ` Lai Jiangshan
@ 2010-07-29  5:56     ` Gleb Natapov
  0 siblings, 0 replies; 10+ messages in thread
From: Gleb Natapov @ 2010-07-29  5:56 UTC (permalink / raw)
  To: Lai Jiangshan; +Cc: LKML, kvm, Avi Kivity, Marcelo Tosatti, Nick Piggin

On Thu, Jul 29, 2010 at 10:15:22AM +0800, Lai Jiangshan wrote:
> On 07/16/2010 03:19 PM, Gleb Natapov wrote:
> 
> >> +/* get a current mapped page fast, and test whether the page is writable. */
> >> +static struct page *get_user_page_and_protection(unsigned long addr,
> >> +	int *writable)
> >> +{
> >> +	struct page *page[1];
> >> +
> >> +	if (__get_user_pages_fast(addr, 1, 1, page) == 1) {
> >> +		*writable = 1;
> >> +		return page[0];
> >> +	}
> >> +	if (__get_user_pages_fast(addr, 1, 0, page) == 1) {
> >> +		*writable = 0;
> >> +		return page[0];
> >> +	}
> >> +	return NULL;
> >> +}
> >> +
> >> +static pfn_t kvm_get_pfn_for_page_fault(struct kvm *kvm, gfn_t gfn,
> >> +		int write_fault, int *host_writable)
> >> +{
> >> +	unsigned long addr;
> >> +	struct page *page;
> >> +
> >> +	if (!write_fault) {
> >> +		addr = gfn_to_hva(kvm, gfn);
> >> +		if (kvm_is_error_hva(addr)) {
> >> +			get_page(bad_page);
> >> +			return page_to_pfn(bad_page);
> >> +		}
> >> +
> >> +		page = get_user_page_and_protection(addr, host_writable);
> >> +		if (page)
> >> +			return page_to_pfn(page);
> >> +	}
> >> +
> >> +	*host_writable = 1;
> >> +	return kvm_get_pfn_for_gfn(kvm, gfn);
> >> +}
> >> +
> > kvm_get_pfn_for_gfn() returns fault_page if page is mapped RO, so caller
> > of kvm_get_pfn_for_page_fault() and kvm_get_pfn_for_gfn() will get
> > different results when called on the same page. Not good.
> > kvm_get_pfn_for_page_fault() logic should be folded into
> > kvm_get_pfn_for_gfn().
> > 
> 
> 
> The different results are the things we just need.
How so? Users of kvm_get_pfn_for_gfn() will think that page is invalid
and may report misconfiguration to userspace and users of
kvm_get_pfn_for_page_fault() will think that the access to page is OK.
There are no many users of kvm_get_pfn_for_gfn() and may be your patch
replace all of them with kvm_get_pfn_for_page_fault(), but this just
strengthen the point that they should be merged.

> We don't want to copy and write a page which is mapped RO when
> only read fault.
I don't see how returning inconsistent results helps us achieving that.

BTW since kvm_get_pfn_for_gfn() will never map RO page
get_user_page_and_protection() will never find any RO pages. Looks like
kvm_get_pfn_for_page_fault() is equivalent to kvm_get_pfn_for_gfn()
since !write_fault section will at best find mapped RW page.

--
			Gleb.

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2010-07-29  5:56 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-07-16  2:13 [PATCH 5/6] kvm, x86: use ro page and don't copy shared page Lai Jiangshan
2010-07-16  7:19 ` Gleb Natapov
2010-07-16 23:26   ` Marcelo Tosatti
2010-07-17  4:31     ` Gleb Natapov
2010-07-18 15:14       ` Avi Kivity
2010-07-18 15:23         ` Gleb Natapov
2010-07-18 15:31           ` Avi Kivity
2010-07-29  2:19     ` Lai Jiangshan
2010-07-29  2:15   ` Lai Jiangshan
2010-07-29  5:56     ` Gleb Natapov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox