[patch 00/10] out of sync shadow v2

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [patch 00/10] out of sync shadow v2
@ 2008-09-18 21:27 Marcelo Tosatti
  2008-09-18 21:27 ` [patch 01/10] KVM: MMU: split mmu_set_spte Marcelo Tosatti
                   ` (10 more replies)
  0 siblings, 11 replies; 34+ messages in thread
From: Marcelo Tosatti @ 2008-09-18 21:27 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, David S. Ahern

Addressing earlier comments.

-- 


^ permalink raw reply	[flat|nested] 34+ messages in thread

* [patch 01/10] KVM: MMU: split mmu_set_spte
  2008-09-18 21:27 [patch 00/10] out of sync shadow v2 Marcelo Tosatti
@ 2008-09-18 21:27 ` Marcelo Tosatti
  2008-09-18 21:27 ` [patch 02/10] KVM: MMU: move local TLB flush to mmu_set_spte Marcelo Tosatti
                   ` (9 subsequent siblings)
  10 siblings, 0 replies; 34+ messages in thread
From: Marcelo Tosatti @ 2008-09-18 21:27 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, David S. Ahern, Marcelo Tosatti

[-- Attachment #1: mmu-set-spte --]
[-- Type: text/plain, Size: 4394 bytes --]

Split the spte entry creation code into a new set_spte function.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: kvm/arch/x86/kvm/mmu.c
===================================================================
--- kvm.orig/arch/x86/kvm/mmu.c
+++ kvm/arch/x86/kvm/mmu.c
@@ -1148,44 +1148,13 @@ struct page *gva_to_page(struct kvm_vcpu
 	return page;
 }
 
-static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
-			 unsigned pt_access, unsigned pte_access,
-			 int user_fault, int write_fault, int dirty,
-			 int *ptwrite, int largepage, gfn_t gfn,
-			 pfn_t pfn, bool speculative)
+static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+		    unsigned pte_access, int user_fault,
+		    int write_fault, int dirty, int largepage,
+		    gfn_t gfn, pfn_t pfn, bool speculative)
 {
 	u64 spte;
-	int was_rmapped = 0;
-	int was_writeble = is_writeble_pte(*shadow_pte);
-
-	pgprintk("%s: spte %llx access %x write_fault %d"
-		 " user_fault %d gfn %lx\n",
-		 __func__, *shadow_pte, pt_access,
-		 write_fault, user_fault, gfn);
-
-	if (is_rmap_pte(*shadow_pte)) {
-		/*
-		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
-		 * the parent of the now unreachable PTE.
-		 */
-		if (largepage && !is_large_pte(*shadow_pte)) {
-			struct kvm_mmu_page *child;
-			u64 pte = *shadow_pte;
-
-			child = page_header(pte & PT64_BASE_ADDR_MASK);
-			mmu_page_remove_parent_pte(child, shadow_pte);
-		} else if (pfn != spte_to_pfn(*shadow_pte)) {
-			pgprintk("hfn old %lx new %lx\n",
-				 spte_to_pfn(*shadow_pte), pfn);
-			rmap_remove(vcpu->kvm, shadow_pte);
-		} else {
-			if (largepage)
-				was_rmapped = is_large_pte(*shadow_pte);
-			else
-				was_rmapped = 1;
-		}
-	}
-
+	int ret = 0;
 	/*
 	 * We don't set the accessed bit, since we sometimes want to see
 	 * whether the guest actually used the pte (in order to detect
@@ -1218,26 +1187,70 @@ static void mmu_set_spte(struct kvm_vcpu
 		   (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
 			pgprintk("%s: found shadow page for %lx, marking ro\n",
 				 __func__, gfn);
+			ret = 1;
 			pte_access &= ~ACC_WRITE_MASK;
 			if (is_writeble_pte(spte)) {
 				spte &= ~PT_WRITABLE_MASK;
 				kvm_x86_ops->tlb_flush(vcpu);
 			}
-			if (write_fault)
-				*ptwrite = 1;
 		}
 	}
 
 	if (pte_access & ACC_WRITE_MASK)
 		mark_page_dirty(vcpu->kvm, gfn);
 
-	pgprintk("%s: setting spte %llx\n", __func__, spte);
-	pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
-		 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
-		 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
 	set_shadow_pte(shadow_pte, spte);
-	if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK)
-	    && (spte & PT_PRESENT_MASK))
+	return ret;
+}
+
+
+static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+			 unsigned pt_access, unsigned pte_access,
+			 int user_fault, int write_fault, int dirty,
+			 int *ptwrite, int largepage, gfn_t gfn,
+			 pfn_t pfn, bool speculative)
+{
+	int was_rmapped = 0;
+	int was_writeble = is_writeble_pte(*shadow_pte);
+
+	pgprintk("%s: spte %llx access %x write_fault %d"
+		 " user_fault %d gfn %lx\n",
+		 __func__, *shadow_pte, pt_access,
+		 write_fault, user_fault, gfn);
+
+	if (is_rmap_pte(*shadow_pte)) {
+		/*
+		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
+		 * the parent of the now unreachable PTE.
+		 */
+		if (largepage && !is_large_pte(*shadow_pte)) {
+			struct kvm_mmu_page *child;
+			u64 pte = *shadow_pte;
+
+			child = page_header(pte & PT64_BASE_ADDR_MASK);
+			mmu_page_remove_parent_pte(child, shadow_pte);
+		} else if (pfn != spte_to_pfn(*shadow_pte)) {
+			pgprintk("hfn old %lx new %lx\n",
+				 spte_to_pfn(*shadow_pte), pfn);
+			rmap_remove(vcpu->kvm, shadow_pte);
+		} else {
+			if (largepage)
+				was_rmapped = is_large_pte(*shadow_pte);
+			else
+				was_rmapped = 1;
+		}
+	}
+	if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
+		      dirty, largepage, gfn, pfn, speculative))
+		if (write_fault)
+			*ptwrite = 1;
+
+	pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
+	pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
+		 is_large_pte(*shadow_pte)? "2MB" : "4kB",
+		 is_present_pte(*shadow_pte)?"RW":"R", gfn,
+		 *shadow_pte, shadow_pte);
+	if (!was_rmapped && is_large_pte(*shadow_pte))
 		++vcpu->kvm->stat.lpages;
 
 	page_header_update_slot(vcpu->kvm, shadow_pte, gfn);

-- 


^ permalink raw reply	[flat|nested] 34+ messages in thread

* [patch 02/10] KVM: MMU: move local TLB flush to mmu_set_spte
  2008-09-18 21:27 [patch 00/10] out of sync shadow v2 Marcelo Tosatti
  2008-09-18 21:27 ` [patch 01/10] KVM: MMU: split mmu_set_spte Marcelo Tosatti
@ 2008-09-18 21:27 ` Marcelo Tosatti
  2008-09-20  0:21   ` Avi Kivity
  2008-09-18 21:27 ` [patch 03/10] KVM: MMU: do not write-protect large mappings Marcelo Tosatti
                   ` (8 subsequent siblings)
  10 siblings, 1 reply; 34+ messages in thread
From: Marcelo Tosatti @ 2008-09-18 21:27 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, David S. Ahern, Marcelo Tosatti

[-- Attachment #1: mmu-set-spte-tlb-flush --]
[-- Type: text/plain, Size: 1069 bytes --]

Since the sync page path can collapse flushes.

Also only flush if the spte was writable before.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: kvm/arch/x86/kvm/mmu.c
===================================================================
--- kvm.orig/arch/x86/kvm/mmu.c
+++ kvm/arch/x86/kvm/mmu.c
@@ -1189,10 +1189,8 @@ static int set_spte(struct kvm_vcpu *vcp
 				 __func__, gfn);
 			ret = 1;
 			pte_access &= ~ACC_WRITE_MASK;
-			if (is_writeble_pte(spte)) {
+			if (is_writeble_pte(spte))
 				spte &= ~PT_WRITABLE_MASK;
-				kvm_x86_ops->tlb_flush(vcpu);
-			}
 		}
 	}
 
@@ -1241,9 +1239,12 @@ static void mmu_set_spte(struct kvm_vcpu
 		}
 	}
 	if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
-		      dirty, largepage, gfn, pfn, speculative))
+		      dirty, largepage, gfn, pfn, speculative)) {
 		if (write_fault)
 			*ptwrite = 1;
+		if (was_writeble)
+			kvm_x86_ops->tlb_flush(vcpu);
+	}
 
 	pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
 	pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",

-- 


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 02/10] KVM: MMU: move local TLB flush to mmu_set_spte
  2008-09-18 21:27 ` [patch 02/10] KVM: MMU: move local TLB flush to mmu_set_spte Marcelo Tosatti
@ 2008-09-20  0:21   ` Avi Kivity
  0 siblings, 0 replies; 34+ messages in thread
From: Avi Kivity @ 2008-09-20  0:21 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, kvm, David S. Ahern

Marcelo Tosatti wrote:
> Since the sync page path can collapse flushes.
>
> Also only flush if the spte was writable before.
>
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>
> @@ -1241,9 +1239,12 @@ static void mmu_set_spte(struct kvm_vcpu
>  		}
>  	}
>  	if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
> -		      dirty, largepage, gfn, pfn, speculative))
> +		      dirty, largepage, gfn, pfn, speculative)) {
>  		if (write_fault)
>  			*ptwrite = 1;
> +		if (was_writeble)
> +			kvm_x86_ops->tlb_flush(vcpu);
> +	}
>  
>   

I think we had cases where the spte.pfn contents changed, for example 
when a large page was replaced by a normal page, and also:

        } else if (pfn != spte_to_pfn(*shadow_pte)) {


-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* [patch 03/10] KVM: MMU: do not write-protect large mappings
  2008-09-18 21:27 [patch 00/10] out of sync shadow v2 Marcelo Tosatti
  2008-09-18 21:27 ` [patch 01/10] KVM: MMU: split mmu_set_spte Marcelo Tosatti
  2008-09-18 21:27 ` [patch 02/10] KVM: MMU: move local TLB flush to mmu_set_spte Marcelo Tosatti
@ 2008-09-18 21:27 ` Marcelo Tosatti
  2008-09-20  0:29   ` Avi Kivity
  2008-09-18 21:27 ` [patch 04/10] KVM: MMU: mode specific sync_page Marcelo Tosatti
                   ` (7 subsequent siblings)
  10 siblings, 1 reply; 34+ messages in thread
From: Marcelo Tosatti @ 2008-09-18 21:27 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, David S. Ahern, Marcelo Tosatti

[-- Attachment #1: no-wrprotect-large-parge --]
[-- Type: text/plain, Size: 1897 bytes --]

There is not much point in write protecting large mappings. This
can only happen when a page is shadowed during the window between
is_largepage_backed and mmu_lock acquision. Zap the entry instead, so
the next pagefault will find a shadowed page via is_largepage_backed and
fallback to 4k translations.

Simplifies out of sync shadow.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: kvm/arch/x86/kvm/mmu.c
===================================================================
--- kvm.orig/arch/x86/kvm/mmu.c
+++ kvm/arch/x86/kvm/mmu.c
@@ -1180,11 +1180,16 @@ static int set_spte(struct kvm_vcpu *vcp
 	    || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
 		struct kvm_mmu_page *shadow;
 
+		if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
+			ret = 1;
+			spte = shadow_trap_nonpresent_pte;
+			goto set_pte;
+		}
+
 		spte |= PT_WRITABLE_MASK;
 
 		shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
-		if (shadow ||
-		   (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
+		if (shadow) {
 			pgprintk("%s: found shadow page for %lx, marking ro\n",
 				 __func__, gfn);
 			ret = 1;
@@ -1197,6 +1202,7 @@ static int set_spte(struct kvm_vcpu *vcp
 	if (pte_access & ACC_WRITE_MASK)
 		mark_page_dirty(vcpu->kvm, gfn);
 
+set_pte:
 	set_shadow_pte(shadow_pte, spte);
 	return ret;
 }
Index: kvm/arch/x86/kvm/paging_tmpl.h
===================================================================
--- kvm.orig/arch/x86/kvm/paging_tmpl.h
+++ kvm/arch/x86/kvm/paging_tmpl.h
@@ -307,11 +307,10 @@ static int FNAME(shadow_walk_entry)(stru
 		return 1;
 	}
 
-	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
+	if (is_shadow_present_pte(*sptep))
 		return 0;
 
-	if (is_large_pte(*sptep))
-		rmap_remove(vcpu->kvm, sptep);
+	WARN_ON (is_large_pte(*sptep));
 
 	if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) {
 		metaphysical = 1;

-- 


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 03/10] KVM: MMU: do not write-protect large mappings
  2008-09-18 21:27 ` [patch 03/10] KVM: MMU: do not write-protect large mappings Marcelo Tosatti
@ 2008-09-20  0:29   ` Avi Kivity
  2008-09-21  0:41     ` Marcelo Tosatti
  0 siblings, 1 reply; 34+ messages in thread
From: Avi Kivity @ 2008-09-20  0:29 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, kvm, David S. Ahern

Marcelo Tosatti wrote:
> There is not much point in write protecting large mappings. This
> can only happen when a page is shadowed during the window between
> is_largepage_backed and mmu_lock acquision. Zap the entry instead, so
> the next pagefault will find a shadowed page via is_largepage_backed and
> fallback to 4k translations.
>
> Simplifies out of sync shadow.
>
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>
> Index: kvm/arch/x86/kvm/mmu.c
> ===================================================================
> --- kvm.orig/arch/x86/kvm/mmu.c
> +++ kvm/arch/x86/kvm/mmu.c
> @@ -1180,11 +1180,16 @@ static int set_spte(struct kvm_vcpu *vcp
>  	    || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
>  		struct kvm_mmu_page *shadow;
>  
> +		if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
> +			ret = 1;
> +			spte = shadow_trap_nonpresent_pte;
> +			goto set_pte;
> +		}
> +
>  		spte |= PT_WRITABLE_MASK;
>  
>  		shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
> -		if (shadow ||
> -		   (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
> +		if (shadow) {
>  			pgprintk("%s: found shadow page for %lx, marking ro\n",
>  				 __func__, gfn);
>  			ret = 1;
> @@ -1197,6 +1202,7 @@ static int set_spte(struct kvm_vcpu *vcp
>  	if (pte_access & ACC_WRITE_MASK)
>  		mark_page_dirty(vcpu->kvm, gfn);
>  
> +set_pte:
>  	set_shadow_pte(shadow_pte, spte);
>  	return ret;
>  }
>   

Don't we need to drop a reference to the page?

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 03/10] KVM: MMU: do not write-protect large mappings
  2008-09-20  0:29   ` Avi Kivity
@ 2008-09-21  0:41     ` Marcelo Tosatti
  0 siblings, 0 replies; 34+ messages in thread
From: Marcelo Tosatti @ 2008-09-21  0:41 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Avi Kivity, kvm, David S. Ahern

On Fri, Sep 19, 2008 at 05:29:28PM -0700, Avi Kivity wrote:
> Marcelo Tosatti wrote:
>> There is not much point in write protecting large mappings. This
>> can only happen when a page is shadowed during the window between
>> is_largepage_backed and mmu_lock acquision. Zap the entry instead, so
>> the next pagefault will find a shadowed page via is_largepage_backed and
>> fallback to 4k translations.
>>
>> Simplifies out of sync shadow.
>>
>> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>>
>> Index: kvm/arch/x86/kvm/mmu.c
>> ===================================================================
>> --- kvm.orig/arch/x86/kvm/mmu.c
>> +++ kvm/arch/x86/kvm/mmu.c
>> @@ -1180,11 +1180,16 @@ static int set_spte(struct kvm_vcpu *vcp
>>  	    || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
>>  		struct kvm_mmu_page *shadow;
>>  +		if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
>> +			ret = 1;
>> +			spte = shadow_trap_nonpresent_pte;
>> +			goto set_pte;
>> +		}
>
> Don't we need to drop a reference to the page?

mmu_set_spte will do it if necessary:

        if (!was_rmapped) {
                rmap_add(vcpu, shadow_pte, gfn, largepage);
                if (!is_rmap_pte(*shadow_pte))
                        kvm_release_pfn_clean(pfn);


But as noted, this part is wrong:

@@ -307,11 +307,10 @@ static int FNAME(shadow_walk_entry)(stru
        return 1;
    }
 
-   if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
+   if (is_shadow_present_pte(*sptep))
        return 0;
 
-   if (is_large_pte(*sptep))
-       rmap_remove(vcpu->kvm, sptep);
+   WARN_ON (is_large_pte(*sptep));

Since it might still be necessary to replace a read-only large mapping
(which rmap_write_protect will not nuke) with a normal pte pointer.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* [patch 04/10] KVM: MMU: mode specific sync_page
  2008-09-18 21:27 [patch 00/10] out of sync shadow v2 Marcelo Tosatti
                   ` (2 preceding siblings ...)
  2008-09-18 21:27 ` [patch 03/10] KVM: MMU: do not write-protect large mappings Marcelo Tosatti
@ 2008-09-18 21:27 ` Marcelo Tosatti
  2008-09-20  0:44   ` Avi Kivity
  2008-09-18 21:27 ` [patch 05/10] KVM: MMU: sync roots on mmu reload Marcelo Tosatti
                   ` (6 subsequent siblings)
  10 siblings, 1 reply; 34+ messages in thread
From: Marcelo Tosatti @ 2008-09-18 21:27 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, David S. Ahern, Marcelo Tosatti

[-- Attachment #1: kvm-oos-sync-page --]
[-- Type: text/plain, Size: 4393 bytes --]

Examine guest pagetable and bring the shadow back in sync. Caller is responsible
for local TLB flush before re-entering guest mode.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: kvm/arch/x86/kvm/mmu.c
===================================================================
--- kvm.orig/arch/x86/kvm/mmu.c
+++ kvm/arch/x86/kvm/mmu.c
@@ -871,6 +871,12 @@ static void nonpaging_prefetch_page(stru
 		sp->spt[i] = shadow_trap_nonpresent_pte;
 }
 
+static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
+			       struct kvm_mmu_page *sp)
+{
+	return 1;
+}
+
 static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 {
 	unsigned index;
@@ -1548,6 +1554,7 @@ static int nonpaging_init_context(struct
 	context->gva_to_gpa = nonpaging_gva_to_gpa;
 	context->free = nonpaging_free;
 	context->prefetch_page = nonpaging_prefetch_page;
+	context->sync_page = nonpaging_sync_page;
 	context->root_level = 0;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
@@ -1595,6 +1602,7 @@ static int paging64_init_context_common(
 	context->page_fault = paging64_page_fault;
 	context->gva_to_gpa = paging64_gva_to_gpa;
 	context->prefetch_page = paging64_prefetch_page;
+	context->sync_page = paging64_sync_page;
 	context->free = paging_free;
 	context->root_level = level;
 	context->shadow_root_level = level;
@@ -1616,6 +1624,7 @@ static int paging32_init_context(struct 
 	context->gva_to_gpa = paging32_gva_to_gpa;
 	context->free = paging_free;
 	context->prefetch_page = paging32_prefetch_page;
+	context->sync_page = paging32_sync_page;
 	context->root_level = PT32_ROOT_LEVEL;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
@@ -1635,6 +1644,7 @@ static int init_kvm_tdp_mmu(struct kvm_v
 	context->page_fault = tdp_page_fault;
 	context->free = nonpaging_free;
 	context->prefetch_page = nonpaging_prefetch_page;
+	context->sync_page = nonpaging_sync_page;
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 	context->root_hpa = INVALID_PAGE;
 
Index: kvm/arch/x86/kvm/paging_tmpl.h
===================================================================
--- kvm.orig/arch/x86/kvm/paging_tmpl.h
+++ kvm/arch/x86/kvm/paging_tmpl.h
@@ -503,6 +503,61 @@ static void FNAME(prefetch_page)(struct 
 	}
 }
 
+/*
+ * Using the cached information from sp->gfns is safe because:
+ * - The spte has a reference to the struct page, so the pfn for a given gfn
+ *   can't change unless all sptes pointing to it are nuked first.
+ * - Alias changes zap the entire shadow cache.
+ */
+static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+	int i, offset, nr_present;
+
+	offset = nr_present = 0;
+
+	if (PTTYPE == 32)
+		offset = sp->role.quadrant << PT64_LEVEL_BITS;
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
+		if (is_shadow_present_pte(sp->spt[i])) {
+			unsigned pte_access;
+			pt_element_t gpte;
+			gpa_t pte_gpa;
+			gfn_t gfn = sp->gfns[i];
+
+			pte_gpa = gfn_to_gpa(sp->gfn);
+			pte_gpa += (i+offset) * sizeof(pt_element_t);
+
+			if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
+						  sizeof(pt_element_t)))
+				return -EINVAL;
+
+			if (gpte_to_gfn(gpte) != gfn || !(gpte & PT_ACCESSED_MASK)) {
+				rmap_remove(vcpu->kvm, &sp->spt[i]);
+				if (is_present_pte(gpte))
+					sp->spt[i] = shadow_trap_nonpresent_pte;
+				else
+					sp->spt[i] = shadow_notrap_nonpresent_pte;
+				continue;
+			}
+
+			if (!is_present_pte(gpte)) {
+				rmap_remove(vcpu->kvm, &sp->spt[i]);
+				sp->spt[i] = shadow_notrap_nonpresent_pte;
+				continue;
+			}
+
+			nr_present++;
+			pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+			set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
+				 is_dirty_pte(gpte), 0, gfn,
+				 spte_to_pfn(sp->spt[i]), true);
+		}
+	}
+
+	return !nr_present;
+}
+
 #undef pt_element_t
 #undef guest_walker
 #undef shadow_walker
Index: kvm/include/asm-x86/kvm_host.h
===================================================================
--- kvm.orig/include/asm-x86/kvm_host.h
+++ kvm/include/asm-x86/kvm_host.h
@@ -220,6 +220,8 @@ struct kvm_mmu {
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
 	void (*prefetch_page)(struct kvm_vcpu *vcpu,
 			      struct kvm_mmu_page *page);
+	int (*sync_page)(struct kvm_vcpu *vcpu,
+			 struct kvm_mmu_page *sp);
 	hpa_t root_hpa;
 	int root_level;
 	int shadow_root_level;

-- 


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 04/10] KVM: MMU: mode specific sync_page
  2008-09-18 21:27 ` [patch 04/10] KVM: MMU: mode specific sync_page Marcelo Tosatti
@ 2008-09-20  0:44   ` Avi Kivity
  0 siblings, 0 replies; 34+ messages in thread
From: Avi Kivity @ 2008-09-20  0:44 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, kvm, David S. Ahern

Marcelo Tosatti wrote:
> Examine guest pagetable and bring the shadow back in sync. Caller is responsible
> for local TLB flush before re-entering guest mode.
>
>   

Neat!  We had a gpte snapshot, and I forgot all about it.

> +	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
> +		if (is_shadow_present_pte(sp->spt[i])) {
>   

if (!is_..())
   continue;

to reduce indentation.

> +			pte_gpa += (i+offset) * sizeof(pt_element_t);
> +
> +			if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
> +						  sizeof(pt_element_t)))
> +				return -EINVAL;
>   

I guess we want a kvm_map_guest_page_atomic() to speed this up.  Can be 
done later as an optimization, of course.

> +
> +			if (gpte_to_gfn(gpte) != gfn || !(gpte & PT_ACCESSED_MASK)) {
> +				rmap_remove(vcpu->kvm, &sp->spt[i]);
> +				if (is_present_pte(gpte))
> +					sp->spt[i] = shadow_trap_nonpresent_pte;
> +				else
> +					sp->spt[i] = shadow_notrap_nonpresent_pte;
>   

set_shadow_pte()

> +				continue;
> +			}
> +
> +			if (!is_present_pte(gpte)) {
> +				rmap_remove(vcpu->kvm, &sp->spt[i]);
> +				sp->spt[i] = shadow_notrap_nonpresent_pte;
> +				continue;
> +			}
>   

Merge with previous block?

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* [patch 05/10] KVM: MMU: sync roots on mmu reload
  2008-09-18 21:27 [patch 00/10] out of sync shadow v2 Marcelo Tosatti
                   ` (3 preceding siblings ...)
  2008-09-18 21:27 ` [patch 04/10] KVM: MMU: mode specific sync_page Marcelo Tosatti
@ 2008-09-18 21:27 ` Marcelo Tosatti
  2008-09-18 21:27 ` [patch 06/10] KVM: x86: trap invlpg Marcelo Tosatti
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 34+ messages in thread
From: Marcelo Tosatti @ 2008-09-18 21:27 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, David S. Ahern, Marcelo Tosatti

[-- Attachment #1: mmu-sync-roots --]
[-- Type: text/plain, Size: 2381 bytes --]

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: kvm/arch/x86/kvm/mmu.c
===================================================================
--- kvm.orig/arch/x86/kvm/mmu.c
+++ kvm/arch/x86/kvm/mmu.c
@@ -1472,6 +1472,41 @@ static void mmu_alloc_roots(struct kvm_v
 	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
 }
 
+static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+}
+
+static void mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+	int i;
+	struct kvm_mmu_page *sp;
+
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		return;
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		hpa_t root = vcpu->arch.mmu.root_hpa;
+		sp = page_header(root);
+		mmu_sync_children(vcpu, sp);
+		return;
+	}
+	for (i = 0; i < 4; ++i) {
+		hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+		if (root) {
+			root &= PT64_BASE_ADDR_MASK;
+			sp = page_header(root);
+			mmu_sync_children(vcpu, sp);
+		}
+	}
+}
+
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+	spin_lock(&vcpu->kvm->mmu_lock);
+	mmu_sync_roots(vcpu);
+	spin_unlock(&vcpu->kvm->mmu_lock);
+}
+
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
 {
 	return vaddr;
@@ -1716,6 +1751,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
 	mmu_alloc_roots(vcpu);
+	mmu_sync_roots(vcpu);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
 	kvm_mmu_flush_tlb(vcpu);
Index: kvm/arch/x86/kvm/x86.c
===================================================================
--- kvm.orig/arch/x86/kvm/x86.c
+++ kvm/arch/x86/kvm/x86.c
@@ -594,6 +594,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
 	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
+		kvm_mmu_sync_roots(vcpu);
 		kvm_mmu_flush_tlb(vcpu);
 		return;
 	}
Index: kvm/include/asm-x86/kvm_host.h
===================================================================
--- kvm.orig/include/asm-x86/kvm_host.h
+++ kvm/include/asm-x86/kvm_host.h
@@ -584,6 +584,7 @@ int kvm_mmu_unprotect_page_virt(struct k
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
 
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 

-- 


^ permalink raw reply	[flat|nested] 34+ messages in thread

* [patch 06/10] KVM: x86: trap invlpg
  2008-09-18 21:27 [patch 00/10] out of sync shadow v2 Marcelo Tosatti
                   ` (4 preceding siblings ...)
  2008-09-18 21:27 ` [patch 05/10] KVM: MMU: sync roots on mmu reload Marcelo Tosatti
@ 2008-09-18 21:27 ` Marcelo Tosatti
  2008-09-20  0:53   ` Avi Kivity
  2008-09-18 21:27 ` [patch 07/10] KVM: MMU: mmu_parent_walk Marcelo Tosatti
                   ` (4 subsequent siblings)
  10 siblings, 1 reply; 34+ messages in thread
From: Marcelo Tosatti @ 2008-09-18 21:27 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, David S. Ahern, Marcelo Tosatti

[-- Attachment #1: mmu-invlpg --]
[-- Type: text/plain, Size: 8119 bytes --]

With pages out of sync invlpg needs to be trapped. For now simply nuke
the entry.

Untested on AMD.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: kvm/arch/x86/kvm/mmu.c
===================================================================
--- kvm.orig/arch/x86/kvm/mmu.c
+++ kvm/arch/x86/kvm/mmu.c
@@ -877,6 +877,10 @@ static int nonpaging_sync_page(struct kv
 	return 1;
 }
 
+static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+}
+
 static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 {
 	unsigned index;
@@ -1590,6 +1594,7 @@ static int nonpaging_init_context(struct
 	context->free = nonpaging_free;
 	context->prefetch_page = nonpaging_prefetch_page;
 	context->sync_page = nonpaging_sync_page;
+	context->invlpg = nonpaging_invlpg;
 	context->root_level = 0;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
@@ -1638,6 +1643,7 @@ static int paging64_init_context_common(
 	context->gva_to_gpa = paging64_gva_to_gpa;
 	context->prefetch_page = paging64_prefetch_page;
 	context->sync_page = paging64_sync_page;
+	context->invlpg = paging64_invlpg;
 	context->free = paging_free;
 	context->root_level = level;
 	context->shadow_root_level = level;
@@ -1660,6 +1666,7 @@ static int paging32_init_context(struct 
 	context->free = paging_free;
 	context->prefetch_page = paging32_prefetch_page;
 	context->sync_page = paging32_sync_page;
+	context->invlpg = paging32_invlpg;
 	context->root_level = PT32_ROOT_LEVEL;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
@@ -1680,6 +1687,7 @@ static int init_kvm_tdp_mmu(struct kvm_v
 	context->free = nonpaging_free;
 	context->prefetch_page = nonpaging_prefetch_page;
 	context->sync_page = nonpaging_sync_page;
+	context->invlpg = nonpaging_invlpg;
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 	context->root_hpa = INVALID_PAGE;
 
@@ -2072,6 +2080,15 @@ out:
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
 
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+	spin_lock(&vcpu->kvm->mmu_lock);
+	vcpu->arch.mmu.invlpg(vcpu, gva);
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	kvm_mmu_flush_tlb(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
+
 void kvm_enable_tdp(void)
 {
 	tdp_enabled = true;
Index: kvm/arch/x86/kvm/paging_tmpl.h
===================================================================
--- kvm.orig/arch/x86/kvm/paging_tmpl.h
+++ kvm/arch/x86/kvm/paging_tmpl.h
@@ -457,6 +457,31 @@ out_unlock:
 	return 0;
 }
 
+static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
+				      struct kvm_vcpu *vcpu, u64 addr,
+				      u64 *sptep, int level)
+{
+
+	if (level == PT_PAGE_TABLE_LEVEL) {
+		if (is_shadow_present_pte(*sptep))
+			rmap_remove(vcpu->kvm, sptep);
+		set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+		return 1;
+	}
+	if (!is_shadow_present_pte(*sptep))
+		return 1;
+	return 0;
+}
+
+static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
+{
+	struct shadow_walker walker = {
+		.walker = { .entry = FNAME(shadow_invlpg_entry), },
+	};
+
+	walk_shadow(&walker.walker, vcpu, gva);
+}
+
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
 {
 	struct guest_walker walker;
Index: kvm/arch/x86/kvm/svm.c
===================================================================
--- kvm.orig/arch/x86/kvm/svm.c
+++ kvm/arch/x86/kvm/svm.c
@@ -525,6 +525,7 @@ static void init_vmcb(struct vcpu_svm *s
 				(1ULL << INTERCEPT_CPUID) |
 				(1ULL << INTERCEPT_INVD) |
 				(1ULL << INTERCEPT_HLT) |
+				(1ULL << INTERCEPT_INVLPG) |
 				(1ULL << INTERCEPT_INVLPGA) |
 				(1ULL << INTERCEPT_IOIO_PROT) |
 				(1ULL << INTERCEPT_MSR_PROT) |
@@ -589,7 +590,8 @@ static void init_vmcb(struct vcpu_svm *s
 	if (npt_enabled) {
 		/* Setup VMCB for Nested Paging */
 		control->nested_ctl = 1;
-		control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH);
+		control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
+					(1ULL << INTERCEPT_INVLPG));
 		control->intercept_exceptions &= ~(1 << PF_VECTOR);
 		control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
 						INTERCEPT_CR3_MASK);
@@ -1164,6 +1166,13 @@ static int cpuid_interception(struct vcp
 	return 1;
 }
 
+static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
+		pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
+	return 1;
+}
+
 static int emulate_on_interception(struct vcpu_svm *svm,
 				   struct kvm_run *kvm_run)
 {
@@ -1417,7 +1426,7 @@ static int (*svm_exit_handlers[])(struct
 	[SVM_EXIT_CPUID]			= cpuid_interception,
 	[SVM_EXIT_INVD]                         = emulate_on_interception,
 	[SVM_EXIT_HLT]				= halt_interception,
-	[SVM_EXIT_INVLPG]			= emulate_on_interception,
+	[SVM_EXIT_INVLPG]			= invlpg_interception,
 	[SVM_EXIT_INVLPGA]			= invalid_op_interception,
 	[SVM_EXIT_IOIO] 		  	= io_interception,
 	[SVM_EXIT_MSR]				= msr_interception,
Index: kvm/arch/x86/kvm/vmx.c
===================================================================
--- kvm.orig/arch/x86/kvm/vmx.c
+++ kvm/arch/x86/kvm/vmx.c
@@ -1130,7 +1130,8 @@ static __init int setup_vmcs_config(stru
 	      CPU_BASED_CR3_STORE_EXITING |
 	      CPU_BASED_USE_IO_BITMAPS |
 	      CPU_BASED_MOV_DR_EXITING |
-	      CPU_BASED_USE_TSC_OFFSETING;
+	      CPU_BASED_USE_TSC_OFFSETING |
+	      CPU_BASED_INVLPG_EXITING;
 	opt = CPU_BASED_TPR_SHADOW |
 	      CPU_BASED_USE_MSR_BITMAPS |
 	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1159,9 +1160,11 @@ static __init int setup_vmcs_config(stru
 		_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
 #endif
 	if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
-		/* CR3 accesses don't need to cause VM Exits when EPT enabled */
+		/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
+		   enabled */
 		min &= ~(CPU_BASED_CR3_LOAD_EXITING |
-			 CPU_BASED_CR3_STORE_EXITING);
+			 CPU_BASED_CR3_STORE_EXITING |
+			 CPU_BASED_INVLPG_EXITING);
 		if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
 					&_cpu_based_exec_control) < 0)
 			return -EIO;
@@ -2790,6 +2793,15 @@ static int handle_vmcall(struct kvm_vcpu
 	return 1;
 }
 
+static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+
+	kvm_mmu_invlpg(vcpu, exit_qualification);
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
 static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	skip_emulated_instruction(vcpu);
@@ -2958,6 +2970,7 @@ static int (*kvm_vmx_exit_handlers[])(st
 	[EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
 	[EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
 	[EXIT_REASON_HLT]                     = handle_halt,
+	[EXIT_REASON_INVLPG]		      = handle_invlpg,
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
 	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
 	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
Index: kvm/include/asm-x86/kvm_host.h
===================================================================
--- kvm.orig/include/asm-x86/kvm_host.h
+++ kvm/include/asm-x86/kvm_host.h
@@ -222,6 +222,7 @@ struct kvm_mmu {
 			      struct kvm_mmu_page *page);
 	int (*sync_page)(struct kvm_vcpu *vcpu,
 			 struct kvm_mmu_page *sp);
+	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
 	hpa_t root_hpa;
 	int root_level;
 	int shadow_root_level;
@@ -591,6 +592,7 @@ int kvm_emulate_hypercall(struct kvm_vcp
 int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
 
 void kvm_enable_tdp(void);
 void kvm_disable_tdp(void);
Index: kvm/arch/x86/kvm/x86.c
===================================================================
--- kvm.orig/arch/x86/kvm/x86.c
+++ kvm/arch/x86/kvm/x86.c
@@ -2341,6 +2341,7 @@ static unsigned long get_segment_base(st
 
 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
 {
+	kvm_mmu_invlpg(vcpu, address);
 	return X86EMUL_CONTINUE;
 }
 

-- 


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 06/10] KVM: x86: trap invlpg
  2008-09-18 21:27 ` [patch 06/10] KVM: x86: trap invlpg Marcelo Tosatti
@ 2008-09-20  0:53   ` Avi Kivity
  2008-09-21  0:43     ` Marcelo Tosatti
  0 siblings, 1 reply; 34+ messages in thread
From: Avi Kivity @ 2008-09-20  0:53 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, kvm, David S. Ahern

Marcelo Tosatti wrote:
>  
> +static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
> +				      struct kvm_vcpu *vcpu, u64 addr,
> +				      u64 *sptep, int level)
> +{
> +
> +	if (level == PT_PAGE_TABLE_LEVEL) {
> +		if (is_shadow_present_pte(*sptep))
> +			rmap_remove(vcpu->kvm, sptep);
> +		set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
>   

Need to flush the real tlb as well.


-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 06/10] KVM: x86: trap invlpg
  2008-09-20  0:53   ` Avi Kivity
@ 2008-09-21  0:43     ` Marcelo Tosatti
  0 siblings, 0 replies; 34+ messages in thread
From: Marcelo Tosatti @ 2008-09-21  0:43 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Avi Kivity, kvm, David S. Ahern

On Fri, Sep 19, 2008 at 05:53:22PM -0700, Avi Kivity wrote:
> Marcelo Tosatti wrote:
>>  +static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
>> +				      struct kvm_vcpu *vcpu, u64 addr,
>> +				      u64 *sptep, int level)
>> +{
>> +
>> +	if (level == PT_PAGE_TABLE_LEVEL) {
>> +		if (is_shadow_present_pte(*sptep))
>> +			rmap_remove(vcpu->kvm, sptep);
>> +		set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
>>   
>
> Need to flush the real tlb as well.

The local TLB you mean?

+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+       spin_lock(&vcpu->kvm->mmu_lock);
+       vcpu->arch.mmu.invlpg(vcpu, gva);
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       kvm_mmu_flush_tlb(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);


^ permalink raw reply	[flat|nested] 34+ messages in thread

* [patch 07/10] KVM: MMU: mmu_parent_walk
  2008-09-18 21:27 [patch 00/10] out of sync shadow v2 Marcelo Tosatti
                   ` (5 preceding siblings ...)
  2008-09-18 21:27 ` [patch 06/10] KVM: x86: trap invlpg Marcelo Tosatti
@ 2008-09-18 21:27 ` Marcelo Tosatti
  2008-09-20  0:56   ` Avi Kivity
  2008-09-18 21:27 ` [patch 08/10] KVM: MMU: awareness of new kvm_mmu_zap_page behaviour Marcelo Tosatti
                   ` (3 subsequent siblings)
  10 siblings, 1 reply; 34+ messages in thread
From: Marcelo Tosatti @ 2008-09-18 21:27 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, David S. Ahern, Marcelo Tosatti

[-- Attachment #1: walk-parent --]
[-- Type: text/plain, Size: 2128 bytes --]

Introduce a function to walk all parents of a given page, invoking a handler.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>


Index: kvm/arch/x86/kvm/mmu.c
===================================================================
--- kvm.orig/arch/x86/kvm/mmu.c
+++ kvm/arch/x86/kvm/mmu.c
@@ -147,6 +147,8 @@ struct kvm_shadow_walk {
 		     u64 addr, u64 *spte, int level);
 };
 
+typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
+
 static struct kmem_cache *pte_chain_cache;
 static struct kmem_cache *rmap_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
@@ -862,6 +864,65 @@ static void mmu_page_remove_parent_pte(s
 	BUG();
 }
 
+struct mmu_parent_walk {
+	struct hlist_node *node;
+	int i;
+};
+
+static struct kvm_mmu_page *mmu_parent_next(struct kvm_mmu_page *sp,
+					    struct mmu_parent_walk *walk)
+{
+	struct kvm_pte_chain *pte_chain;
+	struct hlist_head *h;
+
+	if (!walk->node) {
+		if (!sp || !sp->parent_pte)
+			return NULL;
+		if (!sp->multimapped)
+			return page_header(__pa(sp->parent_pte));
+		h = &sp->parent_ptes;
+		walk->node = h->first;
+		walk->i = 0;
+	}
+
+	while (walk->node) {
+		pte_chain = hlist_entry(walk->node, struct kvm_pte_chain, link);
+		while (walk->i < NR_PTE_CHAIN_ENTRIES) {
+			int i = walk->i++;
+			if (!pte_chain->parent_ptes[i])
+				break;
+			return page_header(__pa(pte_chain->parent_ptes[i]));
+		}
+		walk->node = walk->node->next;
+		walk->i = 0;
+	}
+
+	return NULL;
+}
+
+static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			    mmu_parent_walk_fn fn)
+{
+	int level, start_level;
+	struct mmu_parent_walk walk[PT64_ROOT_LEVEL];
+
+	memset(&walk, 0, sizeof(walk));
+	level = start_level = sp->role.level;
+
+	do {
+		sp = mmu_parent_next(sp, &walk[level-1]);
+		if (sp) {
+			if (sp->role.level > start_level)
+				fn(vcpu, sp);
+			if (level != sp->role.level)
+				++level;
+			WARN_ON (level > PT64_ROOT_LEVEL);
+			continue;
+		}
+		--level;
+	} while (level > start_level-1);
+}
+
 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
 				    struct kvm_mmu_page *sp)
 {

-- 


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 07/10] KVM: MMU: mmu_parent_walk
  2008-09-18 21:27 ` [patch 07/10] KVM: MMU: mmu_parent_walk Marcelo Tosatti
@ 2008-09-20  0:56   ` Avi Kivity
  2008-09-21  0:44     ` Marcelo Tosatti
  0 siblings, 1 reply; 34+ messages in thread
From: Avi Kivity @ 2008-09-20  0:56 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, kvm, David S. Ahern

Marcelo Tosatti wrote:
> Introduce a function to walk all parents of a given page, invoking a handler.
>
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>
>
> Index: kvm/arch/x86/kvm/mmu.c
> ===================================================================
> --- kvm.orig/arch/x86/kvm/mmu.c
> +++ kvm/arch/x86/kvm/mmu.c
> @@ -147,6 +147,8 @@ struct kvm_shadow_walk {
>  		     u64 addr, u64 *spte, int level);
>  };
>  
> +typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
> +
>  static struct kmem_cache *pte_chain_cache;
>  static struct kmem_cache *rmap_desc_cache;
>  static struct kmem_cache *mmu_page_header_cache;
> @@ -862,6 +864,65 @@ static void mmu_page_remove_parent_pte(s
>  	BUG();
>  }
>  
> +struct mmu_parent_walk {
> +	struct hlist_node *node;
> +	int i;
> +};
> +
> +static struct kvm_mmu_page *mmu_parent_next(struct kvm_mmu_page *sp,
> +					    struct mmu_parent_walk *walk)
> +{
> +	struct kvm_pte_chain *pte_chain;
> +	struct hlist_head *h;
> +
> +	if (!walk->node) {
> +		if (!sp || !sp->parent_pte)
> +			return NULL;
> +		if (!sp->multimapped)
> +			return page_header(__pa(sp->parent_pte));
> +		h = &sp->parent_ptes;
> +		walk->node = h->first;
> +		walk->i = 0;
> +	}
> +
> +	while (walk->node) {
> +		pte_chain = hlist_entry(walk->node, struct kvm_pte_chain, link);
> +		while (walk->i < NR_PTE_CHAIN_ENTRIES) {
> +			int i = walk->i++;
> +			if (!pte_chain->parent_ptes[i])
> +				break;
> +			return page_header(__pa(pte_chain->parent_ptes[i]));
> +		}
> +		walk->node = walk->node->next;
> +		walk->i = 0;
> +	}
> +
> +	return NULL;
> +}
> +
> +static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> +			    mmu_parent_walk_fn fn)
> +{
> +	int level, start_level;
> +	struct mmu_parent_walk walk[PT64_ROOT_LEVEL];
> +
> +	memset(&walk, 0, sizeof(walk));
> +	level = start_level = sp->role.level;
> +
> +	do {
> +		sp = mmu_parent_next(sp, &walk[level-1]);
> +		if (sp) {
> +			if (sp->role.level > start_level)
> +				fn(vcpu, sp);
> +			if (level != sp->role.level)
> +				++level;
> +			WARN_ON (level > PT64_ROOT_LEVEL);
> +			continue;
> +		}
> +		--level;
> +	} while (level > start_level-1);
> +}
> +

Could be much simplified with recursion, no?  As the depth is limited to 
4, there's no stack overflow problem.


-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 07/10] KVM: MMU: mmu_parent_walk
  2008-09-20  0:56   ` Avi Kivity
@ 2008-09-21  0:44     ` Marcelo Tosatti
  2008-09-22 20:30       ` Avi Kivity
  0 siblings, 1 reply; 34+ messages in thread
From: Marcelo Tosatti @ 2008-09-21  0:44 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Avi Kivity, kvm, David S. Ahern

On Fri, Sep 19, 2008 at 05:56:46PM -0700, Avi Kivity wrote:
>> +	} while (level > start_level-1);
>> +}
>> +
>
> Could be much simplified with recursion, no?  As the depth is limited to  
> 4, there's no stack overflow problem.

The early version was recursive, but since its a generic helper I
preferred a non-recursive function.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 07/10] KVM: MMU: mmu_parent_walk
  2008-09-21  0:44     ` Marcelo Tosatti
@ 2008-09-22 20:30       ` Avi Kivity
  2008-09-22 22:04         ` Marcelo Tosatti
  0 siblings, 1 reply; 34+ messages in thread
From: Avi Kivity @ 2008-09-22 20:30 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, David S. Ahern

Marcelo Tosatti wrote:
> On Fri, Sep 19, 2008 at 05:56:46PM -0700, Avi Kivity wrote:
>   
>>> +	} while (level > start_level-1);
>>> +}
>>> +
>>>       
>> Could be much simplified with recursion, no?  As the depth is limited to  
>> 4, there's no stack overflow problem.
>>     
>
> The early version was recursive, but since its a generic helper I
> preferred a non-recursive function.
>   

Let's start with a super-simple recursive version. When the code has 
seen some debugging, we can add complexity. But for the initial phase, 
simpler is better.

The non-recursive version has the advantage that it can be converted to 
a kvm_for_each_parent() later, but still, we can do that later.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 07/10] KVM: MMU: mmu_parent_walk
  2008-09-22 20:30       ` Avi Kivity
@ 2008-09-22 22:04         ` Marcelo Tosatti
  0 siblings, 0 replies; 34+ messages in thread
From: Marcelo Tosatti @ 2008-09-22 22:04 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, David S. Ahern

On Mon, Sep 22, 2008 at 11:30:51PM +0300, Avi Kivity wrote:
> Marcelo Tosatti wrote:
>> On Fri, Sep 19, 2008 at 05:56:46PM -0700, Avi Kivity wrote:
>>   
>>>> +	} while (level > start_level-1);
>>>> +}
>>>> +
>>>>       
>>> Could be much simplified with recursion, no?  As the depth is limited 
>>> to  4, there's no stack overflow problem.
>>>     
>>
>> The early version was recursive, but since its a generic helper I
>> preferred a non-recursive function.
>>   
>
> Let's start with a super-simple recursive version. When the code has  
> seen some debugging, we can add complexity. But for the initial phase,  
> simpler is better.
>
> The non-recursive version has the advantage that it can be converted to  
> a kvm_for_each_parent() later, but still, we can do that later.

OK, this is the earlier version. I'll resend the patchset with it
instead, anything else? (hoping you're ok with 32-bit nonpae being done
as optimization later).

Index: kvm.oos2/arch/x86/kvm/mmu.c
===================================================================
--- kvm.oos2.orig/arch/x86/kvm/mmu.c
+++ kvm.oos2/arch/x86/kvm/mmu.c
@@ -922,6 +922,31 @@ static void mmu_page_remove_parent_pte(s
 	BUG();
 }
 
+static int mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			   int (*fn) (struct kvm_vcpu *vcpu,
+				      struct kvm_mmu_page *sp))
+{
+	struct kvm_pte_chain *pte_chain;
+	struct hlist_node *node;
+	struct kvm_mmu_page *parent_sp;
+	int i;
+
+	fn(vcpu, sp);
+
+	if (!sp->multimapped && sp->parent_pte) {
+		parent_sp = page_header(__pa(sp->parent_pte));
+		mmu_parent_walk(vcpu, parent_sp, fn);
+		return;
+	}
+	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+			if (!pte_chain->parent_ptes[i])
+				break;
+			parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
+			mmu_parent_walk(vcpu, parent_sp, fn);
+		}
+}
+
 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
 				    struct kvm_mmu_page *sp)
 {

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [patch 08/10] KVM: MMU: awareness of new kvm_mmu_zap_page behaviour
  2008-09-18 21:27 [patch 00/10] out of sync shadow v2 Marcelo Tosatti
                   ` (6 preceding siblings ...)
  2008-09-18 21:27 ` [patch 07/10] KVM: MMU: mmu_parent_walk Marcelo Tosatti
@ 2008-09-18 21:27 ` Marcelo Tosatti
  2008-09-18 21:27 ` [patch 09/10] KVM: MMU: out of sync shadow core v2 Marcelo Tosatti
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 34+ messages in thread
From: Marcelo Tosatti @ 2008-09-18 21:27 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, David S. Ahern, Marcelo Tosatti

[-- Attachment #1: mmu-zap-ret --]
[-- Type: text/plain, Size: 1777 bytes --]

kvm_mmu_zap_page will soon zap the unsynced children of a page. Restart
list walk in such case.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: kvm/arch/x86/kvm/mmu.c
===================================================================
--- kvm.orig/arch/x86/kvm/mmu.c
+++ kvm/arch/x86/kvm/mmu.c
@@ -1112,7 +1112,7 @@ static void kvm_mmu_unlink_parents(struc
 	}
 }
 
-static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	++kvm->stat.mmu_shadow_zapped;
 	kvm_mmu_page_unlink_children(kvm, sp);
@@ -1129,6 +1129,7 @@ static void kvm_mmu_zap_page(struct kvm 
 		kvm_reload_remote_mmus(kvm);
 	}
 	kvm_mmu_reset_last_pte_updated(kvm);
+	return 0;
 }
 
 /*
@@ -1181,8 +1182,9 @@ static int kvm_mmu_unprotect_page(struct
 		if (sp->gfn == gfn && !sp->role.metaphysical) {
 			pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
 				 sp->role.word);
-			kvm_mmu_zap_page(kvm, sp);
 			r = 1;
+			if (kvm_mmu_zap_page(kvm, sp))
+				n = bucket->first;
 		}
 	return r;
 }
@@ -2027,7 +2029,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *
 			 */
 			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
 				 gpa, bytes, sp->role.word);
-			kvm_mmu_zap_page(vcpu->kvm, sp);
+			if (kvm_mmu_zap_page(vcpu->kvm, sp))
+				n = bucket->first;
 			++vcpu->kvm->stat.mmu_flooded;
 			continue;
 		}
@@ -2260,7 +2263,9 @@ void kvm_mmu_zap_all(struct kvm *kvm)
 
 	spin_lock(&kvm->mmu_lock);
 	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
-		kvm_mmu_zap_page(kvm, sp);
+		if (kvm_mmu_zap_page(kvm, sp))
+			node = container_of(kvm->arch.active_mmu_pages.next,
+					    struct kvm_mmu_page, link);
 	spin_unlock(&kvm->mmu_lock);
 
 	kvm_flush_remote_tlbs(kvm);

-- 


^ permalink raw reply	[flat|nested] 34+ messages in thread

* [patch 09/10] KVM: MMU: out of sync shadow core v2
  2008-09-18 21:27 [patch 00/10] out of sync shadow v2 Marcelo Tosatti
                   ` (7 preceding siblings ...)
  2008-09-18 21:27 ` [patch 08/10] KVM: MMU: awareness of new kvm_mmu_zap_page behaviour Marcelo Tosatti
@ 2008-09-18 21:27 ` Marcelo Tosatti
  2008-09-20  1:22   ` Avi Kivity
  2008-09-18 21:27 ` [patch 10/10] KVM: MMU: speed up mmu_unsync_walk Marcelo Tosatti
  2008-09-18 22:36 ` [patch 00/10] out of sync shadow v2 Marcelo Tosatti
  10 siblings, 1 reply; 34+ messages in thread
From: Marcelo Tosatti @ 2008-09-18 21:27 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, David S. Ahern, Marcelo Tosatti

[-- Attachment #1: kvm-oos-core --]
[-- Type: text/plain, Size: 9878 bytes --]

Allow guest pagetables to go out of sync.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: kvm/arch/x86/kvm/mmu.c
===================================================================
--- kvm.orig/arch/x86/kvm/mmu.c
+++ kvm/arch/x86/kvm/mmu.c
@@ -148,6 +148,7 @@ struct kvm_shadow_walk {
 };
 
 typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
+typedef int (*mmu_unsync_fn) (struct kvm_mmu_page *sp, void *priv);
 
 static struct kmem_cache *pte_chain_cache;
 static struct kmem_cache *rmap_desc_cache;
@@ -942,6 +943,39 @@ static void nonpaging_invlpg(struct kvm_
 {
 }
 
+static int mmu_unsync_walk(struct kvm_mmu_page *parent, mmu_unsync_fn fn,
+			   void *priv)
+{
+	int i, ret;
+	struct kvm_mmu_page *sp = parent;
+
+	while (parent->unsync_children) {
+		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+			u64 ent = sp->spt[i];
+
+			if (is_shadow_present_pte(ent)) {
+				struct kvm_mmu_page *child;
+				child = page_header(ent & PT64_BASE_ADDR_MASK);
+
+				if (child->unsync_children) {
+					sp = child;
+					break;
+				}
+				if (child->unsync) {
+					ret = fn(child, priv);
+					if (ret)
+						return ret;
+				}
+			}
+		}
+		if (i == PT64_ENT_PER_PAGE) {
+			sp->unsync_children = 0;
+			sp = parent;
+		}
+	}
+	return 0;
+}
+
 static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 {
 	unsigned index;
@@ -962,6 +996,47 @@ static struct kvm_mmu_page *kvm_mmu_look
 	return NULL;
 }
 
+static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	WARN_ON(!sp->unsync);
+	sp->unsync = 0;
+	--kvm->stat.mmu_unsync;
+}
+
+static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+	if (sp->role.glevels != vcpu->arch.mmu.root_level) {
+		kvm_mmu_zap_page(vcpu->kvm, sp);
+		return 1;
+	}
+
+	rmap_write_protect(vcpu->kvm, sp->gfn);
+	if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
+		kvm_mmu_zap_page(vcpu->kvm, sp);
+		return 1;
+	}
+
+	kvm_mmu_flush_tlb(vcpu);
+	kvm_unlink_unsync_page(vcpu->kvm, sp);
+	return 0;
+}
+
+static int mmu_sync_fn(struct kvm_mmu_page *sp, void *priv)
+{
+	struct kvm_vcpu *vcpu = priv;
+
+	kvm_sync_page(vcpu, sp);
+	return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
+}
+
+static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+	while (mmu_unsync_walk(sp, mmu_sync_fn, vcpu))
+		cond_resched_lock(&vcpu->kvm->mmu_lock);
+}
+
 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 					     gfn_t gfn,
 					     gva_t gaddr,
@@ -975,7 +1050,7 @@ static struct kvm_mmu_page *kvm_mmu_get_
 	unsigned quadrant;
 	struct hlist_head *bucket;
 	struct kvm_mmu_page *sp;
-	struct hlist_node *node;
+	struct hlist_node *node, *tmp;
 
 	role.word = 0;
 	role.glevels = vcpu->arch.mmu.root_level;
@@ -991,8 +1066,18 @@ static struct kvm_mmu_page *kvm_mmu_get_
 		 gfn, role.word);
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-	hlist_for_each_entry(sp, node, bucket, hash_link)
-		if (sp->gfn == gfn && sp->role.word == role.word) {
+	hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
+		if (sp->gfn == gfn) {
+			if (sp->unsync)
+				if (kvm_sync_page(vcpu, sp))
+					continue;
+
+			if (sp->role.word != role.word)
+				continue;
+
+			if (sp->unsync_children)
+				vcpu->arch.mmu.need_root_sync = 1;
+
 			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
 			pgprintk("%s: found\n", __func__);
 			return sp;
@@ -1112,14 +1197,45 @@ static void kvm_mmu_unlink_parents(struc
 	}
 }
 
+struct mmu_zap_walk {
+	struct kvm *kvm;
+	int zapped;
+};
+
+static int mmu_zap_fn(struct kvm_mmu_page *sp, void *private)
+{
+	struct mmu_zap_walk *zap_walk = private;
+
+	kvm_mmu_zap_page(zap_walk->kvm, sp);
+	zap_walk->zapped = 1;
+	return 0;
+}
+
+static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	struct mmu_zap_walk mmu_zap_walk = {
+		.kvm = kvm,
+		.zapped = 0,
+	};
+
+	if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+		return 0;
+	mmu_unsync_walk(sp, mmu_zap_fn, &mmu_zap_walk);
+	return mmu_zap_walk.zapped;
+}
+
 static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
+	int ret;
 	++kvm->stat.mmu_shadow_zapped;
+	ret = mmu_zap_unsync_children(kvm, sp);
 	kvm_mmu_page_unlink_children(kvm, sp);
 	kvm_mmu_unlink_parents(kvm, sp);
 	kvm_flush_remote_tlbs(kvm);
 	if (!sp->role.invalid && !sp->role.metaphysical)
 		unaccount_shadowed(kvm, sp->gfn);
+	if (sp->unsync)
+		kvm_unlink_unsync_page(kvm, sp);
 	if (!sp->root_count) {
 		hlist_del(&sp->hash_link);
 		kvm_mmu_free_page(kvm, sp);
@@ -1129,7 +1245,7 @@ static int kvm_mmu_zap_page(struct kvm *
 		kvm_reload_remote_mmus(kvm);
 	}
 	kvm_mmu_reset_last_pte_updated(kvm);
-	return 0;
+	return ret;
 }
 
 /*
@@ -1221,10 +1337,57 @@ struct page *gva_to_page(struct kvm_vcpu
 	return page;
 }
 
+static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+	sp->unsync_children = 1;
+	return 1;
+}
+
+static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+	unsigned index;
+	struct hlist_head *bucket;
+	struct kvm_mmu_page *s;
+	struct hlist_node *node, *n;
+
+	index = kvm_page_table_hashfn(sp->gfn);
+	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+	/* don't unsync if pagetable is shadowed with multiple roles */
+	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
+		if (s->gfn != sp->gfn || s->role.metaphysical)
+			continue;
+		if (s->role.word != sp->role.word)
+			return 1;
+	}
+	mmu_parent_walk(vcpu, sp, unsync_walk_fn);
+	++vcpu->kvm->stat.mmu_unsync;
+	sp->unsync = 1;
+	return 0;
+}
+
+static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
+				  bool can_unsync)
+{
+	struct kvm_mmu_page *shadow;
+
+	shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
+	if (shadow) {
+		if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
+			return 1;
+		if (shadow->unsync)
+			return 0;
+		if (can_unsync)
+			return kvm_unsync_page(vcpu, shadow);
+		return 1;
+	}
+	return 0;
+}
+
 static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		    unsigned pte_access, int user_fault,
 		    int write_fault, int dirty, int largepage,
-		    gfn_t gfn, pfn_t pfn, bool speculative)
+		    gfn_t gfn, pfn_t pfn, bool speculative,
+		    bool can_unsync)
 {
 	u64 spte;
 	int ret = 0;
@@ -1251,7 +1414,6 @@ static int set_spte(struct kvm_vcpu *vcp
 
 	if ((pte_access & ACC_WRITE_MASK)
 	    || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
-		struct kvm_mmu_page *shadow;
 
 		if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
 			ret = 1;
@@ -1261,8 +1423,7 @@ static int set_spte(struct kvm_vcpu *vcp
 
 		spte |= PT_WRITABLE_MASK;
 
-		shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
-		if (shadow) {
+		if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
 			pgprintk("%s: found shadow page for %lx, marking ro\n",
 				 __func__, gfn);
 			ret = 1;
@@ -1280,7 +1441,6 @@ set_pte:
 	return ret;
 }
 
-
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 			 unsigned pt_access, unsigned pte_access,
 			 int user_fault, int write_fault, int dirty,
@@ -1318,7 +1478,7 @@ static void mmu_set_spte(struct kvm_vcpu
 		}
 	}
 	if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
-		      dirty, largepage, gfn, pfn, speculative)) {
+		      dirty, largepage, gfn, pfn, speculative, true)) {
 		if (write_fault)
 			*ptwrite = 1;
 		if (was_writeble)
@@ -1539,10 +1699,6 @@ static void mmu_alloc_roots(struct kvm_v
 	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
 }
 
-static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
-{
-}
-
 static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 {
 	int i;
Index: kvm/include/asm-x86/kvm_host.h
===================================================================
--- kvm.orig/include/asm-x86/kvm_host.h
+++ kvm/include/asm-x86/kvm_host.h
@@ -195,6 +195,8 @@ struct kvm_mmu_page {
 				    */
 	int multimapped;         /* More than one parent_pte? */
 	int root_count;          /* Currently serving as active root */
+	bool unsync;
+	bool unsync_children;
 	union {
 		u64 *parent_pte;               /* !multimapped */
 		struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
@@ -226,6 +228,7 @@ struct kvm_mmu {
 	hpa_t root_hpa;
 	int root_level;
 	int shadow_root_level;
+	bool need_root_sync;
 
 	u64 *pae_root;
 };
@@ -371,6 +374,7 @@ struct kvm_vm_stat {
 	u32 mmu_flooded;
 	u32 mmu_recycled;
 	u32 mmu_cache_miss;
+	u32 mmu_unsync;
 	u32 remote_tlb_flush;
 	u32 lpages;
 };
Index: kvm/arch/x86/kvm/x86.c
===================================================================
--- kvm.orig/arch/x86/kvm/x86.c
+++ kvm/arch/x86/kvm/x86.c
@@ -101,6 +101,7 @@ struct kvm_stats_debugfs_item debugfs_en
 	{ "mmu_flooded", VM_STAT(mmu_flooded) },
 	{ "mmu_recycled", VM_STAT(mmu_recycled) },
 	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
+	{ "mmu_unsync", VM_STAT(mmu_unsync) },
 	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 	{ "largepages", VM_STAT(lpages) },
 	{ NULL }
Index: kvm/arch/x86/kvm/paging_tmpl.h
===================================================================
--- kvm.orig/arch/x86/kvm/paging_tmpl.h
+++ kvm/arch/x86/kvm/paging_tmpl.h
@@ -449,6 +449,11 @@ static int FNAME(page_fault)(struct kvm_
 	kvm_mmu_audit(vcpu, "post page fault (fixed)");
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
+	if (vcpu->arch.mmu.need_root_sync) {
+		kvm_mmu_sync_roots(vcpu);
+		vcpu->arch.mmu.need_root_sync = 0;
+	}
+
 	return write_pt;
 
 out_unlock:
@@ -576,7 +581,7 @@ static int FNAME(sync_page)(struct kvm_v
 			pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
 			set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
 				 is_dirty_pte(gpte), 0, gfn,
-				 spte_to_pfn(sp->spt[i]), true);
+				 spte_to_pfn(sp->spt[i]), true, false);
 		}
 	}
 

-- 


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 09/10] KVM: MMU: out of sync shadow core v2
  2008-09-18 21:27 ` [patch 09/10] KVM: MMU: out of sync shadow core v2 Marcelo Tosatti
@ 2008-09-20  1:22   ` Avi Kivity
  2008-09-21  0:45     ` Marcelo Tosatti
  0 siblings, 1 reply; 34+ messages in thread
From: Avi Kivity @ 2008-09-20  1:22 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, kvm, David S. Ahern

Marcelo Tosatti wrote:
>  static struct kmem_cache *rmap_desc_cache;
> @@ -942,6 +943,39 @@ static void nonpaging_invlpg(struct kvm_
>  {
>  }
>  
> +static int mmu_unsync_walk(struct kvm_mmu_page *parent, mmu_unsync_fn fn,
> +			   void *priv)
>   

Instead of private, have an object contain both callback and private 
data, and use container_of().  Reduces the chance of type errors.

> +{
> +	int i, ret;
> +	struct kvm_mmu_page *sp = parent;
> +
> +	while (parent->unsync_children) {
> +		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
> +			u64 ent = sp->spt[i];
> +
> +			if (is_shadow_present_pte(ent)) {
> +				struct kvm_mmu_page *child;
> +				child = page_header(ent & PT64_BASE_ADDR_MASK);
> +
> +				if (child->unsync_children) {
> +					sp = child;
> +					break;
> +				}
> +				if (child->unsync) {
> +					ret = fn(child, priv);
> +					if (ret)
> +						return ret;
> +				}
> +			}
> +		}
> +		if (i == PT64_ENT_PER_PAGE) {
> +			sp->unsync_children = 0;
> +			sp = parent;
> +		}
> +	}
> +	return 0;
> +}
>   

What does this do?

> +static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
> +{
> +	if (sp->role.glevels != vcpu->arch.mmu.root_level) {
> +		kvm_mmu_zap_page(vcpu->kvm, sp);
> +		return 1;
> +	}
>   

Suppose we switch to real mode, touch a pte, switch back.  Is this handled?

> @@ -991,8 +1066,18 @@ static struct kvm_mmu_page *kvm_mmu_get_
>  		 gfn, role.word);
>  	index = kvm_page_table_hashfn(gfn);
>  	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
> -	hlist_for_each_entry(sp, node, bucket, hash_link)
> -		if (sp->gfn == gfn && sp->role.word == role.word) {
> +	hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
> +		if (sp->gfn == gfn) {
> +			if (sp->unsync)
> +				if (kvm_sync_page(vcpu, sp))
> +					continue;
> +
> +			if (sp->role.word != role.word)
> +				continue;
> +
> +			if (sp->unsync_children)
> +				vcpu->arch.mmu.need_root_sync = 1;
>   

mmu_reload() maybe?

>  static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
>  {
> +	int ret;
>  	++kvm->stat.mmu_shadow_zapped;
> +	ret = mmu_zap_unsync_children(kvm, sp);
>  	kvm_mmu_page_unlink_children(kvm, sp);
>  	kvm_mmu_unlink_parents(kvm, sp);
>  	kvm_flush_remote_tlbs(kvm);
>  	if (!sp->role.invalid && !sp->role.metaphysical)
>  		unaccount_shadowed(kvm, sp->gfn);
> +	if (sp->unsync)
> +		kvm_unlink_unsync_page(kvm, sp);
>  	if (!sp->root_count) {
>  		hlist_del(&sp->hash_link);
>  		kvm_mmu_free_page(kvm, sp);
> @@ -1129,7 +1245,7 @@ static int kvm_mmu_zap_page(struct kvm *
>  		kvm_reload_remote_mmus(kvm);
>  	}
>  	kvm_mmu_reset_last_pte_updated(kvm);
> -	return 0;
> +	return ret;
>  }
>   

Why does the caller care if zap also zapped some other random pages?  To 
restart walking the list?

>  
> +
> +static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
> +{
> +	unsigned index;
> +	struct hlist_head *bucket;
> +	struct kvm_mmu_page *s;
> +	struct hlist_node *node, *n;
> +
> +	index = kvm_page_table_hashfn(sp->gfn);
> +	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
> +	/* don't unsync if pagetable is shadowed with multiple roles */
> +	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
> +		if (s->gfn != sp->gfn || s->role.metaphysical)
> +			continue;
> +		if (s->role.word != sp->role.word)
> +			return 1;
> +	}
>   

This will happen for nonpae paging.  But why not allow it?  Zap all 
unsynced pages on mode switch.

Oh, if a page is both a page directory and page table, yes.  So to allow 
nonpae oos, check the level instead.


-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 09/10] KVM: MMU: out of sync shadow core v2
  2008-09-20  1:22   ` Avi Kivity
@ 2008-09-21  0:45     ` Marcelo Tosatti
  2008-09-22 20:41       ` Avi Kivity
  0 siblings, 1 reply; 34+ messages in thread
From: Marcelo Tosatti @ 2008-09-21  0:45 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Avi Kivity, kvm, David S. Ahern

On Fri, Sep 19, 2008 at 06:22:52PM -0700, Avi Kivity wrote:
> Instead of private, have an object contain both callback and private  
> data, and use container_of().  Reduces the chance of type errors.

OK.

>> +	while (parent->unsync_children) {
>> +		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
>> +			u64 ent = sp->spt[i];
>> +
>> +			if (is_shadow_present_pte(ent)) {
>> +				struct kvm_mmu_page *child;
>> +				child = page_header(ent & PT64_BASE_ADDR_MASK);
>
> What does this do?

Walks all children of given page with no efficiency. Its replaced later
by the bitmap version.

>> +static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
>> +{
>> +	if (sp->role.glevels != vcpu->arch.mmu.root_level) {
>> +		kvm_mmu_zap_page(vcpu->kvm, sp);
>> +		return 1;
>> +	}
>>   
>
> Suppose we switch to real mode, touch a pte, switch back.  Is this handled?

The shadow page will go unsync on pte touch and resynced as soon as its
visible (after return to paging).

Or, while still in real mode, it might be zapped by
kvm_mmu_get_page->kvm_sync_page.

Am I missing something?

>> @@ -991,8 +1066,18 @@ static struct kvm_mmu_page *kvm_mmu_get_
>>  		 gfn, role.word);
>>  	index = kvm_page_table_hashfn(gfn);
>>  	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
>> -	hlist_for_each_entry(sp, node, bucket, hash_link)
>> -		if (sp->gfn == gfn && sp->role.word == role.word) {
>> +	hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
>> +		if (sp->gfn == gfn) {
>> +			if (sp->unsync)
>> +				if (kvm_sync_page(vcpu, sp))
>> +					continue;
>> +
>> +			if (sp->role.word != role.word)
>> +				continue;
>> +
>> +			if (sp->unsync_children)
>> +				vcpu->arch.mmu.need_root_sync = 1;
>>   
>
> mmu_reload() maybe?

Hum, will think about it.

>>  static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
>> -	return 0;
>> +	return ret;
>>  }
>>   
>
> Why does the caller care if zap also zapped some other random pages?  To  
> restart walking the list?

Yes. The next element for_each_entry_safe saved could have been zapped.

>> +	/* don't unsync if pagetable is shadowed with multiple roles */
>> +	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
>> +		if (s->gfn != sp->gfn || s->role.metaphysical)
>> +			continue;
>> +		if (s->role.word != sp->role.word)
>> +			return 1;
>> +	}
>>   
>
> This will happen for nonpae paging.  But why not allow it?  Zap all  
> unsynced pages on mode switch.
>
> Oh, if a page is both a page directory and page table, yes.  

Yes. 

> So to allow nonpae oos, check the level instead.

Windows 2008 64-bit has all sorts of sharing a pagetable at multiple
levels too.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 09/10] KVM: MMU: out of sync shadow core v2
  2008-09-21  0:45     ` Marcelo Tosatti
@ 2008-09-22 20:41       ` Avi Kivity
  2008-09-22 21:55         ` Marcelo Tosatti
  0 siblings, 1 reply; 34+ messages in thread
From: Avi Kivity @ 2008-09-22 20:41 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, David S. Ahern

Marcelo Tosatti wrote:
>>> +	while (parent->unsync_children) {
>>> +		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
>>> +			u64 ent = sp->spt[i];
>>> +
>>> +			if (is_shadow_present_pte(ent)) {
>>> +				struct kvm_mmu_page *child;
>>> +				child = page_header(ent & PT64_BASE_ADDR_MASK);
>>>       
>> What does this do?
>>     
>
> Walks all children of given page with no efficiency. Its replaced later
> by the bitmap version.
>
>   

I don't understand how the variables sp, child, and parent interact. You 
either need recursion or an explicit stack?

>>> +static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
>>> +{
>>> +	if (sp->role.glevels != vcpu->arch.mmu.root_level) {
>>> +		kvm_mmu_zap_page(vcpu->kvm, sp);
>>> +		return 1;
>>> +	}
>>>   
>>>       
>> Suppose we switch to real mode, touch a pte, switch back.  Is this handled?
>>     
>
> The shadow page will go unsync on pte touch and resynced as soon as its
> visible (after return to paging).
>
> Or, while still in real mode, it might be zapped by
> kvm_mmu_get_page->kvm_sync_page.
>
> Am I missing something?
>
>   

I guess I was. Ok.

>
>>>  static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
>>> -	return 0;
>>> +	return ret;
>>>  }
>>>   
>>>       
>> Why does the caller care if zap also zapped some other random pages?  To  
>> restart walking the list?
>>     
>
> Yes. The next element for_each_entry_safe saved could have been zapped.
>
>   

Ouch. Ouch.

I hate doing this. Can see no alternative though.

>>> +	/* don't unsync if pagetable is shadowed with multiple roles */
>>> +	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
>>> +		if (s->gfn != sp->gfn || s->role.metaphysical)
>>> +			continue;
>>> +		if (s->role.word != sp->role.word)
>>> +			return 1;
>>> +	}
>>>   
>>>       
>> This will happen for nonpae paging.  But why not allow it?  Zap all  
>> unsynced pages on mode switch.
>>
>> Oh, if a page is both a page directory and page table, yes.  
>>     
>
> Yes. 
>
>   
>> So to allow nonpae oos, check the level instead.
>>     
>
> Windows 2008 64-bit has all sorts of sharing a pagetable at multiple
> levels too.
>
>   

We still want to allow oos for the two quadrants of a nonpae shadow page.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 09/10] KVM: MMU: out of sync shadow core v2
  2008-09-22 20:41       ` Avi Kivity
@ 2008-09-22 21:55         ` Marcelo Tosatti
  2008-09-22 22:51           ` Marcelo Tosatti
  2008-09-23 10:46           ` Avi Kivity
  0 siblings, 2 replies; 34+ messages in thread
From: Marcelo Tosatti @ 2008-09-22 21:55 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, David S. Ahern

On Mon, Sep 22, 2008 at 11:41:14PM +0300, Avi Kivity wrote:
> Marcelo Tosatti wrote:
>>>> +	while (parent->unsync_children) {
>>>> +		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
>>>> +			u64 ent = sp->spt[i];
>>>> +
>>>> +			if (is_shadow_present_pte(ent)) {
>>>> +				struct kvm_mmu_page *child;
>>>> +				child = page_header(ent & PT64_BASE_ADDR_MASK);
>>>>       
>>> What does this do?
>>>     
>>
>> Walks all children of given page with no efficiency. Its replaced later
>> by the bitmap version.
>>
>>   
>
> I don't understand how the variables sp, child, and parent interact. You  
> either need recursion or an explicit stack?

It restarts at parent level whenever finishing any children:

+               if (i == PT64_ENT_PER_PAGE) {
+                       sp->unsync_children = 0;
+                       sp = parent;
+               }

No efficiency.

>> Yes. The next element for_each_entry_safe saved could have been zapped.
>>
>>   
>
> Ouch. Ouch.
>
> I hate doing this. Can see no alternative though.

Me neither.

>>>> +	/* don't unsync if pagetable is shadowed with multiple roles */
>>>> +	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
>>>> +		if (s->gfn != sp->gfn || s->role.metaphysical)
>>>> +			continue;
>>>> +		if (s->role.word != sp->role.word)
>>>> +			return 1;
>>>> +	}
>>>>         
>>> This will happen for nonpae paging.  But why not allow it?  Zap all   
>>> unsynced pages on mode switch.
>>>
>>> Oh, if a page is both a page directory and page table, yes.      
>>
>> Yes. 
>>
>>   
>>> So to allow nonpae oos, check the level instead.
>>>     
>>
>> Windows 2008 64-bit has all sorts of sharing a pagetable at multiple
>> levels too.
>>
>>   
>
> We still want to allow oos for the two quadrants of a nonpae shadow page.

Sure, can be an optimization step later?


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 09/10] KVM: MMU: out of sync shadow core v2
  2008-09-22 21:55         ` Marcelo Tosatti
@ 2008-09-22 22:51           ` Marcelo Tosatti
  2008-09-23 10:46             ` Avi Kivity
  2008-09-23 10:46           ` Avi Kivity
  1 sibling, 1 reply; 34+ messages in thread
From: Marcelo Tosatti @ 2008-09-22 22:51 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, David S. Ahern

On Mon, Sep 22, 2008 at 06:55:03PM -0300, Marcelo Tosatti wrote:
> On Mon, Sep 22, 2008 at 11:41:14PM +0300, Avi Kivity wrote:
> > Marcelo Tosatti wrote:
> >>>> +	while (parent->unsync_children) {
> >>>> +		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
> >>>> +			u64 ent = sp->spt[i];
> >>>> +
> >>>> +			if (is_shadow_present_pte(ent)) {
> >>>> +				struct kvm_mmu_page *child;
> >>>> +				child = page_header(ent & PT64_BASE_ADDR_MASK);
> >>>>       
> >>> What does this do?
> >>>     
> >>
> >> Walks all children of given page with no efficiency. Its replaced later
> >> by the bitmap version.
> >>
> >>   
> >
> > I don't understand how the variables sp, child, and parent interact. You  
> > either need recursion or an explicit stack?
> 
> It restarts at parent level whenever finishing any children:
> 
> +               if (i == PT64_ENT_PER_PAGE) {
> +                       sp->unsync_children = 0;
> +                       sp = parent;
> +               }
> 
> No efficiency.

Do you prefer a recursive version for this one too? 


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 09/10] KVM: MMU: out of sync shadow core v2
  2008-09-22 22:51           ` Marcelo Tosatti
@ 2008-09-23 10:46             ` Avi Kivity
  0 siblings, 0 replies; 34+ messages in thread
From: Avi Kivity @ 2008-09-23 10:46 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, David S. Ahern

Marcelo Tosatti wrote:
>> It restarts at parent level whenever finishing any children:
>>
>> +               if (i == PT64_ENT_PER_PAGE) {
>> +                       sp->unsync_children = 0;
>> +                       sp = parent;
>> +               }
>>
>> No efficiency.
>>     
>
> Do you prefer a recursive version for this one too? 
>
>   

Yes please.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 09/10] KVM: MMU: out of sync shadow core v2
  2008-09-22 21:55         ` Marcelo Tosatti
  2008-09-22 22:51           ` Marcelo Tosatti
@ 2008-09-23 10:46           ` Avi Kivity
  2008-09-23 13:17             ` Marcelo Tosatti
  1 sibling, 1 reply; 34+ messages in thread
From: Avi Kivity @ 2008-09-23 10:46 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, David S. Ahern

Marcelo Tosatti wrote:
>>>   
>>>       
>> I don't understand how the variables sp, child, and parent interact. You  
>> either need recursion or an explicit stack?
>>     
>
> It restarts at parent level whenever finishing any children:
>
> +               if (i == PT64_ENT_PER_PAGE) {
> +                       sp->unsync_children = 0;
> +                       sp = parent;
> +               }
>
> No efficiency.
>
>   

Oh okay.  'parent' is never assigned to.  Lack of concentration.

>>> Yes. The next element for_each_entry_safe saved could have been zapped.
>>>
>>>   
>>>       
>> Ouch. Ouch.
>>
>> I hate doing this. Can see no alternative though.
>>     
>
> Me neither.
>
>   

Well.  But I don't see kvm_mmu_zap_page()'s return value used anywhere.

Actually, I think I see an alternative:  set the invalid flag on these 
pages and queue them in a list, like we do for roots in use.  Flush the 
list on some cleanup path.

>>> Windows 2008 64-bit has all sorts of sharing a pagetable at multiple
>>> levels too.
>>>
>>>   
>>>       
>> We still want to allow oos for the two quadrants of a nonpae shadow page.
>>     
>
> Sure, can be an optimization step later?
>   

I'd like to reexamine this from another angle: what if we allow oos of 
any level?

This will simplify the can_unsync path (always true) and remove a 
special case.  The cost is implementing invlpg and resync for non-leaf 
pages (invlpg has to resync the pte for every level).  Are there other 
problems with this?

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 09/10] KVM: MMU: out of sync shadow core v2
  2008-09-23 10:46           ` Avi Kivity
@ 2008-09-23 13:17             ` Marcelo Tosatti
  0 siblings, 0 replies; 34+ messages in thread
From: Marcelo Tosatti @ 2008-09-23 13:17 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, David S. Ahern

On Tue, Sep 23, 2008 at 01:46:23PM +0300, Avi Kivity wrote:
> Marcelo Tosatti wrote:
>>>>         
>>> I don't understand how the variables sp, child, and parent interact. 
>>> You  either need recursion or an explicit stack?
>>>     
>>
>> It restarts at parent level whenever finishing any children:
>>
>> +               if (i == PT64_ENT_PER_PAGE) {
>> +                       sp->unsync_children = 0;
>> +                       sp = parent;
>> +               }
>>
>> No efficiency.
>>
>>   
>
> Oh okay.  'parent' is never assigned to.  Lack of concentration.
>
>>>> Yes. The next element for_each_entry_safe saved could have been zapped.
>>>>
>>>>         
>>> Ouch. Ouch.
>>>
>>> I hate doing this. Can see no alternative though.
>>>     
>>
>> Me neither.
>>
>>   
>
> Well.  But I don't see kvm_mmu_zap_page()'s return value used anywhere.

It is. List walk becomes unsafe otherwise.

> Actually, I think I see an alternative:  set the invalid flag on these  
> pages and queue them in a list, like we do for roots in use.  Flush the  
> list on some cleanup path.

Yes, it is an alternative. But then you would have to test for the
invalid flag on all those paths that currently test for kvm_mmu_zap_page
return value. I'm not sure if thats any better?

>>>> Windows 2008 64-bit has all sorts of sharing a pagetable at multiple
>>>> levels too.
>>>>
>>>>         
>>> We still want to allow oos for the two quadrants of a nonpae shadow page.
>>>     
>>
>> Sure, can be an optimization step later?
>>   
>
> I'd like to reexamine this from another angle: what if we allow oos of  
> any level?
>
> This will simplify the can_unsync path (always true) 

The can_unsync flag is there to avoid the resync path
(mmu_unsync_walk->kvm_sync_page) from unsyncing pages of the root being
synced. Say, if at every resync you end up unsyncing two pages (unlikely
but possible).

However, we can probably get rid of it the bitmap walk (which won't
restart the walk from the beginning).

> and remove a special case. The cost is implementing invlpg and resync
> for non-leaf pages (invlpg has to resync the pte for every level). Are
> there other problems with this?

There is no gfn cache for non-leaf pages, so you either need to
introduce it or go for gfn_to_page_atomic-like functionality
(expensive).

I was hoping to look into non-leaf unsync to be another "for later"
optimization step, if found to be worthwhile.



^ permalink raw reply	[flat|nested] 34+ messages in thread

* [patch 10/10] KVM: MMU: speed up mmu_unsync_walk
  2008-09-18 21:27 [patch 00/10] out of sync shadow v2 Marcelo Tosatti
                   ` (8 preceding siblings ...)
  2008-09-18 21:27 ` [patch 09/10] KVM: MMU: out of sync shadow core v2 Marcelo Tosatti
@ 2008-09-18 21:27 ` Marcelo Tosatti
  2008-09-20  1:26   ` Avi Kivity
  2008-09-18 22:36 ` [patch 00/10] out of sync shadow v2 Marcelo Tosatti
  10 siblings, 1 reply; 34+ messages in thread
From: Marcelo Tosatti @ 2008-09-18 21:27 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, David S. Ahern

[-- Attachment #1: kvm-oos-speed-walk --]
[-- Type: text/plain, Size: 4602 bytes --]

Cache the unsynced children information in a per-page bitmap.


Index: kvm/arch/x86/kvm/mmu.c
===================================================================
--- kvm.orig/arch/x86/kvm/mmu.c
+++ kvm/arch/x86/kvm/mmu.c
@@ -924,6 +924,38 @@ static void mmu_parent_walk(struct kvm_v
 	} while (level > start_level-1);
 }
 
+static void kvm_mmu_update_unsync_bitmap(u64 *spte)
+{
+	unsigned int index;
+	struct kvm_mmu_page *sp = page_header(__pa(spte));
+
+	index = spte - sp->spt;
+	__set_bit(index, sp->unsync_child_bitmap);
+	sp->unsync_children = 1;
+}
+
+static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
+{
+	struct kvm_pte_chain *pte_chain;
+	struct hlist_node *node;
+	int i;
+
+	if (!sp->parent_pte)
+		return;
+
+	if (!sp->multimapped) {
+		kvm_mmu_update_unsync_bitmap(sp->parent_pte);
+		return;
+	}
+
+	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+			if (!pte_chain->parent_ptes[i])
+				break;
+			kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
+		}
+}
+
 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
 				    struct kvm_mmu_page *sp)
 {
@@ -946,33 +978,57 @@ static void nonpaging_invlpg(struct kvm_
 static int mmu_unsync_walk(struct kvm_mmu_page *parent, mmu_unsync_fn fn,
 			   void *priv)
 {
-	int i, ret;
-	struct kvm_mmu_page *sp = parent;
+	int ret, level, i;
+	u64 ent;
+	struct kvm_mmu_page *sp, *child;
+	struct walk {
+		struct kvm_mmu_page *sp;
+		int pos;
+	} walk[PT64_ROOT_LEVEL];
 
-	while (parent->unsync_children) {
-		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-			u64 ent = sp->spt[i];
+	WARN_ON(parent->role.level == PT_PAGE_TABLE_LEVEL);
+
+	if (!parent->unsync_children)
+		return 0;
+
+	memset(&walk, 0, sizeof(walk));
+	level = parent->role.level;
+	walk[level-1].sp = parent;
+
+	do {
+		sp = walk[level-1].sp;
+		i = find_next_bit(sp->unsync_child_bitmap, 512, walk[level-1].pos);
+		if (i < 512) {
+			walk[level-1].pos = i+1;
+			ent = sp->spt[i];
 
 			if (is_shadow_present_pte(ent)) {
-				struct kvm_mmu_page *child;
 				child = page_header(ent & PT64_BASE_ADDR_MASK);
 
 				if (child->unsync_children) {
-					sp = child;
-					break;
+					--level;
+					walk[level-1].sp = child;
+					walk[level-1].pos = 0;
+					continue;
 				}
 				if (child->unsync) {
 					ret = fn(child, priv);
+					__clear_bit(i, sp->unsync_child_bitmap);
 					if (ret)
 						return ret;
 				}
 			}
+			__clear_bit(i, sp->unsync_child_bitmap);
+		} else {
+			++level;
+			if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) {
+				sp->unsync_children = 0;
+				if (level-1 < PT64_ROOT_LEVEL)
+					walk[level-1].pos = 0;
+			}
 		}
-		if (i == PT64_ENT_PER_PAGE) {
-			sp->unsync_children = 0;
-			sp = parent;
-		}
-	}
+	} while (level <= parent->role.level);
+
 	return 0;
 }
 
@@ -1037,6 +1093,13 @@ static void mmu_sync_children(struct kvm
 		cond_resched_lock(&vcpu->kvm->mmu_lock);
 }
 
+static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+	sp->unsync_children = 1;
+	kvm_mmu_update_parents_unsync(sp);
+	return 1;
+}
+
 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 					     gfn_t gfn,
 					     gva_t gaddr,
@@ -1075,10 +1138,11 @@ static struct kvm_mmu_page *kvm_mmu_get_
 			if (sp->role.word != role.word)
 				continue;
 
-			if (sp->unsync_children)
-				vcpu->arch.mmu.need_root_sync = 1;
-
 			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+			if (sp->unsync_children) {
+				vcpu->arch.mmu.need_root_sync = 1;
+				mmu_parent_walk(vcpu, sp, unsync_walk_fn);
+			}
 			pgprintk("%s: found\n", __func__);
 			return sp;
 		}
@@ -1337,12 +1401,6 @@ struct page *gva_to_page(struct kvm_vcpu
 	return page;
 }
 
-static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
-{
-	sp->unsync_children = 1;
-	return 1;
-}
-
 static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
 	unsigned index;
@@ -1360,6 +1418,7 @@ static int kvm_unsync_page(struct kvm_vc
 			return 1;
 	}
 	mmu_parent_walk(vcpu, sp, unsync_walk_fn);
+	kvm_mmu_update_parents_unsync(sp);
 	++vcpu->kvm->stat.mmu_unsync;
 	sp->unsync = 1;
 	return 0;
Index: kvm/include/asm-x86/kvm_host.h
===================================================================
--- kvm.orig/include/asm-x86/kvm_host.h
+++ kvm/include/asm-x86/kvm_host.h
@@ -201,6 +201,7 @@ struct kvm_mmu_page {
 		u64 *parent_pte;               /* !multimapped */
 		struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
 	};
+	DECLARE_BITMAP(unsync_child_bitmap, 512);
 };
 
 struct kvm_pv_mmu_op_buffer {

-- 


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 10/10] KVM: MMU: speed up mmu_unsync_walk
  2008-09-18 21:27 ` [patch 10/10] KVM: MMU: speed up mmu_unsync_walk Marcelo Tosatti
@ 2008-09-20  1:26   ` Avi Kivity
  2008-09-21  0:45     ` Marcelo Tosatti
  0 siblings, 1 reply; 34+ messages in thread
From: Avi Kivity @ 2008-09-20  1:26 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, kvm, David S. Ahern

Marcelo Tosatti wrote:
> Cache the unsynced children information in a per-page bitmap.
>
>  static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
>  				    struct kvm_mmu_page *sp)
>  {
> @@ -946,33 +978,57 @@ static void nonpaging_invlpg(struct kvm_
>  static int mmu_unsync_walk(struct kvm_mmu_page *parent, mmu_unsync_fn fn,
>  			   void *priv)
>  {
> -	int i, ret;
> -	struct kvm_mmu_page *sp = parent;
> +	int ret, level, i;
> +	u64 ent;
> +	struct kvm_mmu_page *sp, *child;
> +	struct walk {
> +		struct kvm_mmu_page *sp;
> +		int pos;
> +	} walk[PT64_ROOT_LEVEL];
>  
> -	while (parent->unsync_children) {
> -		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
> -			u64 ent = sp->spt[i];
> +	WARN_ON(parent->role.level == PT_PAGE_TABLE_LEVEL);
> +
> +	if (!parent->unsync_children)
> +		return 0;
> +
> +	memset(&walk, 0, sizeof(walk));
> +	level = parent->role.level;
> +	walk[level-1].sp = parent;
> +
> +	do {
> +		sp = walk[level-1].sp;
> +		i = find_next_bit(sp->unsync_child_bitmap, 512, walk[level-1].pos);
> +		if (i < 512) {
> +			walk[level-1].pos = i+1;
> +			ent = sp->spt[i];
>  
>  			if (is_shadow_present_pte(ent)) {
> -				struct kvm_mmu_page *child;
>  				child = page_header(ent & PT64_BASE_ADDR_MASK);
>  
>  				if (child->unsync_children) {
> -					sp = child;
> -					break;
> +					--level;
> +					walk[level-1].sp = child;
> +					walk[level-1].pos = 0;
> +					continue;
>  				}
>  				if (child->unsync) {
>  					ret = fn(child, priv);
> +					__clear_bit(i, sp->unsync_child_bitmap);
>  					if (ret)
>  						return ret;
>  				}
>  			}
> +			__clear_bit(i, sp->unsync_child_bitmap);
> +		} else {
> +			++level;
> +			if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) {
> +				sp->unsync_children = 0;
> +				if (level-1 < PT64_ROOT_LEVEL)
> +					walk[level-1].pos = 0;
> +			}
>  		}
> -		if (i == PT64_ENT_PER_PAGE) {
> -			sp->unsync_children = 0;
> -			sp = parent;
> -		}
> -	}
> +	} while (level <= parent->role.level);
> +
>  	return 0;
>  }
>   

<weeps>

>  
> --- kvm.orig/include/asm-x86/kvm_host.h
> +++ kvm/include/asm-x86/kvm_host.h
> @@ -201,6 +201,7 @@ struct kvm_mmu_page {
>  		u64 *parent_pte;               /* !multimapped */
>  		struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
>  	};
> +	DECLARE_BITMAP(unsync_child_bitmap, 512);
>  };
>  

Later, we can throw this bitmap out to a separate object.  Also, it may 
make sense to replace it with an array of u16s.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 10/10] KVM: MMU: speed up mmu_unsync_walk
  2008-09-20  1:26   ` Avi Kivity
@ 2008-09-21  0:45     ` Marcelo Tosatti
  2008-09-22 20:43       ` Avi Kivity
  0 siblings, 1 reply; 34+ messages in thread
From: Marcelo Tosatti @ 2008-09-21  0:45 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Avi Kivity, kvm, David S. Ahern

On Fri, Sep 19, 2008 at 06:26:56PM -0700, Avi Kivity wrote:
>>  --- kvm.orig/include/asm-x86/kvm_host.h
>> +++ kvm/include/asm-x86/kvm_host.h
>> @@ -201,6 +201,7 @@ struct kvm_mmu_page {
>>  		u64 *parent_pte;               /* !multimapped */
>>  		struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
>>  	};
>> +	DECLARE_BITMAP(unsync_child_bitmap, 512);
>>  };
>>  
>
> Later, we can throw this bitmap out to a separate object. 

Yeah.

> Also, it may make sense to replace it with an array of u16s.

Why?

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 10/10] KVM: MMU: speed up mmu_unsync_walk
  2008-09-21  0:45     ` Marcelo Tosatti
@ 2008-09-22 20:43       ` Avi Kivity
  0 siblings, 0 replies; 34+ messages in thread
From: Avi Kivity @ 2008-09-22 20:43 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, David S. Ahern

Marcelo Tosatti wrote:
>> Also, it may make sense to replace it with an array of u16s.
>>     
>
> Why?
>   

They'll be usually empty or near-empty, no? In which case the array is 
faster and smaller.

But I don't think the difference is measurable. So scratch that remark.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 00/10] out of sync shadow v2
  2008-09-18 21:27 [patch 00/10] out of sync shadow v2 Marcelo Tosatti
                   ` (9 preceding siblings ...)
  2008-09-18 21:27 ` [patch 10/10] KVM: MMU: speed up mmu_unsync_walk Marcelo Tosatti
@ 2008-09-18 22:36 ` Marcelo Tosatti
  2008-09-20  1:28   ` Avi Kivity
  10 siblings, 1 reply; 34+ messages in thread
From: Marcelo Tosatti @ 2008-09-18 22:36 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, David S. Ahern

On Thu, Sep 18, 2008 at 06:27:49PM -0300, Marcelo Tosatti wrote:
> Addressing earlier comments.

Ugh, forgot to convert shadow_notrap -> shadow_trap on unsync, 
so bypass_guest_pf=1 is still broken.



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch 00/10] out of sync shadow v2
  2008-09-18 22:36 ` [patch 00/10] out of sync shadow v2 Marcelo Tosatti
@ 2008-09-20  1:28   ` Avi Kivity
  0 siblings, 0 replies; 34+ messages in thread
From: Avi Kivity @ 2008-09-20  1:28 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, kvm, David S. Ahern

Marcelo Tosatti wrote:
> On Thu, Sep 18, 2008 at 06:27:49PM -0300, Marcelo Tosatti wrote:
>   
>> Addressing earlier comments.
>>     
>
> Ugh, forgot to convert shadow_notrap -> shadow_trap on unsync, 
> so bypass_guest_pf=1 is still broken.
>
>
>   

Also, forgot the nice benchmark results.

I really like this (at least the parts I understand, which I believe are 
most of the patchset).  This looks much closer to merging.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2008-09-23 13:19 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-09-18 21:27 [patch 00/10] out of sync shadow v2 Marcelo Tosatti
2008-09-18 21:27 ` [patch 01/10] KVM: MMU: split mmu_set_spte Marcelo Tosatti
2008-09-18 21:27 ` [patch 02/10] KVM: MMU: move local TLB flush to mmu_set_spte Marcelo Tosatti
2008-09-20  0:21   ` Avi Kivity
2008-09-18 21:27 ` [patch 03/10] KVM: MMU: do not write-protect large mappings Marcelo Tosatti
2008-09-20  0:29   ` Avi Kivity
2008-09-21  0:41     ` Marcelo Tosatti
2008-09-18 21:27 ` [patch 04/10] KVM: MMU: mode specific sync_page Marcelo Tosatti
2008-09-20  0:44   ` Avi Kivity
2008-09-18 21:27 ` [patch 05/10] KVM: MMU: sync roots on mmu reload Marcelo Tosatti
2008-09-18 21:27 ` [patch 06/10] KVM: x86: trap invlpg Marcelo Tosatti
2008-09-20  0:53   ` Avi Kivity
2008-09-21  0:43     ` Marcelo Tosatti
2008-09-18 21:27 ` [patch 07/10] KVM: MMU: mmu_parent_walk Marcelo Tosatti
2008-09-20  0:56   ` Avi Kivity
2008-09-21  0:44     ` Marcelo Tosatti
2008-09-22 20:30       ` Avi Kivity
2008-09-22 22:04         ` Marcelo Tosatti
2008-09-18 21:27 ` [patch 08/10] KVM: MMU: awareness of new kvm_mmu_zap_page behaviour Marcelo Tosatti
2008-09-18 21:27 ` [patch 09/10] KVM: MMU: out of sync shadow core v2 Marcelo Tosatti
2008-09-20  1:22   ` Avi Kivity
2008-09-21  0:45     ` Marcelo Tosatti
2008-09-22 20:41       ` Avi Kivity
2008-09-22 21:55         ` Marcelo Tosatti
2008-09-22 22:51           ` Marcelo Tosatti
2008-09-23 10:46             ` Avi Kivity
2008-09-23 10:46           ` Avi Kivity
2008-09-23 13:17             ` Marcelo Tosatti
2008-09-18 21:27 ` [patch 10/10] KVM: MMU: speed up mmu_unsync_walk Marcelo Tosatti
2008-09-20  1:26   ` Avi Kivity
2008-09-21  0:45     ` Marcelo Tosatti
2008-09-22 20:43       ` Avi Kivity
2008-09-18 22:36 ` [patch 00/10] out of sync shadow v2 Marcelo Tosatti
2008-09-20  1:28   ` Avi Kivity

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox