[patch 0/3] oos shadow optimizations

kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [patch 0/3] oos shadow optimizations
@ 2008-10-25 22:31 Marcelo Tosatti
  2008-10-25 22:31 ` [patch 1/3] KVM: MMU: collapse remote TLB flushes on root sync Marcelo Tosatti
                   ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Marcelo Tosatti @ 2008-10-25 22:31 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm

-- 


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [patch 1/3] KVM: MMU: collapse remote TLB flushes on root sync
  2008-10-25 22:31 [patch 0/3] oos shadow optimizations Marcelo Tosatti
@ 2008-10-25 22:31 ` Marcelo Tosatti
  2008-10-26 11:17   ` Avi Kivity
  2008-10-25 22:31 ` [patch 2/3] KVM: MMU: skip global pgtables on sync due to cr3 switch Marcelo Tosatti
  2008-10-25 22:31 ` [patch 3/3] KVM: MMU: prepopulate the shadow on invlpg Marcelo Tosatti
  2 siblings, 1 reply; 20+ messages in thread
From: Marcelo Tosatti @ 2008-10-25 22:31 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, Marcelo Tosatti

[-- Attachment #1: kvm-oos-collapse-remote-tlb-flush --]
[-- Type: text/plain, Size: 3535 bytes --]

Instead of flushing remote TLB's at every page resync, do an initial
pass to write protect the sptes, collapsing the flushes on a single
remote TLB invalidation.

kernbench is 2.3% faster on 4-way guest. Improvements have been seen
with other loads such as AIM7.

Avi: feel free to change this if you dislike the style (I do, but can't
think of anything nicer).

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: kvm/arch/x86/kvm/mmu.c
===================================================================
--- kvm.orig/arch/x86/kvm/mmu.c
+++ kvm/arch/x86/kvm/mmu.c
@@ -152,6 +152,7 @@ struct kvm_shadow_walk {
 
 struct kvm_unsync_walk {
 	int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
+	bool clear_unsync;
 };
 
 typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
@@ -615,7 +616,7 @@ static u64 *rmap_next(struct kvm *kvm, u
 	return NULL;
 }
 
-static void rmap_write_protect(struct kvm *kvm, u64 gfn)
+static int __rmap_write_protect(struct kvm *kvm, u64 gfn)
 {
 	unsigned long *rmapp;
 	u64 *spte;
@@ -661,7 +662,12 @@ static void rmap_write_protect(struct kv
 		spte = rmap_next(kvm, rmapp, spte);
 	}
 
-	if (write_protected)
+	return write_protected;
+}
+
+static void rmap_write_protect(struct kvm *kvm, u64 gfn)
+{
+	if (__rmap_write_protect(kvm, gfn))
 		kvm_flush_remote_tlbs(kvm);
 }
 
@@ -985,12 +991,14 @@ static int mmu_unsync_walk(struct kvm_mm
 				ret = mmu_unsync_walk(child, walker);
 				if (ret)
 					return ret;
-				__clear_bit(i, sp->unsync_child_bitmap);
+				if (walker->clear_unsync)
+					__clear_bit(i, sp->unsync_child_bitmap);
 			}
 
 			if (child->unsync) {
 				ret = walker->entry(child, walker);
-				__clear_bit(i, sp->unsync_child_bitmap);
+				if (walker->clear_unsync)
+					__clear_bit(i, sp->unsync_child_bitmap);
 				if (ret)
 					return ret;
 			}
@@ -1053,6 +1061,7 @@ static int kvm_sync_page(struct kvm_vcpu
 struct sync_walker {
 	struct kvm_vcpu *vcpu;
 	struct kvm_unsync_walk walker;
+	bool write_protected;
 };
 
 static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
@@ -1065,13 +1074,35 @@ static int mmu_sync_fn(struct kvm_mmu_pa
 	return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
 }
 
+static int mmu_wprotect_fn(struct kvm_mmu_page *sp,
+			   struct kvm_unsync_walk *walk)
+{
+	struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
+						     walker);
+	struct kvm_vcpu *vcpu = sync_walk->vcpu;
+
+	if (__rmap_write_protect(vcpu->kvm, sp->gfn))
+		sync_walk->write_protected = true;
+	return need_resched();
+}
+
 static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
 	struct sync_walker walker = {
-		.walker = { .entry = mmu_sync_fn, },
+		.walker = { .entry = mmu_wprotect_fn,
+			    .clear_unsync = false, },
 		.vcpu = vcpu,
+		.write_protected = false
 	};
 
+	/* collapse the TLB flushes as an optimization */
+	mmu_unsync_walk(sp, &walker.walker);
+	if (walker.write_protected)
+		kvm_flush_remote_tlbs(vcpu->kvm);
+
+	walker.walker.entry = mmu_sync_fn;
+	walker.walker.clear_unsync = true;
+
 	while (mmu_unsync_walk(sp, &walker.walker))
 		cond_resched_lock(&vcpu->kvm->mmu_lock);
 }
@@ -1257,7 +1288,8 @@ static int mmu_zap_fn(struct kvm_mmu_pag
 static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	struct zap_walker walker = {
-		.walker = { .entry = mmu_zap_fn, },
+		.walker = { .entry = mmu_zap_fn,
+			    .clear_unsync = true, },
 		.kvm = kvm,
 		.zapped = 0,
 	};

-- 


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch 1/3] KVM: MMU: collapse remote TLB flushes on root sync
  2008-10-25 22:31 ` [patch 1/3] KVM: MMU: collapse remote TLB flushes on root sync Marcelo Tosatti
@ 2008-10-26 11:17   ` Avi Kivity
  2008-10-29 23:26     ` Marcelo Tosatti
  0 siblings, 1 reply; 20+ messages in thread
From: Avi Kivity @ 2008-10-26 11:17 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm

Marcelo Tosatti wrote:
> Instead of flushing remote TLB's at every page resync, do an initial
> pass to write protect the sptes, collapsing the flushes on a single
> remote TLB invalidation.
>
> kernbench is 2.3% faster on 4-way guest. Improvements have been seen
> with other loads such as AIM7.
>
> Avi: feel free to change this if you dislike the style (I do, but can't
> think of anything nicer).
>
>  static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
>  {
>  	struct sync_walker walker = {
> -		.walker = { .entry = mmu_sync_fn, },
> +		.walker = { .entry = mmu_wprotect_fn,
> +			    .clear_unsync = false, },
>  		.vcpu = vcpu,
> +		.write_protected = false
>  	};
>  
> +	/* collapse the TLB flushes as an optimization */
> +	mmu_unsync_walk(sp, &walker.walker);
> +	if (walker.write_protected)
> +		kvm_flush_remote_tlbs(vcpu->kvm);
> +
> +	walker.walker.entry = mmu_sync_fn;
> +	walker.walker.clear_unsync = true;
> +
>  	while (mmu_unsync_walk(sp, &walker.walker))
>  		cond_resched_lock(&vcpu->kvm->mmu_lock);
>   

We're always doing two passes here, which is a bit sad.  How about 
having a single pass which:

- collects unsync pages into an array
- exits on no more unsync pages or max array size reached

Then, iterate over the array:

- write protect all pages
- flush tlb
- sync pages

Loop until the root is synced.

If the number of pages to sync is typically small, and the array is 
sized to be larger than this, then we only walk the pagetables once.

btw, our walkers are a bit awkward (though still better than what we had 
before).  If we rewrite them into for_each style iterators, the code 
could become cleaner and shorter.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch 1/3] KVM: MMU: collapse remote TLB flushes on root sync
  2008-10-26 11:17   ` Avi Kivity
@ 2008-10-29 23:26     ` Marcelo Tosatti
  2008-10-30 10:04       ` Avi Kivity
  0 siblings, 1 reply; 20+ messages in thread
From: Marcelo Tosatti @ 2008-10-29 23:26 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, Chris Wright

On Sun, Oct 26, 2008 at 01:17:14PM +0200, Avi Kivity wrote:
> Marcelo Tosatti wrote:
>> Instead of flushing remote TLB's at every page resync, do an initial
>> pass to write protect the sptes, collapsing the flushes on a single
>> remote TLB invalidation.
>>
>> kernbench is 2.3% faster on 4-way guest. Improvements have been seen
>> with other loads such as AIM7.
>>
>> Avi: feel free to change this if you dislike the style (I do, but can't
>> think of anything nicer).
>>
>>  static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
>>  {
>>  	struct sync_walker walker = {
>> -		.walker = { .entry = mmu_sync_fn, },
>> +		.walker = { .entry = mmu_wprotect_fn,
>> +			    .clear_unsync = false, },
>>  		.vcpu = vcpu,
>> +		.write_protected = false
>>  	};
>>  +	/* collapse the TLB flushes as an optimization */
>> +	mmu_unsync_walk(sp, &walker.walker);
>> +	if (walker.write_protected)
>> +		kvm_flush_remote_tlbs(vcpu->kvm);
>> +
>> +	walker.walker.entry = mmu_sync_fn;
>> +	walker.walker.clear_unsync = true;
>> +
>>  	while (mmu_unsync_walk(sp, &walker.walker))
>>  		cond_resched_lock(&vcpu->kvm->mmu_lock);
>>   
>
> We're always doing two passes here, which is a bit sad.  How about  
> having a single pass which:
>
> - collects unsync pages into an array
> - exits on no more unsync pages or max array size reached
>
> Then, iterate over the array:
>
> - write protect all pages
> - flush tlb
> - sync pages
>
> Loop until the root is synced.
>
> If the number of pages to sync is typically small, and the array is  
> sized to be larger than this, then we only walk the pagetables once.

There is significant overhead now in comparison to the early indexing
scheme with a list per root. It must be optimized.

A problem with your suggestion is how to clean the unsync bitmap bit in
the upper pagetables. The advantage however is that the bitmaps and spte
entries can be cached in L1, while currently the cache is blown at
every page resync.

What i'm testing now is:

#define KVM_PAGE_ARRAY_NR 16

struct kvm_mmu_pages {
        struct kvm_mmu_page *pages[KVM_PAGE_ARRAY_NR];
        struct kvm_mmu_page *parent_pages[KVM_PAGE_ARRAY_NR];
        unsigned int offset[KVM_PAGE_ARRAY_NR];
        unsigned int nr;
};

static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
        int i;
        struct kvm_mmu_pages pages;

        kvm_mmu_pages_init(&pages);
        while (mmu_unsync_walk(sp, &pages)) {
                for_each_sp(pages, i) {
                        struct kvm_mmu_page *parent = pages.parent_pages[i];

                        kvm_sync_page(vcpu, pages.pages[i]);
                        __clear_bit(pages.offset[i],
                                    parent->unsync_child_bitmap);
                }
                kvm_mmu_pages_init(&pages);
                cond_resched_lock(&vcpu->kvm->mmu_lock);
        }
}

But a second pass is still needed for levels 3 and 4 (the second pass
could be postponed for the next CR3 switch, but i'm not sure its
worthwhile).

Also, the array method poorly handles cases with a large number of 
unsync pages, which are common with 2.4/kscand for example.

Hum, Chris suggests a list_head per level instead of the bitmap.

> btw, our walkers are a bit awkward (though still better than what we had  
> before).  If we rewrite them into for_each style iterators, the code  
> could become cleaner and shorter.
>
> -- 
> error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch 1/3] KVM: MMU: collapse remote TLB flushes on root sync
  2008-10-29 23:26     ` Marcelo Tosatti
@ 2008-10-30 10:04       ` Avi Kivity
  2008-10-31 19:30         ` Marcelo Tosatti
  0 siblings, 1 reply; 20+ messages in thread
From: Avi Kivity @ 2008-10-30 10:04 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, Chris Wright

Marcelo Tosatti wrote:
> There is significant overhead now in comparison to the early indexing
> scheme with a list per root. It must be optimized.
>   

What's the typical number of (1) unsynced pages and (2) unsynced pages 
belonging to next cr3 when switching cr3?

I'm guessing (1) and (2) are almost equal, and both fairly large?

(one nice thing is that pages which are no longer used as pagetables 
will not be resynced, so our forky workload preformance should be good)

> A problem with your suggestion is how to clean the unsync bitmap bit in
> the upper pagetables.

True.

> The advantage however is that the bitmaps and spte
> entries can be cached in L1, while currently the cache is blown at
> every page resync.
>
> What i'm testing now is:
>
> #define KVM_PAGE_ARRAY_NR 16
>
> struct kvm_mmu_pages {
>         struct kvm_mmu_page *pages[KVM_PAGE_ARRAY_NR];
>         struct kvm_mmu_page *parent_pages[KVM_PAGE_ARRAY_NR];
>         unsigned int offset[KVM_PAGE_ARRAY_NR];
>   
    struct { ... } ...[KVM_PAGE_ARRAY_NR]; // SCNR
>         unsigned int nr;
> };
>
> static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
> {
>         int i;
>         struct kvm_mmu_pages pages;
>
>         kvm_mmu_pages_init(&pages);
>         while (mmu_unsync_walk(sp, &pages)) {
>   

So mmu_unsync_walk() collects unsynced pages, write protects them, 
flushes the tlb, returns 0 if none found?

Perhaps it can simply collect the pages, then do a write-protect pass, 
tlb flush, and resync pass.  The two passes are now over L1 cached data 
so they're not too expensive.

>                 for_each_sp(pages, i) {
>                         struct kvm_mmu_page *parent = pages.parent_pages[i];
>
>                         kvm_sync_page(vcpu, pages.pages[i]);
>                         __clear_bit(pages.offset[i],
>                                     parent->unsync_child_bitmap);
>   


>                 }
>                 kvm_mmu_pages_init(&pages);
>                 cond_resched_lock(&vcpu->kvm->mmu_lock);
>         }
> }
>
> But a second pass is still needed for levels 3 and 4 (the second pass
> could be postponed for the next CR3 switch, but i'm not sure its
> worthwhile).
>
>   

We could:
  - keep all three parents in the array
  - for the bitmap, keep a count of how many bits are set
  - when we clear a bit, we dec the count, and if zero, we clear the bit 
in the parent's parent.


> Also, the array method poorly handles cases with a large number of 
> unsync pages, which are common with 2.4/kscand for example.
>
>   

Well, depends how large the array is.  If it's large enough, the tlb 
flush cost is overwhelmed by the cost of the actual resync.

> Hum, Chris suggests a list_head per level instead of the bitmap.
>
>   

Doesn't work, a page can have multiple parents, so it would need to be 
linked to multiple lists.

We can have

union {
    bitmap;
    array;
}

and use the array for sparse bitmaps (or use the bitmap for overflowed 
arrays)

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch 1/3] KVM: MMU: collapse remote TLB flushes on root sync
  2008-10-30 10:04       ` Avi Kivity
@ 2008-10-31 19:30         ` Marcelo Tosatti
  0 siblings, 0 replies; 20+ messages in thread
From: Marcelo Tosatti @ 2008-10-31 19:30 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, Chris Wright

On Thu, Oct 30, 2008 at 12:04:33PM +0200, Avi Kivity wrote:
> Marcelo Tosatti wrote:
>> There is significant overhead now in comparison to the early indexing
>> scheme with a list per root. It must be optimized.
>>   
>
> What's the typical number of (1) unsynced pages and (2) unsynced pages  
> belonging to next cr3 when switching cr3?
>
> I'm guessing (1) and (2) are almost equal, and both fairly large?

About 1% of memory unsynced (including pages that are not pagetables
anymore, so the value is not exact).

Not equal, (2) a hundred or so.

> Perhaps it can simply collect the pages, then do a write-protect pass,  
> tlb flush, and resync pass.  The two passes are now over L1 cached data  
> so they're not too expensive.

Thats what its doing.

> We could:
>  - keep all three parents in the array
>  - for the bitmap, keep a count of how many bits are set
>  - when we clear a bit, we dec the count, and if zero, we clear the bit  
> in the parent's parent.

Yep. 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [patch 2/3] KVM: MMU: skip global pgtables on sync due to cr3 switch
  2008-10-25 22:31 [patch 0/3] oos shadow optimizations Marcelo Tosatti
  2008-10-25 22:31 ` [patch 1/3] KVM: MMU: collapse remote TLB flushes on root sync Marcelo Tosatti
@ 2008-10-25 22:31 ` Marcelo Tosatti
  2008-10-26 11:27   ` Avi Kivity
  2008-10-25 22:31 ` [patch 3/3] KVM: MMU: prepopulate the shadow on invlpg Marcelo Tosatti
  2 siblings, 1 reply; 20+ messages in thread
From: Marcelo Tosatti @ 2008-10-25 22:31 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, Marcelo Tosatti

[-- Attachment #1: kvm-oos-global --]
[-- Type: text/plain, Size: 8777 bytes --]

Skip syncing global pages on cr3 switch (but not on cr4/cr0). This is
important for Linux 32-bit guests with PAE, where the kmap page is
marked as global.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: kvm/arch/x86/kvm/mmu.c
===================================================================
--- kvm.orig/arch/x86/kvm/mmu.c
+++ kvm/arch/x86/kvm/mmu.c
@@ -794,9 +794,11 @@ static struct kvm_mmu_page *kvm_mmu_allo
 	sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
+	INIT_LIST_HEAD(&sp->oos_link);
 	ASSERT(is_empty_shadow_page(sp->spt));
 	sp->slot_bitmap = 0;
 	sp->multimapped = 0;
+	sp->global = 1;
 	sp->parent_pte = parent_pte;
 	--vcpu->kvm->arch.n_free_mmu_pages;
 	return sp;
@@ -1031,10 +1033,18 @@ static struct kvm_mmu_page *kvm_mmu_look
 	return NULL;
 }
 
+static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	list_del(&sp->oos_link);
+	--kvm->stat.mmu_unsync_global;
+}
+
 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	WARN_ON(!sp->unsync);
 	sp->unsync = 0;
+	if (sp->global)
+		kvm_unlink_unsync_global(kvm, sp);
 	--kvm->stat.mmu_unsync;
 }
 
@@ -1070,7 +1080,8 @@ static int mmu_sync_fn(struct kvm_mmu_pa
 						     walker);
 	struct kvm_vcpu *vcpu = sync_walk->vcpu;
 
-	kvm_sync_page(vcpu, sp);
+	if (!sp->global)
+		kvm_sync_page(vcpu, sp);
 	return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
 }
 
@@ -1081,7 +1092,7 @@ static int mmu_wprotect_fn(struct kvm_mm
 						     walker);
 	struct kvm_vcpu *vcpu = sync_walk->vcpu;
 
-	if (__rmap_write_protect(vcpu->kvm, sp->gfn))
+	if (!sp->global && __rmap_write_protect(vcpu->kvm, sp->gfn))
 		sync_walk->write_protected = true;
 	return need_resched();
 }
@@ -1547,9 +1558,15 @@ static int kvm_unsync_page(struct kvm_vc
 		if (s->role.word != sp->role.word)
 			return 1;
 	}
-	kvm_mmu_mark_parents_unsync(vcpu, sp);
 	++vcpu->kvm->stat.mmu_unsync;
 	sp->unsync = 1;
+
+	if (sp->global) {
+		list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages);
+		++vcpu->kvm->stat.mmu_unsync_global;
+	} else
+		kvm_mmu_mark_parents_unsync(vcpu, sp);
+
 	mmu_convert_notrap(sp);
 	return 0;
 }
@@ -1575,12 +1592,21 @@ static int mmu_need_write_protect(struct
 static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		    unsigned pte_access, int user_fault,
 		    int write_fault, int dirty, int largepage,
-		    gfn_t gfn, pfn_t pfn, bool speculative,
+		    int global, gfn_t gfn, pfn_t pfn, bool speculative,
 		    bool can_unsync)
 {
 	u64 spte;
 	int ret = 0;
 	u64 mt_mask = shadow_mt_mask;
+	struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
+
+	if (!global && sp->global) {
+		sp->global = 0;
+		if (sp->unsync) {
+			kvm_unlink_unsync_global(vcpu->kvm, sp);
+			kvm_mmu_mark_parents_unsync(vcpu, sp);
+		}
+	}
 
 	/*
 	 * We don't set the accessed bit, since we sometimes want to see
@@ -1640,8 +1666,8 @@ set_pte:
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 			 unsigned pt_access, unsigned pte_access,
 			 int user_fault, int write_fault, int dirty,
-			 int *ptwrite, int largepage, gfn_t gfn,
-			 pfn_t pfn, bool speculative)
+			 int *ptwrite, int largepage, int global,
+			 gfn_t gfn, pfn_t pfn, bool speculative)
 {
 	int was_rmapped = 0;
 	int was_writeble = is_writeble_pte(*shadow_pte);
@@ -1674,7 +1700,7 @@ static void mmu_set_spte(struct kvm_vcpu
 		}
 	}
 	if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
-		      dirty, largepage, gfn, pfn, speculative, true)) {
+		      dirty, largepage, global, gfn, pfn, speculative, true)) {
 		if (write_fault)
 			*ptwrite = 1;
 		kvm_x86_ops->tlb_flush(vcpu);
@@ -1731,7 +1757,7 @@ static int direct_map_entry(struct kvm_s
 	    || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
 		mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
 			     0, walk->write, 1, &walk->pt_write,
-			     walk->largepage, gfn, walk->pfn, false);
+			     walk->largepage, 0, gfn, walk->pfn, false);
 		++vcpu->stat.pf_fixed;
 		return 1;
 	}
@@ -1918,6 +1944,15 @@ static void mmu_sync_roots(struct kvm_vc
 	}
 }
 
+static void mmu_sync_global(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_mmu_page *sp, *n;
+
+	list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link)
+		kvm_sync_page(vcpu, sp);
+}
+
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 {
 	spin_lock(&vcpu->kvm->mmu_lock);
@@ -1925,6 +1960,13 @@ void kvm_mmu_sync_roots(struct kvm_vcpu 
 	spin_unlock(&vcpu->kvm->mmu_lock);
 }
 
+void kvm_mmu_sync_global(struct kvm_vcpu *vcpu)
+{
+	spin_lock(&vcpu->kvm->mmu_lock);
+	mmu_sync_global(vcpu);
+	spin_unlock(&vcpu->kvm->mmu_lock);
+}
+
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
 {
 	return vaddr;
Index: kvm/arch/x86/kvm/paging_tmpl.h
===================================================================
--- kvm.orig/arch/x86/kvm/paging_tmpl.h
+++ kvm/arch/x86/kvm/paging_tmpl.h
@@ -274,7 +274,8 @@ static void FNAME(update_pte)(struct kvm
 		return;
 	kvm_get_pfn(pfn);
 	mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
-		     gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
+		     gpte & PT_DIRTY_MASK, NULL, largepage,
+		     gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte),
 		     pfn, true);
 }
 
@@ -301,8 +302,9 @@ static int FNAME(shadow_walk_entry)(stru
 		mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
 			     sw->user_fault, sw->write_fault,
 			     gw->ptes[gw->level-1] & PT_DIRTY_MASK,
-			     sw->ptwrite, sw->largepage, gw->gfn, sw->pfn,
-			     false);
+			     sw->ptwrite, sw->largepage,
+			     gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
+			     gw->gfn, sw->pfn, false);
 		sw->sptep = sptep;
 		return 1;
 	}
@@ -579,7 +581,7 @@ static int FNAME(sync_page)(struct kvm_v
 		nr_present++;
 		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
 		set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
-			 is_dirty_pte(gpte), 0, gfn,
+			 is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn,
 			 spte_to_pfn(sp->spt[i]), true, false);
 	}
 
Index: kvm/arch/x86/kvm/x86.c
===================================================================
--- kvm.orig/arch/x86/kvm/x86.c
+++ kvm/arch/x86/kvm/x86.c
@@ -104,6 +104,7 @@ struct kvm_stats_debugfs_item debugfs_en
 	{ "mmu_recycled", VM_STAT(mmu_recycled) },
 	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
 	{ "mmu_unsync", VM_STAT(mmu_unsync) },
+	{ "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
 	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 	{ "largepages", VM_STAT(lpages) },
 	{ NULL }
@@ -315,6 +316,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, 
 	kvm_x86_ops->set_cr0(vcpu, cr0);
 	vcpu->arch.cr0 = cr0;
 
+	kvm_mmu_sync_global(vcpu);
 	kvm_mmu_reset_context(vcpu);
 	return;
 }
@@ -358,6 +360,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, 
 	}
 	kvm_x86_ops->set_cr4(vcpu, cr4);
 	vcpu->arch.cr4 = cr4;
+	kvm_mmu_sync_global(vcpu);
 	kvm_mmu_reset_context(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -4113,6 +4116,7 @@ struct  kvm *kvm_arch_create_vm(void)
 		return ERR_PTR(-ENOMEM);
 
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+	INIT_LIST_HEAD(&kvm->arch.oos_global_pages);
 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
 	return kvm;
Index: kvm/include/asm-x86/kvm_host.h
===================================================================
--- kvm.orig/include/asm-x86/kvm_host.h
+++ kvm/include/asm-x86/kvm_host.h
@@ -182,6 +182,8 @@ struct kvm_mmu_page {
 	struct list_head link;
 	struct hlist_node hash_link;
 
+	struct list_head oos_link;
+
 	/*
 	 * The following two entries are used to key the shadow page in the
 	 * hash table.
@@ -198,6 +200,7 @@ struct kvm_mmu_page {
 	int multimapped;         /* More than one parent_pte? */
 	int root_count;          /* Currently serving as active root */
 	bool unsync;
+	bool global;
 	bool unsync_children;
 	union {
 		u64 *parent_pte;               /* !multimapped */
@@ -354,6 +357,7 @@ struct kvm_arch{
 	 */
 	struct list_head active_mmu_pages;
 	struct list_head assigned_dev_head;
+	struct list_head oos_global_pages;
 	struct dmar_domain *intel_iommu_domain;
 	struct kvm_pic *vpic;
 	struct kvm_ioapic *vioapic;
@@ -379,6 +383,7 @@ struct kvm_vm_stat {
 	u32 mmu_recycled;
 	u32 mmu_cache_miss;
 	u32 mmu_unsync;
+	u32 mmu_unsync_global;
 	u32 remote_tlb_flush;
 	u32 lpages;
 };
@@ -597,6 +602,7 @@ void __kvm_mmu_free_some_pages(struct kv
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_global(struct kvm_vcpu *vcpu);
 
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 

-- 


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch 2/3] KVM: MMU: skip global pgtables on sync due to cr3 switch
  2008-10-25 22:31 ` [patch 2/3] KVM: MMU: skip global pgtables on sync due to cr3 switch Marcelo Tosatti
@ 2008-10-26 11:27   ` Avi Kivity
  2008-10-31 19:36     ` Marcelo Tosatti
  0 siblings, 1 reply; 20+ messages in thread
From: Avi Kivity @ 2008-10-26 11:27 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm

Marcelo Tosatti wrote:
> Skip syncing global pages on cr3 switch (but not on cr4/cr0). This is
> important for Linux 32-bit guests with PAE, where the kmap page is
> marked as global.
>
>   

Patch is good, but won't apply without the first.

>  {
>  	u64 spte;
>  	int ret = 0;
>  	u64 mt_mask = shadow_mt_mask;
> +	struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
> +
> +	if (!global && sp->global) {
> +		sp->global = 0;
>   

A slight deficiency in this approach is that a page can't transition 
from !global to global.  I don't think this is frequent, so we don't 
need to deal with it.  But a more accurate approach is to keep a count 
of non-global present mappings, and to activate the global logic when 
this count is nonzero.



-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch 2/3] KVM: MMU: skip global pgtables on sync due to cr3 switch
  2008-10-26 11:27   ` Avi Kivity
@ 2008-10-31 19:36     ` Marcelo Tosatti
  2008-10-31 19:43       ` Avi Kivity
  0 siblings, 1 reply; 20+ messages in thread
From: Marcelo Tosatti @ 2008-10-31 19:36 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm

On Sun, Oct 26, 2008 at 01:27:34PM +0200, Avi Kivity wrote:
> Marcelo Tosatti wrote:
>> Skip syncing global pages on cr3 switch (but not on cr4/cr0). This is
>> important for Linux 32-bit guests with PAE, where the kmap page is
>> marked as global.
>>
>>   
>
> Patch is good, but won't apply without the first.
>
>>  {
>>  	u64 spte;
>>  	int ret = 0;
>>  	u64 mt_mask = shadow_mt_mask;
>> +	struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
>> +
>> +	if (!global && sp->global) {
>> +		sp->global = 0;
>>   
>
> A slight deficiency in this approach is that a page can't transition  
> from !global to global.  I don't think this is frequent, so we don't  
> need to deal with it.

Yep.

> But a more accurate approach is to keep a count  
> of non-global present mappings, and to activate the global logic when  
> this count is nonzero.

I don't think the overhead it adds is significant for the added
complexity? 


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch 2/3] KVM: MMU: skip global pgtables on sync due to cr3 switch
  2008-10-31 19:36     ` Marcelo Tosatti
@ 2008-10-31 19:43       ` Avi Kivity
  2008-10-31 19:50         ` Marcelo Tosatti
  0 siblings, 1 reply; 20+ messages in thread
From: Avi Kivity @ 2008-10-31 19:43 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm

Marcelo Tosatti wrote:
>> But a more accurate approach is to keep a count  
>> of non-global present mappings, and to activate the global logic when  
>> this count is nonzero.
>>     
>
> I don't think the overhead it adds is significant for the added
> complexity? 
>   

Not sure what you're asking exactly (-ENOPARSE).  I don't think it's 
worthwhile to account for !global -> global transitions.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch 2/3] KVM: MMU: skip global pgtables on sync due to cr3 switch
  2008-10-31 19:43       ` Avi Kivity
@ 2008-10-31 19:50         ` Marcelo Tosatti
  2008-10-31 19:59           ` Avi Kivity
  0 siblings, 1 reply; 20+ messages in thread
From: Marcelo Tosatti @ 2008-10-31 19:50 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm

On Fri, Oct 31, 2008 at 09:43:50PM +0200, Avi Kivity wrote:
> Marcelo Tosatti wrote:
>>> But a more accurate approach is to keep a count  of non-global 
>>> present mappings, and to activate the global logic when  this count 
>>> is nonzero.
>>>     
>>
>> I don't think the overhead it adds is significant for the added
>> complexity?   
>
> Not sure what you're asking exactly (-ENOPARSE).  I don't think it's  
> worthwhile to account for !global -> global transitions.

Was referring to "active the global logic when this count is nonzero"
(ie don't see the point of keeping non-global count).



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch 2/3] KVM: MMU: skip global pgtables on sync due to cr3 switch
  2008-10-31 19:50         ` Marcelo Tosatti
@ 2008-10-31 19:59           ` Avi Kivity
  0 siblings, 0 replies; 20+ messages in thread
From: Avi Kivity @ 2008-10-31 19:59 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm

Marcelo Tosatti wrote:
>> Not sure what you're asking exactly (-ENOPARSE).  I don't think it's  
>> worthwhile to account for !global -> global transitions.
>>     
>
> Was referring to "active the global logic when this count is nonzero"
> (ie don't see the point of keeping non-global count).
>   

We agree then.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [patch 3/3] KVM: MMU: prepopulate the shadow on invlpg
  2008-10-25 22:31 [patch 0/3] oos shadow optimizations Marcelo Tosatti
  2008-10-25 22:31 ` [patch 1/3] KVM: MMU: collapse remote TLB flushes on root sync Marcelo Tosatti
  2008-10-25 22:31 ` [patch 2/3] KVM: MMU: skip global pgtables on sync due to cr3 switch Marcelo Tosatti
@ 2008-10-25 22:31 ` Marcelo Tosatti
  2008-10-26 11:48   ` Avi Kivity
  2 siblings, 1 reply; 20+ messages in thread
From: Marcelo Tosatti @ 2008-10-25 22:31 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, Marcelo Tosatti

[-- Attachment #1: invlpg-opt --]
[-- Type: text/plain, Size: 5209 bytes --]

If the guest executes invlpg on a non present entry, peek into th
pagetable and attempt to prepopulate the shadow entry.

Also stop dirty fault updates from interfering with the fork dete

For RHEL3 and 32-bit Vista guests the success rate is high. With Win2003
there is a 1:2 success/fail ratio, but even then compilation benchmarks
are slightly faster. 2000 and XP rarely execute invlpg on non present
entries.

2% improvement on RHEL3/AIM7.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: kvm/arch/x86/kvm/mmu.c
===================================================================
--- kvm.orig/arch/x86/kvm/mmu.c
+++ kvm/arch/x86/kvm/mmu.c
@@ -2365,7 +2365,8 @@ static void kvm_mmu_access_page(struct k
 }
 
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-		       const u8 *new, int bytes)
+		       const u8 *new, int bytes,
+		       bool speculative)
 {
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	struct kvm_mmu_page *sp;
@@ -2391,15 +2392,17 @@ void kvm_mmu_pte_write(struct kvm_vcpu *
 	kvm_mmu_free_some_pages(vcpu);
 	++vcpu->kvm->stat.mmu_pte_write;
 	kvm_mmu_audit(vcpu, "pre pte write");
-	if (gfn == vcpu->arch.last_pt_write_gfn
-	    && !last_updated_pte_accessed(vcpu)) {
-		++vcpu->arch.last_pt_write_count;
-		if (vcpu->arch.last_pt_write_count >= 3)
-			flooded = 1;
-	} else {
-		vcpu->arch.last_pt_write_gfn = gfn;
-		vcpu->arch.last_pt_write_count = 1;
-		vcpu->arch.last_pte_updated = NULL;
+	if (!speculative) {
+		if (gfn == vcpu->arch.last_pt_write_gfn
+		    && !last_updated_pte_accessed(vcpu)) {
+			++vcpu->arch.last_pt_write_count;
+			if (vcpu->arch.last_pt_write_count >= 3)
+				flooded = 1;
+		} else {
+			vcpu->arch.last_pt_write_gfn = gfn;
+			vcpu->arch.last_pt_write_count = 1;
+			vcpu->arch.last_pte_updated = NULL;
+		}
 	}
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
@@ -2539,9 +2542,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
 
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 {
-	spin_lock(&vcpu->kvm->mmu_lock);
 	vcpu->arch.mmu.invlpg(vcpu, gva);
-	spin_unlock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_flush_tlb(vcpu);
 	++vcpu->stat.invlpg;
 }
Index: kvm/arch/x86/kvm/paging_tmpl.h
===================================================================
--- kvm.orig/arch/x86/kvm/paging_tmpl.h
+++ kvm/arch/x86/kvm/paging_tmpl.h
@@ -82,6 +82,7 @@ struct shadow_walker {
 	int *ptwrite;
 	pfn_t pfn;
 	u64 *sptep;
+	gpa_t pte_gpa;
 };
 
 static gfn_t gpte_to_gfn(pt_element_t gpte)
@@ -222,7 +223,7 @@ walk:
 		if (ret)
 			goto walk;
 		pte |= PT_DIRTY_MASK;
-		kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
+		kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 1);
 		walker->ptes[walker->level - 1] = pte;
 	}
 
@@ -467,10 +468,18 @@ static int FNAME(shadow_invlpg_entry)(st
 				      struct kvm_vcpu *vcpu, u64 addr,
 				      u64 *sptep, int level)
 {
+	struct shadow_walker *sw =
+		container_of(_sw, struct shadow_walker, walker);
 
 	if (level == PT_PAGE_TABLE_LEVEL) {
-		if (is_shadow_present_pte(*sptep))
+		struct kvm_mmu_page *sp = page_header(__pa(sptep));
+		sw->pte_gpa = (sp->gfn << PAGE_SHIFT);
+		sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
+
+		if (is_shadow_present_pte(*sptep)) {
 			rmap_remove(vcpu->kvm, sptep);
+			sw->pte_gpa = -1;
+		}
 		set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
 		return 1;
 	}
@@ -481,11 +490,26 @@ static int FNAME(shadow_invlpg_entry)(st
 
 static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 {
+	pt_element_t gpte;
 	struct shadow_walker walker = {
 		.walker = { .entry = FNAME(shadow_invlpg_entry), },
+		.pte_gpa = -1,
 	};
 
+	spin_lock(&vcpu->kvm->mmu_lock);
 	walk_shadow(&walker.walker, vcpu, gva);
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	if (walker.pte_gpa == -1)
+		return;
+	if (kvm_read_guest_atomic(vcpu->kvm, walker.pte_gpa, &gpte,
+				  sizeof(pt_element_t)))
+		return;
+	if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) {
+		if (mmu_topup_memory_caches(vcpu))
+			return;
+		kvm_mmu_pte_write(vcpu, walker.pte_gpa, (const u8 *)&gpte,
+				  sizeof(pt_element_t), 1);
+	}
 }
 
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
Index: kvm/arch/x86/kvm/x86.c
===================================================================
--- kvm.orig/arch/x86/kvm/x86.c
+++ kvm/arch/x86/kvm/x86.c
@@ -2044,7 +2044,7 @@ int emulator_write_phys(struct kvm_vcpu 
 	ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
 	if (ret < 0)
 		return 0;
-	kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+	kvm_mmu_pte_write(vcpu, gpa, val, bytes, 0);
 	return 1;
 }
 
Index: kvm/include/asm-x86/kvm_host.h
===================================================================
--- kvm.orig/include/asm-x86/kvm_host.h
+++ kvm/include/asm-x86/kvm_host.h
@@ -596,7 +596,8 @@ unsigned long segment_base(u16 selector)
 
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-		       const u8 *new, int bytes);
+		       const u8 *new, int bytes,
+		       bool speculative);
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);

-- 


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch 3/3] KVM: MMU: prepopulate the shadow on invlpg
  2008-10-25 22:31 ` [patch 3/3] KVM: MMU: prepopulate the shadow on invlpg Marcelo Tosatti
@ 2008-10-26 11:48   ` Avi Kivity
  2008-10-31 19:47     ` Marcelo Tosatti
  0 siblings, 1 reply; 20+ messages in thread
From: Avi Kivity @ 2008-10-26 11:48 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm

Marcelo Tosatti wrote:
> If the guest executes invlpg on a non present entry, peek into th
> pagetable and attempt to prepopulate the shadow entry.
>
> Also stop dirty fault updates from interfering with the fork dete
>
> For RHEL3 and 32-bit Vista guests the success rate is high. With Win2003
> there is a 1:2 success/fail ratio, but even then compilation benchmarks
> are slightly faster. 2000 and XP rarely execute invlpg on non present
> entries.
>
> 2% improvement on RHEL3/AIM7.
>
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>
> Index: kvm/arch/x86/kvm/mmu.c
> ===================================================================
> --- kvm.orig/arch/x86/kvm/mmu.c
> +++ kvm/arch/x86/kvm/mmu.c
> @@ -2365,7 +2365,8 @@ static void kvm_mmu_access_page(struct k
>  }
>  
>  void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
> -		       const u8 *new, int bytes)
> +		       const u8 *new, int bytes,
> +		       bool speculative)
>   

kvm_mmu_pte_write()s are always speculative.  Maybe this is misnamed?

Perhaps 'guest_initiated' (with the opposite meaning).

> @@ -222,7 +223,7 @@ walk:
>  		if (ret)
>  			goto walk;
>  		pte |= PT_DIRTY_MASK;
> -		kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
> +		kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 1);
>   

This is definitely not a speculative write.  But it is !guest_initiated.

> @@ -467,10 +468,18 @@ static int FNAME(shadow_invlpg_entry)(st
>  				      struct kvm_vcpu *vcpu, u64 addr,
>  				      u64 *sptep, int level)
>  {
> +	struct shadow_walker *sw =
> +		container_of(_sw, struct shadow_walker, walker);
>  
>  	if (level == PT_PAGE_TABLE_LEVEL) {
> -		if (is_shadow_present_pte(*sptep))
> +		struct kvm_mmu_page *sp = page_header(__pa(sptep));
>   

blank line after declarations.  Or move to head of function.

> +		sw->pte_gpa = (sp->gfn << PAGE_SHIFT);
> +		sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
> +
> +		if (is_shadow_present_pte(*sptep)) {
>  			rmap_remove(vcpu->kvm, sptep);
> +			sw->pte_gpa = -1;
>   

Why?  The pte could have heen replaced (for example, a write access to a 
cow page).


-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch 3/3] KVM: MMU: prepopulate the shadow on invlpg
  2008-10-26 11:48   ` Avi Kivity
@ 2008-10-31 19:47     ` Marcelo Tosatti
  2008-10-31 19:58       ` Avi Kivity
  0 siblings, 1 reply; 20+ messages in thread
From: Marcelo Tosatti @ 2008-10-31 19:47 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm

On Sun, Oct 26, 2008 at 01:48:22PM +0200, Avi Kivity wrote:
>> Index: kvm/arch/x86/kvm/mmu.c
>> ===================================================================
>> --- kvm.orig/arch/x86/kvm/mmu.c
>> +++ kvm/arch/x86/kvm/mmu.c
>> @@ -2365,7 +2365,8 @@ static void kvm_mmu_access_page(struct k
>>  }
>>   void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
>> -		       const u8 *new, int bytes)
>> +		       const u8 *new, int bytes,
>> +		       bool speculative)
>>   
>
> kvm_mmu_pte_write()s are always speculative.  Maybe this is misnamed?

Yep.

>> +		sw->pte_gpa = (sp->gfn << PAGE_SHIFT);
>> +		sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
>> +
>> +		if (is_shadow_present_pte(*sptep)) {
>>  			rmap_remove(vcpu->kvm, sptep);
>> +			sw->pte_gpa = -1;
>>   
>
> Why?  The pte could have heen replaced (for example, a write access to a  
> cow page).

Well look-aheads on address space teardown will be useless. OTOH the
guest pte read cost is minimal compared to an exit.

Whatever you prefer. Learning guest behaviour as suggested earlier would 
be optimal, but simple is good.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch 3/3] KVM: MMU: prepopulate the shadow on invlpg
  2008-10-31 19:47     ` Marcelo Tosatti
@ 2008-10-31 19:58       ` Avi Kivity
  2008-10-31 22:33         ` Marcelo Tosatti
  0 siblings, 1 reply; 20+ messages in thread
From: Avi Kivity @ 2008-10-31 19:58 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm

Marcelo Tosatti wrote:
>>> +		sw->pte_gpa = (sp->gfn << PAGE_SHIFT);
>>> +		sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
>>> +
>>> +		if (is_shadow_present_pte(*sptep)) {
>>>  			rmap_remove(vcpu->kvm, sptep);
>>> +			sw->pte_gpa = -1;
>>>   
>>>       
>> Why?  The pte could have heen replaced (for example, a write access to a  
>> cow page).
>>     
>
> Well look-aheads on address space teardown will be useless. OTOH the
> guest pte read cost is minimal compared to an exit.
>   

Don't understand.  We will incur an exit if a pte is replaced and 
invlpg'ed due to a copy-on-write (do guests actually execute invlpg 
after a cow? I don't think they have to).

What is the downside?  A pagetable teardown that does not involve 
zeroing the page?  I don't think we'll see invlpg on that path, more 
likely a complete tlb flush.

> Whatever you prefer. Learning guest behaviour as suggested earlier would 
> be optimal, but simple is good.
>   

We're way past simple.  We can reclaim some of the complexity by always 
doing unsync, and dropping emulation and kvm_mmu_set_pte(), but need to 
make sure we don't regress on performance.  I think Windows does a pde 
write on context switch, which will add a vmexit, but Windows 
applications are not too context switch intensive AFAIK.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch 3/3] KVM: MMU: prepopulate the shadow on invlpg
  2008-10-31 19:58       ` Avi Kivity
@ 2008-10-31 22:33         ` Marcelo Tosatti
  2008-11-02  8:39           ` Avi Kivity
  0 siblings, 1 reply; 20+ messages in thread
From: Marcelo Tosatti @ 2008-10-31 22:33 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm

On Fri, Oct 31, 2008 at 09:58:17PM +0200, Avi Kivity wrote:
> Marcelo Tosatti wrote:
>>>> +		sw->pte_gpa = (sp->gfn << PAGE_SHIFT);
>>>> +		sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
>>>> +
>>>> +		if (is_shadow_present_pte(*sptep)) {
>>>>  			rmap_remove(vcpu->kvm, sptep);
>>>> +			sw->pte_gpa = -1;
>>>>         
>>> Why?  The pte could have heen replaced (for example, a write access 
>>> to a  cow page).
>>>     
>>
>> Well look-aheads on address space teardown will be useless. OTOH the
>> guest pte read cost is minimal compared to an exit.
>>   
>
> Don't understand.  We will incur an exit if a pte is replaced and  
> invlpg'ed due to a copy-on-write (do guests actually execute invlpg  
> after a cow? I don't think they have to).
>
> What is the downside?  A pagetable teardown that does not involve  
> zeroing the page?  I don't think we'll see invlpg on that path, more  
> likely a complete tlb flush.

Err, I'm on crack. The assumption is that the common case is pte
invalidation + invlpg: kunmap_atomic, page aging clearing the 
accessed bit, page reclaim.

Linux COW will invalidate + invlpg (do_wp_page) first:

                entry = mk_pte(new_page, vma->vm_page_prot);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                /*
                 * Clear the pte entry and flush it first, before
                 * updating the
                 * pte with the new entry. This will avoid a race
                 * condition
                 * seen in the presence of one thread doing SMC and
                 * another
                 * thread doing COW.
                 */
                ptep_clear_flush_notify(vma, address, page_table);

Not sure about Windows.

>> Whatever you prefer. Learning guest behaviour as suggested earlier 
>> would be optimal, but simple is good.
>>   
>
> We're way past simple.  We can reclaim some of the complexity by always  
> doing unsync, and dropping emulation and kvm_mmu_set_pte(), but need to  
> make sure we don't regress on performance.  I think Windows does a pde  
> write on context switch, which will add a vmexit, but Windows  
> applications are not too context switch intensive AFAIK.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch 3/3] KVM: MMU: prepopulate the shadow on invlpg
  2008-10-31 22:33         ` Marcelo Tosatti
@ 2008-11-02  8:39           ` Avi Kivity
  2008-11-02 16:08             ` Marcelo Tosatti
  0 siblings, 1 reply; 20+ messages in thread
From: Avi Kivity @ 2008-11-02  8:39 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm

Marcelo Tosatti wrote:
> Err, I'm on crack. The assumption is that the common case is pte
> invalidation + invlpg: kunmap_atomic, page aging clearing the 
> accessed bit, page reclaim.
>
> Linux COW will invalidate + invlpg (do_wp_page) first:
>
>                 entry = mk_pte(new_page, vma->vm_page_prot);
>                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>                 /*
>                  * Clear the pte entry and flush it first, before
>                  * updating the
>                  * pte with the new entry. This will avoid a race
>                  * condition
>                  * seen in the presence of one thread doing SMC and
>                  * another
>                  * thread doing COW.
>                  */
>                 ptep_clear_flush_notify(vma, address, page_table);
>
> Not sure about Windows.
>
>   

Okay, so Linux won't win on this.  But is there any downside, apart from 
fetching the pte and reestablishing the spte?


-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch 3/3] KVM: MMU: prepopulate the shadow on invlpg
  2008-11-02  8:39           ` Avi Kivity
@ 2008-11-02 16:08             ` Marcelo Tosatti
  2008-11-02 16:14               ` Avi Kivity
  0 siblings, 1 reply; 20+ messages in thread
From: Marcelo Tosatti @ 2008-11-02 16:08 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm

On Sun, Nov 02, 2008 at 10:39:10AM +0200, Avi Kivity wrote:
> Marcelo Tosatti wrote:
>> Err, I'm on crack. The assumption is that the common case is pte
>> invalidation + invlpg: kunmap_atomic, page aging clearing the accessed 
>> bit, page reclaim.
>>
>> Linux COW will invalidate + invlpg (do_wp_page) first:
>>
>>                 entry = mk_pte(new_page, vma->vm_page_prot);
>>                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>>                 /*
>>                  * Clear the pte entry and flush it first, before
>>                  * updating the
>>                  * pte with the new entry. This will avoid a race
>>                  * condition
>>                  * seen in the presence of one thread doing SMC and
>>                  * another
>>                  * thread doing COW.
>>                  */
>>                 ptep_clear_flush_notify(vma, address, page_table);
>>
>> Not sure about Windows.
>>
>>   
>
> Okay, so Linux won't win on this.  But is there any downside, apart from  
> fetching the pte and reestablishing the spte?

No, the downside is fetching the pte without being able to establish a
valid spte. 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch 3/3] KVM: MMU: prepopulate the shadow on invlpg
  2008-11-02 16:08             ` Marcelo Tosatti
@ 2008-11-02 16:14               ` Avi Kivity
  0 siblings, 0 replies; 20+ messages in thread
From: Avi Kivity @ 2008-11-02 16:14 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm

Marcelo Tosatti wrote:

  

>> Okay, so Linux won't win on this.  But is there any downside, apart from  
>> fetching the pte and reestablishing the spte?
>>     
>
> No, the downside is fetching the pte without being able to establish a
> valid spte. 
>   

That doesn't seem so expensive.  Just a kvm_read_guest_atomic() with a 
couple of bit tests for present/accessed.

(we do need to optimize copy_from_user_inatomic() to use a single mov insn)

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2008-11-02 16:14 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-10-25 22:31 [patch 0/3] oos shadow optimizations Marcelo Tosatti
2008-10-25 22:31 ` [patch 1/3] KVM: MMU: collapse remote TLB flushes on root sync Marcelo Tosatti
2008-10-26 11:17   ` Avi Kivity
2008-10-29 23:26     ` Marcelo Tosatti
2008-10-30 10:04       ` Avi Kivity
2008-10-31 19:30         ` Marcelo Tosatti
2008-10-25 22:31 ` [patch 2/3] KVM: MMU: skip global pgtables on sync due to cr3 switch Marcelo Tosatti
2008-10-26 11:27   ` Avi Kivity
2008-10-31 19:36     ` Marcelo Tosatti
2008-10-31 19:43       ` Avi Kivity
2008-10-31 19:50         ` Marcelo Tosatti
2008-10-31 19:59           ` Avi Kivity
2008-10-25 22:31 ` [patch 3/3] KVM: MMU: prepopulate the shadow on invlpg Marcelo Tosatti
2008-10-26 11:48   ` Avi Kivity
2008-10-31 19:47     ` Marcelo Tosatti
2008-10-31 19:58       ` Avi Kivity
2008-10-31 22:33         ` Marcelo Tosatti
2008-11-02  8:39           ` Avi Kivity
2008-11-02 16:08             ` Marcelo Tosatti
2008-11-02 16:14               ` Avi Kivity

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).