[PATCH v5 1/5] KVM: MMU: fix Dirty bit missed if CR0.WP = 0

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH v5 1/5] KVM: MMU: fix Dirty bit missed if CR0.WP = 0
@ 2013-01-08  6:36 Xiao Guangrong
  2013-01-08  6:36 ` [PATCH v5 2/5] KVM: MMU: fix infinite fault access retry Xiao Guangrong
                   ` (3 more replies)
  0 siblings, 4 replies; 16+ messages in thread
From: Xiao Guangrong @ 2013-01-08  6:36 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Gleb Natapov, LKML, KVM

If the write-fault access is from supervisor and CR0.WP is not set on the
vcpu, kvm will fix it by adjusting pte access - it sets the W bit on pte
and clears U bit. This is the chance that kvm can change pte access from
readonly to writable

Unfortunately, the pte access is the access of 'direct' shadow page table,
means direct sp.role.access = pte_access, then we will create a writable
spte entry on the readonly shadow page table. It will cause Dirty bit is
not tracked when two guest ptes point to the same large page. Note, it
does not have other impact except Dirty bit since cr0.wp is encoded into
sp.role

It can be fixed by adjusting pte access before establishing shadow page
table. Also, after that, no mmu specified code exists in the common function
and drop two parameters in set_spte

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/mmu.c         |   47 ++++++++++++-------------------------------
 arch/x86/kvm/paging_tmpl.h |   30 +++++++++++++++++++++++----
 2 files changed, 38 insertions(+), 39 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 01d7c2a..2a3c890 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2342,8 +2342,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 }

 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
-		    unsigned pte_access, int user_fault,
-		    int write_fault, int level,
+		    unsigned pte_access, int level,
 		    gfn_t gfn, pfn_t pfn, bool speculative,
 		    bool can_unsync, bool host_writable)
 {
@@ -2378,9 +2377,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,

 	spte |= (u64)pfn << PAGE_SHIFT;

-	if ((pte_access & ACC_WRITE_MASK)
-	    || (!vcpu->arch.mmu.direct_map && write_fault
-		&& !is_write_protection(vcpu) && !user_fault)) {
+	if (pte_access & ACC_WRITE_MASK) {

 		/*
 		 * There are two cases:
@@ -2399,19 +2396,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,

 		spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;

-		if (!vcpu->arch.mmu.direct_map
-		    && !(pte_access & ACC_WRITE_MASK)) {
-			spte &= ~PT_USER_MASK;
-			/*
-			 * If we converted a user page to a kernel page,
-			 * so that the kernel can write to it when cr0.wp=0,
-			 * then we should prevent the kernel from executing it
-			 * if SMEP is enabled.
-			 */
-			if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
-				spte |= PT64_NX_MASK;
-		}
-
 		/*
 		 * Optimization: for pte sync, if spte was writable the hash
 		 * lookup is unnecessary (and expensive). Write protection
@@ -2442,18 +2426,15 @@ done:

 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			 unsigned pt_access, unsigned pte_access,
-			 int user_fault, int write_fault,
-			 int *emulate, int level, gfn_t gfn,
-			 pfn_t pfn, bool speculative,
-			 bool host_writable)
+			 int write_fault, int *emulate, int level, gfn_t gfn,
+			 pfn_t pfn, bool speculative, bool host_writable)
 {
 	int was_rmapped = 0;
 	int rmap_count;

-	pgprintk("%s: spte %llx access %x write_fault %d"
-		 " user_fault %d gfn %llx\n",
+	pgprintk("%s: spte %llx access %x write_fault %d gfn %llx\n",
 		 __func__, *sptep, pt_access,
-		 write_fault, user_fault, gfn);
+		 write_fault, gfn);

 	if (is_rmap_spte(*sptep)) {
 		/*
@@ -2477,9 +2458,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			was_rmapped = 1;
 	}

-	if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
-		      level, gfn, pfn, speculative, true,
-		      host_writable)) {
+	if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
+	      true, host_writable)) {
 		if (write_fault)
 			*emulate = 1;
 		kvm_mmu_flush_tlb(vcpu);
@@ -2571,10 +2551,9 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
 		return -1;

 	for (i = 0; i < ret; i++, gfn++, start++)
-		mmu_set_spte(vcpu, start, ACC_ALL,
-			     access, 0, 0, NULL,
-			     sp->role.level, gfn,
-			     page_to_pfn(pages[i]), true, true);
+		mmu_set_spte(vcpu, start, ACC_ALL, access, 0, NULL,
+			     sp->role.level, gfn, page_to_pfn(pages[i]),
+			     true, true);

 	return 0;
 }
@@ -2636,8 +2615,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			unsigned pte_access = ACC_ALL;

 			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
-				     0, write, &emulate,
-				     level, gfn, pfn, prefault, map_writable);
+				     write, &emulate, level, gfn, pfn,
+				     prefault, map_writable);
 			direct_pte_prefetch(vcpu, iterator.sptep);
 			++vcpu->stat.pf_fixed;
 			break;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a7b24cf..7c575e7 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -326,7 +326,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	 * we call mmu_set_spte() with host_writable = true because
 	 * pte_prefetch_gfn_to_pfn always gets a writable pfn.
 	 */
-	mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
+	mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0,
 		     NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true);

 	return true;
@@ -401,7 +401,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
  */
 static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			 struct guest_walker *gw,
-			 int user_fault, int write_fault, int hlevel,
+			 int write_fault, int hlevel,
 			 pfn_t pfn, bool map_writable, bool prefault)
 {
 	struct kvm_mmu_page *sp = NULL;
@@ -474,7 +474,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,

 	clear_sp_write_flooding_count(it.sptep);
 	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
-		     user_fault, write_fault, &emulate, it.level,
+		     write_fault, &emulate, it.level,
 		     gw->gfn, pfn, prefault, map_writable);
 	FNAME(pte_prefetch)(vcpu, gw, it.sptep);

@@ -560,6 +560,26 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 				walker.gfn, pfn, walker.pte_access, &r))
 		return r;

+	/*
+	 * Do not change pte_access if the pfn is a mmio page, otherwise
+	 * we will cache the incorrect access into mmio spte.
+	 */
+	if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) &&
+	     !is_write_protection(vcpu) && !user_fault &&
+	      !is_noslot_pfn(pfn)) {
+		walker.pte_access |= ACC_WRITE_MASK;
+		walker.pte_access &= ~ACC_USER_MASK;
+
+		/*
+		 * If we converted a user page to a kernel page,
+		 * so that the kernel can write to it when cr0.wp=0,
+		 * then we should prevent the kernel from executing it
+		 * if SMEP is enabled.
+		 */
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
+			walker.pte_access &= ~ACC_EXEC_MASK;
+	}
+
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
 		goto out_unlock;
@@ -568,7 +588,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	kvm_mmu_free_some_pages(vcpu);
 	if (!force_pt_level)
 		transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
-	r = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
+	r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
 			 level, pfn, map_writable, prefault);
 	++vcpu->stat.pf_fixed;
 	kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
@@ -743,7 +763,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)

 		host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;

-		set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
+		set_spte(vcpu, &sp->spt[i], pte_access,
 			 PT_PAGE_TABLE_LEVEL, gfn,
 			 spte_to_pfn(sp->spt[i]), true, false,
 			 host_writable);
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v5 2/5] KVM: MMU: fix infinite fault access retry
  2013-01-08  6:36 [PATCH v5 1/5] KVM: MMU: fix Dirty bit missed if CR0.WP = 0 Xiao Guangrong
@ 2013-01-08  6:36 ` Xiao Guangrong
  2013-01-08  6:37 ` [PATCH v5 3/5] KVM: x86: clean up reexecute_instruction Xiao Guangrong
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 16+ messages in thread
From: Xiao Guangrong @ 2013-01-08  6:36 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Marcelo Tosatti, Gleb Natapov, LKML, KVM

We have two issues in current code:
- if target gfn is used as its page table, guest will refault then kvm will use
  small page size to map it. We need two #PF to fix its shadow page table

- sometimes, say a exception is triggered during vm-exit caused by #PF
  (see handle_exception() in vmx.c), we remove all the shadow pages shadowed
  by the target gfn before go into page fault path, it will cause infinite
  loop:
  delete shadow pages shadowed by the gfn -> try to use large page size to map
  the gfn -> retry the access ->...

To fix these, we can adjust page size early if the target gfn is used as page
table

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/mmu.c         |   13 ++++---------
 arch/x86/kvm/paging_tmpl.h |   35 ++++++++++++++++++++++++++++++++++-
 2 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2a3c890..54fc61e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2380,15 +2380,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	if (pte_access & ACC_WRITE_MASK) {

 		/*
-		 * There are two cases:
-		 * - the one is other vcpu creates new sp in the window
-		 *   between mapping_level() and acquiring mmu-lock.
-		 * - the another case is the new sp is created by itself
-		 *   (page-fault path) when guest uses the target gfn as
-		 *   its page table.
-		 * Both of these cases can be fixed by allowing guest to
-		 * retry the access, it will refault, then we can establish
-		 * the mapping by using small page.
+		 * Other vcpu creates new sp in the window between
+		 * mapping_level() and acquiring mmu-lock. We can
+		 * allow guest to retry the access, the mapping can
+		 * be fixed if guest refault.
 		 */
 		if (level > PT_PAGE_TABLE_LEVEL &&
 		    has_wrprotected_page(vcpu->kvm, gfn, level))
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 7c575e7..67b390d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -487,6 +487,38 @@ out_gpte_changed:
 	return 0;
 }

+ /*
+ * To see whether the mapped gfn can write its page table in the current
+ * mapping.
+ *
+ * It is the helper function of FNAME(page_fault). When guest uses large page
+ * size to map the writable gfn which is used as current page table, we should
+ * force kvm to use small page size to map it because new shadow page will be
+ * created when kvm establishes shadow page table that stop kvm using large
+ * page size. Do it early can avoid unnecessary #PF and emulation.
+ *
+ * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
+ * since the PDPT is always shadowed, that means, we can not use large page
+ * size to map the gfn which is used as PDPT.
+ */
+static bool
+FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
+			      struct guest_walker *walker, int user_fault)
+{
+	int level;
+	gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
+
+	if (!(walker->pte_access & ACC_WRITE_MASK ||
+	      (!is_write_protection(vcpu) && !user_fault)))
+		return false;
+
+	for (level = walker->level; level <= walker->max_level; level++)
+		if (!((walker->gfn ^ walker->table_gfn[level - 1]) & mask))
+			return true;
+
+	return false;
+}
+
 /*
  * Page fault handler.  There are several causes for a page fault:
  *   - there is no shadow pte for the guest pte
@@ -541,7 +573,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	}

 	if (walker.level >= PT_DIRECTORY_LEVEL)
-		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
+		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
+		   || FNAME(is_self_change_mapping)(vcpu, &walker, user_fault);
 	else
 		force_pt_level = 1;
 	if (!force_pt_level) {
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v5 3/5] KVM: x86: clean up reexecute_instruction
  2013-01-08  6:36 [PATCH v5 1/5] KVM: MMU: fix Dirty bit missed if CR0.WP = 0 Xiao Guangrong
  2013-01-08  6:36 ` [PATCH v5 2/5] KVM: MMU: fix infinite fault access retry Xiao Guangrong
@ 2013-01-08  6:37 ` Xiao Guangrong
  2013-01-08  6:37 ` [PATCH v5 4/5] KVM: x86: let reexecute_instruction work for tdp Xiao Guangrong
  2013-01-08  6:38 ` [PATCH v5 5/5] KVM: x86: improve reexecute_instruction Xiao Guangrong
  3 siblings, 0 replies; 16+ messages in thread
From: Xiao Guangrong @ 2013-01-08  6:37 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Marcelo Tosatti, Gleb Natapov, LKML, KVM

Little cleanup for reexecute_instruction, also use gpa_to_gfn in
retry_instruction

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/x86.c |   13 ++++++-------
 1 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1c9c834..08cacd9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4761,19 +4761,18 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
 	if (tdp_enabled)
 		return false;

+	gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+	if (gpa == UNMAPPED_GVA)
+		return true; /* let cpu generate fault */
+
 	/*
 	 * if emulation was due to access to shadowed page table
 	 * and it failed try to unshadow page and re-enter the
 	 * guest to let CPU execute the instruction.
 	 */
-	if (kvm_mmu_unprotect_page_virt(vcpu, gva))
+	if (kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)))
 		return true;

-	gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
-
-	if (gpa == UNMAPPED_GVA)
-		return true; /* let cpu generate fault */
-
 	/*
 	 * Do not retry the unhandleable instruction if it faults on the
 	 * readonly host memory, otherwise it will goto a infinite loop:
@@ -4828,7 +4827,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
 	if (!vcpu->arch.mmu.direct_map)
 		gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);

-	kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));

 	return true;
 }
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v5 4/5] KVM: x86: let reexecute_instruction work for tdp
  2013-01-08  6:36 [PATCH v5 1/5] KVM: MMU: fix Dirty bit missed if CR0.WP = 0 Xiao Guangrong
  2013-01-08  6:36 ` [PATCH v5 2/5] KVM: MMU: fix infinite fault access retry Xiao Guangrong
  2013-01-08  6:37 ` [PATCH v5 3/5] KVM: x86: clean up reexecute_instruction Xiao Guangrong
@ 2013-01-08  6:37 ` Xiao Guangrong
  2013-01-08  6:38 ` [PATCH v5 5/5] KVM: x86: improve reexecute_instruction Xiao Guangrong
  3 siblings, 0 replies; 16+ messages in thread
From: Xiao Guangrong @ 2013-01-08  6:37 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Marcelo Tosatti, Gleb Natapov, LKML, KVM

Currently, reexecute_instruction refused to retry all instructions if
tdp is enabled. If nested npt is used, the emulation may be caused by
shadow page, it can be fixed by dropping the shadow page. And the only
condition that tdp can not retry the instruction is the access fault
on error pfn

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/x86.c |   61 ++++++++++++++++++++++++++++++++++++---------------
 1 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 08cacd9..6f13e03 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4753,25 +4753,25 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
 	return r;
 }

-static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
+static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2)
 {
-	gpa_t gpa;
+	gpa_t gpa = cr2;
 	pfn_t pfn;

-	if (tdp_enabled)
-		return false;
-
-	gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
-	if (gpa == UNMAPPED_GVA)
-		return true; /* let cpu generate fault */
+	if (!vcpu->arch.mmu.direct_map) {
+		/*
+		 * Write permission should be allowed since only
+		 * write access need to be emulated.
+		 */
+		gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);

-	/*
-	 * if emulation was due to access to shadowed page table
-	 * and it failed try to unshadow page and re-enter the
-	 * guest to let CPU execute the instruction.
-	 */
-	if (kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)))
-		return true;
+		/*
+		 * If the mapping is invalid in guest, let cpu retry
+		 * it to generate fault.
+		 */
+		if (gpa == UNMAPPED_GVA)
+			return true;
+	}

 	/*
 	 * Do not retry the unhandleable instruction if it faults on the
@@ -4780,12 +4780,37 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
 	 * instruction -> ...
 	 */
 	pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
-	if (!is_error_noslot_pfn(pfn)) {
-		kvm_release_pfn_clean(pfn);
+
+	/*
+	 * If the instruction failed on the error pfn, it can not be fixed,
+	 * report the error to userspace.
+	 */
+	if (is_error_noslot_pfn(pfn))
+		return false;
+
+	kvm_release_pfn_clean(pfn);
+
+	/* The instructions are well-emulated on direct mmu. */
+	if (vcpu->arch.mmu.direct_map) {
+		unsigned int indirect_shadow_pages;
+
+		spin_lock(&vcpu->kvm->mmu_lock);
+		indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
+		spin_unlock(&vcpu->kvm->mmu_lock);
+
+		if (indirect_shadow_pages)
+			kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+
 		return true;
 	}

-	return false;
+	/*
+	 * if emulation was due to access to shadowed page table
+	 * and it failed try to unshadow page and re-enter the
+	 * guest to let CPU execute the instruction.
+	 */
+	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+	return true;
 }

 static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v5 5/5] KVM: x86: improve reexecute_instruction
  2013-01-08  6:36 [PATCH v5 1/5] KVM: MMU: fix Dirty bit missed if CR0.WP = 0 Xiao Guangrong
                   ` (2 preceding siblings ...)
  2013-01-08  6:37 ` [PATCH v5 4/5] KVM: x86: let reexecute_instruction work for tdp Xiao Guangrong
@ 2013-01-08  6:38 ` Xiao Guangrong
  2013-01-10 17:26   ` Marcelo Tosatti
  2013-01-10 17:30   ` Marcelo Tosatti
  3 siblings, 2 replies; 16+ messages in thread
From: Xiao Guangrong @ 2013-01-08  6:38 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Marcelo Tosatti, Gleb Natapov, LKML, KVM

The current reexecute_instruction can not well detect the failed instruction
emulation. It allows guest to retry all the instructions except it accesses
on error pfn

For example, some cases are nested-write-protect - if the page we want to
write is used as PDE but it chains to itself. Under this case, we should
stop the emulation and report the case to userspace

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/include/asm/kvm_host.h |    7 +++++++
 arch/x86/kvm/paging_tmpl.h      |   27 ++++++++++++++++++++-------
 arch/x86/kvm/x86.c              |    8 +++++++-
 3 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c431b33..d6ab8d2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -502,6 +502,13 @@ struct kvm_vcpu_arch {
 		u64 msr_val;
 		struct gfn_to_hva_cache data;
 	} pv_eoi;
+
+	/*
+	 * Indicate whether the access faults on its page table in guest
+	 * which is set when fix page fault and used to detect unhandeable
+	 * instruction.
+	 */
+	bool write_fault_to_shadow_pgtable;
 };

 struct kvm_lpage_info {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 67b390d..df50560 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -497,26 +497,34 @@ out_gpte_changed:
  * created when kvm establishes shadow page table that stop kvm using large
  * page size. Do it early can avoid unnecessary #PF and emulation.
  *
+ * @write_fault_to_shadow_pgtable will return true if the fault gfn is
+ * currently used as its page table.
+ *
  * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
  * since the PDPT is always shadowed, that means, we can not use large page
  * size to map the gfn which is used as PDPT.
  */
 static bool
 FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
-			      struct guest_walker *walker, int user_fault)
+			      struct guest_walker *walker, int user_fault,
+			      bool *write_fault_to_shadow_pgtable)
 {
 	int level;
 	gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
+	bool self_changed = false;

 	if (!(walker->pte_access & ACC_WRITE_MASK ||
 	      (!is_write_protection(vcpu) && !user_fault)))
 		return false;

-	for (level = walker->level; level <= walker->max_level; level++)
-		if (!((walker->gfn ^ walker->table_gfn[level - 1]) & mask))
-			return true;
+	for (level = walker->level; level <= walker->max_level; level++) {
+		gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
+
+		self_changed |= !(gfn & mask);
+		*write_fault_to_shadow_pgtable |= !gfn;
+	}

-	return false;
+	return self_changed;
 }

 /*
@@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	int level = PT_PAGE_TABLE_LEVEL;
 	int force_pt_level;
 	unsigned long mmu_seq;
-	bool map_writable;
+	bool map_writable, is_self_change_mapping;

 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);

@@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 		return 0;
 	}

+	vcpu->arch.write_fault_to_shadow_pgtable = false;
+
+	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
+	      &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
+
 	if (walker.level >= PT_DIRECTORY_LEVEL)
 		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
-		   || FNAME(is_self_change_mapping)(vcpu, &walker, user_fault);
+		   || is_self_change_mapping;
 	else
 		force_pt_level = 1;
 	if (!force_pt_level) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6f13e03..2957012 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2)
 	 * guest to let CPU execute the instruction.
 	 */
 	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
-	return true;
+
+	/*
+	 * If the access faults on its page table, it can not
+	 * be fixed by unprotecting shadow page and it should
+	 * be reported to userspace.
+	 */
+	return !vcpu->arch.write_fault_to_shadow_pgtable;
 }

 static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 5/5] KVM: x86: improve reexecute_instruction
  2013-01-08  6:38 ` [PATCH v5 5/5] KVM: x86: improve reexecute_instruction Xiao Guangrong
@ 2013-01-10 17:26   ` Marcelo Tosatti
  2013-01-10 18:05     ` Xiao Guangrong
  2013-01-10 17:30   ` Marcelo Tosatti
  1 sibling, 1 reply; 16+ messages in thread
From: Marcelo Tosatti @ 2013-01-10 17:26 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Gleb Natapov, LKML, KVM

On Tue, Jan 08, 2013 at 02:38:36PM +0800, Xiao Guangrong wrote:
> The current reexecute_instruction can not well detect the failed instruction
> emulation. It allows guest to retry all the instructions except it accesses
> on error pfn
> 
> For example, some cases are nested-write-protect - if the page we want to
> write is used as PDE but it chains to itself. Under this case, we should
> stop the emulation and report the case to userspace
> 
> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> ---
>  arch/x86/include/asm/kvm_host.h |    7 +++++++
>  arch/x86/kvm/paging_tmpl.h      |   27 ++++++++++++++++++++-------
>  arch/x86/kvm/x86.c              |    8 +++++++-
>  3 files changed, 34 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index c431b33..d6ab8d2 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -502,6 +502,13 @@ struct kvm_vcpu_arch {
>  		u64 msr_val;
>  		struct gfn_to_hva_cache data;
>  	} pv_eoi;
> +
> +	/*
> +	 * Indicate whether the access faults on its page table in guest
> +	 * which is set when fix page fault and used to detect unhandeable
> +	 * instruction.
> +	 */
> +	bool write_fault_to_shadow_pgtable;
>  };
> 
>  struct kvm_lpage_info {
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 67b390d..df50560 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -497,26 +497,34 @@ out_gpte_changed:
>   * created when kvm establishes shadow page table that stop kvm using large
>   * page size. Do it early can avoid unnecessary #PF and emulation.
>   *
> + * @write_fault_to_shadow_pgtable will return true if the fault gfn is
> + * currently used as its page table.
> + *
>   * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
>   * since the PDPT is always shadowed, that means, we can not use large page
>   * size to map the gfn which is used as PDPT.
>   */
>  static bool
>  FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
> -			      struct guest_walker *walker, int user_fault)
> +			      struct guest_walker *walker, int user_fault,
> +			      bool *write_fault_to_shadow_pgtable)
>  {
>  	int level;
>  	gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
> +	bool self_changed = false;
> 
>  	if (!(walker->pte_access & ACC_WRITE_MASK ||
>  	      (!is_write_protection(vcpu) && !user_fault)))
>  		return false;
> 
> -	for (level = walker->level; level <= walker->max_level; level++)
> -		if (!((walker->gfn ^ walker->table_gfn[level - 1]) & mask))
> -			return true;
> +	for (level = walker->level; level <= walker->max_level; level++) {
> +		gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
> +
> +		self_changed |= !(gfn & mask);
> +		*write_fault_to_shadow_pgtable |= !gfn;
> +	}
> 
> -	return false;
> +	return self_changed;
>  }
> 
>  /*
> @@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
>  	int level = PT_PAGE_TABLE_LEVEL;
>  	int force_pt_level;
>  	unsigned long mmu_seq;
> -	bool map_writable;
> +	bool map_writable, is_self_change_mapping;
> 
>  	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
> 
> @@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
>  		return 0;
>  	}
> 
> +	vcpu->arch.write_fault_to_shadow_pgtable = false;
> +
> +	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
> +	      &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
> +
>  	if (walker.level >= PT_DIRECTORY_LEVEL)
>  		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
> -		   || FNAME(is_self_change_mapping)(vcpu, &walker, user_fault);
> +		   || is_self_change_mapping;
>  	else
>  		force_pt_level = 1;
>  	if (!force_pt_level) {
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 6f13e03..2957012 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2)
>  	 * guest to let CPU execute the instruction.
>  	 */
>  	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
> -	return true;
> +
> +	/*
> +	 * If the access faults on its page table, it can not
> +	 * be fixed by unprotecting shadow page and it should
> +	 * be reported to userspace.
> +	 */
> +	return !vcpu->arch.write_fault_to_shadow_pgtable;
>  }

This sounds wrong: only reporting emulation failure in case 
of a write fault to shadow pagetable? 

The current pattern is sane:

if (condition_1 which allows reexecution is true)
	return true;

if (condition_2 which allows reexecution is true)
	return true;
...
	return false;

Applied 1-2.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 5/5] KVM: x86: improve reexecute_instruction
  2013-01-10 17:26   ` Marcelo Tosatti
@ 2013-01-10 18:05     ` Xiao Guangrong
  2013-01-10 19:48       ` Marcelo Tosatti
  0 siblings, 1 reply; 16+ messages in thread
From: Xiao Guangrong @ 2013-01-10 18:05 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Gleb Natapov, LKML, KVM

On 01/11/2013 01:26 AM, Marcelo Tosatti wrote:
> On Tue, Jan 08, 2013 at 02:38:36PM +0800, Xiao Guangrong wrote:
>> The current reexecute_instruction can not well detect the failed instruction
>> emulation. It allows guest to retry all the instructions except it accesses
>> on error pfn
>>
>> For example, some cases are nested-write-protect - if the page we want to
>> write is used as PDE but it chains to itself. Under this case, we should
>> stop the emulation and report the case to userspace
>>
>> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
>> ---
>>  arch/x86/include/asm/kvm_host.h |    7 +++++++
>>  arch/x86/kvm/paging_tmpl.h      |   27 ++++++++++++++++++++-------
>>  arch/x86/kvm/x86.c              |    8 +++++++-
>>  3 files changed, 34 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index c431b33..d6ab8d2 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -502,6 +502,13 @@ struct kvm_vcpu_arch {
>>  		u64 msr_val;
>>  		struct gfn_to_hva_cache data;
>>  	} pv_eoi;
>> +
>> +	/*
>> +	 * Indicate whether the access faults on its page table in guest
>> +	 * which is set when fix page fault and used to detect unhandeable
>> +	 * instruction.
>> +	 */
>> +	bool write_fault_to_shadow_pgtable;
>>  };
>>
>>  struct kvm_lpage_info {
>> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
>> index 67b390d..df50560 100644
>> --- a/arch/x86/kvm/paging_tmpl.h
>> +++ b/arch/x86/kvm/paging_tmpl.h
>> @@ -497,26 +497,34 @@ out_gpte_changed:
>>   * created when kvm establishes shadow page table that stop kvm using large
>>   * page size. Do it early can avoid unnecessary #PF and emulation.
>>   *
>> + * @write_fault_to_shadow_pgtable will return true if the fault gfn is
>> + * currently used as its page table.
>> + *
>>   * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
>>   * since the PDPT is always shadowed, that means, we can not use large page
>>   * size to map the gfn which is used as PDPT.
>>   */
>>  static bool
>>  FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
>> -			      struct guest_walker *walker, int user_fault)
>> +			      struct guest_walker *walker, int user_fault,
>> +			      bool *write_fault_to_shadow_pgtable)
>>  {
>>  	int level;
>>  	gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
>> +	bool self_changed = false;
>>
>>  	if (!(walker->pte_access & ACC_WRITE_MASK ||
>>  	      (!is_write_protection(vcpu) && !user_fault)))
>>  		return false;
>>
>> -	for (level = walker->level; level <= walker->max_level; level++)
>> -		if (!((walker->gfn ^ walker->table_gfn[level - 1]) & mask))
>> -			return true;
>> +	for (level = walker->level; level <= walker->max_level; level++) {
>> +		gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
>> +
>> +		self_changed |= !(gfn & mask);
>> +		*write_fault_to_shadow_pgtable |= !gfn;
>> +	}
>>
>> -	return false;
>> +	return self_changed;
>>  }
>>
>>  /*
>> @@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
>>  	int level = PT_PAGE_TABLE_LEVEL;
>>  	int force_pt_level;
>>  	unsigned long mmu_seq;
>> -	bool map_writable;
>> +	bool map_writable, is_self_change_mapping;
>>
>>  	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
>>
>> @@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
>>  		return 0;
>>  	}
>>
>> +	vcpu->arch.write_fault_to_shadow_pgtable = false;
>> +
>> +	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
>> +	      &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
>> +
>>  	if (walker.level >= PT_DIRECTORY_LEVEL)
>>  		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
>> -		   || FNAME(is_self_change_mapping)(vcpu, &walker, user_fault);
>> +		   || is_self_change_mapping;
>>  	else
>>  		force_pt_level = 1;
>>  	if (!force_pt_level) {
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 6f13e03..2957012 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2)
>>  	 * guest to let CPU execute the instruction.
>>  	 */
>>  	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
>> -	return true;
>> +
>> +	/*
>> +	 * If the access faults on its page table, it can not
>> +	 * be fixed by unprotecting shadow page and it should
>> +	 * be reported to userspace.
>> +	 */
>> +	return !vcpu->arch.write_fault_to_shadow_pgtable;
>>  }
> 
> This sounds wrong: only reporting emulation failure in case 
> of a write fault to shadow pagetable? 

We suppose unprotecting target-gfn can avoid emulation, the same
as current code. :(

> 
> The current pattern is sane:
> 
> if (condition_1 which allows reexecution is true)
> 	return true;
> 
> if (condition_2 which allows reexecution is true)
> 	return true;
> ...
> 	return false;

Unfortunately, the current code reports failure only when the access
fault on error pfn:

        pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
        if (!is_error_pfn(pfn)) {
                kvm_release_pfn_clean(pfn);
                return true;
        }

        return false;

All !is_rror_pfn returns true.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 5/5] KVM: x86: improve reexecute_instruction
  2013-01-10 18:05     ` Xiao Guangrong
@ 2013-01-10 19:48       ` Marcelo Tosatti
  2013-01-10 20:18         ` Xiao Guangrong
  0 siblings, 1 reply; 16+ messages in thread
From: Marcelo Tosatti @ 2013-01-10 19:48 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Gleb Natapov, LKML, KVM

On Fri, Jan 11, 2013 at 02:05:33AM +0800, Xiao Guangrong wrote:
> On 01/11/2013 01:26 AM, Marcelo Tosatti wrote:
> > On Tue, Jan 08, 2013 at 02:38:36PM +0800, Xiao Guangrong wrote:
> >> The current reexecute_instruction can not well detect the failed instruction
> >> emulation. It allows guest to retry all the instructions except it accesses
> >> on error pfn
> >>
> >> For example, some cases are nested-write-protect - if the page we want to
> >> write is used as PDE but it chains to itself. Under this case, we should
> >> stop the emulation and report the case to userspace
> >>
> >> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> >> ---
> >>  arch/x86/include/asm/kvm_host.h |    7 +++++++
> >>  arch/x86/kvm/paging_tmpl.h      |   27 ++++++++++++++++++++-------
> >>  arch/x86/kvm/x86.c              |    8 +++++++-
> >>  3 files changed, 34 insertions(+), 8 deletions(-)
> >>
> >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> >> index c431b33..d6ab8d2 100644
> >> --- a/arch/x86/include/asm/kvm_host.h
> >> +++ b/arch/x86/include/asm/kvm_host.h
> >> @@ -502,6 +502,13 @@ struct kvm_vcpu_arch {
> >>  		u64 msr_val;
> >>  		struct gfn_to_hva_cache data;
> >>  	} pv_eoi;
> >> +
> >> +	/*
> >> +	 * Indicate whether the access faults on its page table in guest
> >> +	 * which is set when fix page fault and used to detect unhandeable
> >> +	 * instruction.
> >> +	 */
> >> +	bool write_fault_to_shadow_pgtable;
> >>  };
> >>
> >>  struct kvm_lpage_info {
> >> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> >> index 67b390d..df50560 100644
> >> --- a/arch/x86/kvm/paging_tmpl.h
> >> +++ b/arch/x86/kvm/paging_tmpl.h
> >> @@ -497,26 +497,34 @@ out_gpte_changed:
> >>   * created when kvm establishes shadow page table that stop kvm using large
> >>   * page size. Do it early can avoid unnecessary #PF and emulation.
> >>   *
> >> + * @write_fault_to_shadow_pgtable will return true if the fault gfn is
> >> + * currently used as its page table.
> >> + *
> >>   * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
> >>   * since the PDPT is always shadowed, that means, we can not use large page
> >>   * size to map the gfn which is used as PDPT.
> >>   */
> >>  static bool
> >>  FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
> >> -			      struct guest_walker *walker, int user_fault)
> >> +			      struct guest_walker *walker, int user_fault,
> >> +			      bool *write_fault_to_shadow_pgtable)
> >>  {
> >>  	int level;
> >>  	gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
> >> +	bool self_changed = false;
> >>
> >>  	if (!(walker->pte_access & ACC_WRITE_MASK ||
> >>  	      (!is_write_protection(vcpu) && !user_fault)))
> >>  		return false;
> >>
> >> -	for (level = walker->level; level <= walker->max_level; level++)
> >> -		if (!((walker->gfn ^ walker->table_gfn[level - 1]) & mask))
> >> -			return true;
> >> +	for (level = walker->level; level <= walker->max_level; level++) {
> >> +		gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
> >> +
> >> +		self_changed |= !(gfn & mask);
> >> +		*write_fault_to_shadow_pgtable |= !gfn;
> >> +	}
> >>
> >> -	return false;
> >> +	return self_changed;
> >>  }
> >>
> >>  /*
> >> @@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
> >>  	int level = PT_PAGE_TABLE_LEVEL;
> >>  	int force_pt_level;
> >>  	unsigned long mmu_seq;
> >> -	bool map_writable;
> >> +	bool map_writable, is_self_change_mapping;
> >>
> >>  	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
> >>
> >> @@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
> >>  		return 0;
> >>  	}
> >>
> >> +	vcpu->arch.write_fault_to_shadow_pgtable = false;
> >> +
> >> +	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
> >> +	      &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
> >> +
> >>  	if (walker.level >= PT_DIRECTORY_LEVEL)
> >>  		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
> >> -		   || FNAME(is_self_change_mapping)(vcpu, &walker, user_fault);
> >> +		   || is_self_change_mapping;
> >>  	else
> >>  		force_pt_level = 1;
> >>  	if (!force_pt_level) {
> >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >> index 6f13e03..2957012 100644
> >> --- a/arch/x86/kvm/x86.c
> >> +++ b/arch/x86/kvm/x86.c
> >> @@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2)
> >>  	 * guest to let CPU execute the instruction.
> >>  	 */
> >>  	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
> >> -	return true;
> >> +
> >> +	/*
> >> +	 * If the access faults on its page table, it can not
> >> +	 * be fixed by unprotecting shadow page and it should
> >> +	 * be reported to userspace.
> >> +	 */
> >> +	return !vcpu->arch.write_fault_to_shadow_pgtable;
> >>  }
> > 
> > This sounds wrong: only reporting emulation failure in case 
> > of a write fault to shadow pagetable? 
> 
> We suppose unprotecting target-gfn can avoid emulation, the same
> as current code. :(

Current code treats access to non-mapped guest address as indication to
exit reporting emulation failure.

The patch above restricts emulation failure reporting to when
write_fault_to_shadow_pgtable = true.

> > The current pattern is sane:
> > 
> > if (condition_1 which allows reexecution is true)
> > 	return true;
> > 
> > if (condition_2 which allows reexecution is true)
> > 	return true;
> > ...
> > 	return false;
> 
> Unfortunately, the current code reports failure only when the access
> fault on error pfn:
> 
>         pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
>         if (!is_error_pfn(pfn)) {
>                 kvm_release_pfn_clean(pfn);
>                 return true;
>         }
> 
>         return false;
> 
> All !is_rror_pfn returns true.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 5/5] KVM: x86: improve reexecute_instruction
  2013-01-10 19:48       ` Marcelo Tosatti
@ 2013-01-10 20:18         ` Xiao Guangrong
  2013-01-11 13:15           ` Marcelo Tosatti
  0 siblings, 1 reply; 16+ messages in thread
From: Xiao Guangrong @ 2013-01-10 20:18 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Gleb Natapov, LKML, KVM

On 01/11/2013 03:48 AM, Marcelo Tosatti wrote:
> On Fri, Jan 11, 2013 at 02:05:33AM +0800, Xiao Guangrong wrote:
>> On 01/11/2013 01:26 AM, Marcelo Tosatti wrote:
>>> On Tue, Jan 08, 2013 at 02:38:36PM +0800, Xiao Guangrong wrote:
>>>> The current reexecute_instruction can not well detect the failed instruction
>>>> emulation. It allows guest to retry all the instructions except it accesses
>>>> on error pfn
>>>>
>>>> For example, some cases are nested-write-protect - if the page we want to
>>>> write is used as PDE but it chains to itself. Under this case, we should
>>>> stop the emulation and report the case to userspace
>>>>
>>>> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
>>>> ---
>>>>  arch/x86/include/asm/kvm_host.h |    7 +++++++
>>>>  arch/x86/kvm/paging_tmpl.h      |   27 ++++++++++++++++++++-------
>>>>  arch/x86/kvm/x86.c              |    8 +++++++-
>>>>  3 files changed, 34 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>>>> index c431b33..d6ab8d2 100644
>>>> --- a/arch/x86/include/asm/kvm_host.h
>>>> +++ b/arch/x86/include/asm/kvm_host.h
>>>> @@ -502,6 +502,13 @@ struct kvm_vcpu_arch {
>>>>  		u64 msr_val;
>>>>  		struct gfn_to_hva_cache data;
>>>>  	} pv_eoi;
>>>> +
>>>> +	/*
>>>> +	 * Indicate whether the access faults on its page table in guest
>>>> +	 * which is set when fix page fault and used to detect unhandeable
>>>> +	 * instruction.
>>>> +	 */
>>>> +	bool write_fault_to_shadow_pgtable;
>>>>  };
>>>>
>>>>  struct kvm_lpage_info {
>>>> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
>>>> index 67b390d..df50560 100644
>>>> --- a/arch/x86/kvm/paging_tmpl.h
>>>> +++ b/arch/x86/kvm/paging_tmpl.h
>>>> @@ -497,26 +497,34 @@ out_gpte_changed:
>>>>   * created when kvm establishes shadow page table that stop kvm using large
>>>>   * page size. Do it early can avoid unnecessary #PF and emulation.
>>>>   *
>>>> + * @write_fault_to_shadow_pgtable will return true if the fault gfn is
>>>> + * currently used as its page table.
>>>> + *
>>>>   * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
>>>>   * since the PDPT is always shadowed, that means, we can not use large page
>>>>   * size to map the gfn which is used as PDPT.
>>>>   */
>>>>  static bool
>>>>  FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
>>>> -			      struct guest_walker *walker, int user_fault)
>>>> +			      struct guest_walker *walker, int user_fault,
>>>> +			      bool *write_fault_to_shadow_pgtable)
>>>>  {
>>>>  	int level;
>>>>  	gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
>>>> +	bool self_changed = false;
>>>>
>>>>  	if (!(walker->pte_access & ACC_WRITE_MASK ||
>>>>  	      (!is_write_protection(vcpu) && !user_fault)))
>>>>  		return false;
>>>>
>>>> -	for (level = walker->level; level <= walker->max_level; level++)
>>>> -		if (!((walker->gfn ^ walker->table_gfn[level - 1]) & mask))
>>>> -			return true;
>>>> +	for (level = walker->level; level <= walker->max_level; level++) {
>>>> +		gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
>>>> +
>>>> +		self_changed |= !(gfn & mask);
>>>> +		*write_fault_to_shadow_pgtable |= !gfn;
>>>> +	}
>>>>
>>>> -	return false;
>>>> +	return self_changed;
>>>>  }
>>>>
>>>>  /*
>>>> @@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
>>>>  	int level = PT_PAGE_TABLE_LEVEL;
>>>>  	int force_pt_level;
>>>>  	unsigned long mmu_seq;
>>>> -	bool map_writable;
>>>> +	bool map_writable, is_self_change_mapping;
>>>>
>>>>  	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
>>>>
>>>> @@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
>>>>  		return 0;
>>>>  	}
>>>>
>>>> +	vcpu->arch.write_fault_to_shadow_pgtable = false;
>>>> +
>>>> +	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
>>>> +	      &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
>>>> +
>>>>  	if (walker.level >= PT_DIRECTORY_LEVEL)
>>>>  		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
>>>> -		   || FNAME(is_self_change_mapping)(vcpu, &walker, user_fault);
>>>> +		   || is_self_change_mapping;
>>>>  	else
>>>>  		force_pt_level = 1;
>>>>  	if (!force_pt_level) {
>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>>> index 6f13e03..2957012 100644
>>>> --- a/arch/x86/kvm/x86.c
>>>> +++ b/arch/x86/kvm/x86.c
>>>> @@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2)
>>>>  	 * guest to let CPU execute the instruction.
>>>>  	 */
>>>>  	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
>>>> -	return true;
>>>> +
>>>> +	/*
>>>> +	 * If the access faults on its page table, it can not
>>>> +	 * be fixed by unprotecting shadow page and it should
>>>> +	 * be reported to userspace.
>>>> +	 */
>>>> +	return !vcpu->arch.write_fault_to_shadow_pgtable;
>>>>  }
>>>
>>> This sounds wrong: only reporting emulation failure in case 
>>> of a write fault to shadow pagetable? 
>>
>> We suppose unprotecting target-gfn can avoid emulation, the same
>> as current code. :(
> 
> Current code treats access to non-mapped guest address as indication to
> exit reporting emulation failure.
> 
> The patch above restricts emulation failure reporting to when
> write_fault_to_shadow_pgtable = true.

In the patch 4:

+	/*
+	 * If the instruction failed on the error pfn, it can not be fixed,
+	 * report the error to userspace.
+	 */
+	if (is_error_noslot_pfn(pfn))
+		return false;
+
+	kvm_release_pfn_clean(pfn);

That means, two cases can cause failure fail:

1): access on non-mapped guest address (The same as the current code)
2): !vcpu->arch.write_fault_to_shadow_pgtable (The new case added in this patch)

Hmm, or i missed something?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 5/5] KVM: x86: improve reexecute_instruction
  2013-01-10 20:18         ` Xiao Guangrong
@ 2013-01-11 13:15           ` Marcelo Tosatti
  0 siblings, 0 replies; 16+ messages in thread
From: Marcelo Tosatti @ 2013-01-11 13:15 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Gleb Natapov, LKML, KVM

On Fri, Jan 11, 2013 at 04:18:22AM +0800, Xiao Guangrong wrote:
> On 01/11/2013 03:48 AM, Marcelo Tosatti wrote:
> > On Fri, Jan 11, 2013 at 02:05:33AM +0800, Xiao Guangrong wrote:
> >> On 01/11/2013 01:26 AM, Marcelo Tosatti wrote:
> >>> On Tue, Jan 08, 2013 at 02:38:36PM +0800, Xiao Guangrong wrote:
> >>>> The current reexecute_instruction can not well detect the failed instruction
> >>>> emulation. It allows guest to retry all the instructions except it accesses
> >>>> on error pfn
> >>>>
> >>>> For example, some cases are nested-write-protect - if the page we want to
> >>>> write is used as PDE but it chains to itself. Under this case, we should
> >>>> stop the emulation and report the case to userspace
> >>>>
> >>>> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> >>>> ---
> >>>>  arch/x86/include/asm/kvm_host.h |    7 +++++++
> >>>>  arch/x86/kvm/paging_tmpl.h      |   27 ++++++++++++++++++++-------
> >>>>  arch/x86/kvm/x86.c              |    8 +++++++-
> >>>>  3 files changed, 34 insertions(+), 8 deletions(-)
> >>>>
> >>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> >>>> index c431b33..d6ab8d2 100644
> >>>> --- a/arch/x86/include/asm/kvm_host.h
> >>>> +++ b/arch/x86/include/asm/kvm_host.h
> >>>> @@ -502,6 +502,13 @@ struct kvm_vcpu_arch {
> >>>>  		u64 msr_val;
> >>>>  		struct gfn_to_hva_cache data;
> >>>>  	} pv_eoi;
> >>>> +
> >>>> +	/*
> >>>> +	 * Indicate whether the access faults on its page table in guest
> >>>> +	 * which is set when fix page fault and used to detect unhandeable
> >>>> +	 * instruction.
> >>>> +	 */
> >>>> +	bool write_fault_to_shadow_pgtable;
> >>>>  };
> >>>>
> >>>>  struct kvm_lpage_info {
> >>>> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> >>>> index 67b390d..df50560 100644
> >>>> --- a/arch/x86/kvm/paging_tmpl.h
> >>>> +++ b/arch/x86/kvm/paging_tmpl.h
> >>>> @@ -497,26 +497,34 @@ out_gpte_changed:
> >>>>   * created when kvm establishes shadow page table that stop kvm using large
> >>>>   * page size. Do it early can avoid unnecessary #PF and emulation.
> >>>>   *
> >>>> + * @write_fault_to_shadow_pgtable will return true if the fault gfn is
> >>>> + * currently used as its page table.
> >>>> + *
> >>>>   * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
> >>>>   * since the PDPT is always shadowed, that means, we can not use large page
> >>>>   * size to map the gfn which is used as PDPT.
> >>>>   */
> >>>>  static bool
> >>>>  FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
> >>>> -			      struct guest_walker *walker, int user_fault)
> >>>> +			      struct guest_walker *walker, int user_fault,
> >>>> +			      bool *write_fault_to_shadow_pgtable)
> >>>>  {
> >>>>  	int level;
> >>>>  	gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
> >>>> +	bool self_changed = false;
> >>>>
> >>>>  	if (!(walker->pte_access & ACC_WRITE_MASK ||
> >>>>  	      (!is_write_protection(vcpu) && !user_fault)))
> >>>>  		return false;
> >>>>
> >>>> -	for (level = walker->level; level <= walker->max_level; level++)
> >>>> -		if (!((walker->gfn ^ walker->table_gfn[level - 1]) & mask))
> >>>> -			return true;
> >>>> +	for (level = walker->level; level <= walker->max_level; level++) {
> >>>> +		gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
> >>>> +
> >>>> +		self_changed |= !(gfn & mask);
> >>>> +		*write_fault_to_shadow_pgtable |= !gfn;
> >>>> +	}
> >>>>
> >>>> -	return false;
> >>>> +	return self_changed;
> >>>>  }
> >>>>
> >>>>  /*
> >>>> @@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
> >>>>  	int level = PT_PAGE_TABLE_LEVEL;
> >>>>  	int force_pt_level;
> >>>>  	unsigned long mmu_seq;
> >>>> -	bool map_writable;
> >>>> +	bool map_writable, is_self_change_mapping;
> >>>>
> >>>>  	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
> >>>>
> >>>> @@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
> >>>>  		return 0;
> >>>>  	}
> >>>>
> >>>> +	vcpu->arch.write_fault_to_shadow_pgtable = false;
> >>>> +
> >>>> +	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
> >>>> +	      &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
> >>>> +
> >>>>  	if (walker.level >= PT_DIRECTORY_LEVEL)
> >>>>  		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
> >>>> -		   || FNAME(is_self_change_mapping)(vcpu, &walker, user_fault);
> >>>> +		   || is_self_change_mapping;
> >>>>  	else
> >>>>  		force_pt_level = 1;
> >>>>  	if (!force_pt_level) {
> >>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >>>> index 6f13e03..2957012 100644
> >>>> --- a/arch/x86/kvm/x86.c
> >>>> +++ b/arch/x86/kvm/x86.c
> >>>> @@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2)
> >>>>  	 * guest to let CPU execute the instruction.
> >>>>  	 */
> >>>>  	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
> >>>> -	return true;
> >>>> +
> >>>> +	/*
> >>>> +	 * If the access faults on its page table, it can not
> >>>> +	 * be fixed by unprotecting shadow page and it should
> >>>> +	 * be reported to userspace.
> >>>> +	 */
> >>>> +	return !vcpu->arch.write_fault_to_shadow_pgtable;
> >>>>  }
> >>>
> >>> This sounds wrong: only reporting emulation failure in case 
> >>> of a write fault to shadow pagetable? 
> >>
> >> We suppose unprotecting target-gfn can avoid emulation, the same
> >> as current code. :(
> > 
> > Current code treats access to non-mapped guest address as indication to
> > exit reporting emulation failure.
> > 
> > The patch above restricts emulation failure reporting to when
> > write_fault_to_shadow_pgtable = true.
> 
> In the patch 4:
> 
> +	/*
> +	 * If the instruction failed on the error pfn, it can not be fixed,
> +	 * report the error to userspace.
> +	 */
> +	if (is_error_noslot_pfn(pfn))
> +		return false;
> +
> +	kvm_release_pfn_clean(pfn);
> 
> That means, two cases can cause failure fail:
> 
> 1): access on non-mapped guest address (The same as the current code)
> 2): !vcpu->arch.write_fault_to_shadow_pgtable (The new case added in this patch)
> 
> Hmm, or i missed something?

No, i did. Its correct.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 5/5] KVM: x86: improve reexecute_instruction
  2013-01-08  6:38 ` [PATCH v5 5/5] KVM: x86: improve reexecute_instruction Xiao Guangrong
  2013-01-10 17:26   ` Marcelo Tosatti
@ 2013-01-10 17:30   ` Marcelo Tosatti
  2013-01-10 17:38     ` Gleb Natapov
  2013-01-10 18:16     ` Xiao Guangrong
  1 sibling, 2 replies; 16+ messages in thread
From: Marcelo Tosatti @ 2013-01-10 17:30 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Gleb Natapov, LKML, KVM

On Tue, Jan 08, 2013 at 02:38:36PM +0800, Xiao Guangrong wrote:
> The current reexecute_instruction can not well detect the failed instruction
> emulation. It allows guest to retry all the instructions except it accesses
> on error pfn
> 
> For example, some cases are nested-write-protect - if the page we want to
> write is used as PDE but it chains to itself. Under this case, we should
> stop the emulation and report the case to userspace
> 
> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> ---
>  arch/x86/include/asm/kvm_host.h |    7 +++++++
>  arch/x86/kvm/paging_tmpl.h      |   27 ++++++++++++++++++++-------
>  arch/x86/kvm/x86.c              |    8 +++++++-
>  3 files changed, 34 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index c431b33..d6ab8d2 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -502,6 +502,13 @@ struct kvm_vcpu_arch {
>  		u64 msr_val;
>  		struct gfn_to_hva_cache data;
>  	} pv_eoi;
> +
> +	/*
> +	 * Indicate whether the access faults on its page table in guest
> +	 * which is set when fix page fault and used to detect unhandeable
> +	 * instruction.
> +	 */
> +	bool write_fault_to_shadow_pgtable;
>  };
> 
>  struct kvm_lpage_info {
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 67b390d..df50560 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -497,26 +497,34 @@ out_gpte_changed:
>   * created when kvm establishes shadow page table that stop kvm using large
>   * page size. Do it early can avoid unnecessary #PF and emulation.
>   *
> + * @write_fault_to_shadow_pgtable will return true if the fault gfn is
> + * currently used as its page table.
> + *
>   * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
>   * since the PDPT is always shadowed, that means, we can not use large page
>   * size to map the gfn which is used as PDPT.
>   */
>  static bool
>  FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
> -			      struct guest_walker *walker, int user_fault)
> +			      struct guest_walker *walker, int user_fault,
> +			      bool *write_fault_to_shadow_pgtable)
>  {
>  	int level;
>  	gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
> +	bool self_changed = false;
> 
>  	if (!(walker->pte_access & ACC_WRITE_MASK ||
>  	      (!is_write_protection(vcpu) && !user_fault)))
>  		return false;
> 
> -	for (level = walker->level; level <= walker->max_level; level++)
> -		if (!((walker->gfn ^ walker->table_gfn[level - 1]) & mask))
> -			return true;
> +	for (level = walker->level; level <= walker->max_level; level++) {
> +		gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
> +
> +		self_changed |= !(gfn & mask);
> +		*write_fault_to_shadow_pgtable |= !gfn;
> +	}
> 
> -	return false;
> +	return self_changed;
>  }
> 
>  /*
> @@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
>  	int level = PT_PAGE_TABLE_LEVEL;
>  	int force_pt_level;
>  	unsigned long mmu_seq;
> -	bool map_writable;
> +	bool map_writable, is_self_change_mapping;
> 
>  	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
> 
> @@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
>  		return 0;
>  	}
> 
> +	vcpu->arch.write_fault_to_shadow_pgtable = false;
> +
> +	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
> +	      &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
> +
>  	if (walker.level >= PT_DIRECTORY_LEVEL)
>  		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
> -		   || FNAME(is_self_change_mapping)(vcpu, &walker, user_fault);
> +		   || is_self_change_mapping;
>  	else
>  		force_pt_level = 1;
>  	if (!force_pt_level) {
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 6f13e03..2957012 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2)
>  	 * guest to let CPU execute the instruction.
>  	 */
>  	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
> -	return true;
> +
> +	/*
> +	 * If the access faults on its page table, it can not
> +	 * be fixed by unprotecting shadow page and it should
> +	 * be reported to userspace.
> +	 */
> +	return !vcpu->arch.write_fault_to_shadow_pgtable;
>  }
> 
>  static bool retry_instruction(struct x86_emulate_ctxt *ctxt,

Also should make sure vcpu->arch.write_fault_to_shadow_pgtable is never
reused. Say, clean when exiting x86_emulate_instruction?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 5/5] KVM: x86: improve reexecute_instruction
  2013-01-10 17:30   ` Marcelo Tosatti
@ 2013-01-10 17:38     ` Gleb Natapov
  2013-01-10 18:16     ` Xiao Guangrong
  1 sibling, 0 replies; 16+ messages in thread
From: Gleb Natapov @ 2013-01-10 17:38 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Xiao Guangrong, LKML, KVM

On Thu, Jan 10, 2013 at 03:30:36PM -0200, Marcelo Tosatti wrote:
> On Tue, Jan 08, 2013 at 02:38:36PM +0800, Xiao Guangrong wrote:
> > The current reexecute_instruction can not well detect the failed instruction
> > emulation. It allows guest to retry all the instructions except it accesses
> > on error pfn
> > 
> > For example, some cases are nested-write-protect - if the page we want to
> > write is used as PDE but it chains to itself. Under this case, we should
> > stop the emulation and report the case to userspace
> > 
> > Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> > ---
> >  arch/x86/include/asm/kvm_host.h |    7 +++++++
> >  arch/x86/kvm/paging_tmpl.h      |   27 ++++++++++++++++++++-------
> >  arch/x86/kvm/x86.c              |    8 +++++++-
> >  3 files changed, 34 insertions(+), 8 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index c431b33..d6ab8d2 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -502,6 +502,13 @@ struct kvm_vcpu_arch {
> >  		u64 msr_val;
> >  		struct gfn_to_hva_cache data;
> >  	} pv_eoi;
> > +
> > +	/*
> > +	 * Indicate whether the access faults on its page table in guest
> > +	 * which is set when fix page fault and used to detect unhandeable
> > +	 * instruction.
> > +	 */
> > +	bool write_fault_to_shadow_pgtable;
> >  };
> > 
> >  struct kvm_lpage_info {
> > diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> > index 67b390d..df50560 100644
> > --- a/arch/x86/kvm/paging_tmpl.h
> > +++ b/arch/x86/kvm/paging_tmpl.h
> > @@ -497,26 +497,34 @@ out_gpte_changed:
> >   * created when kvm establishes shadow page table that stop kvm using large
> >   * page size. Do it early can avoid unnecessary #PF and emulation.
> >   *
> > + * @write_fault_to_shadow_pgtable will return true if the fault gfn is
> > + * currently used as its page table.
> > + *
> >   * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
> >   * since the PDPT is always shadowed, that means, we can not use large page
> >   * size to map the gfn which is used as PDPT.
> >   */
> >  static bool
> >  FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
> > -			      struct guest_walker *walker, int user_fault)
> > +			      struct guest_walker *walker, int user_fault,
> > +			      bool *write_fault_to_shadow_pgtable)
> >  {
> >  	int level;
> >  	gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
> > +	bool self_changed = false;
> > 
> >  	if (!(walker->pte_access & ACC_WRITE_MASK ||
> >  	      (!is_write_protection(vcpu) && !user_fault)))
> >  		return false;
> > 
> > -	for (level = walker->level; level <= walker->max_level; level++)
> > -		if (!((walker->gfn ^ walker->table_gfn[level - 1]) & mask))
> > -			return true;
> > +	for (level = walker->level; level <= walker->max_level; level++) {
> > +		gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
> > +
> > +		self_changed |= !(gfn & mask);
> > +		*write_fault_to_shadow_pgtable |= !gfn;
> > +	}
> > 
> > -	return false;
> > +	return self_changed;
> >  }
> > 
> >  /*
> > @@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
> >  	int level = PT_PAGE_TABLE_LEVEL;
> >  	int force_pt_level;
> >  	unsigned long mmu_seq;
> > -	bool map_writable;
> > +	bool map_writable, is_self_change_mapping;
> > 
> >  	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
> > 
> > @@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
> >  		return 0;
> >  	}
> > 
> > +	vcpu->arch.write_fault_to_shadow_pgtable = false;
> > +
> > +	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
> > +	      &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
> > +
> >  	if (walker.level >= PT_DIRECTORY_LEVEL)
> >  		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
> > -		   || FNAME(is_self_change_mapping)(vcpu, &walker, user_fault);
> > +		   || is_self_change_mapping;
> >  	else
> >  		force_pt_level = 1;
> >  	if (!force_pt_level) {
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 6f13e03..2957012 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2)
> >  	 * guest to let CPU execute the instruction.
> >  	 */
> >  	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
> > -	return true;
> > +
> > +	/*
> > +	 * If the access faults on its page table, it can not
> > +	 * be fixed by unprotecting shadow page and it should
> > +	 * be reported to userspace.
> > +	 */
> > +	return !vcpu->arch.write_fault_to_shadow_pgtable;
> >  }
> > 
> >  static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
> 
> Also should make sure vcpu->arch.write_fault_to_shadow_pgtable is never
> reused. Say, clean when exiting x86_emulate_instruction?
Clear it right here for clarity.

--
			Gleb.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 5/5] KVM: x86: improve reexecute_instruction
  2013-01-10 17:30   ` Marcelo Tosatti
  2013-01-10 17:38     ` Gleb Natapov
@ 2013-01-10 18:16     ` Xiao Guangrong
  2013-01-11 13:15       ` Marcelo Tosatti
  1 sibling, 1 reply; 16+ messages in thread
From: Xiao Guangrong @ 2013-01-10 18:16 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Gleb Natapov, LKML, KVM

On 01/11/2013 01:30 AM, Marcelo Tosatti wrote:
> On Tue, Jan 08, 2013 at 02:38:36PM +0800, Xiao Guangrong wrote:
>> The current reexecute_instruction can not well detect the failed instruction
>> emulation. It allows guest to retry all the instructions except it accesses
>> on error pfn
>>
>> For example, some cases are nested-write-protect - if the page we want to
>> write is used as PDE but it chains to itself. Under this case, we should
>> stop the emulation and report the case to userspace
>>
>> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
>> ---
>>  arch/x86/include/asm/kvm_host.h |    7 +++++++
>>  arch/x86/kvm/paging_tmpl.h      |   27 ++++++++++++++++++++-------
>>  arch/x86/kvm/x86.c              |    8 +++++++-
>>  3 files changed, 34 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index c431b33..d6ab8d2 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -502,6 +502,13 @@ struct kvm_vcpu_arch {
>>  		u64 msr_val;
>>  		struct gfn_to_hva_cache data;
>>  	} pv_eoi;
>> +
>> +	/*
>> +	 * Indicate whether the access faults on its page table in guest
>> +	 * which is set when fix page fault and used to detect unhandeable
>> +	 * instruction.
>> +	 */
>> +	bool write_fault_to_shadow_pgtable;
>>  };
>>
>>  struct kvm_lpage_info {
>> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
>> index 67b390d..df50560 100644
>> --- a/arch/x86/kvm/paging_tmpl.h
>> +++ b/arch/x86/kvm/paging_tmpl.h
>> @@ -497,26 +497,34 @@ out_gpte_changed:
>>   * created when kvm establishes shadow page table that stop kvm using large
>>   * page size. Do it early can avoid unnecessary #PF and emulation.
>>   *
>> + * @write_fault_to_shadow_pgtable will return true if the fault gfn is
>> + * currently used as its page table.
>> + *
>>   * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
>>   * since the PDPT is always shadowed, that means, we can not use large page
>>   * size to map the gfn which is used as PDPT.
>>   */
>>  static bool
>>  FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
>> -			      struct guest_walker *walker, int user_fault)
>> +			      struct guest_walker *walker, int user_fault,
>> +			      bool *write_fault_to_shadow_pgtable)
>>  {
>>  	int level;
>>  	gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
>> +	bool self_changed = false;
>>
>>  	if (!(walker->pte_access & ACC_WRITE_MASK ||
>>  	      (!is_write_protection(vcpu) && !user_fault)))
>>  		return false;
>>
>> -	for (level = walker->level; level <= walker->max_level; level++)
>> -		if (!((walker->gfn ^ walker->table_gfn[level - 1]) & mask))
>> -			return true;
>> +	for (level = walker->level; level <= walker->max_level; level++) {
>> +		gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
>> +
>> +		self_changed |= !(gfn & mask);
>> +		*write_fault_to_shadow_pgtable |= !gfn;
>> +	}
>>
>> -	return false;
>> +	return self_changed;
>>  }
>>
>>  /*
>> @@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
>>  	int level = PT_PAGE_TABLE_LEVEL;
>>  	int force_pt_level;
>>  	unsigned long mmu_seq;
>> -	bool map_writable;
>> +	bool map_writable, is_self_change_mapping;
>>
>>  	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
>>
>> @@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
>>  		return 0;
>>  	}
>>
>> +	vcpu->arch.write_fault_to_shadow_pgtable = false;
>> +
>> +	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
>> +	      &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
>> +
>>  	if (walker.level >= PT_DIRECTORY_LEVEL)
>>  		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
>> -		   || FNAME(is_self_change_mapping)(vcpu, &walker, user_fault);
>> +		   || is_self_change_mapping;
>>  	else
>>  		force_pt_level = 1;
>>  	if (!force_pt_level) {
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 6f13e03..2957012 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2)
>>  	 * guest to let CPU execute the instruction.
>>  	 */
>>  	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
>> -	return true;
>> +
>> +	/*
>> +	 * If the access faults on its page table, it can not
>> +	 * be fixed by unprotecting shadow page and it should
>> +	 * be reported to userspace.
>> +	 */
>> +	return !vcpu->arch.write_fault_to_shadow_pgtable;
>>  }
>>
>>  static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
> 
> Also should make sure vcpu->arch.write_fault_to_shadow_pgtable is never
> reused. Say, clean when exiting x86_emulate_instruction?

Yes, it is more clear.

But i am thinking if it is really needed because 'cr2' is only valid when it
is called on page fault path, vcpu->arch.write_fault_to_shadow_pgtable is reset
at the beginning of page-fault path.

For other paths, cr2 is always 0 which is always 'NULL' pointer and not mapped
on guest, reexecute_instruction will always return true:

		gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);

		/*
		 * If the mapping is invalid in guest, let cpu retry
		 * it to generate fault.
		 */
		if (gpa == UNMAPPED_GVA)
			return true;

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 5/5] KVM: x86: improve reexecute_instruction
  2013-01-10 18:16     ` Xiao Guangrong
@ 2013-01-11 13:15       ` Marcelo Tosatti
  2013-01-11 14:12         ` [PATCH v5 6/5] KVM: x86: clear write_fault_to_shadow_pgtable explicitly Xiao Guangrong
  0 siblings, 1 reply; 16+ messages in thread
From: Marcelo Tosatti @ 2013-01-11 13:15 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Gleb Natapov, LKML, KVM

On Fri, Jan 11, 2013 at 02:16:11AM +0800, Xiao Guangrong wrote:
> On 01/11/2013 01:30 AM, Marcelo Tosatti wrote:
> > On Tue, Jan 08, 2013 at 02:38:36PM +0800, Xiao Guangrong wrote:
> >> The current reexecute_instruction can not well detect the failed instruction
> >> emulation. It allows guest to retry all the instructions except it accesses
> >> on error pfn
> >>
> >> For example, some cases are nested-write-protect - if the page we want to
> >> write is used as PDE but it chains to itself. Under this case, we should
> >> stop the emulation and report the case to userspace
> >>
> >> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> >> ---
> >>  arch/x86/include/asm/kvm_host.h |    7 +++++++
> >>  arch/x86/kvm/paging_tmpl.h      |   27 ++++++++++++++++++++-------
> >>  arch/x86/kvm/x86.c              |    8 +++++++-
> >>  3 files changed, 34 insertions(+), 8 deletions(-)
> >>
> >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> >> index c431b33..d6ab8d2 100644
> >> --- a/arch/x86/include/asm/kvm_host.h
> >> +++ b/arch/x86/include/asm/kvm_host.h
> >> @@ -502,6 +502,13 @@ struct kvm_vcpu_arch {
> >>  		u64 msr_val;
> >>  		struct gfn_to_hva_cache data;
> >>  	} pv_eoi;
> >> +
> >> +	/*
> >> +	 * Indicate whether the access faults on its page table in guest
> >> +	 * which is set when fix page fault and used to detect unhandeable
> >> +	 * instruction.
> >> +	 */
> >> +	bool write_fault_to_shadow_pgtable;
> >>  };
> >>
> >>  struct kvm_lpage_info {
> >> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> >> index 67b390d..df50560 100644
> >> --- a/arch/x86/kvm/paging_tmpl.h
> >> +++ b/arch/x86/kvm/paging_tmpl.h
> >> @@ -497,26 +497,34 @@ out_gpte_changed:
> >>   * created when kvm establishes shadow page table that stop kvm using large
> >>   * page size. Do it early can avoid unnecessary #PF and emulation.
> >>   *
> >> + * @write_fault_to_shadow_pgtable will return true if the fault gfn is
> >> + * currently used as its page table.
> >> + *
> >>   * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
> >>   * since the PDPT is always shadowed, that means, we can not use large page
> >>   * size to map the gfn which is used as PDPT.
> >>   */
> >>  static bool
> >>  FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
> >> -			      struct guest_walker *walker, int user_fault)
> >> +			      struct guest_walker *walker, int user_fault,
> >> +			      bool *write_fault_to_shadow_pgtable)
> >>  {
> >>  	int level;
> >>  	gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
> >> +	bool self_changed = false;
> >>
> >>  	if (!(walker->pte_access & ACC_WRITE_MASK ||
> >>  	      (!is_write_protection(vcpu) && !user_fault)))
> >>  		return false;
> >>
> >> -	for (level = walker->level; level <= walker->max_level; level++)
> >> -		if (!((walker->gfn ^ walker->table_gfn[level - 1]) & mask))
> >> -			return true;
> >> +	for (level = walker->level; level <= walker->max_level; level++) {
> >> +		gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
> >> +
> >> +		self_changed |= !(gfn & mask);
> >> +		*write_fault_to_shadow_pgtable |= !gfn;
> >> +	}
> >>
> >> -	return false;
> >> +	return self_changed;
> >>  }
> >>
> >>  /*
> >> @@ -544,7 +552,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
> >>  	int level = PT_PAGE_TABLE_LEVEL;
> >>  	int force_pt_level;
> >>  	unsigned long mmu_seq;
> >> -	bool map_writable;
> >> +	bool map_writable, is_self_change_mapping;
> >>
> >>  	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
> >>
> >> @@ -572,9 +580,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
> >>  		return 0;
> >>  	}
> >>
> >> +	vcpu->arch.write_fault_to_shadow_pgtable = false;
> >> +
> >> +	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
> >> +	      &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
> >> +
> >>  	if (walker.level >= PT_DIRECTORY_LEVEL)
> >>  		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
> >> -		   || FNAME(is_self_change_mapping)(vcpu, &walker, user_fault);
> >> +		   || is_self_change_mapping;
> >>  	else
> >>  		force_pt_level = 1;
> >>  	if (!force_pt_level) {
> >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >> index 6f13e03..2957012 100644
> >> --- a/arch/x86/kvm/x86.c
> >> +++ b/arch/x86/kvm/x86.c
> >> @@ -4810,7 +4810,13 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2)
> >>  	 * guest to let CPU execute the instruction.
> >>  	 */
> >>  	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
> >> -	return true;
> >> +
> >> +	/*
> >> +	 * If the access faults on its page table, it can not
> >> +	 * be fixed by unprotecting shadow page and it should
> >> +	 * be reported to userspace.
> >> +	 */
> >> +	return !vcpu->arch.write_fault_to_shadow_pgtable;
> >>  }
> >>
> >>  static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
> > 
> > Also should make sure vcpu->arch.write_fault_to_shadow_pgtable is never
> > reused. Say, clean when exiting x86_emulate_instruction?
> 
> Yes, it is more clear.
> 
> But i am thinking if it is really needed because 'cr2' is only valid when it
> is called on page fault path, vcpu->arch.write_fault_to_shadow_pgtable is reset
> at the beginning of page-fault path.
> 
> For other paths, cr2 is always 0 which is always 'NULL' pointer and not mapped
> on guest, reexecute_instruction will always return true:
> 
> 		gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
> 
> 		/*
> 		 * If the mapping is invalid in guest, let cpu retry
> 		 * it to generate fault.
> 		 */
> 		if (gpa == UNMAPPED_GVA)
> 			return true;

This is cryptic. Its not obvious at all for someone modifying the code, 
for example.

Can you please clear it explicitly? 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v5 6/5] KVM: x86: clear write_fault_to_shadow_pgtable explicitly
  2013-01-11 13:15       ` Marcelo Tosatti
@ 2013-01-11 14:12         ` Xiao Guangrong
  2013-01-11 19:09           ` Marcelo Tosatti
  0 siblings, 1 reply; 16+ messages in thread
From: Xiao Guangrong @ 2013-01-11 14:12 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Gleb Natapov, LKML, KVM

On 01/11/2013 09:15 PM, Marcelo Tosatti wrote:

> 
> This is cryptic. Its not obvious at all for someone modifying the code, 
> for example.
> 
> Can you please clear it explicitly? 

Sure, this is the patch to apply your idea, is it good to you? :)

============================================
Subject: [PATCH 6/6] KVM: x86: clear write_fault_to_shadow_pgtable explicitly

Clear it explicitly when exiting x86_emulate_instruction to clarify the code,
it is suggested by Marcelo

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/x86.c |   16 ++++++++++++----
 1 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2957012..89d01a8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4753,7 +4753,8 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
 	return r;
 }

-static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2)
+static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
+				  bool write_fault_to_shadow_pgtable)
 {
 	gpa_t gpa = cr2;
 	pfn_t pfn;
@@ -4816,7 +4817,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2)
 	 * be fixed by unprotecting shadow page and it should
 	 * be reported to userspace.
 	 */
-	return !vcpu->arch.write_fault_to_shadow_pgtable;
+	return !write_fault_to_shadow_pgtable;
 }

 static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
@@ -4875,7 +4876,13 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 	int r;
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	bool writeback = true;
+	bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;

+	/*
+	 * Clear write_fault_to_shadow_pgtable here to ensure it is
+	 * never reused.
+	 */
+	vcpu->arch.write_fault_to_shadow_pgtable = false;
 	kvm_clear_exception_queue(vcpu);

 	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
@@ -4894,7 +4901,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 		if (r != EMULATION_OK)  {
 			if (emulation_type & EMULTYPE_TRAP_UD)
 				return EMULATE_FAIL;
-			if (reexecute_instruction(vcpu, cr2))
+			if (reexecute_instruction(vcpu, cr2,
+						  write_fault_to_spt))
 				return EMULATE_DONE;
 			if (emulation_type & EMULTYPE_SKIP)
 				return EMULATE_FAIL;
@@ -4924,7 +4932,7 @@ restart:
 		return EMULATE_DONE;

 	if (r == EMULATION_FAILED) {
-		if (reexecute_instruction(vcpu, cr2))
+		if (reexecute_instruction(vcpu, cr2, write_fault_to_spt))
 			return EMULATE_DONE;

 		return handle_emulation_failure(vcpu);
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v5 6/5] KVM: x86: clear write_fault_to_shadow_pgtable explicitly
  2013-01-11 14:12         ` [PATCH v5 6/5] KVM: x86: clear write_fault_to_shadow_pgtable explicitly Xiao Guangrong
@ 2013-01-11 19:09           ` Marcelo Tosatti
  0 siblings, 0 replies; 16+ messages in thread
From: Marcelo Tosatti @ 2013-01-11 19:09 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Gleb Natapov, LKML, KVM

On Fri, Jan 11, 2013 at 10:12:58PM +0800, Xiao Guangrong wrote:
> On 01/11/2013 09:15 PM, Marcelo Tosatti wrote:
> 
> > 
> > This is cryptic. Its not obvious at all for someone modifying the code, 
> > for example.
> > 
> > Can you please clear it explicitly? 
> 
> Sure, this is the patch to apply your idea, is it good to you? :)
> 
> ============================================
> Subject: [PATCH 6/6] KVM: x86: clear write_fault_to_shadow_pgtable explicitly
> 
> Clear it explicitly when exiting x86_emulate_instruction to clarify the code,
> it is suggested by Marcelo
> 
> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> ---
>  arch/x86/kvm/x86.c |   16 ++++++++++++----
>  1 files changed, 12 insertions(+), 4 deletions(-)

Fine, please rebase against queue.

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2013-01-11 19:09 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-01-08  6:36 [PATCH v5 1/5] KVM: MMU: fix Dirty bit missed if CR0.WP = 0 Xiao Guangrong
2013-01-08  6:36 ` [PATCH v5 2/5] KVM: MMU: fix infinite fault access retry Xiao Guangrong
2013-01-08  6:37 ` [PATCH v5 3/5] KVM: x86: clean up reexecute_instruction Xiao Guangrong
2013-01-08  6:37 ` [PATCH v5 4/5] KVM: x86: let reexecute_instruction work for tdp Xiao Guangrong
2013-01-08  6:38 ` [PATCH v5 5/5] KVM: x86: improve reexecute_instruction Xiao Guangrong
2013-01-10 17:26   ` Marcelo Tosatti
2013-01-10 18:05     ` Xiao Guangrong
2013-01-10 19:48       ` Marcelo Tosatti
2013-01-10 20:18         ` Xiao Guangrong
2013-01-11 13:15           ` Marcelo Tosatti
2013-01-10 17:30   ` Marcelo Tosatti
2013-01-10 17:38     ` Gleb Natapov
2013-01-10 18:16     ` Xiao Guangrong
2013-01-11 13:15       ` Marcelo Tosatti
2013-01-11 14:12         ` [PATCH v5 6/5] KVM: x86: clear write_fault_to_shadow_pgtable explicitly Xiao Guangrong
2013-01-11 19:09           ` Marcelo Tosatti

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox