public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/1] Enble a guest to access a device's memory mapped I/O regions directly.
  2008-04-16 13:25 direct mmio for passthrough - kernel part benami
@ 2008-04-16 13:25 ` benami
  2008-04-16 13:25   ` benami
  2008-04-18 15:50   ` Avi Kivity
  0 siblings, 2 replies; 9+ messages in thread
From: benami @ 2008-04-16 13:25 UTC (permalink / raw)
  To: amit.shah; +Cc: kvm-devel, allen.m.kay, andrea, benami

From: Ben-Ami Yassour <benami@il.ibm.com>

Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
---
 arch/x86/kvm/mmu.c         |   59 +++++++++++++++++++++++++++++--------------
 arch/x86/kvm/paging_tmpl.h |   19 +++++++++----
 include/linux/kvm_host.h   |    2 +-
 virt/kvm/kvm_main.c        |   17 +++++++++++-
 4 files changed, 69 insertions(+), 28 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 078a7f1..c89029d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -112,6 +112,8 @@ static int dbg = 1;
 #define PT_FIRST_AVAIL_BITS_SHIFT 9
 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
 
+#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+
 #define VALID_PAGE(x) ((x) != INVALID_PAGE)
 
 #define PT64_LEVEL_BITS 9
@@ -237,6 +239,9 @@ static int is_dirty_pte(unsigned long pte)
 
 static int is_rmap_pte(u64 pte)
 {
+	if (pte & PT_SHADOW_IO_MARK)
+		return false;
+
 	return is_shadow_present_pte(pte);
 }
 
@@ -1034,7 +1039,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 			 unsigned pt_access, unsigned pte_access,
 			 int user_fault, int write_fault, int dirty,
 			 int *ptwrite, int largepage, gfn_t gfn,
-			 pfn_t pfn, bool speculative)
+			 pfn_t pfn, bool speculative,
+			 int direct_mmio)
 {
 	u64 spte;
 	int was_rmapped = 0;
@@ -1114,6 +1120,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		}
 	}
 
+	if (direct_mmio)
+		spte |= PT_SHADOW_IO_MARK;
+
 unshadowed:
 
 	if (pte_access & ACC_WRITE_MASK)
@@ -1129,16 +1138,19 @@ unshadowed:
 		++vcpu->kvm->stat.lpages;
 
 	page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
-	if (!was_rmapped) {
-		rmap_add(vcpu, shadow_pte, gfn, largepage);
-		if (!is_rmap_pte(*shadow_pte))
-			kvm_release_pfn_clean(pfn);
-	} else {
-		if (was_writeble)
-			kvm_release_pfn_dirty(pfn);
-		else
-			kvm_release_pfn_clean(pfn);
+	if (!direct_mmio) {
+		if (!was_rmapped) {
+			rmap_add(vcpu, shadow_pte, gfn, largepage);
+			if (!is_rmap_pte(*shadow_pte))
+				kvm_release_pfn_clean(pfn);
+		} else {
+			if (was_writeble)
+				kvm_release_pfn_dirty(pfn);
+			else
+				kvm_release_pfn_clean(pfn);
+		}
 	}
+
 	if (!ptwrite || !*ptwrite)
 		vcpu->arch.last_pte_updated = shadow_pte;
 }
@@ -1149,7 +1161,7 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 
 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			   int largepage, gfn_t gfn, pfn_t pfn,
-			   int level)
+			   int level, int direct_mmio)
 {
 	hpa_t table_addr = vcpu->arch.mmu.root_hpa;
 	int pt_write = 0;
@@ -1163,13 +1175,15 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 
 		if (level == 1) {
 			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
-				     0, write, 1, &pt_write, 0, gfn, pfn, false);
+				     0, write, 1, &pt_write, 0, gfn, pfn,
+				     false, direct_mmio);
 			return pt_write;
 		}
 
 		if (largepage && level == 2) {
 			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
-				     0, write, 1, &pt_write, 1, gfn, pfn, false);
+				     0, write, 1, &pt_write, 1, gfn, pfn,
+				     false, direct_mmio);
 			return pt_write;
 		}
 
@@ -1200,6 +1214,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	int r;
 	int largepage = 0;
 	pfn_t pfn;
+	int direct_mmio = 0;
 
 	down_read(&current->mm->mmap_sem);
 	if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
@@ -1207,10 +1222,10 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 		largepage = 1;
 	}
 
-	pfn = gfn_to_pfn(vcpu->kvm, gfn);
+	pfn = gfn_to_pfn(vcpu->kvm, gfn, &direct_mmio);
 	up_read(&current->mm->mmap_sem);
 
-	/* mmio */
+	/* handle emulated mmio */
 	if (is_error_pfn(pfn)) {
 		kvm_release_pfn_clean(pfn);
 		return 1;
@@ -1219,7 +1234,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
 	r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
-			 PT32E_ROOT_LEVEL);
+			 PT32E_ROOT_LEVEL, direct_mmio);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 
@@ -1355,6 +1370,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 	int r;
 	int largepage = 0;
 	gfn_t gfn = gpa >> PAGE_SHIFT;
+	int direct_mmio = 0;
 
 	ASSERT(vcpu);
 	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -1368,7 +1384,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
 		largepage = 1;
 	}
-	pfn = gfn_to_pfn(vcpu->kvm, gfn);
+	pfn = gfn_to_pfn(vcpu->kvm, gfn, &direct_mmio);
 	up_read(&current->mm->mmap_sem);
 	if (is_error_pfn(pfn)) {
 		kvm_release_pfn_clean(pfn);
@@ -1377,7 +1393,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
 	r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
-			 largepage, gfn, pfn, TDP_ROOT_LEVEL);
+			 largepage, gfn, pfn, TDP_ROOT_LEVEL,
+			 direct_mmio);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return r;
@@ -1643,6 +1660,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	int r;
 	u64 gpte = 0;
 	pfn_t pfn;
+	int direct_mmio = 0;
 
 	vcpu->arch.update_pte.largepage = 0;
 
@@ -1678,9 +1696,12 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
 		vcpu->arch.update_pte.largepage = 1;
 	}
-	pfn = gfn_to_pfn(vcpu->kvm, gfn);
+	pfn = gfn_to_pfn(vcpu->kvm, gfn, &direct_mmio);
 	up_read(&current->mm->mmap_sem);
 
+	if (direct_mmio)
+		return;
+
 	if (is_error_pfn(pfn)) {
 		kvm_release_pfn_clean(pfn);
 		return;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 156fe10..e85d8ae 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -264,9 +264,10 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 	if (is_error_pfn(pfn))
 		return;
 	kvm_get_pfn(pfn);
+
 	mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
 		     gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
-		     pfn, true);
+		     pfn, true, false);
 }
 
 /*
@@ -275,7 +276,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			 struct guest_walker *walker,
 			 int user_fault, int write_fault, int largepage,
-			 int *ptwrite, pfn_t pfn)
+			 int *ptwrite, pfn_t pfn, int direct_mmio)
 {
 	hpa_t shadow_addr;
 	int level;
@@ -349,11 +350,15 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
 		     user_fault, write_fault,
 		     walker->ptes[walker->level-1] & PT_DIRTY_MASK,
-		     ptwrite, largepage, walker->gfn, pfn, false);
+		     ptwrite, largepage, walker->gfn, pfn, false,
+		     direct_mmio);
 
 	return shadow_ent;
 }
 
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr);
+
+
 /*
  * Page fault handler.  There are several causes for a page fault:
  *   - there is no shadow pte for the guest pte
@@ -380,6 +385,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	int r;
 	pfn_t pfn;
 	int largepage = 0;
+	int direct_mmio = 0;
 
 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
 	kvm_mmu_audit(vcpu, "pre page fault");
@@ -413,10 +419,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 			largepage = 1;
 		}
 	}
-	pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
+	pfn = gfn_to_pfn(vcpu->kvm, walker.gfn, &direct_mmio);
 	up_read(&current->mm->mmap_sem);
 
-	/* mmio */
+	/* handle emulated mmio */
 	if (is_error_pfn(pfn)) {
 		pgprintk("gfn %x is mmio\n", walker.gfn);
 		kvm_release_pfn_clean(pfn);
@@ -426,7 +432,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
 	shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
-				  largepage, &write_pt, pfn);
+				  largepage, &write_pt, pfn,
+				  direct_mmio);
 
 	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
 		 shadow_pte, *shadow_pte, write_pt);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 578c363..0910cc1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -173,7 +173,7 @@ void kvm_release_page_dirty(struct page *page);
 void kvm_set_page_dirty(struct page *page);
 void kvm_set_page_accessed(struct page *page);
 
-pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn, int *direct_mmio);
 void kvm_release_pfn_dirty(pfn_t);
 void kvm_release_pfn_clean(pfn_t pfn);
 void kvm_set_pfn_dirty(pfn_t pfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6a52c08..07b95f7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -526,20 +526,33 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 /*
  * Requires current->mm->mmap_sem to be held
  */
-pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn, int *direct_mmio)
 {
 	struct page *page[1];
 	unsigned long addr;
 	int npages;
+	struct vm_area_struct *vma;
 
 	might_sleep();
 
+	if (direct_mmio)
+		*direct_mmio = 0;
+
 	addr = gfn_to_hva(kvm, gfn);
 	if (kvm_is_error_hva(addr)) {
 		get_page(bad_page);
 		return page_to_pfn(bad_page);
 	}
 
+	/* handle mmio */
+	vma = find_vma(current->mm, addr);
+	if (vma->vm_flags & VM_IO) {
+		if (direct_mmio)
+			*direct_mmio = 1;
+
+		return ((addr - vma->vm_start) >> PAGE_SHIFT) +  vma->vm_pgoff;
+	}
+
 	npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page,
 				NULL);
 
@@ -555,7 +568,7 @@ EXPORT_SYMBOL_GPL(gfn_to_pfn);
 
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 {
-	return pfn_to_page(gfn_to_pfn(kvm, gfn));
+	return pfn_to_page(gfn_to_pfn(kvm, gfn, NULL));
 }
 
 EXPORT_SYMBOL_GPL(gfn_to_page);
-- 
1.5.4.5


-------------------------------------------------------------------------
This SF.net email is sponsored by the 2008 JavaOne(SM) Conference 
Don't miss this year's exciting event. There's still time to save $100. 
Use priority code J8TL2D2. 
http://ad.doubleclick.net/clk;198757673;13503038;p?http://java.sun.com/javaone

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 1/1] Enble a guest to access a device's memory mapped I/O regions directly.
  2008-04-16 13:25 ` [PATCH 1/1] Enble a guest to access a device's memory mapped I/O regions directly benami
@ 2008-04-16 13:25   ` benami
  2008-04-18 15:50   ` Avi Kivity
  1 sibling, 0 replies; 9+ messages in thread
From: benami @ 2008-04-16 13:25 UTC (permalink / raw)
  To: amit.shah; +Cc: kvm-devel, allen.m.kay, andrea, benami

From: Ben-Ami Yassour <benami@il.ibm.com>

Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
---
 arch/x86/kvm/mmu.c         |   59 +++++++++++++++++++++++++++++--------------
 arch/x86/kvm/paging_tmpl.h |   19 +++++++++----
 include/linux/kvm_host.h   |    2 +-
 virt/kvm/kvm_main.c        |   17 +++++++++++-
 4 files changed, 69 insertions(+), 28 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 078a7f1..c89029d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -112,6 +112,8 @@ static int dbg = 1;
 #define PT_FIRST_AVAIL_BITS_SHIFT 9
 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
 
+#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+
 #define VALID_PAGE(x) ((x) != INVALID_PAGE)
 
 #define PT64_LEVEL_BITS 9
@@ -237,6 +239,9 @@ static int is_dirty_pte(unsigned long pte)
 
 static int is_rmap_pte(u64 pte)
 {
+	if (pte & PT_SHADOW_IO_MARK)
+		return false;
+
 	return is_shadow_present_pte(pte);
 }
 
@@ -1034,7 +1039,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 			 unsigned pt_access, unsigned pte_access,
 			 int user_fault, int write_fault, int dirty,
 			 int *ptwrite, int largepage, gfn_t gfn,
-			 pfn_t pfn, bool speculative)
+			 pfn_t pfn, bool speculative,
+			 int direct_mmio)
 {
 	u64 spte;
 	int was_rmapped = 0;
@@ -1114,6 +1120,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		}
 	}
 
+	if (direct_mmio)
+		spte |= PT_SHADOW_IO_MARK;
+
 unshadowed:
 
 	if (pte_access & ACC_WRITE_MASK)
@@ -1129,16 +1138,19 @@ unshadowed:
 		++vcpu->kvm->stat.lpages;
 
 	page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
-	if (!was_rmapped) {
-		rmap_add(vcpu, shadow_pte, gfn, largepage);
-		if (!is_rmap_pte(*shadow_pte))
-			kvm_release_pfn_clean(pfn);
-	} else {
-		if (was_writeble)
-			kvm_release_pfn_dirty(pfn);
-		else
-			kvm_release_pfn_clean(pfn);
+	if (!direct_mmio) {
+		if (!was_rmapped) {
+			rmap_add(vcpu, shadow_pte, gfn, largepage);
+			if (!is_rmap_pte(*shadow_pte))
+				kvm_release_pfn_clean(pfn);
+		} else {
+			if (was_writeble)
+				kvm_release_pfn_dirty(pfn);
+			else
+				kvm_release_pfn_clean(pfn);
+		}
 	}
+
 	if (!ptwrite || !*ptwrite)
 		vcpu->arch.last_pte_updated = shadow_pte;
 }
@@ -1149,7 +1161,7 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 
 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			   int largepage, gfn_t gfn, pfn_t pfn,
-			   int level)
+			   int level, int direct_mmio)
 {
 	hpa_t table_addr = vcpu->arch.mmu.root_hpa;
 	int pt_write = 0;
@@ -1163,13 +1175,15 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 
 		if (level == 1) {
 			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
-				     0, write, 1, &pt_write, 0, gfn, pfn, false);
+				     0, write, 1, &pt_write, 0, gfn, pfn,
+				     false, direct_mmio);
 			return pt_write;
 		}
 
 		if (largepage && level == 2) {
 			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
-				     0, write, 1, &pt_write, 1, gfn, pfn, false);
+				     0, write, 1, &pt_write, 1, gfn, pfn,
+				     false, direct_mmio);
 			return pt_write;
 		}
 
@@ -1200,6 +1214,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	int r;
 	int largepage = 0;
 	pfn_t pfn;
+	int direct_mmio = 0;
 
 	down_read(&current->mm->mmap_sem);
 	if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
@@ -1207,10 +1222,10 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 		largepage = 1;
 	}
 
-	pfn = gfn_to_pfn(vcpu->kvm, gfn);
+	pfn = gfn_to_pfn(vcpu->kvm, gfn, &direct_mmio);
 	up_read(&current->mm->mmap_sem);
 
-	/* mmio */
+	/* handle emulated mmio */
 	if (is_error_pfn(pfn)) {
 		kvm_release_pfn_clean(pfn);
 		return 1;
@@ -1219,7 +1234,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
 	r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
-			 PT32E_ROOT_LEVEL);
+			 PT32E_ROOT_LEVEL, direct_mmio);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 
@@ -1355,6 +1370,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 	int r;
 	int largepage = 0;
 	gfn_t gfn = gpa >> PAGE_SHIFT;
+	int direct_mmio = 0;
 
 	ASSERT(vcpu);
 	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -1368,7 +1384,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
 		largepage = 1;
 	}
-	pfn = gfn_to_pfn(vcpu->kvm, gfn);
+	pfn = gfn_to_pfn(vcpu->kvm, gfn, &direct_mmio);
 	up_read(&current->mm->mmap_sem);
 	if (is_error_pfn(pfn)) {
 		kvm_release_pfn_clean(pfn);
@@ -1377,7 +1393,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
 	r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
-			 largepage, gfn, pfn, TDP_ROOT_LEVEL);
+			 largepage, gfn, pfn, TDP_ROOT_LEVEL,
+			 direct_mmio);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return r;
@@ -1643,6 +1660,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	int r;
 	u64 gpte = 0;
 	pfn_t pfn;
+	int direct_mmio = 0;
 
 	vcpu->arch.update_pte.largepage = 0;
 
@@ -1678,9 +1696,12 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
 		vcpu->arch.update_pte.largepage = 1;
 	}
-	pfn = gfn_to_pfn(vcpu->kvm, gfn);
+	pfn = gfn_to_pfn(vcpu->kvm, gfn, &direct_mmio);
 	up_read(&current->mm->mmap_sem);
 
+	if (direct_mmio)
+		return;
+
 	if (is_error_pfn(pfn)) {
 		kvm_release_pfn_clean(pfn);
 		return;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 156fe10..e85d8ae 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -264,9 +264,10 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 	if (is_error_pfn(pfn))
 		return;
 	kvm_get_pfn(pfn);
+
 	mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
 		     gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
-		     pfn, true);
+		     pfn, true, false);
 }
 
 /*
@@ -275,7 +276,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			 struct guest_walker *walker,
 			 int user_fault, int write_fault, int largepage,
-			 int *ptwrite, pfn_t pfn)
+			 int *ptwrite, pfn_t pfn, int direct_mmio)
 {
 	hpa_t shadow_addr;
 	int level;
@@ -349,11 +350,15 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
 		     user_fault, write_fault,
 		     walker->ptes[walker->level-1] & PT_DIRTY_MASK,
-		     ptwrite, largepage, walker->gfn, pfn, false);
+		     ptwrite, largepage, walker->gfn, pfn, false,
+		     direct_mmio);
 
 	return shadow_ent;
 }
 
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr);
+
+
 /*
  * Page fault handler.  There are several causes for a page fault:
  *   - there is no shadow pte for the guest pte
@@ -380,6 +385,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	int r;
 	pfn_t pfn;
 	int largepage = 0;
+	int direct_mmio = 0;
 
 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
 	kvm_mmu_audit(vcpu, "pre page fault");
@@ -413,10 +419,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 			largepage = 1;
 		}
 	}
-	pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
+	pfn = gfn_to_pfn(vcpu->kvm, walker.gfn, &direct_mmio);
 	up_read(&current->mm->mmap_sem);
 
-	/* mmio */
+	/* handle emulated mmio */
 	if (is_error_pfn(pfn)) {
 		pgprintk("gfn %x is mmio\n", walker.gfn);
 		kvm_release_pfn_clean(pfn);
@@ -426,7 +432,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
 	shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
-				  largepage, &write_pt, pfn);
+				  largepage, &write_pt, pfn,
+				  direct_mmio);
 
 	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
 		 shadow_pte, *shadow_pte, write_pt);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 578c363..0910cc1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -173,7 +173,7 @@ void kvm_release_page_dirty(struct page *page);
 void kvm_set_page_dirty(struct page *page);
 void kvm_set_page_accessed(struct page *page);
 
-pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn, int *direct_mmio);
 void kvm_release_pfn_dirty(pfn_t);
 void kvm_release_pfn_clean(pfn_t pfn);
 void kvm_set_pfn_dirty(pfn_t pfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6a52c08..07b95f7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -526,20 +526,33 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 /*
  * Requires current->mm->mmap_sem to be held
  */
-pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn, int *direct_mmio)
 {
 	struct page *page[1];
 	unsigned long addr;
 	int npages;
+	struct vm_area_struct *vma;
 
 	might_sleep();
 
+	if (direct_mmio)
+		*direct_mmio = 0;
+
 	addr = gfn_to_hva(kvm, gfn);
 	if (kvm_is_error_hva(addr)) {
 		get_page(bad_page);
 		return page_to_pfn(bad_page);
 	}
 
+	/* handle mmio */
+	vma = find_vma(current->mm, addr);
+	if (vma->vm_flags & VM_IO) {
+		if (direct_mmio)
+			*direct_mmio = 1;
+
+		return ((addr - vma->vm_start) >> PAGE_SHIFT) +  vma->vm_pgoff;
+	}
+
 	npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page,
 				NULL);
 
@@ -555,7 +568,7 @@ EXPORT_SYMBOL_GPL(gfn_to_pfn);
 
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 {
-	return pfn_to_page(gfn_to_pfn(kvm, gfn));
+	return pfn_to_page(gfn_to_pfn(kvm, gfn, NULL));
 }
 
 EXPORT_SYMBOL_GPL(gfn_to_page);
-- 
1.5.4.5


-------------------------------------------------------------------------
This SF.net email is sponsored by the 2008 JavaOne(SM) Conference 
Don't miss this year's exciting event. There's still time to save $100. 
Use priority code J8TL2D2. 
http://ad.doubleclick.net/clk;198757673;13503038;p?http://java.sun.com/javaone

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 1/1] Enble a guest to access a device's memory mapped I/O regions directly.
@ 2008-04-16 13:26 benami
  2008-04-16 13:26 ` benami
  2008-04-18 15:56 ` Avi Kivity
  0 siblings, 2 replies; 9+ messages in thread
From: benami @ 2008-04-16 13:26 UTC (permalink / raw)
  To: amit.shah; +Cc: kvm-devel, allen.m.kay, andrea, benami

From: Ben-Ami Yassour <benami@il.ibm.com>

Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
---
 libkvm/libkvm.c           |   24 ++++++++----
 qemu/hw/pci-passthrough.c |   89 +++++++++++----------------------------------
 qemu/hw/pci-passthrough.h |    2 +
 3 files changed, 40 insertions(+), 75 deletions(-)

diff --git a/libkvm/libkvm.c b/libkvm/libkvm.c
index de91328..8c02af9 100644
--- a/libkvm/libkvm.c
+++ b/libkvm/libkvm.c
@@ -400,7 +400,7 @@ void *kvm_create_userspace_phys_mem(kvm_context_t kvm, unsigned long phys_start,
 {
 	int r;
 	int prot = PROT_READ;
-	void *ptr;
+	void *ptr = NULL;
 	struct kvm_userspace_memory_region memory = {
 		.memory_size = len,
 		.guest_phys_addr = phys_start,
@@ -410,16 +410,24 @@ void *kvm_create_userspace_phys_mem(kvm_context_t kvm, unsigned long phys_start,
 	if (writable)
 		prot |= PROT_WRITE;
 
-	ptr = mmap(NULL, len, prot, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
-	if (ptr == MAP_FAILED) {
-		fprintf(stderr, "create_userspace_phys_mem: %s", strerror(errno));
-		return 0;
-	}
+	if (len > 0) {
+		ptr = mmap(NULL, len, prot, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+		if (ptr == MAP_FAILED) {
+			fprintf(stderr, "create_userspace_phys_mem: %s",
+				strerror(errno));
+			return 0;
+		}
 
-	memset(ptr, 0, len);
+		memset(ptr, 0, len);
+	}
 
 	memory.userspace_addr = (unsigned long)ptr;
-	memory.slot = get_free_slot(kvm);
+
+	if (len > 0)
+		memory.slot = get_free_slot(kvm);
+	else
+		memory.slot = get_slot(phys_start);
+
 	r = ioctl(kvm->vm_fd, KVM_SET_USER_MEMORY_REGION, &memory);
 	if (r == -1) {
 		fprintf(stderr, "create_userspace_phys_mem: %s", strerror(errno));
diff --git a/qemu/hw/pci-passthrough.c b/qemu/hw/pci-passthrough.c
index 7ffcc7b..a5894d9 100644
--- a/qemu/hw/pci-passthrough.c
+++ b/qemu/hw/pci-passthrough.c
@@ -25,18 +25,6 @@ typedef __u64 resource_size_t;
 extern kvm_context_t kvm_context;
 extern FILE *logfile;
 
-CPUReadMemoryFunc *pt_mmio_read_cb[3] = {
-	pt_mmio_readb,
-	pt_mmio_readw,
-	pt_mmio_readl
-};
-
-CPUWriteMemoryFunc *pt_mmio_write_cb[3] = {
-	pt_mmio_writeb,
-	pt_mmio_writew,
-	pt_mmio_writel
-};
-
 //#define PT_DEBUG
 
 #ifdef PT_DEBUG
@@ -45,47 +33,6 @@ CPUWriteMemoryFunc *pt_mmio_write_cb[3] = {
 #define DEBUG(fmt, args...)
 #endif
 
-#define pt_mmio_write(suffix, type)					\
-void pt_mmio_write##suffix(void *opaque, target_phys_addr_t e_phys,	\
-				uint32_t value)				\
-{									\
-	pt_region_t *r_access = (pt_region_t *)opaque;			\
-	void *r_virt = (u8 *)r_access->r_virtbase + 			\
-			(e_phys - r_access->e_physbase);		\
-	if (r_access->debug & PT_DEBUG_MMIO) {				\
-		fprintf(logfile, "pt_mmio_write" #suffix		\
-			": e_physbase=%p e_phys=%p r_virt=%p value=%08x\n", \
-			(void *)r_access->e_physbase, (void *)e_phys,	\
-			r_virt, value);					\
-	}								\
-	*(type *)r_virt = (type)value;					\
-}
-
-pt_mmio_write(b, u8)
-pt_mmio_write(w, u16)
-pt_mmio_write(l, u32)
-
-#define pt_mmio_read(suffix, type)					\
-uint32_t pt_mmio_read##suffix(void *opaque, target_phys_addr_t e_phys)	\
-{									\
-	pt_region_t *r_access = (pt_region_t *)opaque;			\
-	void *r_virt = (u8 *)r_access->r_virtbase + 			\
-			(e_phys - r_access->e_physbase);		\
-	uint32_t value = (u32) (*(type *) r_virt);			\
-	if (r_access->debug & PT_DEBUG_MMIO) {				\
-		fprintf(logfile,					\
-			"pt_mmio_read" #suffix ": e_physbase=%p "	\
-			"e_phys=%p r_virt=%p value=%08x\n",		\
-			(void *)r_access->e_physbase,			\
-			(void *)e_phys, r_virt, value);			\
-	}								\
-	return value;							\
-}
-
-pt_mmio_read(b, u8)
-pt_mmio_read(w, u16)
-pt_mmio_read(l, u32)
-
 #define pt_ioport_write(suffix)						\
 void pt_ioport_write##suffix(void *opaque, uint32_t addr, uint32_t value) \
 {									\
@@ -127,22 +74,33 @@ pt_ioport_read(b)
 pt_ioport_read(w)
 pt_ioport_read(l)
 
-static void pt_iomem_map(PCIDevice * d, int region_num,
-			 uint32_t e_phys, uint32_t e_size, int type)
+void pt_iomem_map(PCIDevice * pci_dev, int region_num, uint32_t e_phys,
+		  uint32_t e_size, int type)
 {
-	pt_dev_t *r_dev = (pt_dev_t *) d;
-
-	r_dev->v_addrs[region_num].e_physbase = e_phys;
+	pt_dev_t *r_dev = (pt_dev_t *) pci_dev;
+	pt_region_t *region = &r_dev->v_addrs[region_num];
+	int first_map = (region->e_size == 0);
+	int ret = 0;
 
 	DEBUG("e_phys=%08x r_virt=%p type=%d len=%08x region_num=%d \n",
 	      e_phys, r_dev->v_addrs[region_num].r_virtbase, type, e_size,
 	      region_num);
 
-	cpu_register_physical_memory(e_phys,
-				     r_dev->dev.io_regions[region_num].size,
-				     r_dev->v_addrs[region_num].memory_index);
-}
+	region->e_physbase = e_phys;
+	region->e_size = e_size;
+
+	if (!first_map)
+		kvm_destroy_phys_mem(kvm_context, e_phys, e_size);
 
+	if (e_size > 0)
+		ret = kvm_register_userspace_phys_mem(kvm_context,
+						      e_phys,
+						      region->r_virtbase,
+						      e_size,
+						      0);
+	if (ret != 0)
+		fprintf(logfile, "Error: create new mapping failed\n");
+}
 
 static void pt_ioport_map(PCIDevice * pci_dev, int region_num,
 			  uint32_t addr, uint32_t size, int type)
@@ -265,6 +223,8 @@ static int pt_register_regions(pci_region_t * io_regions,
 					(uint32_t) (cur_region->base_addr));
 				return (-1);
 			}
+			pci_dev->v_addrs[i].r_size = cur_region->size;
+			pci_dev->v_addrs[i].e_size = 0;
 
 			/* add offset */
 			pci_dev->v_addrs[i].r_virtbase +=
@@ -274,11 +234,6 @@ static int pt_register_regions(pci_region_t * io_regions,
 					       cur_region->size, t,
 					       pt_iomem_map);
 
-			pci_dev->v_addrs[i].memory_index =
-			    cpu_register_io_memory(0, pt_mmio_read_cb,
-						   pt_mmio_write_cb,
-						   (void *) &(pci_dev->v_addrs[i]));
-
 			continue;
 		}
 		/* handle port io regions */
diff --git a/qemu/hw/pci-passthrough.h b/qemu/hw/pci-passthrough.h
index 012014a..49db1d2 100644
--- a/qemu/hw/pci-passthrough.h
+++ b/qemu/hw/pci-passthrough.h
@@ -54,6 +54,8 @@ typedef struct pt_region_s {
 	uint32_t memory_index;
 	void *r_virtbase;	/* mmapped access address */
 	int num;		/* our index within v_addrs[] */
+	uint32_t e_size;          /* emulated size of region in bytes */
+	uint32_t r_size;          /* real size of region in bytes */
 	uint32_t debug;
 } pt_region_t;
 
-- 
1.5.4.5


-------------------------------------------------------------------------
This SF.net email is sponsored by the 2008 JavaOne(SM) Conference 
Don't miss this year's exciting event. There's still time to save $100. 
Use priority code J8TL2D2. 
http://ad.doubleclick.net/clk;198757673;13503038;p?http://java.sun.com/javaone

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 1/1] Enble a guest to access a device's memory mapped I/O regions directly.
  2008-04-16 13:26 [PATCH 1/1] Enble a guest to access a device's memory mapped I/O regions directly benami
@ 2008-04-16 13:26 ` benami
  2008-04-18 15:56 ` Avi Kivity
  1 sibling, 0 replies; 9+ messages in thread
From: benami @ 2008-04-16 13:26 UTC (permalink / raw)
  To: amit.shah; +Cc: kvm-devel, allen.m.kay, andrea, benami

From: Ben-Ami Yassour <benami@il.ibm.com>

Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
---
 libkvm/libkvm.c           |   24 ++++++++----
 qemu/hw/pci-passthrough.c |   89 +++++++++++----------------------------------
 qemu/hw/pci-passthrough.h |    2 +
 3 files changed, 40 insertions(+), 75 deletions(-)

diff --git a/libkvm/libkvm.c b/libkvm/libkvm.c
index de91328..8c02af9 100644
--- a/libkvm/libkvm.c
+++ b/libkvm/libkvm.c
@@ -400,7 +400,7 @@ void *kvm_create_userspace_phys_mem(kvm_context_t kvm, unsigned long phys_start,
 {
 	int r;
 	int prot = PROT_READ;
-	void *ptr;
+	void *ptr = NULL;
 	struct kvm_userspace_memory_region memory = {
 		.memory_size = len,
 		.guest_phys_addr = phys_start,
@@ -410,16 +410,24 @@ void *kvm_create_userspace_phys_mem(kvm_context_t kvm, unsigned long phys_start,
 	if (writable)
 		prot |= PROT_WRITE;
 
-	ptr = mmap(NULL, len, prot, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
-	if (ptr == MAP_FAILED) {
-		fprintf(stderr, "create_userspace_phys_mem: %s", strerror(errno));
-		return 0;
-	}
+	if (len > 0) {
+		ptr = mmap(NULL, len, prot, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+		if (ptr == MAP_FAILED) {
+			fprintf(stderr, "create_userspace_phys_mem: %s",
+				strerror(errno));
+			return 0;
+		}
 
-	memset(ptr, 0, len);
+		memset(ptr, 0, len);
+	}
 
 	memory.userspace_addr = (unsigned long)ptr;
-	memory.slot = get_free_slot(kvm);
+
+	if (len > 0)
+		memory.slot = get_free_slot(kvm);
+	else
+		memory.slot = get_slot(phys_start);
+
 	r = ioctl(kvm->vm_fd, KVM_SET_USER_MEMORY_REGION, &memory);
 	if (r == -1) {
 		fprintf(stderr, "create_userspace_phys_mem: %s", strerror(errno));
diff --git a/qemu/hw/pci-passthrough.c b/qemu/hw/pci-passthrough.c
index 7ffcc7b..a5894d9 100644
--- a/qemu/hw/pci-passthrough.c
+++ b/qemu/hw/pci-passthrough.c
@@ -25,18 +25,6 @@ typedef __u64 resource_size_t;
 extern kvm_context_t kvm_context;
 extern FILE *logfile;
 
-CPUReadMemoryFunc *pt_mmio_read_cb[3] = {
-	pt_mmio_readb,
-	pt_mmio_readw,
-	pt_mmio_readl
-};
-
-CPUWriteMemoryFunc *pt_mmio_write_cb[3] = {
-	pt_mmio_writeb,
-	pt_mmio_writew,
-	pt_mmio_writel
-};
-
 //#define PT_DEBUG
 
 #ifdef PT_DEBUG
@@ -45,47 +33,6 @@ CPUWriteMemoryFunc *pt_mmio_write_cb[3] = {
 #define DEBUG(fmt, args...)
 #endif
 
-#define pt_mmio_write(suffix, type)					\
-void pt_mmio_write##suffix(void *opaque, target_phys_addr_t e_phys,	\
-				uint32_t value)				\
-{									\
-	pt_region_t *r_access = (pt_region_t *)opaque;			\
-	void *r_virt = (u8 *)r_access->r_virtbase + 			\
-			(e_phys - r_access->e_physbase);		\
-	if (r_access->debug & PT_DEBUG_MMIO) {				\
-		fprintf(logfile, "pt_mmio_write" #suffix		\
-			": e_physbase=%p e_phys=%p r_virt=%p value=%08x\n", \
-			(void *)r_access->e_physbase, (void *)e_phys,	\
-			r_virt, value);					\
-	}								\
-	*(type *)r_virt = (type)value;					\
-}
-
-pt_mmio_write(b, u8)
-pt_mmio_write(w, u16)
-pt_mmio_write(l, u32)
-
-#define pt_mmio_read(suffix, type)					\
-uint32_t pt_mmio_read##suffix(void *opaque, target_phys_addr_t e_phys)	\
-{									\
-	pt_region_t *r_access = (pt_region_t *)opaque;			\
-	void *r_virt = (u8 *)r_access->r_virtbase + 			\
-			(e_phys - r_access->e_physbase);		\
-	uint32_t value = (u32) (*(type *) r_virt);			\
-	if (r_access->debug & PT_DEBUG_MMIO) {				\
-		fprintf(logfile,					\
-			"pt_mmio_read" #suffix ": e_physbase=%p "	\
-			"e_phys=%p r_virt=%p value=%08x\n",		\
-			(void *)r_access->e_physbase,			\
-			(void *)e_phys, r_virt, value);			\
-	}								\
-	return value;							\
-}
-
-pt_mmio_read(b, u8)
-pt_mmio_read(w, u16)
-pt_mmio_read(l, u32)
-
 #define pt_ioport_write(suffix)						\
 void pt_ioport_write##suffix(void *opaque, uint32_t addr, uint32_t value) \
 {									\
@@ -127,22 +74,33 @@ pt_ioport_read(b)
 pt_ioport_read(w)
 pt_ioport_read(l)
 
-static void pt_iomem_map(PCIDevice * d, int region_num,
-			 uint32_t e_phys, uint32_t e_size, int type)
+void pt_iomem_map(PCIDevice * pci_dev, int region_num, uint32_t e_phys,
+		  uint32_t e_size, int type)
 {
-	pt_dev_t *r_dev = (pt_dev_t *) d;
-
-	r_dev->v_addrs[region_num].e_physbase = e_phys;
+	pt_dev_t *r_dev = (pt_dev_t *) pci_dev;
+	pt_region_t *region = &r_dev->v_addrs[region_num];
+	int first_map = (region->e_size == 0);
+	int ret = 0;
 
 	DEBUG("e_phys=%08x r_virt=%p type=%d len=%08x region_num=%d \n",
 	      e_phys, r_dev->v_addrs[region_num].r_virtbase, type, e_size,
 	      region_num);
 
-	cpu_register_physical_memory(e_phys,
-				     r_dev->dev.io_regions[region_num].size,
-				     r_dev->v_addrs[region_num].memory_index);
-}
+	region->e_physbase = e_phys;
+	region->e_size = e_size;
+
+	if (!first_map)
+		kvm_destroy_phys_mem(kvm_context, e_phys, e_size);
 
+	if (e_size > 0)
+		ret = kvm_register_userspace_phys_mem(kvm_context,
+						      e_phys,
+						      region->r_virtbase,
+						      e_size,
+						      0);
+	if (ret != 0)
+		fprintf(logfile, "Error: create new mapping failed\n");
+}
 
 static void pt_ioport_map(PCIDevice * pci_dev, int region_num,
 			  uint32_t addr, uint32_t size, int type)
@@ -265,6 +223,8 @@ static int pt_register_regions(pci_region_t * io_regions,
 					(uint32_t) (cur_region->base_addr));
 				return (-1);
 			}
+			pci_dev->v_addrs[i].r_size = cur_region->size;
+			pci_dev->v_addrs[i].e_size = 0;
 
 			/* add offset */
 			pci_dev->v_addrs[i].r_virtbase +=
@@ -274,11 +234,6 @@ static int pt_register_regions(pci_region_t * io_regions,
 					       cur_region->size, t,
 					       pt_iomem_map);
 
-			pci_dev->v_addrs[i].memory_index =
-			    cpu_register_io_memory(0, pt_mmio_read_cb,
-						   pt_mmio_write_cb,
-						   (void *) &(pci_dev->v_addrs[i]));
-
 			continue;
 		}
 		/* handle port io regions */
diff --git a/qemu/hw/pci-passthrough.h b/qemu/hw/pci-passthrough.h
index 012014a..49db1d2 100644
--- a/qemu/hw/pci-passthrough.h
+++ b/qemu/hw/pci-passthrough.h
@@ -54,6 +54,8 @@ typedef struct pt_region_s {
 	uint32_t memory_index;
 	void *r_virtbase;	/* mmapped access address */
 	int num;		/* our index within v_addrs[] */
+	uint32_t e_size;          /* emulated size of region in bytes */
+	uint32_t r_size;          /* real size of region in bytes */
 	uint32_t debug;
 } pt_region_t;
 
-- 
1.5.4.5


-------------------------------------------------------------------------
This SF.net email is sponsored by the 2008 JavaOne(SM) Conference 
Don't miss this year's exciting event. There's still time to save $100. 
Use priority code J8TL2D2. 
http://ad.doubleclick.net/clk;198757673;13503038;p?http://java.sun.com/javaone

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/1] Enble a guest to access a device's memory mapped I/O regions directly.
  2008-04-16 13:25 ` [PATCH 1/1] Enble a guest to access a device's memory mapped I/O regions directly benami
  2008-04-16 13:25   ` benami
@ 2008-04-18 15:50   ` Avi Kivity
  2008-04-19 14:35     ` Muli Ben-Yehuda
  1 sibling, 1 reply; 9+ messages in thread
From: Avi Kivity @ 2008-04-18 15:50 UTC (permalink / raw)
  To: benami; +Cc: kvm-devel, andrea, allen.m.kay

benami@il.ibm.com wrote:
> From: Ben-Ami Yassour <benami@il.ibm.com>
>
> Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com>
> Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
> ---
>  arch/x86/kvm/mmu.c         |   59 +++++++++++++++++++++++++++++--------------
>  arch/x86/kvm/paging_tmpl.h |   19 +++++++++----
>  include/linux/kvm_host.h   |    2 +-
>  virt/kvm/kvm_main.c        |   17 +++++++++++-
>  4 files changed, 69 insertions(+), 28 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 078a7f1..c89029d 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -112,6 +112,8 @@ static int dbg = 1;
>  #define PT_FIRST_AVAIL_BITS_SHIFT 9
>  #define PT64_SECOND_AVAIL_BITS_SHIFT 52
>  
> +#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
> +
>   

Please rename this PT_SHADOW_MMIO_MASK.


>  #define VALID_PAGE(x) ((x) != INVALID_PAGE)
>  
>  #define PT64_LEVEL_BITS 9
> @@ -237,6 +239,9 @@ static int is_dirty_pte(unsigned long pte)
>  
>  static int is_rmap_pte(u64 pte)
>  {
> +	if (pte & PT_SHADOW_IO_MARK)
> +		return false;
> +
>  	return is_shadow_present_pte(pte);
>  }
>   

Why avoid rmap on mmio pages?  Sure it's unnecessary work, but having 
less cases improves overall reliability.

You can use pfn_valid() in gfn_to_pfn() and kvm_release_pfn_*() to 
conditionally update the page refcounts.

-- 
Any sufficiently difficult bug is indistinguishable from a feature.


-------------------------------------------------------------------------
This SF.net email is sponsored by the 2008 JavaOne(SM) Conference 
Don't miss this year's exciting event. There's still time to save $100. 
Use priority code J8TL2D2. 
http://ad.doubleclick.net/clk;198757673;13503038;p?http://java.sun.com/javaone

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/1] Enble a guest to access a device's memory mapped I/O regions directly.
  2008-04-16 13:26 [PATCH 1/1] Enble a guest to access a device's memory mapped I/O regions directly benami
  2008-04-16 13:26 ` benami
@ 2008-04-18 15:56 ` Avi Kivity
  2008-04-19 14:56   ` Muli Ben-Yehuda
  1 sibling, 1 reply; 9+ messages in thread
From: Avi Kivity @ 2008-04-18 15:56 UTC (permalink / raw)
  To: benami; +Cc: kvm-devel, andrea, allen.m.kay

benami@il.ibm.com wrote:
> From: Ben-Ami Yassour <benami@il.ibm.com>
>
> Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com>
> Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
> ---
>  libkvm/libkvm.c           |   24 ++++++++----
>  qemu/hw/pci-passthrough.c |   89 +++++++++++----------------------------------
>  qemu/hw/pci-passthrough.h |    2 +
>  3 files changed, 40 insertions(+), 75 deletions(-)
>
> diff --git a/libkvm/libkvm.c b/libkvm/libkvm.c
> index de91328..8c02af9 100644
> --- a/libkvm/libkvm.c
> +++ b/libkvm/libkvm.c
> @@ -400,7 +400,7 @@ void *kvm_create_userspace_phys_mem(kvm_context_t kvm, unsigned long phys_start,
>  {
>  	int r;
>  	int prot = PROT_READ;
> -	void *ptr;
> +	void *ptr = NULL;
>  	struct kvm_userspace_memory_region memory = {
>  		.memory_size = len,
>  		.guest_phys_addr = phys_start,
> @@ -410,16 +410,24 @@ void *kvm_create_userspace_phys_mem(kvm_context_t kvm, unsigned long phys_start,
>  	if (writable)
>  		prot |= PROT_WRITE;
>  
> -	ptr = mmap(NULL, len, prot, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
> -	if (ptr == MAP_FAILED) {
> -		fprintf(stderr, "create_userspace_phys_mem: %s", strerror(errno));
> -		return 0;
> -	}
> +	if (len > 0) {
> +		ptr = mmap(NULL, len, prot, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
> +		if (ptr == MAP_FAILED) {
> +			fprintf(stderr, "create_userspace_phys_mem: %s",
> +				strerror(errno));
> +			return 0;
> +		}
>  
> -	memset(ptr, 0, len);
> +		memset(ptr, 0, len);
> +	}
>  
>  	memory.userspace_addr = (unsigned long)ptr;
> -	memory.slot = get_free_slot(kvm);
> +
> +	if (len > 0)
> +		memory.slot = get_free_slot(kvm);
> +	else
> +		memory.slot = get_slot(phys_start);
> +
>  	r = ioctl(kvm->vm_fd, KVM_SET_USER_MEMORY_REGION, &memory);
>  	if (r == -1) {
>  		fprintf(stderr, "create_userspace_phys_mem: %s", strerror(errno));
>   

This looks like support for zero-length memory slots? Why is it needed?

It needs to be in a separate patch.

> diff --git a/qemu/hw/pci-passthrough.c b/qemu/hw/pci-passthrough.c
> index 7ffcc7b..a5894d9 100644
> --- a/qemu/hw/pci-passthrough.c
> +++ b/qemu/hw/pci-passthrough.c
> @@ -25,18 +25,6 @@ typedef __u64 resource_size_t;
>  extern kvm_context_t kvm_context;
>  extern FILE *logfile;
>  
> -CPUReadMemoryFunc *pt_mmio_read_cb[3] = {
> -	pt_mmio_readb,
> -	pt_mmio_readw,
> -	pt_mmio_readl
> -};
> -
> -CPUWriteMemoryFunc *pt_mmio_write_cb[3] = {
> -	pt_mmio_writeb,
> -	pt_mmio_writew,
> -	pt_mmio_writel
> -};
> -
>   

There's at least one use case for keeping mmio in userspace: 
reverse-engineering a device driver. So if it doesn't cause too much 
trouble, please keep this an option.


-- 
Any sufficiently difficult bug is indistinguishable from a feature.


-------------------------------------------------------------------------
This SF.net email is sponsored by the 2008 JavaOne(SM) Conference 
Don't miss this year's exciting event. There's still time to save $100. 
Use priority code J8TL2D2. 
http://ad.doubleclick.net/clk;198757673;13503038;p?http://java.sun.com/javaone

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/1] Enble a guest to access a device's memory mapped I/O regions directly.
  2008-04-18 15:50   ` Avi Kivity
@ 2008-04-19 14:35     ` Muli Ben-Yehuda
  2008-04-20 10:29       ` Avi Kivity
  0 siblings, 1 reply; 9+ messages in thread
From: Muli Ben-Yehuda @ 2008-04-19 14:35 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, allen.m.kay, andrea, Ben-Ami Yassour1

On Fri, Apr 18, 2008 at 06:50:03PM +0300, Avi Kivity wrote:
> benami@il.ibm.com wrote:
> > From: Ben-Ami Yassour <benami@il.ibm.com>
> >
> > Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com>
> > Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
> > ---
> >  arch/x86/kvm/mmu.c         |   59 +++++++++++++++++++++++++++++--------------
> >  arch/x86/kvm/paging_tmpl.h |   19 +++++++++----
> >  include/linux/kvm_host.h   |    2 +-
> >  virt/kvm/kvm_main.c        |   17 +++++++++++-
> >  4 files changed, 69 insertions(+), 28 deletions(-)
> >
> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > index 078a7f1..c89029d 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -112,6 +112,8 @@ static int dbg = 1;
> >  #define PT_FIRST_AVAIL_BITS_SHIFT 9
> >  #define PT64_SECOND_AVAIL_BITS_SHIFT 52
> >  
> > +#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
> > +
> >   
> 
> Please rename this PT_SHADOW_MMIO_MASK.

Sure thing.

> >  #define VALID_PAGE(x) ((x) != INVALID_PAGE)
> >  
> >  #define PT64_LEVEL_BITS 9
> > @@ -237,6 +239,9 @@ static int is_dirty_pte(unsigned long pte)
> >  
> >  static int is_rmap_pte(u64 pte)
> >  {
> > +	if (pte & PT_SHADOW_IO_MARK)
> > +		return false;
> > +
> >  	return is_shadow_present_pte(pte);
> >  }
> >   
> 
> Why avoid rmap on mmio pages?  Sure it's unnecessary work, but
> having less cases improves overall reliability.

The rmap functions already have a check to bail out if the pte is not
an rmap pte, so in that sense, we aren't adding a new case for the
code to handle, just adding direct MMIO ptes to the existing list of
non-rmap ptes.

> You can use pfn_valid() in gfn_to_pfn() and kvm_release_pfn_*() to
> conditionally update the page refcounts.

Since rmap isn't useful for direct MMIO ptes, doesn't it make more
sense to "bail out" early rather than in the bowls of the rmap code?

Chag Same'ach
Muli


-------------------------------------------------------------------------
This SF.net email is sponsored by the 2008 JavaOne(SM) Conference 
Don't miss this year's exciting event. There's still time to save $100. 
Use priority code J8TL2D2. 
http://ad.doubleclick.net/clk;198757673;13503038;p?http://java.sun.com/javaone

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/1] Enble a guest to access a device's memory mapped I/O regions directly.
  2008-04-18 15:56 ` Avi Kivity
@ 2008-04-19 14:56   ` Muli Ben-Yehuda
  0 siblings, 0 replies; 9+ messages in thread
From: Muli Ben-Yehuda @ 2008-04-19 14:56 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, allen.m.kay, andrea, Ben-Ami Yassour1

On Fri, Apr 18, 2008 at 06:56:41PM +0300, Avi Kivity wrote:
> benami@il.ibm.com wrote:
> > From: Ben-Ami Yassour <benami@il.ibm.com>
> >
> > Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com>
> > Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
> > ---
> >  libkvm/libkvm.c           |   24 ++++++++----
> >  qemu/hw/pci-passthrough.c |   89 +++++++++++----------------------------------
> >  qemu/hw/pci-passthrough.h |    2 +
> >  3 files changed, 40 insertions(+), 75 deletions(-)
> >
> > diff --git a/libkvm/libkvm.c b/libkvm/libkvm.c
> > index de91328..8c02af9 100644
> > --- a/libkvm/libkvm.c
> > +++ b/libkvm/libkvm.c
> > @@ -400,7 +400,7 @@ void *kvm_create_userspace_phys_mem(kvm_context_t kvm, unsigned long phys_start,
> >  {
> >  	int r;
> >  	int prot = PROT_READ;
> > -	void *ptr;
> > +	void *ptr = NULL;
> >  	struct kvm_userspace_memory_region memory = {
> >  		.memory_size = len,
> >  		.guest_phys_addr = phys_start,
> > @@ -410,16 +410,24 @@ void *kvm_create_userspace_phys_mem(kvm_context_t kvm, unsigned long phys_start,
> >  	if (writable)
> >  		prot |= PROT_WRITE;
> >  
> > -	ptr = mmap(NULL, len, prot, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
> > -	if (ptr == MAP_FAILED) {
> > -		fprintf(stderr, "create_userspace_phys_mem: %s", strerror(errno));
> > -		return 0;
> > -	}
> > +	if (len > 0) {
> > +		ptr = mmap(NULL, len, prot, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
> > +		if (ptr == MAP_FAILED) {
> > +			fprintf(stderr, "create_userspace_phys_mem: %s",
> > +				strerror(errno));
> > +			return 0;
> > +		}
> >  
> > -	memset(ptr, 0, len);
> > +		memset(ptr, 0, len);
> > +	}
> >  
> >  	memory.userspace_addr = (unsigned long)ptr;
> > -	memory.slot = get_free_slot(kvm);
> > +
> > +	if (len > 0)
> > +		memory.slot = get_free_slot(kvm);
> > +	else
> > +		memory.slot = get_slot(phys_start);
> > +
> >  	r = ioctl(kvm->vm_fd, KVM_SET_USER_MEMORY_REGION, &memory);
> >  	if (r == -1) {
> >  		fprintf(stderr, "create_userspace_phys_mem: %s", strerror(errno));
> >   
> 
> This looks like support for zero-length memory slots? Why is it
> needed?
> 
> It needs to be in a separate patch.

We need an interface to remove a memslot. When the guest writes to a
direct assigned device's BAR and changes an MMIO region, we need to
remove the old slot and establish a new one. The kernel side treats
0-sized memslot as "remove", but the userspace side didn't quite
handle it properly.

Personally I would've preferred a proper "remove" interface, rather
than shoehorning it into kvm_create_userspace_phys_mem and a 0-sized
slot. If that's acceptable, we'll put together a patch.

> > diff --git a/qemu/hw/pci-passthrough.c b/qemu/hw/pci-passthrough.c
> > index 7ffcc7b..a5894d9 100644
> > --- a/qemu/hw/pci-passthrough.c
> > +++ b/qemu/hw/pci-passthrough.c
> > @@ -25,18 +25,6 @@ typedef __u64 resource_size_t;
> >  extern kvm_context_t kvm_context;
> >  extern FILE *logfile;
> >  
> > -CPUReadMemoryFunc *pt_mmio_read_cb[3] = {
> > -	pt_mmio_readb,
> > -	pt_mmio_readw,
> > -	pt_mmio_readl
> > -};
> > -
> > -CPUWriteMemoryFunc *pt_mmio_write_cb[3] = {
> > -	pt_mmio_writeb,
> > -	pt_mmio_writew,
> > -	pt_mmio_writel
> > -};
> > -
> >   
> 
> There's at least one use case for keeping mmio in userspace:
> reverse-engineering a device driver. So if it doesn't cause too much
> trouble, please keep this an option.

I don't think it's a big deal to support both, although I'm not sure
how useful it would be (especially considering mmiotrace). Did you
have a user-interface for specifying it in mind?

Cheers,
Muli

-------------------------------------------------------------------------
This SF.net email is sponsored by the 2008 JavaOne(SM) Conference 
Don't miss this year's exciting event. There's still time to save $100. 
Use priority code J8TL2D2. 
http://ad.doubleclick.net/clk;198757673;13503038;p?http://java.sun.com/javaone

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/1] Enble a guest to access a device's memory mapped I/O regions directly.
  2008-04-19 14:35     ` Muli Ben-Yehuda
@ 2008-04-20 10:29       ` Avi Kivity
  0 siblings, 0 replies; 9+ messages in thread
From: Avi Kivity @ 2008-04-20 10:29 UTC (permalink / raw)
  To: Muli Ben-Yehuda; +Cc: kvm-devel, allen.m.kay, andrea, Ben-Ami Yassour1

Muli Ben-Yehuda wrote:
>>>   
>>>       
>> Why avoid rmap on mmio pages?  Sure it's unnecessary work, but
>> having less cases improves overall reliability.
>>     
>
> The rmap functions already have a check to bail out if the pte is not
> an rmap pte, so in that sense, we aren't adding a new case for the
> code to handle, just adding direct MMIO ptes to the existing list of
> non-rmap ptes.
>
>   

I'm worried about the huge chain of direct_mmio parameters passed to 
functions, impact on the audit code (at the end of mmu.c, and the poor 
souls who debug the mmu.

>> You can use pfn_valid() in gfn_to_pfn() and kvm_release_pfn_*() to
>> conditionally update the page refcounts.
>>     
>
> Since rmap isn't useful for direct MMIO ptes, doesn't it make more
> sense to "bail out" early rather than in the bowls of the rmap code?
>   

It does, from a purist point of view (which also favors explicit 
parameters a la direct_mmio rather than indirect parameters like 
pfn_valid()), but I'm looking from the practical point of view now.

With mmu notifiers, we don't need to hold the refcount at all.  So 
presuming we drop the refcounting code completely, are any changes 
actually necessary here?

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


-------------------------------------------------------------------------
This SF.net email is sponsored by the 2008 JavaOne(SM) Conference 
Don't miss this year's exciting event. There's still time to save $100. 
Use priority code J8TL2D2. 
http://ad.doubleclick.net/clk;198757673;13503038;p?http://java.sun.com/javaone

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2008-04-20 10:29 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-04-16 13:26 [PATCH 1/1] Enble a guest to access a device's memory mapped I/O regions directly benami
2008-04-16 13:26 ` benami
2008-04-18 15:56 ` Avi Kivity
2008-04-19 14:56   ` Muli Ben-Yehuda
  -- strict thread matches above, loose matches on Subject: below --
2008-04-16 13:25 direct mmio for passthrough - kernel part benami
2008-04-16 13:25 ` [PATCH 1/1] Enble a guest to access a device's memory mapped I/O regions directly benami
2008-04-16 13:25   ` benami
2008-04-18 15:50   ` Avi Kivity
2008-04-19 14:35     ` Muli Ben-Yehuda
2008-04-20 10:29       ` Avi Kivity

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox