public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed
* [RFC] direct mmio for passthrough
@ 2008-04-01 11:52 benami
  2008-04-01 11:52 ` [PATCH 1/1] direct mmio for passthrough - kernel part benami
  0 siblings, 1 reply; 32+ messages in thread
From: benami @ 2008-04-01 11:52 UTC (permalink / raw)
  To: amit.shah; +Cc: kvm-devel, allen.m.kay, andrea, benami


This patch for PCI passthrough devices enables a guest to access a device's
memory mapped I/O regions directly, without requiring the host to trap and
emulate every MMIO access. We save the list of MMIO regions of the guest's
devices and the corresponding host MMIO regions in the host kernel. When the
guest page faults due to an access to an MMIO region, the host updates the
guest's sptes to map the correct host MMIO region. The kernel part of this
patchset applies to kvm HEAD and the userspace part applies to Amit's
pci-passthrough tree. Tested on a Lenovo M57p with an e1000 NIC assigned
directly to an FC8 guest.

Comments are appreciated. 


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-01 11:52 [RFC] direct mmio for passthrough benami
@ 2008-04-01 11:52 ` benami
  2008-04-01 13:30   ` Avi Kivity
  0 siblings, 1 reply; 32+ messages in thread
From: benami @ 2008-04-01 11:52 UTC (permalink / raw)
  To: amit.shah; +Cc: kvm-devel, allen.m.kay, andrea, benami

From: Ben-Ami Yassour <benami@il.ibm.com>

Enable a guest to access a device's memory mapped I/O regions directly.
Userspace sends the mmio regions that the guest can access. On the first
page fault for an access to an mmio address the host translates the gva to hpa,
and updates the sptes.

Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
---
 arch/x86/kvm/Makefile      |    2 +-
 arch/x86/kvm/mmu.c         |   27 ++++++++
 arch/x86/kvm/mmu.h         |    2 +
 arch/x86/kvm/paging_tmpl.h |   71 +++++++++++++++++++---
 arch/x86/kvm/passthrough.c |  144 ++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/passthrough.h |   23 +++++++
 arch/x86/kvm/x86.c         |    2 +-
 include/asm-x86/kvm_host.h |   15 +++++
 include/linux/kvm.h        |   13 ++++
 include/linux/kvm_host.h   |   10 +++
 virt/kvm/kvm_main.c        |   24 +++++++
 11 files changed, 323 insertions(+), 10 deletions(-)
 create mode 100644 arch/x86/kvm/passthrough.c
 create mode 100644 arch/x86/kvm/passthrough.h

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 4d0c22e..2fa4932 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,7 +7,7 @@ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o)
 EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
 
 kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
-	i8254.o
+	i8254.o passthrough.o
 obj-$(CONFIG_KVM) += kvm.o
 kvm-intel-objs = vmx.o
 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8a6a4f9..dccd898 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -34,6 +34,8 @@
 #include <asm/cmpxchg.h>
 #include <asm/io.h>
 
+#include "passthrough.h"
+
 /*
  * When setting this variable to true it enables Two-Dimensional-Paging
  * where the hardware walks 2 page tables:
@@ -112,6 +114,8 @@ static int dbg = 1;
 #define PT_FIRST_AVAIL_BITS_SHIFT 9
 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
 
+#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+
 #define VALID_PAGE(x) ((x) != INVALID_PAGE)
 
 #define PT64_LEVEL_BITS 9
@@ -545,6 +549,11 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	unsigned long *rmapp;
 	int i;
 
+	/* bail out if this is an spte mapping an MMIO region */
+	if (kvm->arch.pt.pt_mmio_mapped)
+		if (*spte & PT_SHADOW_IO_MARK)
+			return;
+
 	if (!is_rmap_pte(*spte))
 		return;
 	sp = page_header(__pa(spte));
@@ -1273,6 +1282,24 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 }
 
+/*
+ * This is a very big sledgehammer, it should be called very rarely.
+ * We call this when a passthrough mmio mapping changes in order to
+ * remove the old passthrough mmio sptes. This usually only occurs on
+ * guest initialization.
+ */
+void mmu_invalidate_all_sptes(struct kvm *kvm)
+{
+	int i;
+
+	for (i = 0 ; i < KVM_MAX_VCPUS ; i++) {
+		if (kvm->vcpus[i])
+			mmu_free_roots(kvm->vcpus[i]);
+	}
+
+	kvm_flush_remote_tlbs(kvm);
+}
+
 static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 {
 	int i;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e64e9f5..5c9e33e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -47,4 +47,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
 	return vcpu->arch.cr0 & X86_CR0_PG;
 }
 
+void mmu_invalidate_all_sptes(struct kvm *kvm);
+
 #endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 57d872a..c33b6cf 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -73,6 +73,8 @@ struct guest_walker {
 	u32 error_code;
 };
 
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr);
+
 static gfn_t gpte_to_gfn(pt_element_t gpte)
 {
 	return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -275,7 +277,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			 struct guest_walker *walker,
 			 int user_fault, int write_fault, int largepage,
-			 int *ptwrite, struct page *page)
+			 int *ptwrite, int pt_mmio, struct page *page)
 {
 	hpa_t shadow_addr;
 	int level;
@@ -346,15 +348,61 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 		*shadow_ent = shadow_pte;
 	}
 
-	mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
-		     user_fault, write_fault,
-		     walker->ptes[walker->level-1] & PT_DIRTY_MASK,
-		     ptwrite, largepage, walker->gfn, page, false);
-
+	/* for the passthrough mmio case, spte is set by page_fault_pt_mmio */
+	if (!pt_mmio)
+		mmu_set_spte(vcpu, shadow_ent, access,
+			     walker->pte_access & access,
+			     user_fault, write_fault,
+			     walker->ptes[walker->level-1] & PT_DIRTY_MASK,
+			     ptwrite, largepage, walker->gfn, page, false);
 	return shadow_ent;
 }
 
 /*
+ * Handle pagefault for passthrough mmio
+ */
+static int FNAME(page_fault_pt_mmio)(struct kvm_vcpu *vcpu, gva_t addr,
+				     struct guest_walker *walker,
+				     int user_fault, int write_fault,
+				     int largepage, int *ptwrite,
+				     struct page *page)
+{
+	u64 *shadow_pte;
+	gpa_t gpa;
+	hpa_t hpa = UNMAPPED_HPA;
+	u64 spte;
+	int rc = 1;
+	int write_pt = 0;
+
+	gpa = FNAME(gva_to_gpa)(vcpu, addr);
+	if (gpa != UNMAPPED_GVA)
+		hpa = pt_mmio_gpa_to_hpa(vcpu->kvm, gpa);
+
+	if (hpa != UNMAPPED_HPA) {
+		spin_lock(&vcpu->kvm->mmu_lock);
+		kvm_mmu_free_some_pages(vcpu);
+		shadow_pte = FNAME(fetch)(vcpu, addr, walker, user_fault,
+					  write_fault, largepage, &write_pt,
+					  1, page);
+
+		if (shadow_pte) {
+			set_shadow_pte(&spte, *shadow_pte);
+			spte = hpa;
+			spte |= PT_PRESENT_MASK | PT_WRITABLE_MASK |
+				PT_USER_MASK | PT_PWT_MASK | PT_PCD_MASK |
+				PT_ACCESSED_MASK | PT_SHADOW_IO_MARK;
+			set_shadow_pte(shadow_pte, spte);
+			rc = 0;
+		} else {
+			pgprintk("fetch failed to return shadow_pte "
+				 "for address 0x%x", (unsigned int)addr);
+		}
+		spin_unlock(&vcpu->kvm->mmu_lock);
+	}
+	return rc;
+}
+
+/*
  * Page fault handler.  There are several causes for a page fault:
  *   - there is no shadow pte for the guest pte
  *   - write access through a shadow pte marked read only so that we can set
@@ -418,15 +466,22 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 
 	/* mmio */
 	if (is_error_page(page)) {
+		int rc = 1;
 		pgprintk("gfn %x is mmio\n", walker.gfn);
+		if (vcpu->kvm->arch.pt.pt_mmio_mapped) {
+			rc = FNAME(page_fault_pt_mmio)(vcpu, addr, &walker,
+						       user_fault, write_fault,
+						       largepage, &write_pt,
+						       page);
+		}
 		kvm_release_page_clean(page);
-		return 1;
+		return rc;
 	}
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
 	shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
-				  largepage, &write_pt, page);
+				  largepage, &write_pt, 0, page);
 
 	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
 		 shadow_pte, *shadow_pte, write_pt);
diff --git a/arch/x86/kvm/passthrough.c b/arch/x86/kvm/passthrough.c
new file mode 100644
index 0000000..654d1fe
--- /dev/null
+++ b/arch/x86/kvm/passthrough.c
@@ -0,0 +1,144 @@
+/*
+ * This module enable guest shadow page tables to map host mmio regions of
+ * passthrough devices directly.
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Ben-Ami Yassour   <benami@il.ibm.com>
+ *  Muli Ben-Yehuda   <muli@il.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "linux/kvm_host.h"
+#include "passthrough.h"
+#include "mmu.h"
+
+void pt_init_vm(struct pt *pt)
+{
+	spin_lock_init(&pt->pt_list_lock);
+	INIT_LIST_HEAD(&pt->mmio_range_list);
+	pt->pt_mmio_mapped = 0;
+}
+
+hpa_t pt_mmio_gpa_to_hpa(struct kvm *kvm, gpa_t gpa)
+{
+	hpa_t hpa = UNMAPPED_HPA;
+	unsigned long flags;
+	struct mmio_range *mmio = NULL;
+
+	spin_lock_irqsave(&kvm->arch.pt.pt_list_lock, flags);
+
+	/* loop the list of regions */
+	list_for_each_entry(mmio, &kvm->arch.pt.mmio_range_list, list) {
+		if ((gpa >= mmio->gpa) && (gpa < mmio->gpa + mmio->size)) {
+			hpa = mmio->hpa + (gpa - mmio->gpa);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&kvm->arch.pt.pt_list_lock, flags);
+
+	return hpa;
+}
+
+int kvm_vm_ioctl_pt_memory_mapping_add(struct kvm *kvm,
+				       struct kvm_pt_memory_mapping*
+				       memory_mapping)
+{
+	gpa_t gpa = memory_mapping->gpa;
+	hpa_t hpa = memory_mapping->hpa;
+	u64 size = memory_mapping->size;
+	int ret = 0;
+	struct mmio_range *mmio_new;
+	struct mmio_range *mmio_curr = NULL;
+	struct mmio_range *mmio_prev = NULL;
+	unsigned long flags;
+
+	mmio_new = kmalloc(sizeof(*mmio_new), GFP_KERNEL);
+	if (!mmio_new)
+		return -ENOMEM;
+
+	mmio_new->gpa = gpa;
+	mmio_new->hpa = hpa;
+	mmio_new->size = size;
+
+	spin_lock_irqsave(&kvm->arch.pt.pt_list_lock, flags);
+
+	/* search for the location to add this range */
+	ret = 0;
+	list_for_each_entry(mmio_curr, &kvm->arch.pt.mmio_range_list, list) {
+		if ((mmio_curr->gpa + mmio_curr->size) <= gpa) {
+			mmio_prev = mmio_curr;
+			continue;
+		} else if ((gpa + size) <= mmio_curr->gpa) {
+			/* no intersection between ranges */
+			break;
+		} else {
+			/* ranges intersect */
+			ret = -EINVAL;
+			break;
+		}
+	}
+
+	if (!ret) {
+		struct mmio_range *place;
+		place = mmio_prev ? mmio_prev :
+			(struct mmio_range *)&kvm->arch.pt.mmio_range_list;
+		list_add((struct list_head *)mmio_new,
+			 (struct list_head *)place);
+
+		kvm->arch.pt.pt_mmio_mapped = 1;
+	}
+
+	spin_unlock_irqrestore(&kvm->arch.pt.pt_list_lock, flags);
+
+	if (ret)
+		kfree(mmio_new);
+
+	return ret;
+}
+
+int kvm_vm_ioctl_pt_memory_mapping_remove(struct kvm *kvm,
+					  struct kvm_pt_memory_mapping*
+					  memory_mapping)
+{
+	gpa_t gpa = memory_mapping->gpa;
+	u64 size = memory_mapping->size;
+	int ret = 0;
+	struct mmio_range *mmio_curr = NULL;
+	unsigned long flags;
+
+	/* search the range to remove */
+	ret = 0;
+	spin_lock_irqsave(&kvm->arch.pt.pt_list_lock, flags);
+	list_for_each_entry(mmio_curr, &kvm->arch.pt.mmio_range_list, list) {
+		if ((mmio_curr->gpa + mmio_curr->size) <= gpa) {
+			continue;
+		} else if ((gpa + size) <= mmio_curr->gpa) {
+			/* not found */
+			ret = -EINVAL;
+			break;
+		} else {
+			/* ranges intersect */
+			if ((gpa != mmio_curr->gpa) ||
+			    (size != mmio_curr->size)) {
+				/* ranges are not equal */
+				ret = -EINVAL;
+			}
+			break;
+		}
+	}
+
+	if (!ret)
+		list_del((struct list_head *)mmio_curr);
+
+	spin_unlock_irqrestore(&kvm->arch.pt.pt_list_lock, flags);
+
+	mmu_invalidate_all_sptes(kvm);
+
+	return ret;
+}
diff --git a/arch/x86/kvm/passthrough.h b/arch/x86/kvm/passthrough.h
new file mode 100644
index 0000000..68810c6
--- /dev/null
+++ b/arch/x86/kvm/passthrough.h
@@ -0,0 +1,23 @@
+/*
+ * This module enable guest shadow page tables to map host mmio regions of
+ * passthrough devices directly.
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Ben-Ami Yassour   <benami@il.ibm.com>
+ *  Muli Ben-Yehuda   <muli@il.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef PASSTHROUGH_H
+#define PASSTHROUGH_H
+
+#include <asm/kvm_host.h>
+
+hpa_t pt_mmio_gpa_to_hpa(struct kvm *kvm, gpa_t gpa);
+
+#endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 94d77e8..3398ae7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3811,7 +3811,7 @@ struct  kvm *kvm_arch_create_vm(void)
 		return ERR_PTR(-ENOMEM);
 
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
-
+	pt_init_vm(&(kvm->arch.pt));
 	return kvm;
 }
 
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index e6d8df6..01e96f7 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -44,6 +44,7 @@
 
 #define INVALID_PAGE (~(hpa_t)0)
 #define UNMAPPED_GVA (~(gpa_t)0)
+#define UNMAPPED_HPA (~(hpa_t)-1)
 
 /* shadow tables are PAE even on non-PAE hosts */
 #define KVM_HPAGE_SHIFT 21
@@ -296,6 +297,19 @@ struct kvm_mem_alias {
 	gfn_t target_gfn;
 };
 
+struct mmio_range {
+	struct list_head list;
+	gpa_t gpa;
+	hpa_t hpa;
+	u64 size;
+};
+
+struct pt {
+	int pt_mmio_mapped; /* guest sptes map host mmio regions directly */
+	spinlock_t pt_list_lock; /* protect pt specific lists */
+	struct list_head mmio_range_list; /*  ordered list of mmio mapping */
+};
+
 struct kvm_arch{
 	int naliases;
 	struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
@@ -317,6 +331,7 @@ struct kvm_arch{
 	struct page *apic_access_page;
 
 	gpa_t wall_clock;
+	struct pt pt;
 };
 
 struct kvm_vm_stat {
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 37b963e..956512d 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -289,6 +289,19 @@ struct kvm_s390_interrupt {
 #define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\
 					struct kvm_userspace_memory_region)
 #define KVM_SET_TSS_ADDR          _IO(KVMIO, 0x47)
+
+/* Bind host I/O address range to guest address range. */
+struct kvm_pt_memory_mapping {
+	__u64 gpa;
+	__u64 hpa;
+	__u64 size; /* in bytes */
+};
+
+#define KVM_PT_MEMORY_MAPPING_ADD    _IOWR(KVMIO, 0x48, \
+					   struct kvm_pt_memory_mapping)
+#define KVM_PT_MEMORY_MAPPING_REMOVE _IOWR(KVMIO, 0x49, \
+					   struct kvm_pt_memory_mapping)
+
 /*
  * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
  * a vcpu fd.
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f4e1436..3b97624 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -200,6 +200,14 @@ int kvm_get_dirty_log(struct kvm *kvm,
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 				struct kvm_dirty_log *log);
 
+int kvm_vm_ioctl_pt_memory_mapping_add(struct kvm *kvm,
+				       struct kvm_pt_memory_mapping
+				       *memory_mapping);
+
+int kvm_vm_ioctl_pt_memory_mapping_remove(struct kvm *kvm,
+					  struct kvm_pt_memory_mapping
+					  *memory_mapping);
+
 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
 				   struct
 				   kvm_userspace_memory_region *mem,
@@ -294,4 +302,6 @@ struct kvm_stats_debugfs_item {
 };
 extern struct kvm_stats_debugfs_item debugfs_entries[];
 
+void pt_init_vm(struct pt *pt);
+
 #endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 30bf832..c2c6c05 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1030,6 +1030,30 @@ static long kvm_vm_ioctl(struct file *filp,
 			goto out;
 		break;
 	}
+	case KVM_PT_MEMORY_MAPPING_ADD: {
+		struct kvm_pt_memory_mapping input;
+
+		r = -EFAULT;
+		if (copy_from_user(&input, argp,
+				   sizeof(struct kvm_pt_memory_mapping)))
+			goto out;
+		r = kvm_vm_ioctl_pt_memory_mapping_add(kvm, &input);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_PT_MEMORY_MAPPING_REMOVE: {
+		struct kvm_pt_memory_mapping input;
+
+		r = -EFAULT;
+		if (copy_from_user(&input, argp,
+				   sizeof(struct kvm_pt_memory_mapping)))
+			goto out;
+		r = kvm_vm_ioctl_pt_memory_mapping_remove(kvm, &input);
+		if (r)
+			goto out;
+		break;
+	}
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 	}
-- 
1.5.0.3


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-01 11:52 ` [PATCH 1/1] direct mmio for passthrough - kernel part benami
@ 2008-04-01 13:30   ` Avi Kivity
  2008-04-01 14:42     ` Anthony Liguori
  2008-04-01 19:28     ` Ben-Ami Yassour1
  0 siblings, 2 replies; 32+ messages in thread
From: Avi Kivity @ 2008-04-01 13:30 UTC (permalink / raw)
  To: benami; +Cc: kvm-devel, andrea, allen.m.kay

benami@il.ibm.com wrote:
> From: Ben-Ami Yassour <benami@il.ibm.com>
>
> Enable a guest to access a device's memory mapped I/O regions directly.
> Userspace sends the mmio regions that the guest can access. On the first
> page fault for an access to an mmio address the host translates the gva to hpa,
> and updates the sptes.
>
>   

Can you explain why you're not using the regular memory slot mechanism? 
i.e. have userspace mmap(/dev/mem) and create a memslot containing that 
at the appropriate guest physical address?

There are some issues with refcounting, but Andrea has some tricks to 
deal with that.

-- 
Any sufficiently difficult bug is indistinguishable from a feature.


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-01 13:30   ` Avi Kivity
@ 2008-04-01 14:42     ` Anthony Liguori
  2008-04-01 15:20       ` Anthony Liguori
  2008-04-01 17:03       ` Avi Kivity
  2008-04-01 19:28     ` Ben-Ami Yassour1
  1 sibling, 2 replies; 32+ messages in thread
From: Anthony Liguori @ 2008-04-01 14:42 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, allen.m.kay, andrea, benami

Avi Kivity wrote:
> benami@il.ibm.com wrote:
>   
>> From: Ben-Ami Yassour <benami@il.ibm.com>
>>
>> Enable a guest to access a device's memory mapped I/O regions directly.
>> Userspace sends the mmio regions that the guest can access. On the first
>> page fault for an access to an mmio address the host translates the gva to hpa,
>> and updates the sptes.
>>
>>   
>>     
>
> Can you explain why you're not using the regular memory slot mechanism? 
> i.e. have userspace mmap(/dev/mem) and create a memslot containing that 
> at the appropriate guest physical address?
>   

/dev/mem is often restricted in what memory can be mapped.  However, we 
can't add something like this to KVM that allows arbitrary HPA's to be 
mapped into a guest from userspace.  This is just as bad as /dev/mem and 
is going to upset a lot of people.

Regardless of whether we can use /dev/mem, I think we should introduce a 
new char device anyway.  We only need to mmap() MMIO regions which are 
mapped by the PCI bus, presumably, the kernel should know about these 
mappings.  The driver should only allow mappings that are valid for a 
particular PCI device such that it cannot be abused to map arbitrary 
regions of memory into a guest.  Bonus points if it can validate that 
there isn't a valid Linux driver loaded for the given PCI device.

Regards,

Anthony Liguori

> There are some issues with refcounting, but Andrea has some tricks to 
> deal with that.
>
>   


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-01 14:42     ` Anthony Liguori
@ 2008-04-01 15:20       ` Anthony Liguori
  2008-04-01 17:05         ` Avi Kivity
  2008-04-01 18:18         ` Andrea Arcangeli
  2008-04-01 17:03       ` Avi Kivity
  1 sibling, 2 replies; 32+ messages in thread
From: Anthony Liguori @ 2008-04-01 15:20 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, allen.m.kay, andrea, benami

Anthony Liguori wrote:
> Avi Kivity wrote:
>> benami@il.ibm.com wrote:
>>  
>>> From: Ben-Ami Yassour <benami@il.ibm.com>
>>>
>>> Enable a guest to access a device's memory mapped I/O regions directly.
>>> Userspace sends the mmio regions that the guest can access. On the 
>>> first
>>> page fault for an access to an mmio address the host translates the 
>>> gva to hpa,
>>> and updates the sptes.
>>>
>>>       
>>
>> Can you explain why you're not using the regular memory slot 
>> mechanism? i.e. have userspace mmap(/dev/mem) and create a memslot 
>> containing that at the appropriate guest physical address?
>>   
>
> /dev/mem is often restricted in what memory can be mapped.  However, 
> we can't add something like this to KVM that allows arbitrary HPA's to 
> be mapped into a guest from userspace.  This is just as bad as 
> /dev/mem and is going to upset a lot of people.
>
> Regardless of whether we can use /dev/mem, I think we should introduce 
> a new char device anyway.  We only need to mmap() MMIO regions which 
> are mapped by the PCI bus, presumably, the kernel should know about 
> these mappings.  The driver should only allow mappings that are valid 
> for a particular PCI device such that it cannot be abused to map 
> arbitrary regions of memory into a guest.  Bonus points if it can 
> validate that there isn't a valid Linux driver loaded for the given 
> PCI device.

Which is apparently entirely unnecessary as we already have 
/sys/bus/pci/.../region.  It's just a matter of checking if a vma is 
VM_IO and then dealing with the subsequent reference counting issues as 
Avi points out.

Regards,

Anthony Liguori

> Regards,
>
> Anthony Liguori
>
>> There are some issues with refcounting, but Andrea has some tricks to 
>> deal with that.
>>
>>   
>


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-01 14:42     ` Anthony Liguori
  2008-04-01 15:20       ` Anthony Liguori
@ 2008-04-01 17:03       ` Avi Kivity
  2008-04-01 17:18         ` Daniel P. Berrange
  2008-04-01 18:21         ` Anthony Liguori
  1 sibling, 2 replies; 32+ messages in thread
From: Avi Kivity @ 2008-04-01 17:03 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel, allen.m.kay, andrea, benami

Anthony Liguori wrote:
> Avi Kivity wrote:
>> benami@il.ibm.com wrote:
>>  
>>> From: Ben-Ami Yassour <benami@il.ibm.com>
>>>
>>> Enable a guest to access a device's memory mapped I/O regions directly.
>>> Userspace sends the mmio regions that the guest can access. On the 
>>> first
>>> page fault for an access to an mmio address the host translates the 
>>> gva to hpa,
>>> and updates the sptes.
>>>
>>>       
>>
>> Can you explain why you're not using the regular memory slot 
>> mechanism? i.e. have userspace mmap(/dev/mem) and create a memslot 
>> containing that at the appropriate guest physical address?
>>   
>
> /dev/mem is often restricted in what memory can be mapped.  

Please elaborate.

> However, we can't add something like this to KVM that allows arbitrary 
> HPA's to be mapped into a guest from userspace.  This is just as bad 
> as /dev/mem and is going to upset a lot of people.

Device assignment is as rootish as you get.

>
>
> Regardless of whether we can use /dev/mem, I think we should introduce 
> a new char device anyway.  We only need to mmap() MMIO regions which 
> are mapped by the PCI bus, presumably, the kernel should know about 
> these mappings.  The driver should only allow mappings that are valid 
> for a particular PCI device such that it cannot be abused to map 
> arbitrary regions of memory into a guest.  Bonus points if it can 
> validate that there isn't a valid Linux driver loaded for the given 
> PCI device.

This is a very good idea.

The interface exposed would be the same as /dev/mem, so any kvm 
modifications to get /dev/mem working would be applicable to /dev/pci/*, no?

-- 
Any sufficiently difficult bug is indistinguishable from a feature.


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-01 15:20       ` Anthony Liguori
@ 2008-04-01 17:05         ` Avi Kivity
  2008-04-01 18:18         ` Andrea Arcangeli
  1 sibling, 0 replies; 32+ messages in thread
From: Avi Kivity @ 2008-04-01 17:05 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel, allen.m.kay, andrea, benami

Anthony Liguori wrote:
>>
>> Regardless of whether we can use /dev/mem, I think we should 
>> introduce a new char device anyway.  We only need to mmap() MMIO 
>> regions which are mapped by the PCI bus, presumably, the kernel 
>> should know about these mappings.  The driver should only allow 
>> mappings that are valid for a particular PCI device such that it 
>> cannot be abused to map arbitrary regions of memory into a guest.  
>> Bonus points if it can validate that there isn't a valid Linux driver 
>> loaded for the given PCI device.
>
> Which is apparently entirely unnecessary as we already have 
> /sys/bus/pci/.../region.  It's just a matter of checking if a vma is 
> VM_IO and then dealing with the subsequent reference counting issues 
> as Avi points out.
>

 From idea to working implementation in 38 minutes.  Congrats.


-- 
Any sufficiently difficult bug is indistinguishable from a feature.


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-01 17:03       ` Avi Kivity
@ 2008-04-01 17:18         ` Daniel P. Berrange
  2008-04-01 18:10           ` Andrea Arcangeli
  2008-04-01 18:21         ` Anthony Liguori
  1 sibling, 1 reply; 32+ messages in thread
From: Daniel P. Berrange @ 2008-04-01 17:18 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, andrea, allen.m.kay, benami

On Tue, Apr 01, 2008 at 08:03:14PM +0300, Avi Kivity wrote:
> Anthony Liguori wrote:
> > Avi Kivity wrote:
> >> benami@il.ibm.com wrote:
> >>  
> >>> From: Ben-Ami Yassour <benami@il.ibm.com>
> >>>
> >>> Enable a guest to access a device's memory mapped I/O regions directly.
> >>> Userspace sends the mmio regions that the guest can access. On the 
> >>> first
> >>> page fault for an access to an mmio address the host translates the 
> >>> gva to hpa,
> >>> and updates the sptes.
> >>>
> >>>       
> >>
> >> Can you explain why you're not using the regular memory slot 
> >> mechanism? i.e. have userspace mmap(/dev/mem) and create a memslot 
> >> containing that at the appropriate guest physical address?
> >>   
> >
> > /dev/mem is often restricted in what memory can be mapped.  
> 
> Please elaborate.

The /dev/mem, /dev/kmem devices have a special SELinux context memory_device_t
and very few application domains are allowed to access them. THe KVM/QEMU
policy will not allow this for example. Basically on the X server, HAL and
dmidecode have access in current policy. It would be undesirable to have to
all KVM guests full access to /dev/mem, so a more fine grained access method
would have benefits here. 

Dan.
-- 
|: Red Hat, Engineering, Boston   -o-   http://people.redhat.com/berrange/ :|
|: http://libvirt.org  -o-  http://virt-manager.org  -o-  http://ovirt.org :|
|: http://autobuild.org       -o-         http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505  -o-  F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-01 17:18         ` Daniel P. Berrange
@ 2008-04-01 18:10           ` Andrea Arcangeli
  2008-04-01 18:18             ` Daniel P. Berrange
  2008-04-01 18:23             ` Anthony Liguori
  0 siblings, 2 replies; 32+ messages in thread
From: Andrea Arcangeli @ 2008-04-01 18:10 UTC (permalink / raw)
  To: Daniel P. Berrange; +Cc: kvm-devel, allen.m.kay, Avi Kivity, benami

On Tue, Apr 01, 2008 at 06:18:07PM +0100, Daniel P. Berrange wrote:
> and very few application domains are allowed to access them. THe KVM/QEMU
> policy will not allow this for example. Basically on the X server, HAL and
> dmidecode have access in current policy. It would be undesirable to have to
> all KVM guests full access to /dev/mem, so a more fine grained access method
> would have benefits here. 

But pci-passthrough can give a root on the host even to the ring0
guest, just like /dev/mem without VT-d, so there's no muchx difference
with using /dev/mem as far as security is concerned. Only on the CPUs
including VT-d it's possible to retain a mostly equivalent security
level despite pci-passthrough.

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-01 15:20       ` Anthony Liguori
  2008-04-01 17:05         ` Avi Kivity
@ 2008-04-01 18:18         ` Andrea Arcangeli
  2008-04-01 18:28           ` Anthony Liguori
  1 sibling, 1 reply; 32+ messages in thread
From: Andrea Arcangeli @ 2008-04-01 18:18 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel, allen.m.kay, benami, Avi Kivity

On Tue, Apr 01, 2008 at 10:20:49AM -0500, Anthony Liguori wrote:
> Which is apparently entirely unnecessary as we already have 
> /sys/bus/pci/.../region.  It's just a matter of checking if a vma is VM_IO 
> and then dealing with the subsequent reference counting issues as Avi 
> points out.

Do you need to map it in userland too, isn't it enough to map it in
the sptes?

For the ram I had to map it in userland too with /dev/mem, and then I
used the pte_pfn to fill the spte, so the emulated qemu drivers can
input/output. But for the mmio space I doubt the userland side is
needed.

If you add a direct memslot (new bitflag type) I will use it too
instead of catching get_user_pages failures and walking ptes on the
RAM pieces overwritten by /dev/mem.

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-01 18:10           ` Andrea Arcangeli
@ 2008-04-01 18:18             ` Daniel P. Berrange
  2008-04-01 18:23             ` Anthony Liguori
  1 sibling, 0 replies; 32+ messages in thread
From: Daniel P. Berrange @ 2008-04-01 18:18 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: kvm-devel, allen.m.kay, Avi Kivity, benami

On Tue, Apr 01, 2008 at 08:10:31PM +0200, Andrea Arcangeli wrote:
> On Tue, Apr 01, 2008 at 06:18:07PM +0100, Daniel P. Berrange wrote:
> > and very few application domains are allowed to access them. THe KVM/QEMU
> > policy will not allow this for example. Basically on the X server, HAL and
> > dmidecode have access in current policy. It would be undesirable to have to
> > all KVM guests full access to /dev/mem, so a more fine grained access method
> > would have benefits here. 
> 
> But pci-passthrough can give a root on the host even to the ring0
> guest, just like /dev/mem without VT-d, so there's no muchx difference
> with using /dev/mem as far as security is concerned. Only on the CPUs
> including VT-d it's possible to retain a mostly equivalent security
> level despite pci-passthrough.

Clearly it is a loosing battle without VT-d. That doesn't mean we should 
design it to loose in general. So we should design to that when we do have
VT-D it will have the maximum security possible. VT-d will only become more
widespread over time.

Dan.
-- 
|: Red Hat, Engineering, Boston   -o-   http://people.redhat.com/berrange/ :|
|: http://libvirt.org  -o-  http://virt-manager.org  -o-  http://ovirt.org :|
|: http://autobuild.org       -o-         http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505  -o-  F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-01 17:03       ` Avi Kivity
  2008-04-01 17:18         ` Daniel P. Berrange
@ 2008-04-01 18:21         ` Anthony Liguori
  2008-04-01 19:22           ` Avi Kivity
  2008-04-01 22:22           ` Andrea Arcangeli
  1 sibling, 2 replies; 32+ messages in thread
From: Anthony Liguori @ 2008-04-01 18:21 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, allen.m.kay, andrea, benami

Avi Kivity wrote:
> Anthony Liguori wrote:
>> Avi Kivity wrote:
>>> benami@il.ibm.com wrote:
>>>  
>>>> From: Ben-Ami Yassour <benami@il.ibm.com>
>>>>
>>>> Enable a guest to access a device's memory mapped I/O regions 
>>>> directly.
>>>> Userspace sends the mmio regions that the guest can access. On the 
>>>> first
>>>> page fault for an access to an mmio address the host translates the 
>>>> gva to hpa,
>>>> and updates the sptes.
>>>>
>>>>       
>>>
>>> Can you explain why you're not using the regular memory slot 
>>> mechanism? i.e. have userspace mmap(/dev/mem) and create a memslot 
>>> containing that at the appropriate guest physical address?
>>>   
>>
>> /dev/mem is often restricted in what memory can be mapped.  
>
> Please elaborate.

http://lkml.org/lkml/2008/1/30/473

Ubuntu now carries this and I think Fedora has carried something like 
this for a long time.

It's okay for us because we just need device memory access but who knows 
if it will be restricted more in the future.  If X ever gets their act 
together and gets proper kernel drivers, I think there would be a strong 
case for removing /dev/mem altogether.

>> However, we can't add something like this to KVM that allows 
>> arbitrary HPA's to be mapped into a guest from userspace.  This is 
>> just as bad as /dev/mem and is going to upset a lot of people.
>
> Device assignment is as rootish as you get.

In an SELinux world, root may still have limited access to the system.  
In an ideal world, we would be able to guarantee that at worst, a guest 
can only harm the device it has access to and nothing else on the 
system.  This probably requires MSI and VT-d.  Even then, you may be 
able to physically damage the system by improperly programming the device.

>>
>>
>> Regardless of whether we can use /dev/mem, I think we should 
>> introduce a new char device anyway.  We only need to mmap() MMIO 
>> regions which are mapped by the PCI bus, presumably, the kernel 
>> should know about these mappings.  The driver should only allow 
>> mappings that are valid for a particular PCI device such that it 
>> cannot be abused to map arbitrary regions of memory into a guest.  
>> Bonus points if it can validate that there isn't a valid Linux driver 
>> loaded for the given PCI device.
>
> This is a very good idea.
>
> The interface exposed would be the same as /dev/mem, so any kvm 
> modifications to get /dev/mem working would be applicable to 
> /dev/pci/*, no?

I looked at Andrea's patches and I didn't see any special handling for 
non-RAM pages.  Something Muli mentioned that kept them from doing 
/sys/devices/pci/.../region to begin with was the fact that IO pages do 
not have a struct page backing them so get_user_pages() will fail in 
gfn_to_page().

It's not too hard to modify the code to also try get_user_pages() to 
look up the VMA instead of the struct page.  From the VMA, you can get 
the HPA for an IO mapping.  The problem though is that gfn_to_page() 
wants to return a page, not a HPA.  I haven't looked too deeply yet, but 
my suspicion is that to properly support mapping in VM_IO pages will 
require some general refactoring since we always assume that a struct 
page exists for any HPA.

Regards,

Anthony Liguori


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-01 18:10           ` Andrea Arcangeli
  2008-04-01 18:18             ` Daniel P. Berrange
@ 2008-04-01 18:23             ` Anthony Liguori
  1 sibling, 0 replies; 32+ messages in thread
From: Anthony Liguori @ 2008-04-01 18:23 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: kvm-devel, allen.m.kay, benami, Avi Kivity

Andrea Arcangeli wrote:
> On Tue, Apr 01, 2008 at 06:18:07PM +0100, Daniel P. Berrange wrote:
>   
>> and very few application domains are allowed to access them. THe KVM/QEMU
>> policy will not allow this for example. Basically on the X server, HAL and
>> dmidecode have access in current policy. It would be undesirable to have to
>> all KVM guests full access to /dev/mem, so a more fine grained access method
>> would have benefits here. 
>>     
>
> But pci-passthrough can give a root on the host even to the ring0
> guest, just like /dev/mem without VT-d, so there's no muchx difference
> with using /dev/mem as far as security is concerned. Only on the CPUs
> including VT-d it's possible to retain a mostly equivalent security
> level despite pci-passthrough.
>   

Eventually, most machines with have an IOMMU so that's the assumption to 
design to.  It is true that PCI pass-through w/o VT-d is always going to 
be equivalent to /dev/mem access but it's really a special case.  Unless 
you have an IOMMU, you would not do PCI pass-through if you cared at all 
about security/reliability.

Regards,

Anthony Liguori


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-01 18:18         ` Andrea Arcangeli
@ 2008-04-01 18:28           ` Anthony Liguori
  0 siblings, 0 replies; 32+ messages in thread
From: Anthony Liguori @ 2008-04-01 18:28 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: kvm-devel, allen.m.kay, benami, Avi Kivity

Andrea Arcangeli wrote:
> On Tue, Apr 01, 2008 at 10:20:49AM -0500, Anthony Liguori wrote:
>   
>> Which is apparently entirely unnecessary as we already have 
>> /sys/bus/pci/.../region.  It's just a matter of checking if a vma is VM_IO 
>> and then dealing with the subsequent reference counting issues as Avi 
>> points out.
>>     
>
> Do you need to map it in userland too, isn't it enough to map it in
> the sptes?
>
> For the ram I had to map it in userland too with /dev/mem, and then I
> used the pte_pfn to fill the spte, so the emulated qemu drivers can
> input/output. But for the mmio space I doubt the userland side is
> needed.
>
> If you add a direct memslot (new bitflag type) I will use it too
> instead of catching get_user_pages failures and walking ptes on the
> RAM pieces overwritten by /dev/mem.
>   

There's a certain amount of elegance in mapping to userspace and not 
introducing a direct memslot.  It helps simplify the security model 
since an application isn't able to do anything more than it could 
without KVM.

The difficulty with a direct memslot is how you introduce policies to 
limit what guests can access what memslots directly.  You would have to 
teach KVM to interact with the PCI subsystem to determine what memory 
was within a particular PCI IO region.  Not impossible, just ugly.

Regards,

Anthony Liguori

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-01 18:21         ` Anthony Liguori
@ 2008-04-01 19:22           ` Avi Kivity
  2008-04-01 22:38             ` Andrea Arcangeli
  2008-04-01 22:22           ` Andrea Arcangeli
  1 sibling, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2008-04-01 19:22 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel, andrea, allen.m.kay, benami

Anthony Liguori wrote:
> I looked at Andrea's patches and I didn't see any special handling for 
> non-RAM pages.  Something Muli mentioned that kept them from doing 
> /sys/devices/pci/.../region to begin with was the fact that IO pages do 
> not have a struct page backing them so get_user_pages() will fail in 
> gfn_to_page().
>   

It's just something we discussed, not code.

-- 
Any sufficiently difficult bug is indistinguishable from a feature.


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-01 13:30   ` Avi Kivity
  2008-04-01 14:42     ` Anthony Liguori
@ 2008-04-01 19:28     ` Ben-Ami Yassour1
  2008-04-01 19:43       ` Avi Kivity
  1 sibling, 1 reply; 32+ messages in thread
From: Ben-Ami Yassour1 @ 2008-04-01 19:28 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, andrea, allen.m.kay



Avi Kivity <avi@qumranet.com> wrote on 01/04/2008 16:30:00:

> benami@il.ibm.com wrote:
> > From: Ben-Ami Yassour <benami@il.ibm.com>
> >
> > Enable a guest to access a device's memory mapped I/O regions directly.
> > Userspace sends the mmio regions that the guest can access. On the
first
> > page fault for an access to an mmio address the host translates
> the gva to hpa,
> > and updates the sptes.
> >
> >
>
> Can you explain why you're not using the regular memory slot mechanism?
> i.e. have userspace mmap(/dev/mem) and create a memslot containing that
> at the appropriate guest physical address?
>
> There are some issues with refcounting, but Andrea has some tricks to
> deal with that.
>
> --
> Any sufficiently difficult bug is indistinguishable from a feature.
>

Our initial approach was to mmap /sys/bus/pci/devices/.../resource#
and create a memory slot for it. However eventually we realized that for
mmio we don't need hva mapped to the mmio region, we can map the gpa
directly to hpa.

As far as I understand, the memory slots mechanism is used to map
gpa to hva. Then gfn_to_page uses get_user_pages to map hva to hpa.
However, get_user_pages does not work for mmio, and in addition we
know the hpa to begin with, so there is no real need to map an hva
for the mmio region.

In addition there is an assumption in the code that there is a page
struct for the frame which is not the case for mmio. So it was
easier to simply add a list of mmio gpa-hpa mapping.

I guess we can use the memory slots array to holds the gpa to hpa
mapping but it is not necessarily natural, and would probably
require more hooks in the code to handle an mmio memory slot. BTW,
note that for a guest that has multiple passthrough devices each
with a set of mmio regions, it might become a long list, so there
might be value to keep it separate from that respect as well.

With regards to the potential security issue Anthony pointed out
(ioctls taking hpa's) we can change the interface so that they will
take (device, BAR) instead and the kernel will translate to hpa

What do you think? Given that the shadow page table code has to be
modified anyway (due to the struct page issue), is it worthwhile to
experiment with mmap(...region) or is the current approach sufficient?

Thanks,
Ben


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-01 19:28     ` Ben-Ami Yassour1
@ 2008-04-01 19:43       ` Avi Kivity
  2008-04-01 20:04         ` Anthony Liguori
  0 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2008-04-01 19:43 UTC (permalink / raw)
  To: Ben-Ami Yassour1; +Cc: kvm-devel, andrea, allen.m.kay

Ben-Ami Yassour1 wrote:
>>
>> Can you explain why you're not using the regular memory slot mechanism?
>> i.e. have userspace mmap(/dev/mem) and create a memslot containing that
>> at the appropriate guest physical address?
>>
>>     
> Our initial approach was to mmap /sys/bus/pci/devices/.../resource#
> and create a memory slot for it. However eventually we realized that for
> mmio we don't need hva mapped to the mmio region, we can map the gpa
> directly to hpa.
>
> As far as I understand, the memory slots mechanism is used to map
> gpa to hva. Then gfn_to_page uses get_user_pages to map hva to hpa.
> However, get_user_pages does not work for mmio, and in addition we
> know the hpa to begin with, so there is no real need to map an hva
> for the mmio region.
>
> In addition there is an assumption in the code that there is a page
> struct for the frame which is not the case for mmio. So it was
> easier to simply add a list of mmio gpa-hpa mapping.
>
> I guess we can use the memory slots array to holds the gpa to hpa
> mapping but it is not necessarily natural, and would probably
> require more hooks in the code to handle an mmio memory slot. BTW,
> note that for a guest that has multiple passthrough devices each
> with a set of mmio regions, it might become a long list, so there
> might be value to keep it separate from that respect as well.
>
> With regards to the potential security issue Anthony pointed out
> (ioctls taking hpa's) we can change the interface so that they will
> take (device, BAR) instead and the kernel will translate to hpa
>
>   

Not enough.  How do you know if this calling process has permissions to 
access that pci device (I retract my previous "pci passthrough is as 
rootish as you get" remark).

> What do you think? Given that the shadow page table code has to be
> modified anyway (due to the struct page issue), is it worthwhile to
> experiment with mmap(...region) or is the current approach sufficient?
>   

As Anthony points out, the advantage to mmap() is that whatever security 
is needed can be applied at that level.  Passing host physical addresses 
from userspace requires a whole new security model.

The issue with gfn_to_page() is real.  We can either try to work around 
it somehow, or we can try to back mmio regions with struct pages.  Now 
that it looks like mem_map is becoming virtually mapped, the cost is 
minimal, but I expect this approach will meet some resistance.

-- 
Any sufficiently difficult bug is indistinguishable from a feature.


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-01 19:43       ` Avi Kivity
@ 2008-04-01 20:04         ` Anthony Liguori
  2008-04-02  4:32           ` Avi Kivity
  0 siblings, 1 reply; 32+ messages in thread
From: Anthony Liguori @ 2008-04-01 20:04 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, allen.m.kay, andrea, Ben-Ami Yassour1

Avi Kivity wrote:
> Ben-Ami Yassour1 wrote:
>   
>>   
>>     
>
> Not enough.  How do you know if this calling process has permissions to 
> access that pci device (I retract my previous "pci passthrough is as 
> rootish as you get" remark).
>
>   
>> What do you think? Given that the shadow page table code has to be
>> modified anyway (due to the struct page issue), is it worthwhile to
>> experiment with mmap(...region) or is the current approach sufficient?
>>   
>>     
>
> As Anthony points out, the advantage to mmap() is that whatever security 
> is needed can be applied at that level.  Passing host physical addresses 
> from userspace requires a whole new security model.
>
> The issue with gfn_to_page() is real.  We can either try to work around 
> it somehow, or we can try to back mmio regions with struct pages.  Now 
> that it looks like mem_map is becoming virtually mapped, the cost is 
> minimal, but I expect this approach will meet some resistance.
>   

What about switching the KVM MMU code to use hfn_t instead of struct 
page? The initial conversion is pretty straight forward as the places 
where you actually need a struct page you can always get it from 
pfn_to_page() (like in kvm_release_page_dirty).

We can then teach the MMU to deal with hfn_t's that don't have a 
corresponding page.  IIUC, the implementation of something like 
kvm_release_page_dirty is a nop for pfn's that don't have a 
corresponding page.  We just have to be able to detect a pfn_to_page() 
failure and then assume we're dealing with IO memory.

Regards,

Anthony Liguori



-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-01 18:21         ` Anthony Liguori
  2008-04-01 19:22           ` Avi Kivity
@ 2008-04-01 22:22           ` Andrea Arcangeli
  2008-04-01 22:29             ` Anthony Liguori
  1 sibling, 1 reply; 32+ messages in thread
From: Andrea Arcangeli @ 2008-04-01 22:22 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel, allen.m.kay, benami, Avi Kivity

On Tue, Apr 01, 2008 at 01:21:37PM -0500, Anthony Liguori wrote:
> return a page, not a HPA.  I haven't looked too deeply yet, but my 
> suspicion is that to properly support mapping in VM_IO pages will require 
> some general refactoring since we always assume that a struct page exists 
> for any HPA.

Yes, that was potentially problem for reserved _ram_ pages too, as it
isn't guaranteed that memmap_t (old days nomenclature) will exist for
physical addresses not defined as ram in the e820 map (to make it work
without VT-d I have to reserve the ram in the host at the e820 map
parsing time). If the memmap will not exist for the reserved ram
physical range, the pfn_valid() will fail at runtime in kvm and the
bad_page will generate a graceful emulation failure, so it's very
safe. But once we handle direct memslots for mmio regions, the
reserved ram will better stop depending on the memmap too.

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-01 22:22           ` Andrea Arcangeli
@ 2008-04-01 22:29             ` Anthony Liguori
  2008-04-02  4:00               ` Avi Kivity
  0 siblings, 1 reply; 32+ messages in thread
From: Anthony Liguori @ 2008-04-01 22:29 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: kvm-devel, allen.m.kay, benami, Avi Kivity

Andrea Arcangeli wrote:
> On Tue, Apr 01, 2008 at 01:21:37PM -0500, Anthony Liguori wrote:
>   
>> return a page, not a HPA.  I haven't looked too deeply yet, but my 
>> suspicion is that to properly support mapping in VM_IO pages will require 
>> some general refactoring since we always assume that a struct page exists 
>> for any HPA.
>>     
>
> Yes, that was potentially problem for reserved _ram_ pages too, as it
> isn't guaranteed that memmap_t (old days nomenclature) will exist for
> physical addresses not defined as ram in the e820 map (to make it work
> without VT-d I have to reserve the ram in the host at the e820 map
> parsing time). If the memmap will not exist for the reserved ram
> physical range, the pfn_valid() will fail at runtime in kvm and the
> bad_page will generate a graceful emulation failure, so it's very
> safe. But once we handle direct memslots for mmio regions, the
> reserved ram will better stop depending on the memmap too.
>   

Direct memslots is quite a sledgehammer though to deal with IO pages.

You could get away with supporting reserved RAM another way though.  If 
we refactored the MMU to use hfn_t instead of struct page, you would 
then need a mechanism to mmap() reserved ram into userspace (similar to 
ioremap I guess).  In fact, you may be able to just lie and claim 
reserved ram is IO ram.

Then we could treat the reserved ram in the same way as IO ram and not 
have to introduce a new interface in KVM for mapping arbitrary regions 
of memory.

I would be interested in this sort of mechanism not for reserving low 
RAM, but for reserving high RAM.  Basically, I'd like to boot with mem= 
something and then still be able to map the higher memory in QEMU 
userspace and then create KVM guests with it.

Regards,

Anthony Liguori

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-01 19:22           ` Avi Kivity
@ 2008-04-01 22:38             ` Andrea Arcangeli
  0 siblings, 0 replies; 32+ messages in thread
From: Andrea Arcangeli @ 2008-04-01 22:38 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, allen.m.kay, benami

On Tue, Apr 01, 2008 at 10:22:51PM +0300, Avi Kivity wrote:
> It's just something we discussed, not code.

Yes, the pfn_valid check should skip all refcounting for mmio regions
without a struct page. But gfn_to_page can't work without a struct
page, so some change will be needed there. With the reserved ram patch
I didn't need to touch the gfn_to_page interface because the memmap
happens to exist despite the ram is reserved in the e820 map, so I
used !page_count() instead of the previously discussed !pfn_valid to
skip all refcounting (reserved ram must skip the refcounting too
because those ram pages can't be freed as they're not-ram as far as
linux is concerned).

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-01 22:29             ` Anthony Liguori
@ 2008-04-02  4:00               ` Avi Kivity
  0 siblings, 0 replies; 32+ messages in thread
From: Avi Kivity @ 2008-04-02  4:00 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel, allen.m.kay, Andrea Arcangeli, benami

Anthony Liguori wrote:
>
> You could get away with supporting reserved RAM another way though.  
> If we refactored the MMU to use hfn_t instead of struct page, you 
> would then need a mechanism to mmap() reserved ram into userspace 
> (similar to ioremap I guess).  In fact, you may be able to just lie 
> and claim reserved ram is IO ram.

We'd need it to return a (hfn_t, page_type_t) pair so the caller would 
know whether to use page refcounting or not.  The caller might be able 
to ask Linux whether the page has a struct page or not, but it gets hairy.

That's why I'm interested in the possibility of adding struct pages at 
runtime. vmemmap is needed for many other reasons (NUMA, memory hotplug, 
sparse SGI memory) so it's reasonable that it will be enabled on most 
distros.

-- 
Any sufficiently difficult bug is indistinguishable from a feature.


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-01 20:04         ` Anthony Liguori
@ 2008-04-02  4:32           ` Avi Kivity
  2008-04-02  7:03             ` Andrea Arcangeli
  0 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2008-04-02  4:32 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel, andrea, allen.m.kay, Ben-Ami Yassour1

Anthony Liguori wrote:
> What about switching the KVM MMU code to use hfn_t instead of struct 
> page? The initial conversion is pretty straight forward as the places 
> where you actually need a struct page you can always get it from 
> pfn_to_page() (like in kvm_release_page_dirty).
>
> We can then teach the MMU to deal with hfn_t's that don't have a 
> corresponding page.  IIUC, the implementation of something like 
> kvm_release_page_dirty is a nop for pfn's that don't have a 
> corresponding page.  We just have to be able to detect a pfn_to_page() 
> failure and then assume we're dealing with IO memory.
>
>   

It ought to work.  gfn_to_hfn() (old gfn_to_page) will still need to 
take a refcount if possible.

This will increase the potential for type errors, so maybe we need to 
make gfn_t and hfn_t distinct structures, and use accessors to get the 
actual values.

-- 
Any sufficiently difficult bug is indistinguishable from a feature.


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-02  4:32           ` Avi Kivity
@ 2008-04-02  7:03             ` Andrea Arcangeli
  2008-04-02  9:50               ` Avi Kivity
  0 siblings, 1 reply; 32+ messages in thread
From: Andrea Arcangeli @ 2008-04-02  7:03 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, allen.m.kay, Ben-Ami Yassour1

On Wed, Apr 02, 2008 at 07:32:35AM +0300, Avi Kivity wrote:
> It ought to work.  gfn_to_hfn() (old gfn_to_page) will still need to take a 
> refcount if possible.

This reminds me, that mmu notifiers we could implement gfn_to_hfn only
with follow_page and skip the refcounting on the struct page.

I'm not suggesting that though, the refcount makes the code more
robust IMHO, and notably it allows to run on kernels without mmu
notifiers.

So the trick is to do something like

   if (pfn_valid(pfn))
      put_page(pfn_to_page(pfn));

That's the trick I was suggesting to take care of mmio space.

If the reserved ram is used instead of VT-d to allow DMA, that will
have change to:

   if (pfn_valid(pfn))
      page = pfn_to_page(pfn);
      if (page_count(page))
      	 put_page(page);

that will take care of both the reserved ram and the pfn.

In general I made it for the reserved-ram with only the page_count !=
0 check, because 'struct page' existed.

I'm unsure if it's good to add struct pages for non-ram, I find it
slightly confusing and not the right thing, it takes memory for stuff
that can't happen (refcounting only makes sense if the page finally
goes in the freelist when count reaches 0, and PG_dirty/referenced
bits and the like don't make sense either for non-ram or
reserved-ram). So I'm not sure the vmemmap trick is the best.

> This will increase the potential for type errors, so maybe we need to make 
> gfn_t and hfn_t distinct structures, and use accessors to get the actual 
> values.

That's also a possibility. If we go for this then hfn_t is probably a
better name as it's less likely to collide with core kernel VM
code. Otherwise perhaps "pfn" can be used instead of hfn, it's up to
you ;).

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-02  7:03             ` Andrea Arcangeli
@ 2008-04-02  9:50               ` Avi Kivity
  2008-04-02 10:28                 ` Andrea Arcangeli
  0 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2008-04-02  9:50 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: kvm-devel, allen.m.kay, Ben-Ami Yassour1

Andrea Arcangeli wrote:
> On Wed, Apr 02, 2008 at 07:32:35AM +0300, Avi Kivity wrote:
>   
>> It ought to work.  gfn_to_hfn() (old gfn_to_page) will still need to take a 
>> refcount if possible.
>>     
>
> This reminds me, that mmu notifiers we could implement gfn_to_hfn only
> with follow_page and skip the refcounting on the struct page.
>
> I'm not suggesting that though, the refcount makes the code more
> robust IMHO, and notably it allows to run on kernels without mmu
> notifiers.
>
>   

Isn't it faster though?  We don't need to pull in the cacheline 
containing the struct page anymore.

We could hack something to make pre mmu notifier kernels work.

>
> I'm unsure if it's good to add struct pages for non-ram, I find it
> slightly confusing and not the right thing, it takes memory for stuff
> that can't happen (refcounting only makes sense if the page finally
> goes in the freelist when count reaches 0, and PG_dirty/referenced
> bits and the like don't make sense either for non-ram or
> reserved-ram). So I'm not sure the vmemmap trick is the best.
>
>   

I thought we'd meet with resistance to that idea :) though I'd like to 
point out that struct pages to exist for mmio on some machines (those 
with >= 4GB).

>> This will increase the potential for type errors, so maybe we need to make 
>> gfn_t and hfn_t distinct structures, and use accessors to get the actual 
>> values.
>>     
>
> That's also a possibility. If we go for this then hfn_t is probably a
> better name as it's less likely to collide with core kernel VM
> code. Otherwise perhaps "pfn" can be used instead of hfn, it's up to
> you ;).
>   

I guess we can move to pfn, they're unambiguous enough.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-02  9:50               ` Avi Kivity
@ 2008-04-02 10:28                 ` Andrea Arcangeli
  2008-04-02 10:59                   ` Avi Kivity
  0 siblings, 1 reply; 32+ messages in thread
From: Andrea Arcangeli @ 2008-04-02 10:28 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, allen.m.kay, Ben-Ami Yassour1

On Wed, Apr 02, 2008 at 12:50:50PM +0300, Avi Kivity wrote:
> Isn't it faster though?  We don't need to pull in the cacheline containing 
> the struct page anymore.

Exactly, not only that, get_user_pages is likely a bit slower that we
need for just kvm pte lookup. GRU uses follow_page directly because it
runs in the tlb miss handler, for us instead the tlb miss handler
doesn't invoke a page fault unless the spte is non present.

I expect for us that optimization will be mostly lost in the noise
but it is likely measurable in heavy VM workloads and in general it
sure worth it in the long run (if nothing else as a microoptimization
for whatever spte fault benchmark).

> We could hack something to make pre mmu notifier kernels work.

Actually I already removed the refcounting for the reserved ram, so
it's going to be very easy to do with a few CONFIG_MMU_NOTIFIER.

But because it's a low prio I would defer it for later, first patch
can do the refcounting unconditionally to better test it. (I'm
assuming we're going to deal with kernels without mmu notifiers [or
without EMM ;) ] for a long while)

> I thought we'd meet with resistance to that idea :) though I'd like to 
> point out that struct pages to exist for mmio on some machines (those with 
> >= 4GB).

Sure, those should have page_count 0 like the reserved ram. I'm
uncertain about the PageReserved semantics, I thought Nick nuked it
long ago but perhaps it was something else. Now copy_page_range bails
out on PFNMAP vmas, in the old days it would threat PG_reserved pages
mapped with remap_page_range like a VM_SHARED by not wrprotecting and
not refcounting even if this was a private mapping.

In general using page_t for anything but ram is something that should
be avoided and that also makes PG_reserved semantics mostly
irrelevant. The users of remap_pfn_range (like /dev/mem) should never
use pages regardless if it's ram or non-ram or if page_t exists or
not. The fact page_t sometime exists for non-ram is strictly a
performance optimization to make the pfn_to_page resolution quicker at
runtime (speedup is significant and memory loss is negligeable).

> I guess we can move to pfn, they're unambiguous enough.

Ok! ;)

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-02 10:28                 ` Andrea Arcangeli
@ 2008-04-02 10:59                   ` Avi Kivity
  2008-04-02 11:16                     ` Avi Kivity
  2008-04-02 14:59                     ` Anthony Liguori
  0 siblings, 2 replies; 32+ messages in thread
From: Avi Kivity @ 2008-04-02 10:59 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: kvm-devel, allen.m.kay, Ben-Ami Yassour1

Andrea Arcangeli wrote:
> On Wed, Apr 02, 2008 at 12:50:50PM +0300, Avi Kivity wrote:
>   
>> Isn't it faster though?  We don't need to pull in the cacheline containing 
>> the struct page anymore.
>>     
>
> Exactly, not only that, get_user_pages is likely a bit slower that we
> need for just kvm pte lookup. GRU uses follow_page directly because it
> runs in the tlb miss handler, for us instead the tlb miss handler
> doesn't invoke a page fault unless the spte is non present.
>
>   

How about this plan?

0. Merge mmu notifiers

1. gfn_to_page() -> gfn_to_pfn()

Still keeping the refcount.  Change bad_page to kvm_bad_hfn.

2. Drop the refcounting from gfn_to_pfn() and from kvm_release_page_*()

Still using get_user_pages() (and dropping the refcount immediately)

Simultaneously, change hack_module.awk to add the refcount back.

3. Export follow_page() or something based on fast_gup(), and use it

btw, if we change the method we use to read the Linux pte, I'd like to 
get the writable bit out of it.  This was, when we create an spte for a 
gpte that is writable and dirty, we can set the spte writable iff the 
Linux pte is writable.  This avoids breaking COW unnecessarily.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-02 10:59                   ` Avi Kivity
@ 2008-04-02 11:16                     ` Avi Kivity
  2008-04-02 11:50                       ` Andrea Arcangeli
  2008-04-02 14:59                     ` Anthony Liguori
  1 sibling, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2008-04-02 11:16 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: kvm-devel, allen.m.kay, Ben-Ami Yassour1

Avi Kivity wrote:
> Andrea Arcangeli wrote:
>> On Wed, Apr 02, 2008 at 12:50:50PM +0300, Avi Kivity wrote:
>>  
>>> Isn't it faster though?  We don't need to pull in the cacheline 
>>> containing the struct page anymore.
>>>     
>>
>> Exactly, not only that, get_user_pages is likely a bit slower that we
>> need for just kvm pte lookup. GRU uses follow_page directly because it
>> runs in the tlb miss handler, for us instead the tlb miss handler
>> doesn't invoke a page fault unless the spte is non present.
>>
>>   
>
> How about this plan?
>
> 0. Merge mmu notifiers
>
> 1. gfn_to_page() -> gfn_to_pfn()
>
> Still keeping the refcount.  Change bad_page to kvm_bad_hfn.
>
> 2. Drop the refcounting from gfn_to_pfn() and from kvm_release_page_*()
>
> Still using get_user_pages() (and dropping the refcount immediately)
>
> Simultaneously, change hack_module.awk to add the refcount back.
>
> 3. Export follow_page() or something based on fast_gup(), and use it
>
> btw, if we change the method we use to read the Linux pte, I'd like to 
> get the writable bit out of it.  This was, when we create an spte for 
> a gpte that is writable and dirty, we can set the spte writable iff 
> the Linux pte is writable.  This avoids breaking COW unnecessarily.
>

Ugh, there's still mark_page_accessed() and SetPageDirty().


-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-02 11:16                     ` Avi Kivity
@ 2008-04-02 11:50                       ` Andrea Arcangeli
  2008-04-02 11:53                         ` Andrea Arcangeli
  2008-04-03  8:51                         ` Avi Kivity
  0 siblings, 2 replies; 32+ messages in thread
From: Andrea Arcangeli @ 2008-04-02 11:50 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, allen.m.kay, Ben-Ami Yassour1

On Wed, Apr 02, 2008 at 02:16:41PM +0300, Avi Kivity wrote:
> Ugh, there's still mark_page_accessed() and SetPageDirty().

btw, like PG_dirty is only set if the spte is writeable,
mark_page_accessed should only run if the accessed bit is set in the
spte. It doesn't matter now as nobody could possibly clear it, but
with mmu notifiers the ->clear_young will clear that accessed bit for
the first time, and if the guest didn't use the spte in the meantime,
we don't need to mark the page accessed and we can let the VM swapout
the page at the next round of the lru.

But that's ok, we'll simply run mark_page_accessed and SetPageDirty if
pfn_valid is true. We're under mmu_lock so the mmu notifiers will
prevent the page to go away from under us. 

The detail I'm uncertain is why my reserved-ram patch had both
pfn_valid = 1 and page_count =0 and PG_reserved =0 for all reserved
ram. Perhaps I triggered a bug in some memmap initialization as the
common code should keep all pages that aren't supposed to go in the
freelist when freed as PG_reserved and page_count 1. Anyway in
practice it worked fine and there's zero risk I'm not using
reserved-ram with the page_count = 0 check ;).

The rmap_remove should look like this (this won't work with the
reserved ram but that needs fixing somehow as reserved-ram must be
like mmio region but with a pfn_valid and in turn with a page_t, and
then the reserved-ram patch will be able to cope with the reserved ram
not having a allocated memmap by luck of how page_alloc.c works).

	if (pfn_valid(pfn)) {
		page = pfn_to_page(pfn);
		if (!PageReserved(page)) {
			BUG_ON(page_count(page) != 1);
		if (is_writeable_pte(*spte))
			SetPageDirty(page);
		if (*spte & PT_ACCESSED_MASK)
			mark_page_accessed(page);
		}
	}

It still skips an atomic op. Your plan still sounds just fine despite
the above, infact it sounds too perfect: the awk hack to re-add the
refcounting when building the external module if CONFIG_MMU_NOTIFIER
isn't defined is going to be messy, a plain CONFIG_MMU_NOTIFIER in
kvm.git would be simpler and more robust IMHO even if less perfect :).

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-02 11:50                       ` Andrea Arcangeli
@ 2008-04-02 11:53                         ` Andrea Arcangeli
  2008-04-03  8:51                         ` Avi Kivity
  1 sibling, 0 replies; 32+ messages in thread
From: Andrea Arcangeli @ 2008-04-02 11:53 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, allen.m.kay, Ben-Ami Yassour1

On Wed, Apr 02, 2008 at 01:50:19PM +0200, Andrea Arcangeli wrote:
> 	if (pfn_valid(pfn)) {
> 		page = pfn_to_page(pfn);
> 		if (!PageReserved(page)) {
> 			BUG_ON(page_count(page) != 1);
> 		if (is_writeable_pte(*spte))
> 			SetPageDirty(page);
> 		if (*spte & PT_ACCESSED_MASK)
> 			mark_page_accessed(page);
> 		}
> 	}

warning the indenting of the above in text-mode was wrong... read it
like a c compiler would do sorry ;)

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-02 10:59                   ` Avi Kivity
  2008-04-02 11:16                     ` Avi Kivity
@ 2008-04-02 14:59                     ` Anthony Liguori
  1 sibling, 0 replies; 32+ messages in thread
From: Anthony Liguori @ 2008-04-02 14:59 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, allen.m.kay, Andrea Arcangeli, Ben-Ami Yassour1

Avi Kivity wrote:
> Andrea Arcangeli wrote:
>> On Wed, Apr 02, 2008 at 12:50:50PM +0300, Avi Kivity wrote:
>>  
>>> Isn't it faster though?  We don't need to pull in the cacheline 
>>> containing the struct page anymore.
>>>     
>>
>> Exactly, not only that, get_user_pages is likely a bit slower that we
>> need for just kvm pte lookup. GRU uses follow_page directly because it
>> runs in the tlb miss handler, for us instead the tlb miss handler
>> doesn't invoke a page fault unless the spte is non present.
>>
>>   
>
> How about this plan?
>
> 0. Merge mmu notifiers

Are mmu-notifiers likely to get pushed when the 2.6.26 window opens up?

> 1. gfn_to_page() -> gfn_to_pfn()
>
> Still keeping the refcount.  Change bad_page to kvm_bad_hfn.

kvm_bad_pfn.

> 2. Drop the refcounting from gfn_to_pfn() and from kvm_release_page_*()
>
> Still using get_user_pages() (and dropping the refcount immediately)
>
> Simultaneously, change hack_module.awk to add the refcount back.

This has the dependency on mmu notifiers so step 1 can conceivably be 
merged in the absence of mmu notifiers.

Regards,

Anthony Liguori

> 3. Export follow_page() or something based on fast_gup(), and use it
>
> btw, if we change the method we use to read the Linux pte, I'd like to 
> get the writable bit out of it.  This was, when we create an spte for 
> a gpte that is writable and dirty, we can set the spte writable iff 
> the Linux pte is writable.  This avoids breaking COW unnecessarily.
>


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/1] direct mmio for passthrough - kernel part
  2008-04-02 11:50                       ` Andrea Arcangeli
  2008-04-02 11:53                         ` Andrea Arcangeli
@ 2008-04-03  8:51                         ` Avi Kivity
  1 sibling, 0 replies; 32+ messages in thread
From: Avi Kivity @ 2008-04-03  8:51 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: kvm-devel, allen.m.kay, Ben-Ami Yassour1

Andrea Arcangeli wrote:
> On Wed, Apr 02, 2008 at 02:16:41PM +0300, Avi Kivity wrote:
>   
>> Ugh, there's still mark_page_accessed() and SetPageDirty().
>>     
>
> btw, like PG_dirty is only set if the spte is writeable,
> mark_page_accessed should only run if the accessed bit is set in the
> spte. It doesn't matter now as nobody could possibly clear it, 

No one will clear it now, but it can start out cleared.  This is done on 
speculative mmu_set_spte(): when the guest writes into its page tables, 
we update the spte speculatively, but the guest may not actually access 
that location (for example, due to a page fault clustering).

So the change makes sense even now.

> It still skips an atomic op. Your plan still sounds just fine despite
> the above, infact it sounds too perfect: the awk hack to re-add the
> refcounting when building the external module if CONFIG_MMU_NOTIFIER
> isn't defined is going to be messy, a plain CONFIG_MMU_NOTIFIER in
> kvm.git would be simpler and more robust IMHO even if less perfect :).
>   

Worst case, we stick a get_user_pages() inside the memslot setup 
function.  That makes things not swappable for pre-mmu notifiers, but 
completely safe.

I'd rather avoid special casing the core code, whenever possible.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2008-04-03  8:51 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-04-01 11:52 [RFC] direct mmio for passthrough benami
2008-04-01 11:52 ` [PATCH 1/1] direct mmio for passthrough - kernel part benami
2008-04-01 13:30   ` Avi Kivity
2008-04-01 14:42     ` Anthony Liguori
2008-04-01 15:20       ` Anthony Liguori
2008-04-01 17:05         ` Avi Kivity
2008-04-01 18:18         ` Andrea Arcangeli
2008-04-01 18:28           ` Anthony Liguori
2008-04-01 17:03       ` Avi Kivity
2008-04-01 17:18         ` Daniel P. Berrange
2008-04-01 18:10           ` Andrea Arcangeli
2008-04-01 18:18             ` Daniel P. Berrange
2008-04-01 18:23             ` Anthony Liguori
2008-04-01 18:21         ` Anthony Liguori
2008-04-01 19:22           ` Avi Kivity
2008-04-01 22:38             ` Andrea Arcangeli
2008-04-01 22:22           ` Andrea Arcangeli
2008-04-01 22:29             ` Anthony Liguori
2008-04-02  4:00               ` Avi Kivity
2008-04-01 19:28     ` Ben-Ami Yassour1
2008-04-01 19:43       ` Avi Kivity
2008-04-01 20:04         ` Anthony Liguori
2008-04-02  4:32           ` Avi Kivity
2008-04-02  7:03             ` Andrea Arcangeli
2008-04-02  9:50               ` Avi Kivity
2008-04-02 10:28                 ` Andrea Arcangeli
2008-04-02 10:59                   ` Avi Kivity
2008-04-02 11:16                     ` Avi Kivity
2008-04-02 11:50                       ` Andrea Arcangeli
2008-04-02 11:53                         ` Andrea Arcangeli
2008-04-03  8:51                         ` Avi Kivity
2008-04-02 14:59                     ` Anthony Liguori

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox