* [RFC] direct mmio for passthrough @ 2008-04-01 11:52 benami 2008-04-01 11:52 ` [PATCH 1/1] direct mmio for passthrough - kernel part benami 0 siblings, 1 reply; 32+ messages in thread From: benami @ 2008-04-01 11:52 UTC (permalink / raw) To: amit.shah; +Cc: kvm-devel, allen.m.kay, andrea, benami This patch for PCI passthrough devices enables a guest to access a device's memory mapped I/O regions directly, without requiring the host to trap and emulate every MMIO access. We save the list of MMIO regions of the guest's devices and the corresponding host MMIO regions in the host kernel. When the guest page faults due to an access to an MMIO region, the host updates the guest's sptes to map the correct host MMIO region. The kernel part of this patchset applies to kvm HEAD and the userspace part applies to Amit's pci-passthrough tree. Tested on a Lenovo M57p with an e1000 NIC assigned directly to an FC8 guest. Comments are appreciated. ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-01 11:52 [RFC] direct mmio for passthrough benami @ 2008-04-01 11:52 ` benami 2008-04-01 13:30 ` Avi Kivity 0 siblings, 1 reply; 32+ messages in thread From: benami @ 2008-04-01 11:52 UTC (permalink / raw) To: amit.shah; +Cc: kvm-devel, allen.m.kay, andrea, benami From: Ben-Ami Yassour <benami@il.ibm.com> Enable a guest to access a device's memory mapped I/O regions directly. Userspace sends the mmio regions that the guest can access. On the first page fault for an access to an mmio address the host translates the gva to hpa, and updates the sptes. Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com> Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com> --- arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/mmu.c | 27 ++++++++ arch/x86/kvm/mmu.h | 2 + arch/x86/kvm/paging_tmpl.h | 71 +++++++++++++++++++--- arch/x86/kvm/passthrough.c | 144 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/passthrough.h | 23 +++++++ arch/x86/kvm/x86.c | 2 +- include/asm-x86/kvm_host.h | 15 +++++ include/linux/kvm.h | 13 ++++ include/linux/kvm_host.h | 10 +++ virt/kvm/kvm_main.c | 24 +++++++ 11 files changed, 323 insertions(+), 10 deletions(-) create mode 100644 arch/x86/kvm/passthrough.c create mode 100644 arch/x86/kvm/passthrough.h diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 4d0c22e..2fa4932 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,7 +7,7 @@ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o) EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ - i8254.o + i8254.o passthrough.o obj-$(CONFIG_KVM) += kvm.o kvm-intel-objs = vmx.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8a6a4f9..dccd898 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -34,6 +34,8 @@ #include <asm/cmpxchg.h> #include <asm/io.h> +#include "passthrough.h" + /* * When setting this variable to true it enables Two-Dimensional-Paging * where the hardware walks 2 page tables: @@ -112,6 +114,8 @@ static int dbg = 1; #define PT_FIRST_AVAIL_BITS_SHIFT 9 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 +#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) + #define VALID_PAGE(x) ((x) != INVALID_PAGE) #define PT64_LEVEL_BITS 9 @@ -545,6 +549,11 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) unsigned long *rmapp; int i; + /* bail out if this is an spte mapping an MMIO region */ + if (kvm->arch.pt.pt_mmio_mapped) + if (*spte & PT_SHADOW_IO_MARK) + return; + if (!is_rmap_pte(*spte)) return; sp = page_header(__pa(spte)); @@ -1273,6 +1282,24 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu.root_hpa = INVALID_PAGE; } +/* + * This is a very big sledgehammer, it should be called very rarely. + * We call this when a passthrough mmio mapping changes in order to + * remove the old passthrough mmio sptes. This usually only occurs on + * guest initialization. + */ +void mmu_invalidate_all_sptes(struct kvm *kvm) +{ + int i; + + for (i = 0 ; i < KVM_MAX_VCPUS ; i++) { + if (kvm->vcpus[i]) + mmu_free_roots(kvm->vcpus[i]); + } + + kvm_flush_remote_tlbs(kvm); +} + static void mmu_alloc_roots(struct kvm_vcpu *vcpu) { int i; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index e64e9f5..5c9e33e 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -47,4 +47,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu) return vcpu->arch.cr0 & X86_CR0_PG; } +void mmu_invalidate_all_sptes(struct kvm *kvm); + #endif diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 57d872a..c33b6cf 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -73,6 +73,8 @@ struct guest_walker { u32 error_code; }; +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr); + static gfn_t gpte_to_gfn(pt_element_t gpte) { return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -275,7 +277,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *walker, int user_fault, int write_fault, int largepage, - int *ptwrite, struct page *page) + int *ptwrite, int pt_mmio, struct page *page) { hpa_t shadow_addr; int level; @@ -346,15 +348,61 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, *shadow_ent = shadow_pte; } - mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, - user_fault, write_fault, - walker->ptes[walker->level-1] & PT_DIRTY_MASK, - ptwrite, largepage, walker->gfn, page, false); - + /* for the passthrough mmio case, spte is set by page_fault_pt_mmio */ + if (!pt_mmio) + mmu_set_spte(vcpu, shadow_ent, access, + walker->pte_access & access, + user_fault, write_fault, + walker->ptes[walker->level-1] & PT_DIRTY_MASK, + ptwrite, largepage, walker->gfn, page, false); return shadow_ent; } /* + * Handle pagefault for passthrough mmio + */ +static int FNAME(page_fault_pt_mmio)(struct kvm_vcpu *vcpu, gva_t addr, + struct guest_walker *walker, + int user_fault, int write_fault, + int largepage, int *ptwrite, + struct page *page) +{ + u64 *shadow_pte; + gpa_t gpa; + hpa_t hpa = UNMAPPED_HPA; + u64 spte; + int rc = 1; + int write_pt = 0; + + gpa = FNAME(gva_to_gpa)(vcpu, addr); + if (gpa != UNMAPPED_GVA) + hpa = pt_mmio_gpa_to_hpa(vcpu->kvm, gpa); + + if (hpa != UNMAPPED_HPA) { + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + shadow_pte = FNAME(fetch)(vcpu, addr, walker, user_fault, + write_fault, largepage, &write_pt, + 1, page); + + if (shadow_pte) { + set_shadow_pte(&spte, *shadow_pte); + spte = hpa; + spte |= PT_PRESENT_MASK | PT_WRITABLE_MASK | + PT_USER_MASK | PT_PWT_MASK | PT_PCD_MASK | + PT_ACCESSED_MASK | PT_SHADOW_IO_MARK; + set_shadow_pte(shadow_pte, spte); + rc = 0; + } else { + pgprintk("fetch failed to return shadow_pte " + "for address 0x%x", (unsigned int)addr); + } + spin_unlock(&vcpu->kvm->mmu_lock); + } + return rc; +} + +/* * Page fault handler. There are several causes for a page fault: * - there is no shadow pte for the guest pte * - write access through a shadow pte marked read only so that we can set @@ -418,15 +466,22 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, /* mmio */ if (is_error_page(page)) { + int rc = 1; pgprintk("gfn %x is mmio\n", walker.gfn); + if (vcpu->kvm->arch.pt.pt_mmio_mapped) { + rc = FNAME(page_fault_pt_mmio)(vcpu, addr, &walker, + user_fault, write_fault, + largepage, &write_pt, + page); + } kvm_release_page_clean(page); - return 1; + return rc; } spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - largepage, &write_pt, page); + largepage, &write_pt, 0, page); pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, shadow_pte, *shadow_pte, write_pt); diff --git a/arch/x86/kvm/passthrough.c b/arch/x86/kvm/passthrough.c new file mode 100644 index 0000000..654d1fe --- /dev/null +++ b/arch/x86/kvm/passthrough.c @@ -0,0 +1,144 @@ +/* + * This module enable guest shadow page tables to map host mmio regions of + * passthrough devices directly. + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Ben-Ami Yassour <benami@il.ibm.com> + * Muli Ben-Yehuda <muli@il.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "linux/kvm_host.h" +#include "passthrough.h" +#include "mmu.h" + +void pt_init_vm(struct pt *pt) +{ + spin_lock_init(&pt->pt_list_lock); + INIT_LIST_HEAD(&pt->mmio_range_list); + pt->pt_mmio_mapped = 0; +} + +hpa_t pt_mmio_gpa_to_hpa(struct kvm *kvm, gpa_t gpa) +{ + hpa_t hpa = UNMAPPED_HPA; + unsigned long flags; + struct mmio_range *mmio = NULL; + + spin_lock_irqsave(&kvm->arch.pt.pt_list_lock, flags); + + /* loop the list of regions */ + list_for_each_entry(mmio, &kvm->arch.pt.mmio_range_list, list) { + if ((gpa >= mmio->gpa) && (gpa < mmio->gpa + mmio->size)) { + hpa = mmio->hpa + (gpa - mmio->gpa); + break; + } + } + + spin_unlock_irqrestore(&kvm->arch.pt.pt_list_lock, flags); + + return hpa; +} + +int kvm_vm_ioctl_pt_memory_mapping_add(struct kvm *kvm, + struct kvm_pt_memory_mapping* + memory_mapping) +{ + gpa_t gpa = memory_mapping->gpa; + hpa_t hpa = memory_mapping->hpa; + u64 size = memory_mapping->size; + int ret = 0; + struct mmio_range *mmio_new; + struct mmio_range *mmio_curr = NULL; + struct mmio_range *mmio_prev = NULL; + unsigned long flags; + + mmio_new = kmalloc(sizeof(*mmio_new), GFP_KERNEL); + if (!mmio_new) + return -ENOMEM; + + mmio_new->gpa = gpa; + mmio_new->hpa = hpa; + mmio_new->size = size; + + spin_lock_irqsave(&kvm->arch.pt.pt_list_lock, flags); + + /* search for the location to add this range */ + ret = 0; + list_for_each_entry(mmio_curr, &kvm->arch.pt.mmio_range_list, list) { + if ((mmio_curr->gpa + mmio_curr->size) <= gpa) { + mmio_prev = mmio_curr; + continue; + } else if ((gpa + size) <= mmio_curr->gpa) { + /* no intersection between ranges */ + break; + } else { + /* ranges intersect */ + ret = -EINVAL; + break; + } + } + + if (!ret) { + struct mmio_range *place; + place = mmio_prev ? mmio_prev : + (struct mmio_range *)&kvm->arch.pt.mmio_range_list; + list_add((struct list_head *)mmio_new, + (struct list_head *)place); + + kvm->arch.pt.pt_mmio_mapped = 1; + } + + spin_unlock_irqrestore(&kvm->arch.pt.pt_list_lock, flags); + + if (ret) + kfree(mmio_new); + + return ret; +} + +int kvm_vm_ioctl_pt_memory_mapping_remove(struct kvm *kvm, + struct kvm_pt_memory_mapping* + memory_mapping) +{ + gpa_t gpa = memory_mapping->gpa; + u64 size = memory_mapping->size; + int ret = 0; + struct mmio_range *mmio_curr = NULL; + unsigned long flags; + + /* search the range to remove */ + ret = 0; + spin_lock_irqsave(&kvm->arch.pt.pt_list_lock, flags); + list_for_each_entry(mmio_curr, &kvm->arch.pt.mmio_range_list, list) { + if ((mmio_curr->gpa + mmio_curr->size) <= gpa) { + continue; + } else if ((gpa + size) <= mmio_curr->gpa) { + /* not found */ + ret = -EINVAL; + break; + } else { + /* ranges intersect */ + if ((gpa != mmio_curr->gpa) || + (size != mmio_curr->size)) { + /* ranges are not equal */ + ret = -EINVAL; + } + break; + } + } + + if (!ret) + list_del((struct list_head *)mmio_curr); + + spin_unlock_irqrestore(&kvm->arch.pt.pt_list_lock, flags); + + mmu_invalidate_all_sptes(kvm); + + return ret; +} diff --git a/arch/x86/kvm/passthrough.h b/arch/x86/kvm/passthrough.h new file mode 100644 index 0000000..68810c6 --- /dev/null +++ b/arch/x86/kvm/passthrough.h @@ -0,0 +1,23 @@ +/* + * This module enable guest shadow page tables to map host mmio regions of + * passthrough devices directly. + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Ben-Ami Yassour <benami@il.ibm.com> + * Muli Ben-Yehuda <muli@il.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef PASSTHROUGH_H +#define PASSTHROUGH_H + +#include <asm/kvm_host.h> + +hpa_t pt_mmio_gpa_to_hpa(struct kvm *kvm, gpa_t gpa); + +#endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 94d77e8..3398ae7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3811,7 +3811,7 @@ struct kvm *kvm_arch_create_vm(void) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); - + pt_init_vm(&(kvm->arch.pt)); return kvm; } diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index e6d8df6..01e96f7 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -44,6 +44,7 @@ #define INVALID_PAGE (~(hpa_t)0) #define UNMAPPED_GVA (~(gpa_t)0) +#define UNMAPPED_HPA (~(hpa_t)-1) /* shadow tables are PAE even on non-PAE hosts */ #define KVM_HPAGE_SHIFT 21 @@ -296,6 +297,19 @@ struct kvm_mem_alias { gfn_t target_gfn; }; +struct mmio_range { + struct list_head list; + gpa_t gpa; + hpa_t hpa; + u64 size; +}; + +struct pt { + int pt_mmio_mapped; /* guest sptes map host mmio regions directly */ + spinlock_t pt_list_lock; /* protect pt specific lists */ + struct list_head mmio_range_list; /* ordered list of mmio mapping */ +}; + struct kvm_arch{ int naliases; struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; @@ -317,6 +331,7 @@ struct kvm_arch{ struct page *apic_access_page; gpa_t wall_clock; + struct pt pt; }; struct kvm_vm_stat { diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 37b963e..956512d 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -289,6 +289,19 @@ struct kvm_s390_interrupt { #define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\ struct kvm_userspace_memory_region) #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) + +/* Bind host I/O address range to guest address range. */ +struct kvm_pt_memory_mapping { + __u64 gpa; + __u64 hpa; + __u64 size; /* in bytes */ +}; + +#define KVM_PT_MEMORY_MAPPING_ADD _IOWR(KVMIO, 0x48, \ + struct kvm_pt_memory_mapping) +#define KVM_PT_MEMORY_MAPPING_REMOVE _IOWR(KVMIO, 0x49, \ + struct kvm_pt_memory_mapping) + /* * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns * a vcpu fd. diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f4e1436..3b97624 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -200,6 +200,14 @@ int kvm_get_dirty_log(struct kvm *kvm, int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log); +int kvm_vm_ioctl_pt_memory_mapping_add(struct kvm *kvm, + struct kvm_pt_memory_mapping + *memory_mapping); + +int kvm_vm_ioctl_pt_memory_mapping_remove(struct kvm *kvm, + struct kvm_pt_memory_mapping + *memory_mapping); + int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, @@ -294,4 +302,6 @@ struct kvm_stats_debugfs_item { }; extern struct kvm_stats_debugfs_item debugfs_entries[]; +void pt_init_vm(struct pt *pt); + #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 30bf832..c2c6c05 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1030,6 +1030,30 @@ static long kvm_vm_ioctl(struct file *filp, goto out; break; } + case KVM_PT_MEMORY_MAPPING_ADD: { + struct kvm_pt_memory_mapping input; + + r = -EFAULT; + if (copy_from_user(&input, argp, + sizeof(struct kvm_pt_memory_mapping))) + goto out; + r = kvm_vm_ioctl_pt_memory_mapping_add(kvm, &input); + if (r) + goto out; + break; + } + case KVM_PT_MEMORY_MAPPING_REMOVE: { + struct kvm_pt_memory_mapping input; + + r = -EFAULT; + if (copy_from_user(&input, argp, + sizeof(struct kvm_pt_memory_mapping))) + goto out; + r = kvm_vm_ioctl_pt_memory_mapping_remove(kvm, &input); + if (r) + goto out; + break; + } default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); } -- 1.5.0.3 ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply related [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-01 11:52 ` [PATCH 1/1] direct mmio for passthrough - kernel part benami @ 2008-04-01 13:30 ` Avi Kivity 2008-04-01 14:42 ` Anthony Liguori 2008-04-01 19:28 ` Ben-Ami Yassour1 0 siblings, 2 replies; 32+ messages in thread From: Avi Kivity @ 2008-04-01 13:30 UTC (permalink / raw) To: benami; +Cc: kvm-devel, andrea, allen.m.kay benami@il.ibm.com wrote: > From: Ben-Ami Yassour <benami@il.ibm.com> > > Enable a guest to access a device's memory mapped I/O regions directly. > Userspace sends the mmio regions that the guest can access. On the first > page fault for an access to an mmio address the host translates the gva to hpa, > and updates the sptes. > > Can you explain why you're not using the regular memory slot mechanism? i.e. have userspace mmap(/dev/mem) and create a memslot containing that at the appropriate guest physical address? There are some issues with refcounting, but Andrea has some tricks to deal with that. -- Any sufficiently difficult bug is indistinguishable from a feature. ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-01 13:30 ` Avi Kivity @ 2008-04-01 14:42 ` Anthony Liguori 2008-04-01 15:20 ` Anthony Liguori 2008-04-01 17:03 ` Avi Kivity 2008-04-01 19:28 ` Ben-Ami Yassour1 1 sibling, 2 replies; 32+ messages in thread From: Anthony Liguori @ 2008-04-01 14:42 UTC (permalink / raw) To: Avi Kivity; +Cc: kvm-devel, allen.m.kay, andrea, benami Avi Kivity wrote: > benami@il.ibm.com wrote: > >> From: Ben-Ami Yassour <benami@il.ibm.com> >> >> Enable a guest to access a device's memory mapped I/O regions directly. >> Userspace sends the mmio regions that the guest can access. On the first >> page fault for an access to an mmio address the host translates the gva to hpa, >> and updates the sptes. >> >> >> > > Can you explain why you're not using the regular memory slot mechanism? > i.e. have userspace mmap(/dev/mem) and create a memslot containing that > at the appropriate guest physical address? > /dev/mem is often restricted in what memory can be mapped. However, we can't add something like this to KVM that allows arbitrary HPA's to be mapped into a guest from userspace. This is just as bad as /dev/mem and is going to upset a lot of people. Regardless of whether we can use /dev/mem, I think we should introduce a new char device anyway. We only need to mmap() MMIO regions which are mapped by the PCI bus, presumably, the kernel should know about these mappings. The driver should only allow mappings that are valid for a particular PCI device such that it cannot be abused to map arbitrary regions of memory into a guest. Bonus points if it can validate that there isn't a valid Linux driver loaded for the given PCI device. Regards, Anthony Liguori > There are some issues with refcounting, but Andrea has some tricks to > deal with that. > > ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-01 14:42 ` Anthony Liguori @ 2008-04-01 15:20 ` Anthony Liguori 2008-04-01 17:05 ` Avi Kivity 2008-04-01 18:18 ` Andrea Arcangeli 2008-04-01 17:03 ` Avi Kivity 1 sibling, 2 replies; 32+ messages in thread From: Anthony Liguori @ 2008-04-01 15:20 UTC (permalink / raw) To: Avi Kivity; +Cc: kvm-devel, allen.m.kay, andrea, benami Anthony Liguori wrote: > Avi Kivity wrote: >> benami@il.ibm.com wrote: >> >>> From: Ben-Ami Yassour <benami@il.ibm.com> >>> >>> Enable a guest to access a device's memory mapped I/O regions directly. >>> Userspace sends the mmio regions that the guest can access. On the >>> first >>> page fault for an access to an mmio address the host translates the >>> gva to hpa, >>> and updates the sptes. >>> >>> >> >> Can you explain why you're not using the regular memory slot >> mechanism? i.e. have userspace mmap(/dev/mem) and create a memslot >> containing that at the appropriate guest physical address? >> > > /dev/mem is often restricted in what memory can be mapped. However, > we can't add something like this to KVM that allows arbitrary HPA's to > be mapped into a guest from userspace. This is just as bad as > /dev/mem and is going to upset a lot of people. > > Regardless of whether we can use /dev/mem, I think we should introduce > a new char device anyway. We only need to mmap() MMIO regions which > are mapped by the PCI bus, presumably, the kernel should know about > these mappings. The driver should only allow mappings that are valid > for a particular PCI device such that it cannot be abused to map > arbitrary regions of memory into a guest. Bonus points if it can > validate that there isn't a valid Linux driver loaded for the given > PCI device. Which is apparently entirely unnecessary as we already have /sys/bus/pci/.../region. It's just a matter of checking if a vma is VM_IO and then dealing with the subsequent reference counting issues as Avi points out. Regards, Anthony Liguori > Regards, > > Anthony Liguori > >> There are some issues with refcounting, but Andrea has some tricks to >> deal with that. >> >> > ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-01 15:20 ` Anthony Liguori @ 2008-04-01 17:05 ` Avi Kivity 2008-04-01 18:18 ` Andrea Arcangeli 1 sibling, 0 replies; 32+ messages in thread From: Avi Kivity @ 2008-04-01 17:05 UTC (permalink / raw) To: Anthony Liguori; +Cc: kvm-devel, allen.m.kay, andrea, benami Anthony Liguori wrote: >> >> Regardless of whether we can use /dev/mem, I think we should >> introduce a new char device anyway. We only need to mmap() MMIO >> regions which are mapped by the PCI bus, presumably, the kernel >> should know about these mappings. The driver should only allow >> mappings that are valid for a particular PCI device such that it >> cannot be abused to map arbitrary regions of memory into a guest. >> Bonus points if it can validate that there isn't a valid Linux driver >> loaded for the given PCI device. > > Which is apparently entirely unnecessary as we already have > /sys/bus/pci/.../region. It's just a matter of checking if a vma is > VM_IO and then dealing with the subsequent reference counting issues > as Avi points out. > From idea to working implementation in 38 minutes. Congrats. -- Any sufficiently difficult bug is indistinguishable from a feature. ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-01 15:20 ` Anthony Liguori 2008-04-01 17:05 ` Avi Kivity @ 2008-04-01 18:18 ` Andrea Arcangeli 2008-04-01 18:28 ` Anthony Liguori 1 sibling, 1 reply; 32+ messages in thread From: Andrea Arcangeli @ 2008-04-01 18:18 UTC (permalink / raw) To: Anthony Liguori; +Cc: kvm-devel, allen.m.kay, benami, Avi Kivity On Tue, Apr 01, 2008 at 10:20:49AM -0500, Anthony Liguori wrote: > Which is apparently entirely unnecessary as we already have > /sys/bus/pci/.../region. It's just a matter of checking if a vma is VM_IO > and then dealing with the subsequent reference counting issues as Avi > points out. Do you need to map it in userland too, isn't it enough to map it in the sptes? For the ram I had to map it in userland too with /dev/mem, and then I used the pte_pfn to fill the spte, so the emulated qemu drivers can input/output. But for the mmio space I doubt the userland side is needed. If you add a direct memslot (new bitflag type) I will use it too instead of catching get_user_pages failures and walking ptes on the RAM pieces overwritten by /dev/mem. ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-01 18:18 ` Andrea Arcangeli @ 2008-04-01 18:28 ` Anthony Liguori 0 siblings, 0 replies; 32+ messages in thread From: Anthony Liguori @ 2008-04-01 18:28 UTC (permalink / raw) To: Andrea Arcangeli; +Cc: kvm-devel, allen.m.kay, benami, Avi Kivity Andrea Arcangeli wrote: > On Tue, Apr 01, 2008 at 10:20:49AM -0500, Anthony Liguori wrote: > >> Which is apparently entirely unnecessary as we already have >> /sys/bus/pci/.../region. It's just a matter of checking if a vma is VM_IO >> and then dealing with the subsequent reference counting issues as Avi >> points out. >> > > Do you need to map it in userland too, isn't it enough to map it in > the sptes? > > For the ram I had to map it in userland too with /dev/mem, and then I > used the pte_pfn to fill the spte, so the emulated qemu drivers can > input/output. But for the mmio space I doubt the userland side is > needed. > > If you add a direct memslot (new bitflag type) I will use it too > instead of catching get_user_pages failures and walking ptes on the > RAM pieces overwritten by /dev/mem. > There's a certain amount of elegance in mapping to userspace and not introducing a direct memslot. It helps simplify the security model since an application isn't able to do anything more than it could without KVM. The difficulty with a direct memslot is how you introduce policies to limit what guests can access what memslots directly. You would have to teach KVM to interact with the PCI subsystem to determine what memory was within a particular PCI IO region. Not impossible, just ugly. Regards, Anthony Liguori ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-01 14:42 ` Anthony Liguori 2008-04-01 15:20 ` Anthony Liguori @ 2008-04-01 17:03 ` Avi Kivity 2008-04-01 17:18 ` Daniel P. Berrange 2008-04-01 18:21 ` Anthony Liguori 1 sibling, 2 replies; 32+ messages in thread From: Avi Kivity @ 2008-04-01 17:03 UTC (permalink / raw) To: Anthony Liguori; +Cc: kvm-devel, allen.m.kay, andrea, benami Anthony Liguori wrote: > Avi Kivity wrote: >> benami@il.ibm.com wrote: >> >>> From: Ben-Ami Yassour <benami@il.ibm.com> >>> >>> Enable a guest to access a device's memory mapped I/O regions directly. >>> Userspace sends the mmio regions that the guest can access. On the >>> first >>> page fault for an access to an mmio address the host translates the >>> gva to hpa, >>> and updates the sptes. >>> >>> >> >> Can you explain why you're not using the regular memory slot >> mechanism? i.e. have userspace mmap(/dev/mem) and create a memslot >> containing that at the appropriate guest physical address? >> > > /dev/mem is often restricted in what memory can be mapped. Please elaborate. > However, we can't add something like this to KVM that allows arbitrary > HPA's to be mapped into a guest from userspace. This is just as bad > as /dev/mem and is going to upset a lot of people. Device assignment is as rootish as you get. > > > Regardless of whether we can use /dev/mem, I think we should introduce > a new char device anyway. We only need to mmap() MMIO regions which > are mapped by the PCI bus, presumably, the kernel should know about > these mappings. The driver should only allow mappings that are valid > for a particular PCI device such that it cannot be abused to map > arbitrary regions of memory into a guest. Bonus points if it can > validate that there isn't a valid Linux driver loaded for the given > PCI device. This is a very good idea. The interface exposed would be the same as /dev/mem, so any kvm modifications to get /dev/mem working would be applicable to /dev/pci/*, no? -- Any sufficiently difficult bug is indistinguishable from a feature. ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-01 17:03 ` Avi Kivity @ 2008-04-01 17:18 ` Daniel P. Berrange 2008-04-01 18:10 ` Andrea Arcangeli 2008-04-01 18:21 ` Anthony Liguori 1 sibling, 1 reply; 32+ messages in thread From: Daniel P. Berrange @ 2008-04-01 17:18 UTC (permalink / raw) To: Avi Kivity; +Cc: kvm-devel, andrea, allen.m.kay, benami On Tue, Apr 01, 2008 at 08:03:14PM +0300, Avi Kivity wrote: > Anthony Liguori wrote: > > Avi Kivity wrote: > >> benami@il.ibm.com wrote: > >> > >>> From: Ben-Ami Yassour <benami@il.ibm.com> > >>> > >>> Enable a guest to access a device's memory mapped I/O regions directly. > >>> Userspace sends the mmio regions that the guest can access. On the > >>> first > >>> page fault for an access to an mmio address the host translates the > >>> gva to hpa, > >>> and updates the sptes. > >>> > >>> > >> > >> Can you explain why you're not using the regular memory slot > >> mechanism? i.e. have userspace mmap(/dev/mem) and create a memslot > >> containing that at the appropriate guest physical address? > >> > > > > /dev/mem is often restricted in what memory can be mapped. > > Please elaborate. The /dev/mem, /dev/kmem devices have a special SELinux context memory_device_t and very few application domains are allowed to access them. THe KVM/QEMU policy will not allow this for example. Basically on the X server, HAL and dmidecode have access in current policy. It would be undesirable to have to all KVM guests full access to /dev/mem, so a more fine grained access method would have benefits here. Dan. -- |: Red Hat, Engineering, Boston -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :| ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-01 17:18 ` Daniel P. Berrange @ 2008-04-01 18:10 ` Andrea Arcangeli 2008-04-01 18:18 ` Daniel P. Berrange 2008-04-01 18:23 ` Anthony Liguori 0 siblings, 2 replies; 32+ messages in thread From: Andrea Arcangeli @ 2008-04-01 18:10 UTC (permalink / raw) To: Daniel P. Berrange; +Cc: kvm-devel, allen.m.kay, Avi Kivity, benami On Tue, Apr 01, 2008 at 06:18:07PM +0100, Daniel P. Berrange wrote: > and very few application domains are allowed to access them. THe KVM/QEMU > policy will not allow this for example. Basically on the X server, HAL and > dmidecode have access in current policy. It would be undesirable to have to > all KVM guests full access to /dev/mem, so a more fine grained access method > would have benefits here. But pci-passthrough can give a root on the host even to the ring0 guest, just like /dev/mem without VT-d, so there's no muchx difference with using /dev/mem as far as security is concerned. Only on the CPUs including VT-d it's possible to retain a mostly equivalent security level despite pci-passthrough. ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-01 18:10 ` Andrea Arcangeli @ 2008-04-01 18:18 ` Daniel P. Berrange 2008-04-01 18:23 ` Anthony Liguori 1 sibling, 0 replies; 32+ messages in thread From: Daniel P. Berrange @ 2008-04-01 18:18 UTC (permalink / raw) To: Andrea Arcangeli; +Cc: kvm-devel, allen.m.kay, Avi Kivity, benami On Tue, Apr 01, 2008 at 08:10:31PM +0200, Andrea Arcangeli wrote: > On Tue, Apr 01, 2008 at 06:18:07PM +0100, Daniel P. Berrange wrote: > > and very few application domains are allowed to access them. THe KVM/QEMU > > policy will not allow this for example. Basically on the X server, HAL and > > dmidecode have access in current policy. It would be undesirable to have to > > all KVM guests full access to /dev/mem, so a more fine grained access method > > would have benefits here. > > But pci-passthrough can give a root on the host even to the ring0 > guest, just like /dev/mem without VT-d, so there's no muchx difference > with using /dev/mem as far as security is concerned. Only on the CPUs > including VT-d it's possible to retain a mostly equivalent security > level despite pci-passthrough. Clearly it is a loosing battle without VT-d. That doesn't mean we should design it to loose in general. So we should design to that when we do have VT-D it will have the maximum security possible. VT-d will only become more widespread over time. Dan. -- |: Red Hat, Engineering, Boston -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :| ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-01 18:10 ` Andrea Arcangeli 2008-04-01 18:18 ` Daniel P. Berrange @ 2008-04-01 18:23 ` Anthony Liguori 1 sibling, 0 replies; 32+ messages in thread From: Anthony Liguori @ 2008-04-01 18:23 UTC (permalink / raw) To: Andrea Arcangeli; +Cc: kvm-devel, allen.m.kay, benami, Avi Kivity Andrea Arcangeli wrote: > On Tue, Apr 01, 2008 at 06:18:07PM +0100, Daniel P. Berrange wrote: > >> and very few application domains are allowed to access them. THe KVM/QEMU >> policy will not allow this for example. Basically on the X server, HAL and >> dmidecode have access in current policy. It would be undesirable to have to >> all KVM guests full access to /dev/mem, so a more fine grained access method >> would have benefits here. >> > > But pci-passthrough can give a root on the host even to the ring0 > guest, just like /dev/mem without VT-d, so there's no muchx difference > with using /dev/mem as far as security is concerned. Only on the CPUs > including VT-d it's possible to retain a mostly equivalent security > level despite pci-passthrough. > Eventually, most machines with have an IOMMU so that's the assumption to design to. It is true that PCI pass-through w/o VT-d is always going to be equivalent to /dev/mem access but it's really a special case. Unless you have an IOMMU, you would not do PCI pass-through if you cared at all about security/reliability. Regards, Anthony Liguori ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-01 17:03 ` Avi Kivity 2008-04-01 17:18 ` Daniel P. Berrange @ 2008-04-01 18:21 ` Anthony Liguori 2008-04-01 19:22 ` Avi Kivity 2008-04-01 22:22 ` Andrea Arcangeli 1 sibling, 2 replies; 32+ messages in thread From: Anthony Liguori @ 2008-04-01 18:21 UTC (permalink / raw) To: Avi Kivity; +Cc: kvm-devel, allen.m.kay, andrea, benami Avi Kivity wrote: > Anthony Liguori wrote: >> Avi Kivity wrote: >>> benami@il.ibm.com wrote: >>> >>>> From: Ben-Ami Yassour <benami@il.ibm.com> >>>> >>>> Enable a guest to access a device's memory mapped I/O regions >>>> directly. >>>> Userspace sends the mmio regions that the guest can access. On the >>>> first >>>> page fault for an access to an mmio address the host translates the >>>> gva to hpa, >>>> and updates the sptes. >>>> >>>> >>> >>> Can you explain why you're not using the regular memory slot >>> mechanism? i.e. have userspace mmap(/dev/mem) and create a memslot >>> containing that at the appropriate guest physical address? >>> >> >> /dev/mem is often restricted in what memory can be mapped. > > Please elaborate. http://lkml.org/lkml/2008/1/30/473 Ubuntu now carries this and I think Fedora has carried something like this for a long time. It's okay for us because we just need device memory access but who knows if it will be restricted more in the future. If X ever gets their act together and gets proper kernel drivers, I think there would be a strong case for removing /dev/mem altogether. >> However, we can't add something like this to KVM that allows >> arbitrary HPA's to be mapped into a guest from userspace. This is >> just as bad as /dev/mem and is going to upset a lot of people. > > Device assignment is as rootish as you get. In an SELinux world, root may still have limited access to the system. In an ideal world, we would be able to guarantee that at worst, a guest can only harm the device it has access to and nothing else on the system. This probably requires MSI and VT-d. Even then, you may be able to physically damage the system by improperly programming the device. >> >> >> Regardless of whether we can use /dev/mem, I think we should >> introduce a new char device anyway. We only need to mmap() MMIO >> regions which are mapped by the PCI bus, presumably, the kernel >> should know about these mappings. The driver should only allow >> mappings that are valid for a particular PCI device such that it >> cannot be abused to map arbitrary regions of memory into a guest. >> Bonus points if it can validate that there isn't a valid Linux driver >> loaded for the given PCI device. > > This is a very good idea. > > The interface exposed would be the same as /dev/mem, so any kvm > modifications to get /dev/mem working would be applicable to > /dev/pci/*, no? I looked at Andrea's patches and I didn't see any special handling for non-RAM pages. Something Muli mentioned that kept them from doing /sys/devices/pci/.../region to begin with was the fact that IO pages do not have a struct page backing them so get_user_pages() will fail in gfn_to_page(). It's not too hard to modify the code to also try get_user_pages() to look up the VMA instead of the struct page. From the VMA, you can get the HPA for an IO mapping. The problem though is that gfn_to_page() wants to return a page, not a HPA. I haven't looked too deeply yet, but my suspicion is that to properly support mapping in VM_IO pages will require some general refactoring since we always assume that a struct page exists for any HPA. Regards, Anthony Liguori ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-01 18:21 ` Anthony Liguori @ 2008-04-01 19:22 ` Avi Kivity 2008-04-01 22:38 ` Andrea Arcangeli 2008-04-01 22:22 ` Andrea Arcangeli 1 sibling, 1 reply; 32+ messages in thread From: Avi Kivity @ 2008-04-01 19:22 UTC (permalink / raw) To: Anthony Liguori; +Cc: kvm-devel, andrea, allen.m.kay, benami Anthony Liguori wrote: > I looked at Andrea's patches and I didn't see any special handling for > non-RAM pages. Something Muli mentioned that kept them from doing > /sys/devices/pci/.../region to begin with was the fact that IO pages do > not have a struct page backing them so get_user_pages() will fail in > gfn_to_page(). > It's just something we discussed, not code. -- Any sufficiently difficult bug is indistinguishable from a feature. ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-01 19:22 ` Avi Kivity @ 2008-04-01 22:38 ` Andrea Arcangeli 0 siblings, 0 replies; 32+ messages in thread From: Andrea Arcangeli @ 2008-04-01 22:38 UTC (permalink / raw) To: Avi Kivity; +Cc: kvm-devel, allen.m.kay, benami On Tue, Apr 01, 2008 at 10:22:51PM +0300, Avi Kivity wrote: > It's just something we discussed, not code. Yes, the pfn_valid check should skip all refcounting for mmio regions without a struct page. But gfn_to_page can't work without a struct page, so some change will be needed there. With the reserved ram patch I didn't need to touch the gfn_to_page interface because the memmap happens to exist despite the ram is reserved in the e820 map, so I used !page_count() instead of the previously discussed !pfn_valid to skip all refcounting (reserved ram must skip the refcounting too because those ram pages can't be freed as they're not-ram as far as linux is concerned). ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-01 18:21 ` Anthony Liguori 2008-04-01 19:22 ` Avi Kivity @ 2008-04-01 22:22 ` Andrea Arcangeli 2008-04-01 22:29 ` Anthony Liguori 1 sibling, 1 reply; 32+ messages in thread From: Andrea Arcangeli @ 2008-04-01 22:22 UTC (permalink / raw) To: Anthony Liguori; +Cc: kvm-devel, allen.m.kay, benami, Avi Kivity On Tue, Apr 01, 2008 at 01:21:37PM -0500, Anthony Liguori wrote: > return a page, not a HPA. I haven't looked too deeply yet, but my > suspicion is that to properly support mapping in VM_IO pages will require > some general refactoring since we always assume that a struct page exists > for any HPA. Yes, that was potentially problem for reserved _ram_ pages too, as it isn't guaranteed that memmap_t (old days nomenclature) will exist for physical addresses not defined as ram in the e820 map (to make it work without VT-d I have to reserve the ram in the host at the e820 map parsing time). If the memmap will not exist for the reserved ram physical range, the pfn_valid() will fail at runtime in kvm and the bad_page will generate a graceful emulation failure, so it's very safe. But once we handle direct memslots for mmio regions, the reserved ram will better stop depending on the memmap too. ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-01 22:22 ` Andrea Arcangeli @ 2008-04-01 22:29 ` Anthony Liguori 2008-04-02 4:00 ` Avi Kivity 0 siblings, 1 reply; 32+ messages in thread From: Anthony Liguori @ 2008-04-01 22:29 UTC (permalink / raw) To: Andrea Arcangeli; +Cc: kvm-devel, allen.m.kay, benami, Avi Kivity Andrea Arcangeli wrote: > On Tue, Apr 01, 2008 at 01:21:37PM -0500, Anthony Liguori wrote: > >> return a page, not a HPA. I haven't looked too deeply yet, but my >> suspicion is that to properly support mapping in VM_IO pages will require >> some general refactoring since we always assume that a struct page exists >> for any HPA. >> > > Yes, that was potentially problem for reserved _ram_ pages too, as it > isn't guaranteed that memmap_t (old days nomenclature) will exist for > physical addresses not defined as ram in the e820 map (to make it work > without VT-d I have to reserve the ram in the host at the e820 map > parsing time). If the memmap will not exist for the reserved ram > physical range, the pfn_valid() will fail at runtime in kvm and the > bad_page will generate a graceful emulation failure, so it's very > safe. But once we handle direct memslots for mmio regions, the > reserved ram will better stop depending on the memmap too. > Direct memslots is quite a sledgehammer though to deal with IO pages. You could get away with supporting reserved RAM another way though. If we refactored the MMU to use hfn_t instead of struct page, you would then need a mechanism to mmap() reserved ram into userspace (similar to ioremap I guess). In fact, you may be able to just lie and claim reserved ram is IO ram. Then we could treat the reserved ram in the same way as IO ram and not have to introduce a new interface in KVM for mapping arbitrary regions of memory. I would be interested in this sort of mechanism not for reserving low RAM, but for reserving high RAM. Basically, I'd like to boot with mem= something and then still be able to map the higher memory in QEMU userspace and then create KVM guests with it. Regards, Anthony Liguori ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-01 22:29 ` Anthony Liguori @ 2008-04-02 4:00 ` Avi Kivity 0 siblings, 0 replies; 32+ messages in thread From: Avi Kivity @ 2008-04-02 4:00 UTC (permalink / raw) To: Anthony Liguori; +Cc: kvm-devel, allen.m.kay, Andrea Arcangeli, benami Anthony Liguori wrote: > > You could get away with supporting reserved RAM another way though. > If we refactored the MMU to use hfn_t instead of struct page, you > would then need a mechanism to mmap() reserved ram into userspace > (similar to ioremap I guess). In fact, you may be able to just lie > and claim reserved ram is IO ram. We'd need it to return a (hfn_t, page_type_t) pair so the caller would know whether to use page refcounting or not. The caller might be able to ask Linux whether the page has a struct page or not, but it gets hairy. That's why I'm interested in the possibility of adding struct pages at runtime. vmemmap is needed for many other reasons (NUMA, memory hotplug, sparse SGI memory) so it's reasonable that it will be enabled on most distros. -- Any sufficiently difficult bug is indistinguishable from a feature. ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-01 13:30 ` Avi Kivity 2008-04-01 14:42 ` Anthony Liguori @ 2008-04-01 19:28 ` Ben-Ami Yassour1 2008-04-01 19:43 ` Avi Kivity 1 sibling, 1 reply; 32+ messages in thread From: Ben-Ami Yassour1 @ 2008-04-01 19:28 UTC (permalink / raw) To: Avi Kivity; +Cc: kvm-devel, andrea, allen.m.kay Avi Kivity <avi@qumranet.com> wrote on 01/04/2008 16:30:00: > benami@il.ibm.com wrote: > > From: Ben-Ami Yassour <benami@il.ibm.com> > > > > Enable a guest to access a device's memory mapped I/O regions directly. > > Userspace sends the mmio regions that the guest can access. On the first > > page fault for an access to an mmio address the host translates > the gva to hpa, > > and updates the sptes. > > > > > > Can you explain why you're not using the regular memory slot mechanism? > i.e. have userspace mmap(/dev/mem) and create a memslot containing that > at the appropriate guest physical address? > > There are some issues with refcounting, but Andrea has some tricks to > deal with that. > > -- > Any sufficiently difficult bug is indistinguishable from a feature. > Our initial approach was to mmap /sys/bus/pci/devices/.../resource# and create a memory slot for it. However eventually we realized that for mmio we don't need hva mapped to the mmio region, we can map the gpa directly to hpa. As far as I understand, the memory slots mechanism is used to map gpa to hva. Then gfn_to_page uses get_user_pages to map hva to hpa. However, get_user_pages does not work for mmio, and in addition we know the hpa to begin with, so there is no real need to map an hva for the mmio region. In addition there is an assumption in the code that there is a page struct for the frame which is not the case for mmio. So it was easier to simply add a list of mmio gpa-hpa mapping. I guess we can use the memory slots array to holds the gpa to hpa mapping but it is not necessarily natural, and would probably require more hooks in the code to handle an mmio memory slot. BTW, note that for a guest that has multiple passthrough devices each with a set of mmio regions, it might become a long list, so there might be value to keep it separate from that respect as well. With regards to the potential security issue Anthony pointed out (ioctls taking hpa's) we can change the interface so that they will take (device, BAR) instead and the kernel will translate to hpa What do you think? Given that the shadow page table code has to be modified anyway (due to the struct page issue), is it worthwhile to experiment with mmap(...region) or is the current approach sufficient? Thanks, Ben ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-01 19:28 ` Ben-Ami Yassour1 @ 2008-04-01 19:43 ` Avi Kivity 2008-04-01 20:04 ` Anthony Liguori 0 siblings, 1 reply; 32+ messages in thread From: Avi Kivity @ 2008-04-01 19:43 UTC (permalink / raw) To: Ben-Ami Yassour1; +Cc: kvm-devel, andrea, allen.m.kay Ben-Ami Yassour1 wrote: >> >> Can you explain why you're not using the regular memory slot mechanism? >> i.e. have userspace mmap(/dev/mem) and create a memslot containing that >> at the appropriate guest physical address? >> >> > Our initial approach was to mmap /sys/bus/pci/devices/.../resource# > and create a memory slot for it. However eventually we realized that for > mmio we don't need hva mapped to the mmio region, we can map the gpa > directly to hpa. > > As far as I understand, the memory slots mechanism is used to map > gpa to hva. Then gfn_to_page uses get_user_pages to map hva to hpa. > However, get_user_pages does not work for mmio, and in addition we > know the hpa to begin with, so there is no real need to map an hva > for the mmio region. > > In addition there is an assumption in the code that there is a page > struct for the frame which is not the case for mmio. So it was > easier to simply add a list of mmio gpa-hpa mapping. > > I guess we can use the memory slots array to holds the gpa to hpa > mapping but it is not necessarily natural, and would probably > require more hooks in the code to handle an mmio memory slot. BTW, > note that for a guest that has multiple passthrough devices each > with a set of mmio regions, it might become a long list, so there > might be value to keep it separate from that respect as well. > > With regards to the potential security issue Anthony pointed out > (ioctls taking hpa's) we can change the interface so that they will > take (device, BAR) instead and the kernel will translate to hpa > > Not enough. How do you know if this calling process has permissions to access that pci device (I retract my previous "pci passthrough is as rootish as you get" remark). > What do you think? Given that the shadow page table code has to be > modified anyway (due to the struct page issue), is it worthwhile to > experiment with mmap(...region) or is the current approach sufficient? > As Anthony points out, the advantage to mmap() is that whatever security is needed can be applied at that level. Passing host physical addresses from userspace requires a whole new security model. The issue with gfn_to_page() is real. We can either try to work around it somehow, or we can try to back mmio regions with struct pages. Now that it looks like mem_map is becoming virtually mapped, the cost is minimal, but I expect this approach will meet some resistance. -- Any sufficiently difficult bug is indistinguishable from a feature. ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-01 19:43 ` Avi Kivity @ 2008-04-01 20:04 ` Anthony Liguori 2008-04-02 4:32 ` Avi Kivity 0 siblings, 1 reply; 32+ messages in thread From: Anthony Liguori @ 2008-04-01 20:04 UTC (permalink / raw) To: Avi Kivity; +Cc: kvm-devel, allen.m.kay, andrea, Ben-Ami Yassour1 Avi Kivity wrote: > Ben-Ami Yassour1 wrote: > >> >> > > Not enough. How do you know if this calling process has permissions to > access that pci device (I retract my previous "pci passthrough is as > rootish as you get" remark). > > >> What do you think? Given that the shadow page table code has to be >> modified anyway (due to the struct page issue), is it worthwhile to >> experiment with mmap(...region) or is the current approach sufficient? >> >> > > As Anthony points out, the advantage to mmap() is that whatever security > is needed can be applied at that level. Passing host physical addresses > from userspace requires a whole new security model. > > The issue with gfn_to_page() is real. We can either try to work around > it somehow, or we can try to back mmio regions with struct pages. Now > that it looks like mem_map is becoming virtually mapped, the cost is > minimal, but I expect this approach will meet some resistance. > What about switching the KVM MMU code to use hfn_t instead of struct page? The initial conversion is pretty straight forward as the places where you actually need a struct page you can always get it from pfn_to_page() (like in kvm_release_page_dirty). We can then teach the MMU to deal with hfn_t's that don't have a corresponding page. IIUC, the implementation of something like kvm_release_page_dirty is a nop for pfn's that don't have a corresponding page. We just have to be able to detect a pfn_to_page() failure and then assume we're dealing with IO memory. Regards, Anthony Liguori ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-01 20:04 ` Anthony Liguori @ 2008-04-02 4:32 ` Avi Kivity 2008-04-02 7:03 ` Andrea Arcangeli 0 siblings, 1 reply; 32+ messages in thread From: Avi Kivity @ 2008-04-02 4:32 UTC (permalink / raw) To: Anthony Liguori; +Cc: kvm-devel, andrea, allen.m.kay, Ben-Ami Yassour1 Anthony Liguori wrote: > What about switching the KVM MMU code to use hfn_t instead of struct > page? The initial conversion is pretty straight forward as the places > where you actually need a struct page you can always get it from > pfn_to_page() (like in kvm_release_page_dirty). > > We can then teach the MMU to deal with hfn_t's that don't have a > corresponding page. IIUC, the implementation of something like > kvm_release_page_dirty is a nop for pfn's that don't have a > corresponding page. We just have to be able to detect a pfn_to_page() > failure and then assume we're dealing with IO memory. > > It ought to work. gfn_to_hfn() (old gfn_to_page) will still need to take a refcount if possible. This will increase the potential for type errors, so maybe we need to make gfn_t and hfn_t distinct structures, and use accessors to get the actual values. -- Any sufficiently difficult bug is indistinguishable from a feature. ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-02 4:32 ` Avi Kivity @ 2008-04-02 7:03 ` Andrea Arcangeli 2008-04-02 9:50 ` Avi Kivity 0 siblings, 1 reply; 32+ messages in thread From: Andrea Arcangeli @ 2008-04-02 7:03 UTC (permalink / raw) To: Avi Kivity; +Cc: kvm-devel, allen.m.kay, Ben-Ami Yassour1 On Wed, Apr 02, 2008 at 07:32:35AM +0300, Avi Kivity wrote: > It ought to work. gfn_to_hfn() (old gfn_to_page) will still need to take a > refcount if possible. This reminds me, that mmu notifiers we could implement gfn_to_hfn only with follow_page and skip the refcounting on the struct page. I'm not suggesting that though, the refcount makes the code more robust IMHO, and notably it allows to run on kernels without mmu notifiers. So the trick is to do something like if (pfn_valid(pfn)) put_page(pfn_to_page(pfn)); That's the trick I was suggesting to take care of mmio space. If the reserved ram is used instead of VT-d to allow DMA, that will have change to: if (pfn_valid(pfn)) page = pfn_to_page(pfn); if (page_count(page)) put_page(page); that will take care of both the reserved ram and the pfn. In general I made it for the reserved-ram with only the page_count != 0 check, because 'struct page' existed. I'm unsure if it's good to add struct pages for non-ram, I find it slightly confusing and not the right thing, it takes memory for stuff that can't happen (refcounting only makes sense if the page finally goes in the freelist when count reaches 0, and PG_dirty/referenced bits and the like don't make sense either for non-ram or reserved-ram). So I'm not sure the vmemmap trick is the best. > This will increase the potential for type errors, so maybe we need to make > gfn_t and hfn_t distinct structures, and use accessors to get the actual > values. That's also a possibility. If we go for this then hfn_t is probably a better name as it's less likely to collide with core kernel VM code. Otherwise perhaps "pfn" can be used instead of hfn, it's up to you ;). ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-02 7:03 ` Andrea Arcangeli @ 2008-04-02 9:50 ` Avi Kivity 2008-04-02 10:28 ` Andrea Arcangeli 0 siblings, 1 reply; 32+ messages in thread From: Avi Kivity @ 2008-04-02 9:50 UTC (permalink / raw) To: Andrea Arcangeli; +Cc: kvm-devel, allen.m.kay, Ben-Ami Yassour1 Andrea Arcangeli wrote: > On Wed, Apr 02, 2008 at 07:32:35AM +0300, Avi Kivity wrote: > >> It ought to work. gfn_to_hfn() (old gfn_to_page) will still need to take a >> refcount if possible. >> > > This reminds me, that mmu notifiers we could implement gfn_to_hfn only > with follow_page and skip the refcounting on the struct page. > > I'm not suggesting that though, the refcount makes the code more > robust IMHO, and notably it allows to run on kernels without mmu > notifiers. > > Isn't it faster though? We don't need to pull in the cacheline containing the struct page anymore. We could hack something to make pre mmu notifier kernels work. > > I'm unsure if it's good to add struct pages for non-ram, I find it > slightly confusing and not the right thing, it takes memory for stuff > that can't happen (refcounting only makes sense if the page finally > goes in the freelist when count reaches 0, and PG_dirty/referenced > bits and the like don't make sense either for non-ram or > reserved-ram). So I'm not sure the vmemmap trick is the best. > > I thought we'd meet with resistance to that idea :) though I'd like to point out that struct pages to exist for mmio on some machines (those with >= 4GB). >> This will increase the potential for type errors, so maybe we need to make >> gfn_t and hfn_t distinct structures, and use accessors to get the actual >> values. >> > > That's also a possibility. If we go for this then hfn_t is probably a > better name as it's less likely to collide with core kernel VM > code. Otherwise perhaps "pfn" can be used instead of hfn, it's up to > you ;). > I guess we can move to pfn, they're unambiguous enough. -- error compiling committee.c: too many arguments to function ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-02 9:50 ` Avi Kivity @ 2008-04-02 10:28 ` Andrea Arcangeli 2008-04-02 10:59 ` Avi Kivity 0 siblings, 1 reply; 32+ messages in thread From: Andrea Arcangeli @ 2008-04-02 10:28 UTC (permalink / raw) To: Avi Kivity; +Cc: kvm-devel, allen.m.kay, Ben-Ami Yassour1 On Wed, Apr 02, 2008 at 12:50:50PM +0300, Avi Kivity wrote: > Isn't it faster though? We don't need to pull in the cacheline containing > the struct page anymore. Exactly, not only that, get_user_pages is likely a bit slower that we need for just kvm pte lookup. GRU uses follow_page directly because it runs in the tlb miss handler, for us instead the tlb miss handler doesn't invoke a page fault unless the spte is non present. I expect for us that optimization will be mostly lost in the noise but it is likely measurable in heavy VM workloads and in general it sure worth it in the long run (if nothing else as a microoptimization for whatever spte fault benchmark). > We could hack something to make pre mmu notifier kernels work. Actually I already removed the refcounting for the reserved ram, so it's going to be very easy to do with a few CONFIG_MMU_NOTIFIER. But because it's a low prio I would defer it for later, first patch can do the refcounting unconditionally to better test it. (I'm assuming we're going to deal with kernels without mmu notifiers [or without EMM ;) ] for a long while) > I thought we'd meet with resistance to that idea :) though I'd like to > point out that struct pages to exist for mmio on some machines (those with > >= 4GB). Sure, those should have page_count 0 like the reserved ram. I'm uncertain about the PageReserved semantics, I thought Nick nuked it long ago but perhaps it was something else. Now copy_page_range bails out on PFNMAP vmas, in the old days it would threat PG_reserved pages mapped with remap_page_range like a VM_SHARED by not wrprotecting and not refcounting even if this was a private mapping. In general using page_t for anything but ram is something that should be avoided and that also makes PG_reserved semantics mostly irrelevant. The users of remap_pfn_range (like /dev/mem) should never use pages regardless if it's ram or non-ram or if page_t exists or not. The fact page_t sometime exists for non-ram is strictly a performance optimization to make the pfn_to_page resolution quicker at runtime (speedup is significant and memory loss is negligeable). > I guess we can move to pfn, they're unambiguous enough. Ok! ;) ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-02 10:28 ` Andrea Arcangeli @ 2008-04-02 10:59 ` Avi Kivity 2008-04-02 11:16 ` Avi Kivity 2008-04-02 14:59 ` Anthony Liguori 0 siblings, 2 replies; 32+ messages in thread From: Avi Kivity @ 2008-04-02 10:59 UTC (permalink / raw) To: Andrea Arcangeli; +Cc: kvm-devel, allen.m.kay, Ben-Ami Yassour1 Andrea Arcangeli wrote: > On Wed, Apr 02, 2008 at 12:50:50PM +0300, Avi Kivity wrote: > >> Isn't it faster though? We don't need to pull in the cacheline containing >> the struct page anymore. >> > > Exactly, not only that, get_user_pages is likely a bit slower that we > need for just kvm pte lookup. GRU uses follow_page directly because it > runs in the tlb miss handler, for us instead the tlb miss handler > doesn't invoke a page fault unless the spte is non present. > > How about this plan? 0. Merge mmu notifiers 1. gfn_to_page() -> gfn_to_pfn() Still keeping the refcount. Change bad_page to kvm_bad_hfn. 2. Drop the refcounting from gfn_to_pfn() and from kvm_release_page_*() Still using get_user_pages() (and dropping the refcount immediately) Simultaneously, change hack_module.awk to add the refcount back. 3. Export follow_page() or something based on fast_gup(), and use it btw, if we change the method we use to read the Linux pte, I'd like to get the writable bit out of it. This was, when we create an spte for a gpte that is writable and dirty, we can set the spte writable iff the Linux pte is writable. This avoids breaking COW unnecessarily. -- error compiling committee.c: too many arguments to function ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-02 10:59 ` Avi Kivity @ 2008-04-02 11:16 ` Avi Kivity 2008-04-02 11:50 ` Andrea Arcangeli 2008-04-02 14:59 ` Anthony Liguori 1 sibling, 1 reply; 32+ messages in thread From: Avi Kivity @ 2008-04-02 11:16 UTC (permalink / raw) To: Andrea Arcangeli; +Cc: kvm-devel, allen.m.kay, Ben-Ami Yassour1 Avi Kivity wrote: > Andrea Arcangeli wrote: >> On Wed, Apr 02, 2008 at 12:50:50PM +0300, Avi Kivity wrote: >> >>> Isn't it faster though? We don't need to pull in the cacheline >>> containing the struct page anymore. >>> >> >> Exactly, not only that, get_user_pages is likely a bit slower that we >> need for just kvm pte lookup. GRU uses follow_page directly because it >> runs in the tlb miss handler, for us instead the tlb miss handler >> doesn't invoke a page fault unless the spte is non present. >> >> > > How about this plan? > > 0. Merge mmu notifiers > > 1. gfn_to_page() -> gfn_to_pfn() > > Still keeping the refcount. Change bad_page to kvm_bad_hfn. > > 2. Drop the refcounting from gfn_to_pfn() and from kvm_release_page_*() > > Still using get_user_pages() (and dropping the refcount immediately) > > Simultaneously, change hack_module.awk to add the refcount back. > > 3. Export follow_page() or something based on fast_gup(), and use it > > btw, if we change the method we use to read the Linux pte, I'd like to > get the writable bit out of it. This was, when we create an spte for > a gpte that is writable and dirty, we can set the spte writable iff > the Linux pte is writable. This avoids breaking COW unnecessarily. > Ugh, there's still mark_page_accessed() and SetPageDirty(). -- error compiling committee.c: too many arguments to function ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-02 11:16 ` Avi Kivity @ 2008-04-02 11:50 ` Andrea Arcangeli 2008-04-02 11:53 ` Andrea Arcangeli 2008-04-03 8:51 ` Avi Kivity 0 siblings, 2 replies; 32+ messages in thread From: Andrea Arcangeli @ 2008-04-02 11:50 UTC (permalink / raw) To: Avi Kivity; +Cc: kvm-devel, allen.m.kay, Ben-Ami Yassour1 On Wed, Apr 02, 2008 at 02:16:41PM +0300, Avi Kivity wrote: > Ugh, there's still mark_page_accessed() and SetPageDirty(). btw, like PG_dirty is only set if the spte is writeable, mark_page_accessed should only run if the accessed bit is set in the spte. It doesn't matter now as nobody could possibly clear it, but with mmu notifiers the ->clear_young will clear that accessed bit for the first time, and if the guest didn't use the spte in the meantime, we don't need to mark the page accessed and we can let the VM swapout the page at the next round of the lru. But that's ok, we'll simply run mark_page_accessed and SetPageDirty if pfn_valid is true. We're under mmu_lock so the mmu notifiers will prevent the page to go away from under us. The detail I'm uncertain is why my reserved-ram patch had both pfn_valid = 1 and page_count =0 and PG_reserved =0 for all reserved ram. Perhaps I triggered a bug in some memmap initialization as the common code should keep all pages that aren't supposed to go in the freelist when freed as PG_reserved and page_count 1. Anyway in practice it worked fine and there's zero risk I'm not using reserved-ram with the page_count = 0 check ;). The rmap_remove should look like this (this won't work with the reserved ram but that needs fixing somehow as reserved-ram must be like mmio region but with a pfn_valid and in turn with a page_t, and then the reserved-ram patch will be able to cope with the reserved ram not having a allocated memmap by luck of how page_alloc.c works). if (pfn_valid(pfn)) { page = pfn_to_page(pfn); if (!PageReserved(page)) { BUG_ON(page_count(page) != 1); if (is_writeable_pte(*spte)) SetPageDirty(page); if (*spte & PT_ACCESSED_MASK) mark_page_accessed(page); } } It still skips an atomic op. Your plan still sounds just fine despite the above, infact it sounds too perfect: the awk hack to re-add the refcounting when building the external module if CONFIG_MMU_NOTIFIER isn't defined is going to be messy, a plain CONFIG_MMU_NOTIFIER in kvm.git would be simpler and more robust IMHO even if less perfect :). ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-02 11:50 ` Andrea Arcangeli @ 2008-04-02 11:53 ` Andrea Arcangeli 2008-04-03 8:51 ` Avi Kivity 1 sibling, 0 replies; 32+ messages in thread From: Andrea Arcangeli @ 2008-04-02 11:53 UTC (permalink / raw) To: Avi Kivity; +Cc: kvm-devel, allen.m.kay, Ben-Ami Yassour1 On Wed, Apr 02, 2008 at 01:50:19PM +0200, Andrea Arcangeli wrote: > if (pfn_valid(pfn)) { > page = pfn_to_page(pfn); > if (!PageReserved(page)) { > BUG_ON(page_count(page) != 1); > if (is_writeable_pte(*spte)) > SetPageDirty(page); > if (*spte & PT_ACCESSED_MASK) > mark_page_accessed(page); > } > } warning the indenting of the above in text-mode was wrong... read it like a c compiler would do sorry ;) ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-02 11:50 ` Andrea Arcangeli 2008-04-02 11:53 ` Andrea Arcangeli @ 2008-04-03 8:51 ` Avi Kivity 1 sibling, 0 replies; 32+ messages in thread From: Avi Kivity @ 2008-04-03 8:51 UTC (permalink / raw) To: Andrea Arcangeli; +Cc: kvm-devel, allen.m.kay, Ben-Ami Yassour1 Andrea Arcangeli wrote: > On Wed, Apr 02, 2008 at 02:16:41PM +0300, Avi Kivity wrote: > >> Ugh, there's still mark_page_accessed() and SetPageDirty(). >> > > btw, like PG_dirty is only set if the spte is writeable, > mark_page_accessed should only run if the accessed bit is set in the > spte. It doesn't matter now as nobody could possibly clear it, No one will clear it now, but it can start out cleared. This is done on speculative mmu_set_spte(): when the guest writes into its page tables, we update the spte speculatively, but the guest may not actually access that location (for example, due to a page fault clustering). So the change makes sense even now. > It still skips an atomic op. Your plan still sounds just fine despite > the above, infact it sounds too perfect: the awk hack to re-add the > refcounting when building the external module if CONFIG_MMU_NOTIFIER > isn't defined is going to be messy, a plain CONFIG_MMU_NOTIFIER in > kvm.git would be simpler and more robust IMHO even if less perfect :). > Worst case, we stick a get_user_pages() inside the memslot setup function. That makes things not swappable for pre-mmu notifiers, but completely safe. I'd rather avoid special casing the core code, whenever possible. -- error compiling committee.c: too many arguments to function ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH 1/1] direct mmio for passthrough - kernel part 2008-04-02 10:59 ` Avi Kivity 2008-04-02 11:16 ` Avi Kivity @ 2008-04-02 14:59 ` Anthony Liguori 1 sibling, 0 replies; 32+ messages in thread From: Anthony Liguori @ 2008-04-02 14:59 UTC (permalink / raw) To: Avi Kivity; +Cc: kvm-devel, allen.m.kay, Andrea Arcangeli, Ben-Ami Yassour1 Avi Kivity wrote: > Andrea Arcangeli wrote: >> On Wed, Apr 02, 2008 at 12:50:50PM +0300, Avi Kivity wrote: >> >>> Isn't it faster though? We don't need to pull in the cacheline >>> containing the struct page anymore. >>> >> >> Exactly, not only that, get_user_pages is likely a bit slower that we >> need for just kvm pte lookup. GRU uses follow_page directly because it >> runs in the tlb miss handler, for us instead the tlb miss handler >> doesn't invoke a page fault unless the spte is non present. >> >> > > How about this plan? > > 0. Merge mmu notifiers Are mmu-notifiers likely to get pushed when the 2.6.26 window opens up? > 1. gfn_to_page() -> gfn_to_pfn() > > Still keeping the refcount. Change bad_page to kvm_bad_hfn. kvm_bad_pfn. > 2. Drop the refcounting from gfn_to_pfn() and from kvm_release_page_*() > > Still using get_user_pages() (and dropping the refcount immediately) > > Simultaneously, change hack_module.awk to add the refcount back. This has the dependency on mmu notifiers so step 1 can conceivably be merged in the absence of mmu notifiers. Regards, Anthony Liguori > 3. Export follow_page() or something based on fast_gup(), and use it > > btw, if we change the method we use to read the Linux pte, I'd like to > get the writable bit out of it. This was, when we create an spte for > a gpte that is writable and dirty, we can set the spte writable iff > the Linux pte is writable. This avoids breaking COW unnecessarily. > ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace ^ permalink raw reply [flat|nested] 32+ messages in thread
end of thread, other threads:[~2008-04-03 8:51 UTC | newest] Thread overview: 32+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2008-04-01 11:52 [RFC] direct mmio for passthrough benami 2008-04-01 11:52 ` [PATCH 1/1] direct mmio for passthrough - kernel part benami 2008-04-01 13:30 ` Avi Kivity 2008-04-01 14:42 ` Anthony Liguori 2008-04-01 15:20 ` Anthony Liguori 2008-04-01 17:05 ` Avi Kivity 2008-04-01 18:18 ` Andrea Arcangeli 2008-04-01 18:28 ` Anthony Liguori 2008-04-01 17:03 ` Avi Kivity 2008-04-01 17:18 ` Daniel P. Berrange 2008-04-01 18:10 ` Andrea Arcangeli 2008-04-01 18:18 ` Daniel P. Berrange 2008-04-01 18:23 ` Anthony Liguori 2008-04-01 18:21 ` Anthony Liguori 2008-04-01 19:22 ` Avi Kivity 2008-04-01 22:38 ` Andrea Arcangeli 2008-04-01 22:22 ` Andrea Arcangeli 2008-04-01 22:29 ` Anthony Liguori 2008-04-02 4:00 ` Avi Kivity 2008-04-01 19:28 ` Ben-Ami Yassour1 2008-04-01 19:43 ` Avi Kivity 2008-04-01 20:04 ` Anthony Liguori 2008-04-02 4:32 ` Avi Kivity 2008-04-02 7:03 ` Andrea Arcangeli 2008-04-02 9:50 ` Avi Kivity 2008-04-02 10:28 ` Andrea Arcangeli 2008-04-02 10:59 ` Avi Kivity 2008-04-02 11:16 ` Avi Kivity 2008-04-02 11:50 ` Andrea Arcangeli 2008-04-02 11:53 ` Andrea Arcangeli 2008-04-03 8:51 ` Avi Kivity 2008-04-02 14:59 ` Anthony Liguori
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox