From: Marcelo Tosatti <marcelo@kvack.org>
To: Avi Kivity <avi@qumranet.com>
Cc: kvm-devel <kvm-devel@lists.sourceforge.net>
Subject: Re: large page support for kvm
Date: Thu, 14 Feb 2008 21:17:39 -0200 [thread overview]
Message-ID: <20080214231739.GA7787@dmt> (raw)
In-Reply-To: <47B2921F.1040905@qumranet.com>
[-- Attachment #1: Type: text/plain, Size: 21758 bytes --]
On Wed, Feb 13, 2008 at 08:45:51AM +0200, Avi Kivity wrote:
> >gfn_to_page() needs to grab the struct page corresponding to the large
> >page, not the offset struct page for the faulting 4k address within
> >the large frame. Since gfn_to_page can sleep, there is no way to do
> >that in the mapping logic which happens under mmu_lock protection.
> >We don't want to grab the large page frame "struct page" unless the
> >is_largepage_backed() checks are successful.
> >
> >The checks could be done in page_fault() if walker->level == 2, before
> >gfn_to_page()... But I don't see much difference of that and doing
> >it inside walk_addr(). What do you say?
> >
> >
>
> I'd like to keep walk_addr() independent of the rest of the mmu (i.e.
> walk_addr is 100% guest oriented). Also, the issue you point out is
> shared by direct_map which doesn't call walk_addr().
>
> An unrelated issue (pointed out by Jun Nakajima) is that this kills
> dirty log tracking (needed for migration). It could be solved simply by
> not using large page backing if dirty log tracking is enabled for that slot.
Ok, fixed your comments and a bug which a root page was shadowed in the
large area being mapped. access.flat is happy.
Joerg, can you give this a try on a NPT-enabled system (need the
attached qemu-largepage-hack.patch).
Thanks
Index: kvm.largepages/arch/x86/kvm/mmu.c
===================================================================
--- kvm.largepages.orig/arch/x86/kvm/mmu.c
+++ kvm.largepages/arch/x86/kvm/mmu.c
@@ -27,6 +27,7 @@
#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/swap.h>
+#include <linux/hugetlb.h>
#include <asm/page.h>
#include <asm/cmpxchg.h>
@@ -211,6 +212,11 @@ static int is_shadow_present_pte(u64 pte
&& pte != shadow_notrap_nonpresent_pte;
}
+static int is_large_pte(u64 pte)
+{
+ return pte & PT_PAGE_SIZE_MASK;
+}
+
static int is_writeble_pte(unsigned long pte)
{
return pte & PT_WRITABLE_MASK;
@@ -350,17 +356,120 @@ static void mmu_free_rmap_desc(struct kv
kfree(rd);
}
+static int hpage_align_diff(unsigned long gfn)
+{
+ return ((gfn+KVM_PAGES_PER_HPAGE-1) & ~(KVM_PAGES_PER_HPAGE-1)) - gfn;
+}
+
+/*
+ * Return the pointer to the largepage write count for a given
+ * gfn, handling slots that are not large page aligned.
+ */
+static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
+{
+ unsigned long idx;
+
+ idx = (gfn - slot->base_gfn) + hpage_align_diff(slot->base_gfn);
+ idx /= KVM_PAGES_PER_HPAGE;
+ return &slot->lpage_info[idx].write_count;
+}
+
+static void account_shadowed(struct kvm *kvm, gfn_t gfn)
+{
+ int *write_count;
+
+ write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+ *write_count += 1;
+ WARN_ON(*write_count > KVM_PAGES_PER_HPAGE);
+}
+
+static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
+{
+ int *write_count;
+
+ write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+ *write_count -= 1;
+ WARN_ON(*write_count < 0);
+}
+
+static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+ int *largepage_idx;
+
+ if (slot) {
+ largepage_idx = slot_largepage_idx(gfn, slot);
+ return *largepage_idx;
+ }
+
+ return 1;
+}
+
+static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
+{
+ struct vm_area_struct *vma;
+ unsigned long addr;
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return 0;
+
+ vma = find_vma(current->mm, addr);
+ if (vma && is_vm_hugetlb_page(vma))
+ return 1;
+
+ return 0;
+}
+
+static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+{
+ struct kvm_memory_slot *slot;
+
+ if (has_wrprotected_page(vcpu->kvm, large_gfn))
+ return 0;
+
+ if (!host_largepage_backed(vcpu->kvm, large_gfn))
+ return 0;
+
+ slot = gfn_to_memslot(vcpu->kvm, large_gfn);
+ if (slot && slot->dirty_bitmap)
+ return 0;
+
+ /* guest has 4M pages, host 2M */
+ if (!is_pae(vcpu) && HPAGE_SHIFT == 21)
+ return 0;
+
+ return 1;
+}
+
+static int is_physical_memory(struct kvm *kvm, gfn_t gfn)
+{
+ unsigned long addr;
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return 0;
+
+ return 1;
+}
+
/*
* Take gfn and return the reverse mapping to it.
* Note: gfn must be unaliased before this function get called
*/
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
{
struct kvm_memory_slot *slot;
+ unsigned long idx;
slot = gfn_to_memslot(kvm, gfn);
- return &slot->rmap[gfn - slot->base_gfn];
+ if (!lpage)
+ return &slot->rmap[gfn - slot->base_gfn];
+
+ idx = gfn - slot->base_gfn + hpage_align_diff(slot->base_gfn);
+ idx /= KVM_PAGES_PER_HPAGE;
+ return &slot->lpage_info[idx].rmap_pde;
}
/*
@@ -372,7 +481,7 @@ static unsigned long *gfn_to_rmap(struct
* If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
* containing more mappings.
*/
-static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
{
struct kvm_mmu_page *sp;
struct kvm_rmap_desc *desc;
@@ -384,7 +493,7 @@ static void rmap_add(struct kvm_vcpu *vc
gfn = unalias_gfn(vcpu->kvm, gfn);
sp = page_header(__pa(spte));
sp->gfns[spte - sp->spt] = gfn;
- rmapp = gfn_to_rmap(vcpu->kvm, gfn);
+ rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
if (!*rmapp) {
rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
*rmapp = (unsigned long)spte;
@@ -450,7 +559,7 @@ static void rmap_remove(struct kvm *kvm,
kvm_release_page_dirty(page);
else
kvm_release_page_clean(page);
- rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
+ rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
if (!*rmapp) {
printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
BUG();
@@ -516,7 +625,7 @@ static void rmap_write_protect(struct kv
int write_protected = 0;
gfn = unalias_gfn(kvm, gfn);
- rmapp = gfn_to_rmap(kvm, gfn);
+ rmapp = gfn_to_rmap(kvm, gfn, 0);
spte = rmap_next(kvm, rmapp, NULL);
while (spte) {
@@ -529,8 +638,27 @@ static void rmap_write_protect(struct kv
}
spte = rmap_next(kvm, rmapp, spte);
}
+ /* check for huge page mappings */
+ rmapp = gfn_to_rmap(kvm, gfn, 1);
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ BUG_ON(!spte);
+ BUG_ON(!(*spte & PT_PRESENT_MASK));
+ BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
+ pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
+ if (is_writeble_pte(*spte)) {
+ rmap_remove(kvm, spte);
+ --kvm->stat.lpages;
+ set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+ write_protected = 1;
+ }
+ spte = rmap_next(kvm, rmapp, spte);
+ }
+
if (write_protected)
kvm_flush_remote_tlbs(kvm);
+
+ account_shadowed(kvm, gfn);
}
#ifdef MMU_DEBUG
@@ -750,11 +878,17 @@ static void kvm_mmu_page_unlink_children
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
ent = pt[i];
+ if (is_shadow_present_pte(ent)) {
+ if (!is_large_pte(ent)) {
+ ent &= PT64_BASE_ADDR_MASK;
+ mmu_page_remove_parent_pte(page_header(ent),
+ &pt[i]);
+ } else {
+ --kvm->stat.lpages;
+ rmap_remove(kvm, &pt[i]);
+ }
+ }
pt[i] = shadow_trap_nonpresent_pte;
- if (!is_shadow_present_pte(ent))
- continue;
- ent &= PT64_BASE_ADDR_MASK;
- mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
}
kvm_flush_remote_tlbs(kvm);
}
@@ -794,6 +928,8 @@ static void kvm_mmu_zap_page(struct kvm
}
kvm_mmu_page_unlink_children(kvm, sp);
if (!sp->root_count) {
+ if (!sp->role.metaphysical)
+ unaccount_shadowed(kvm, sp->gfn);
hlist_del(&sp->hash_link);
kvm_mmu_free_page(kvm, sp);
} else
@@ -894,12 +1030,28 @@ struct page *gva_to_page(struct kvm_vcpu
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault, int dirty,
- int *ptwrite, gfn_t gfn, struct page *page)
+ int *ptwrite, int largepage, gfn_t gfn,
+ struct page *page)
{
u64 spte;
int was_rmapped = is_rmap_pte(*shadow_pte);
int was_writeble = is_writeble_pte(*shadow_pte);
+ /*
+ * If we overwrite a PTE page pointer with a 2MB PMD, unlink
+ * the parent of the now unreachable PTE.
+ */
+ if (largepage) {
+ if (was_rmapped && !is_large_pte(*shadow_pte)) {
+ struct kvm_mmu_page *child;
+ u64 pte = *shadow_pte;
+
+ child = page_header(pte & PT64_BASE_ADDR_MASK);
+ mmu_page_remove_parent_pte(child, shadow_pte);
+ }
+ was_rmapped = is_large_pte(*shadow_pte);
+ }
+
pgprintk("%s: spte %llx access %x write_fault %d"
" user_fault %d gfn %lx\n",
__FUNCTION__, *shadow_pte, pt_access,
@@ -919,6 +1071,8 @@ static void mmu_set_spte(struct kvm_vcpu
spte |= PT_PRESENT_MASK;
if (pte_access & ACC_USER_MASK)
spte |= PT_USER_MASK;
+ if (largepage)
+ spte |= PT_PAGE_SIZE_MASK;
spte |= page_to_phys(page);
@@ -933,7 +1087,8 @@ static void mmu_set_spte(struct kvm_vcpu
}
shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
- if (shadow) {
+ if (shadow ||
+ (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
pgprintk("%s: found shadow page for %lx, marking ro\n",
__FUNCTION__, gfn);
pte_access &= ~ACC_WRITE_MASK;
@@ -941,6 +1096,18 @@ static void mmu_set_spte(struct kvm_vcpu
spte &= ~PT_WRITABLE_MASK;
kvm_x86_ops->tlb_flush(vcpu);
}
+ /*
+ * Largepage creation is susceptible to a upper-level
+ * table to be shadowed and write-protected in the
+ * area being mapped. If that is the case, invalidate
+ * the entry and let the instruction fault again
+ * and use 4K mappings.
+ */
+ if (largepage) {
+ spte = shadow_trap_nonpresent_pte;
+ kvm_x86_ops->tlb_flush(vcpu);
+ goto unshadowed;
+ }
if (write_fault)
*ptwrite = 1;
}
@@ -952,10 +1119,17 @@ unshadowed:
mark_page_dirty(vcpu->kvm, gfn);
pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
+ pgprintk("instantiating %s PTE (%s) at %d (%llx)\n",
+ (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
+ (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte);
set_shadow_pte(shadow_pte, spte);
+ if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK)
+ && (spte & PT_PRESENT_MASK))
+ ++vcpu->kvm->stat.lpages;
+
page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
if (!was_rmapped) {
- rmap_add(vcpu, shadow_pte, gfn);
+ rmap_add(vcpu, shadow_pte, gfn, largepage);
if (!is_rmap_pte(*shadow_pte))
kvm_release_page_clean(page);
} else {
@@ -973,7 +1147,8 @@ static void nonpaging_new_cr3(struct kvm
}
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
- gfn_t gfn, struct page *page, int level)
+ int largepage, gfn_t gfn, struct page *page,
+ int level)
{
hpa_t table_addr = vcpu->arch.mmu.root_hpa;
int pt_write = 0;
@@ -987,7 +1162,13 @@ static int __direct_map(struct kvm_vcpu
if (level == 1) {
mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
- 0, write, 1, &pt_write, gfn, page);
+ 0, write, 1, &pt_write, 0, gfn, page);
+ return pt_write;
+ }
+
+ if (largepage && level == 2) {
+ mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
+ 0, write, 1, &pt_write, 1, gfn, page);
return pt_write;
}
@@ -1017,12 +1198,19 @@ static int __direct_map(struct kvm_vcpu
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
{
int r;
+ int largepage = 0;
struct page *page;
down_read(&vcpu->kvm->slots_lock);
down_read(¤t->mm->mmap_sem);
+ if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))
+ && is_physical_memory(vcpu->kvm, gfn)) {
+ gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+ largepage = 1;
+ }
+
page = gfn_to_page(vcpu->kvm, gfn);
up_read(¤t->mm->mmap_sem);
@@ -1035,7 +1223,8 @@ static int nonpaging_map(struct kvm_vcpu
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
- r = __direct_map(vcpu, v, write, gfn, page, PT32E_ROOT_LEVEL);
+ r = __direct_map(vcpu, v, write, largepage, gfn, page,
+ PT32E_ROOT_LEVEL);
spin_unlock(&vcpu->kvm->mmu_lock);
up_read(&vcpu->kvm->slots_lock);
@@ -1166,6 +1355,8 @@ static int tdp_page_fault(struct kvm_vcp
{
struct page *page;
int r;
+ int largepage = 0;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -1175,7 +1366,12 @@ static int tdp_page_fault(struct kvm_vcp
return r;
down_read(¤t->mm->mmap_sem);
- page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))
+ && is_physical_memory(vcpu->kvm, gfn)) {
+ gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+ largepage = 1;
+ }
+ page = gfn_to_page(vcpu->kvm, gfn);
if (is_error_page(page)) {
kvm_release_page_clean(page);
up_read(¤t->mm->mmap_sem);
@@ -1184,7 +1380,7 @@ static int tdp_page_fault(struct kvm_vcp
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
- gpa >> PAGE_SHIFT, page, TDP_ROOT_LEVEL);
+ largepage, gfn, page, TDP_ROOT_LEVEL);
spin_unlock(&vcpu->kvm->mmu_lock);
up_read(¤t->mm->mmap_sem);
@@ -1383,7 +1579,8 @@ static void mmu_pte_write_zap_pte(struct
pte = *spte;
if (is_shadow_present_pte(pte)) {
- if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+ if (sp->role.level == PT_PAGE_TABLE_LEVEL ||
+ is_large_pte(pte))
rmap_remove(vcpu->kvm, spte);
else {
child = page_header(pte & PT64_BASE_ADDR_MASK);
@@ -1391,6 +1588,8 @@ static void mmu_pte_write_zap_pte(struct
}
}
set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+ if (is_large_pte(pte))
+ --vcpu->kvm->stat.lpages;
}
static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
@@ -1398,7 +1597,8 @@ static void mmu_pte_write_new_pte(struct
u64 *spte,
const void *new)
{
- if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
+ if ((sp->role.level != PT_PAGE_TABLE_LEVEL)
+ && !vcpu->arch.update_pte.largepage) {
++vcpu->kvm->stat.mmu_pde_zapped;
return;
}
@@ -1446,6 +1646,8 @@ static void mmu_guess_page_from_pte_writ
u64 gpte = 0;
struct page *page;
+ vcpu->arch.update_pte.largepage = 0;
+
if (bytes != 4 && bytes != 8)
return;
@@ -1474,6 +1676,10 @@ static void mmu_guess_page_from_pte_writ
gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
down_read(¤t->mm->mmap_sem);
+ if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
+ gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+ vcpu->arch.update_pte.largepage = 1;
+ }
page = gfn_to_page(vcpu->kvm, gfn);
up_read(¤t->mm->mmap_sem);
Index: kvm.largepages/arch/x86/kvm/paging_tmpl.h
===================================================================
--- kvm.largepages.orig/arch/x86/kvm/paging_tmpl.h
+++ kvm.largepages/arch/x86/kvm/paging_tmpl.h
@@ -248,6 +248,7 @@ static void FNAME(update_pte)(struct kvm
pt_element_t gpte;
unsigned pte_access;
struct page *npage;
+ int largepage = vcpu->arch.update_pte.largepage;
gpte = *(const pt_element_t *)pte;
if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
@@ -264,7 +265,8 @@ static void FNAME(update_pte)(struct kvm
return;
get_page(npage);
mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
- gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage);
+ gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
+ npage);
}
/*
@@ -272,8 +274,8 @@ static void FNAME(update_pte)(struct kvm
*/
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *walker,
- int user_fault, int write_fault, int *ptwrite,
- struct page *page)
+ int user_fault, int write_fault, int largepage,
+ int *ptwrite, struct page *page)
{
hpa_t shadow_addr;
int level;
@@ -302,6 +304,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
shadow_ent = ((u64 *)__va(shadow_addr)) + index;
if (level == PT_PAGE_TABLE_LEVEL)
break;
+
+ if (largepage && level == PT_DIRECTORY_LEVEL)
+ break;
+
if (is_shadow_present_pte(*shadow_ent)) {
shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
continue;
@@ -340,7 +346,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
user_fault, write_fault,
walker->ptes[walker->level-1] & PT_DIRTY_MASK,
- ptwrite, walker->gfn, page);
+ ptwrite, largepage, walker->gfn, page);
return shadow_ent;
}
@@ -370,6 +376,7 @@ static int FNAME(page_fault)(struct kvm_
int write_pt = 0;
int r;
struct page *page;
+ int largepage = 0;
pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
kvm_mmu_audit(vcpu, "pre page fault");
@@ -397,6 +404,15 @@ static int FNAME(page_fault)(struct kvm_
}
down_read(¤t->mm->mmap_sem);
+ if (walker.level == PT_DIRECTORY_LEVEL) {
+ gfn_t large_gfn;
+ large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
+ if (is_largepage_backed(vcpu, large_gfn)
+ && is_physical_memory(vcpu->kvm, walker.gfn)) {
+ walker.gfn = large_gfn;
+ largepage = 1;
+ }
+ }
page = gfn_to_page(vcpu->kvm, walker.gfn);
up_read(¤t->mm->mmap_sem);
@@ -411,7 +427,7 @@ static int FNAME(page_fault)(struct kvm_
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
- &write_pt, page);
+ largepage, &write_pt, page);
pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
shadow_pte, *shadow_pte, write_pt);
Index: kvm.largepages/arch/x86/kvm/x86.c
===================================================================
--- kvm.largepages.orig/arch/x86/kvm/x86.c
+++ kvm.largepages/arch/x86/kvm/x86.c
@@ -86,6 +86,7 @@ struct kvm_stats_debugfs_item debugfs_en
{ "mmu_recycled", VM_STAT(mmu_recycled) },
{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
+ { "lpages", VM_STAT(lpages) },
{ NULL }
};
Index: kvm.largepages/include/asm-x86/kvm_host.h
===================================================================
--- kvm.largepages.orig/include/asm-x86/kvm_host.h
+++ kvm.largepages/include/asm-x86/kvm_host.h
@@ -38,6 +38,13 @@
#define INVALID_PAGE (~(hpa_t)0)
#define UNMAPPED_GVA (~(gpa_t)0)
+/* shadow tables are PAE even on non-PAE hosts */
+#define KVM_HPAGE_SHIFT 21
+#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT)
+#define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1))
+
+#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE)
+
#define DE_VECTOR 0
#define UD_VECTOR 6
#define NM_VECTOR 7
@@ -228,6 +235,7 @@ struct kvm_vcpu_arch {
struct {
gfn_t gfn; /* presumed gfn during guest pte update */
struct page *page; /* page corresponding to that gfn */
+ int largepage;
} update_pte;
struct i387_fxsave_struct host_fx_image;
@@ -298,6 +306,7 @@ struct kvm_vm_stat {
u32 mmu_recycled;
u32 mmu_cache_miss;
u32 remote_tlb_flush;
+ u32 lpages;
};
struct kvm_vcpu_stat {
Index: kvm.largepages/include/linux/kvm_host.h
===================================================================
--- kvm.largepages.orig/include/linux/kvm_host.h
+++ kvm.largepages/include/linux/kvm_host.h
@@ -102,6 +102,10 @@ struct kvm_memory_slot {
unsigned long flags;
unsigned long *rmap;
unsigned long *dirty_bitmap;
+ struct {
+ unsigned long rmap_pde;
+ int write_count;
+ } *lpage_info;
unsigned long userspace_addr;
int user_alloc;
};
@@ -168,6 +172,7 @@ int kvm_arch_set_memory_region(struct kv
int user_alloc);
gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
+unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
void kvm_release_page_clean(struct page *page);
void kvm_release_page_dirty(struct page *page);
int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
Index: kvm.largepages/virt/kvm/kvm_main.c
===================================================================
--- kvm.largepages.orig/virt/kvm/kvm_main.c
+++ kvm.largepages/virt/kvm/kvm_main.c
@@ -189,9 +189,13 @@ static void kvm_free_physmem_slot(struct
if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
vfree(free->dirty_bitmap);
+ if (!dont || free->lpage_info != dont->lpage_info)
+ vfree(free->lpage_info);
+
free->npages = 0;
free->dirty_bitmap = NULL;
free->rmap = NULL;
+ free->lpage_info = NULL;
}
void kvm_free_physmem(struct kvm *kvm)
@@ -301,6 +305,22 @@ int __kvm_set_memory_region(struct kvm *
new.user_alloc = user_alloc;
new.userspace_addr = mem->userspace_addr;
}
+ if (npages && !new.lpage_info) {
+ int largepages = npages / KVM_PAGES_PER_HPAGE;
+ if (npages % KVM_PAGES_PER_HPAGE)
+ largepages++;
+ new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info));
+
+ if (!new.lpage_info)
+ goto out_free;
+
+ memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info));
+ /* large page crosses memslot boundary */
+ if (npages % KVM_PAGES_PER_HPAGE) {
+ new.lpage_info[0].write_count = 1;
+ new.lpage_info[largepages-1].write_count = 1;
+ }
+ }
/* Allocate page dirty bitmap if needed */
if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
@@ -444,7 +464,7 @@ int kvm_is_visible_gfn(struct kvm *kvm,
}
EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
-static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *slot;
@@ -454,6 +474,7 @@ static unsigned long gfn_to_hva(struct k
return bad_hva();
return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
}
+EXPORT_SYMBOL(gfn_to_hva);
/*
* Requires current->mm->mmap_sem to be held
[-- Attachment #2: qemu-largepage-hack.patch --]
[-- Type: text/plain, Size: 1302 bytes --]
Index: kvm-userspace/qemu/vl.c
===================================================================
--- kvm-userspace.orig/qemu/vl.c
+++ kvm-userspace/qemu/vl.c
@@ -8501,6 +8501,31 @@ void qemu_get_launch_info(int *argc, cha
*opt_incoming = incoming;
}
+#define HPAGE_SIZE 2*1024*1024
+
+void *alloc_huge_area(unsigned long memory)
+{
+ void *area;
+ int fd;
+ char path[] = "/mnt/kvm.XXXXXX";
+
+ mkstemp(path);
+ fd = open(path, O_RDWR);
+ if (fd < 0) {
+ perror("open");
+ exit(0);
+ }
+ memory = (memory+HPAGE_SIZE-1) & ~(HPAGE_SIZE-1);
+
+ area = mmap(0, memory, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
+ if (area == MAP_FAILED) {
+ perror("mmap");
+ exit(0);
+ }
+
+ return area;
+}
+
int main(int argc, char **argv)
{
#ifdef CONFIG_GDBSTUB
@@ -9330,9 +9355,9 @@ int main(int argc, char **argv)
ret = kvm_qemu_check_extension(KVM_CAP_USER_MEMORY);
if (ret) {
- printf("allocating %d MB\n", phys_ram_size/1024/1024);
- phys_ram_base = qemu_vmalloc(phys_ram_size);
- if (!phys_ram_base) {
+ //phys_ram_base = qemu_vmalloc(phys_ram_size);
+ phys_ram_base = alloc_huge_area(phys_ram_size);
+ if (!phys_ram_base) {
fprintf(stderr, "Could not allocate physical memory\n");
exit(1);
}
[-- Attachment #3: Type: text/plain, Size: 228 bytes --]
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
[-- Attachment #4: Type: text/plain, Size: 158 bytes --]
_______________________________________________
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel
next prev parent reply other threads:[~2008-02-14 23:17 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-01-29 17:20 large page support for kvm Avi Kivity
[not found] ` <479F604C.20107-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2008-01-30 18:40 ` Joerg Roedel
[not found] ` <20080130184035.GS6960-5C7GfCeVMHo@public.gmane.org>
2008-01-31 5:44 ` Avi Kivity
2008-02-11 15:49 ` Marcelo Tosatti
2008-02-12 11:55 ` Avi Kivity
2008-02-13 0:15 ` Marcelo Tosatti
2008-02-13 6:45 ` Avi Kivity
2008-02-14 23:17 ` Marcelo Tosatti [this message]
2008-02-15 7:40 ` Roedel, Joerg
2008-02-17 9:38 ` Avi Kivity
2008-02-19 20:37 ` Marcelo Tosatti
2008-02-20 14:25 ` Avi Kivity
2008-02-22 2:01 ` Marcelo Tosatti
2008-02-22 7:16 ` Avi Kivity
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20080214231739.GA7787@dmt \
--to=marcelo@kvack.org \
--cc=avi@qumranet.com \
--cc=kvm-devel@lists.sourceforge.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.