* [RFC PATCH 0/4] KVM: ARM: Support transparent huge pages and hugetlbfs
@ 2012-11-02 10:03 Christoffer Dall
2012-11-02 10:03 ` [RFC PATCH 1/4] KVM: ARM: Report support of mmu notifiers to user space Christoffer Dall
` (3 more replies)
0 siblings, 4 replies; 5+ messages in thread
From: Christoffer Dall @ 2012-11-02 10:03 UTC (permalink / raw)
To: kvmarm; +Cc: kvm, Christoffer Dall
The following series implements support for transparent huge pages and
hugetlbfs for KVM/ARM. The patch series is based on
kvm-arm-v13-vgic-timers with Will Deacon's hugetlb branch series merged.
These patches can also be fetched from here:
git://github.com/virtualopensystems/linux-kvm-arm.git kvm-arm-hugetlb
Christoffer Dall (4):
KVM: ARM: Report support of mmu notifiers to user space
KVM: ARM: Fixup trace ipa printing
KVM: ARM: Improve stage2_clear_pte
KVM: ARM: Transparent huge pages and hugetlbfs support
arch/arm/include/asm/kvm_host.h | 6 +-
arch/arm/kvm/arm.c | 1 +
arch/arm/kvm/mmu.c | 248 +++++++++++++++++++++++++++++----------
arch/arm/kvm/trace.h | 8 +-
4 files changed, 191 insertions(+), 72 deletions(-)
--
1.7.9.5
^ permalink raw reply [flat|nested] 5+ messages in thread
* [RFC PATCH 1/4] KVM: ARM: Report support of mmu notifiers to user space
2012-11-02 10:03 [RFC PATCH 0/4] KVM: ARM: Support transparent huge pages and hugetlbfs Christoffer Dall
@ 2012-11-02 10:03 ` Christoffer Dall
2012-11-02 10:03 ` [RFC PATCH 2/4] KVM: ARM: Fixup trace ipa printing Christoffer Dall
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: Christoffer Dall @ 2012-11-02 10:03 UTC (permalink / raw)
To: kvmarm; +Cc: kvm, Christoffer Dall
This should have been added a long time ago, and is at least required
for user space to take advantage of hugetlbfs.
Signed-off-by: Christoffer Dall <c.dall@virtualopensystems.com>
---
arch/arm/kvm/arm.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 69bec17..9a7d2d6 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -199,6 +199,7 @@ int kvm_dev_ioctl_check_extension(long ext)
break;
#endif
case KVM_CAP_USER_MEMORY:
+ case KVM_CAP_SYNC_MMU:
case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
case KVM_CAP_ONE_REG:
r = 1;
--
1.7.9.5
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [RFC PATCH 2/4] KVM: ARM: Fixup trace ipa printing
2012-11-02 10:03 [RFC PATCH 0/4] KVM: ARM: Support transparent huge pages and hugetlbfs Christoffer Dall
2012-11-02 10:03 ` [RFC PATCH 1/4] KVM: ARM: Report support of mmu notifiers to user space Christoffer Dall
@ 2012-11-02 10:03 ` Christoffer Dall
2012-11-02 10:03 ` [RFC PATCH 3/4] KVM: ARM: Improve stage2_clear_pte Christoffer Dall
2012-11-02 10:03 ` [RFC PATCH 4/4] KVM: ARM: Transparent huge pages and hugetlbfs support Christoffer Dall
3 siblings, 0 replies; 5+ messages in thread
From: Christoffer Dall @ 2012-11-02 10:03 UTC (permalink / raw)
To: kvmarm; +Cc: kvm, Christoffer Dall
The arguments where shifted and printed a 64 bit integer as a 32 bit
integer.
Signed-off-by: Christoffer Dall <c.dall@virtualopensystems.com>
---
arch/arm/kvm/trace.h | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h
index c3d05f4..cd52640 100644
--- a/arch/arm/kvm/trace.h
+++ b/arch/arm/kvm/trace.h
@@ -42,14 +42,14 @@ TRACE_EVENT(kvm_exit,
TRACE_EVENT(kvm_guest_fault,
TP_PROTO(unsigned long vcpu_pc, unsigned long hsr,
unsigned long hxfar,
- unsigned long ipa),
+ unsigned long long ipa),
TP_ARGS(vcpu_pc, hsr, hxfar, ipa),
TP_STRUCT__entry(
__field( unsigned long, vcpu_pc )
__field( unsigned long, hsr )
__field( unsigned long, hxfar )
- __field( unsigned long, ipa )
+ __field( unsigned long long, ipa )
),
TP_fast_assign(
@@ -60,9 +60,9 @@ TRACE_EVENT(kvm_guest_fault,
),
TP_printk("guest fault at PC %#08lx (hxfar %#08lx, "
- "ipa %#08lx, hsr %#08lx",
+ "ipa %#16llx, hsr %#08lx",
__entry->vcpu_pc, __entry->hxfar,
- __entry->hsr, __entry->ipa)
+ __entry->ipa, __entry->hsr)
);
TRACE_EVENT(kvm_irq_line,
--
1.7.9.5
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [RFC PATCH 3/4] KVM: ARM: Improve stage2_clear_pte
2012-11-02 10:03 [RFC PATCH 0/4] KVM: ARM: Support transparent huge pages and hugetlbfs Christoffer Dall
2012-11-02 10:03 ` [RFC PATCH 1/4] KVM: ARM: Report support of mmu notifiers to user space Christoffer Dall
2012-11-02 10:03 ` [RFC PATCH 2/4] KVM: ARM: Fixup trace ipa printing Christoffer Dall
@ 2012-11-02 10:03 ` Christoffer Dall
2012-11-02 10:03 ` [RFC PATCH 4/4] KVM: ARM: Transparent huge pages and hugetlbfs support Christoffer Dall
3 siblings, 0 replies; 5+ messages in thread
From: Christoffer Dall @ 2012-11-02 10:03 UTC (permalink / raw)
To: kvmarm; +Cc: kvm, Christoffer Dall
Factor out parts of the functionality to make the code more readable and
rename to unmap_stage2_range while supporting unmapping ranges in one
go.
Signed-off-by: Christoffer Dall <c.dall@virtualopensystems.com>
---
arch/arm/kvm/mmu.c | 122 +++++++++++++++++++++++++++++++++++-----------------
1 file changed, 83 insertions(+), 39 deletions(-)
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index cb03d45..96ab6a8 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -365,59 +365,103 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
kvm->arch.pgd = NULL;
}
+static void clear_pud_entry(pud_t *pud)
+{
+ pmd_t *pmd_table = pmd_offset(pud, 0);
+ pud_clear(pud);
+ pmd_free(NULL, pmd_table);
+ put_page(virt_to_page(pud));
+}
+
+static void clear_pmd_entry(pmd_t *pmd)
+{
+ if (pmd_huge(*pmd)) {
+ pmd_clear(pmd);
+ } else {
+ pte_t *pte_table = pte_offset_kernel(pmd, 0);
+ pmd_clear(pmd);
+ pte_free_kernel(NULL, pte_table);
+ }
+ put_page(virt_to_page(pmd));
+}
+
+static bool pmd_empty(pmd_t *pmd)
+{
+ struct page *pmd_page = virt_to_page(pmd);
+ return page_count(pmd_page) == 1;
+}
+
+static void clear_pte_entry(pte_t *pte)
+{
+ set_pte_ext(pte, __pte(0), 0);
+ put_page(virt_to_page(pte));
+}
+
+static bool pte_empty(pte_t *pte)
+{
+ struct page *pte_page = virt_to_page(pte);
+ return page_count(pte_page) == 1;
+}
+
/**
- * stage2_clear_pte -- Clear a stage-2 PTE.
- * @kvm: The VM pointer
- * @addr: The physical address of the PTE
+ * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
+ * @kvm: The VM pointer
+ * @start: The intermediate physical base address of the range to unmap
+ * @size: The size of the area to unmap
*
- * Clear a stage-2 PTE, lowering the various ref-counts. Also takes
- * care of invalidating the TLBs. Must be called while holding
- * mmu_lock, otherwise another faulting VCPU may come in and mess
- * things behind our back.
+ * Clear a range of stage-2 mappings, lowering the various ref-counts. Also
+ * takes care of invalidating the TLBs. Must be called while holding
+ * mmu_lock, otherwise another faulting VCPU may come in and mess with things
+ * behind our backs.
*/
-static void stage2_clear_pte(struct kvm *kvm, phys_addr_t addr)
+static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, size_t size)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
- struct page *page;
-
- pgd = kvm->arch.pgd + pgd_index(addr);
- pud = pud_offset(pgd, addr);
- if (pud_none(*pud))
- return;
+ phys_addr_t addr = start, end = start + size;
+ size_t range;
- pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd))
- return;
+ while (addr < end) {
+ pgd = kvm->arch.pgd + pgd_index(addr);
+ pud = pud_offset(pgd, addr);
+ if (pud_none(*pud)) {
+ addr += PUD_SIZE;
+ continue;
+ }
- pte = pte_offset_kernel(pmd, addr);
- set_pte_ext(pte, __pte(0), 0);
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd)) {
+ addr += PMD_SIZE;
+ continue;
+ }
- page = virt_to_page(pte);
- put_page(page);
- if (page_count(page) != 1) {
- kvm_tlb_flush_vmid(kvm);
- return;
- }
+ if (pmd_huge(*pmd)) {
+ clear_pmd_entry(pmd);
+ if (pmd_empty(pmd))
+ clear_pud_entry(pud);
+ addr += PMD_SIZE;
+ continue;
+ }
- /* Need to remove pte page */
- pmd_clear(pmd);
- pte_free_kernel(NULL, (pte_t *)((unsigned long)pte & PAGE_MASK));
+ pte = pte_offset_kernel(pmd, addr);
+ clear_pte_entry(pte);
+ range = PAGE_SIZE;
+
+ /* If we emptied the pte, walk back up the ladder */
+ if (pte_empty(pte)) {
+ clear_pmd_entry(pmd);
+ range = PMD_SIZE;
+ if (pmd_empty(pmd)) {
+ clear_pud_entry(pud);
+ range = PUD_SIZE;
+ }
+ }
- page = virt_to_page(pmd);
- put_page(page);
- if (page_count(page) != 1) {
- kvm_tlb_flush_vmid(kvm);
- return;
+ addr += range;
}
- pud_clear(pud);
- pmd_free(NULL, (pmd_t *)((unsigned long)pmd & PAGE_MASK));
-
- page = virt_to_page(pud);
- put_page(page);
kvm_tlb_flush_vmid(kvm);
}
@@ -693,7 +737,7 @@ static void handle_hva_to_gpa(struct kvm *kvm,
static void kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
{
- stage2_clear_pte(kvm, gpa);
+ unmap_stage2_range(kvm, gpa, PAGE_SIZE);
}
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
--
1.7.9.5
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [RFC PATCH 4/4] KVM: ARM: Transparent huge pages and hugetlbfs support
2012-11-02 10:03 [RFC PATCH 0/4] KVM: ARM: Support transparent huge pages and hugetlbfs Christoffer Dall
` (2 preceding siblings ...)
2012-11-02 10:03 ` [RFC PATCH 3/4] KVM: ARM: Improve stage2_clear_pte Christoffer Dall
@ 2012-11-02 10:03 ` Christoffer Dall
3 siblings, 0 replies; 5+ messages in thread
From: Christoffer Dall @ 2012-11-02 10:03 UTC (permalink / raw)
To: kvmarm; +Cc: kvm, Christoffer Dall
Support transparent huge pages in KVM/ARM. This requires quite a bit of
checkint and for qemu support to take advantage of this, you need to
make sure qemu allocates pages on aligned to the PMD size.
Signed-off-by: Christoffer Dall <c.dall@virtualopensystems.com>
---
arch/arm/include/asm/kvm_host.h | 6 +-
arch/arm/kvm/mmu.c | 126 +++++++++++++++++++++++++++++++--------
2 files changed, 103 insertions(+), 29 deletions(-)
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 7127fe7..4eea228 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -34,9 +34,9 @@
#define KVM_VCPU_MAX_FEATURES 0
/* We don't currently support large pages. */
-#define KVM_HPAGE_GFN_SHIFT(x) 0
-#define KVM_NR_PAGE_SIZES 1
-#define KVM_PAGES_PER_HPAGE(x) (1UL<<31)
+#define KVM_HPAGE_GFN_SHIFT(_level) (((_level) - 1) * 21)
+#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_GFN_SHIFT(1))
+#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE)
struct kvm_vcpu;
u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode);
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 96ab6a8..762647c 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -19,6 +19,7 @@
#include <linux/mman.h>
#include <linux/kvm_host.h>
#include <linux/io.h>
+#include <linux/hugetlb.h>
#include <trace/events/kvm.h>
#include <asm/idmap.h>
#include <asm/pgalloc.h>
@@ -302,8 +303,7 @@ static void free_stage2_ptes(pmd_t *pmd, unsigned long addr)
pmd_page = virt_to_page(pmd);
for (i = 0; i < PTRS_PER_PMD; i++, addr += PMD_SIZE) {
- BUG_ON(pmd_sect(*pmd));
- if (!pmd_none(*pmd) && pmd_table(*pmd)) {
+ if (pmd_table(*pmd)) {
pte = pte_offset_kernel(pmd, addr);
free_guest_pages(pte, addr);
pte_free_kernel(NULL, pte);
@@ -470,7 +470,7 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
{
pgd_t *pgd;
pud_t *pud;
- pmd_t *pmd;
+ pmd_t *pmd, old_pmd;
pte_t *pte, old_pte;
/* Create 2nd stage page table mapping - Level 1 */
@@ -486,7 +486,22 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
} else
pmd = pmd_offset(pud, addr);
- /* Create 2nd stage page table mapping - Level 2 */
+ /* Create 2nd stage section mappings (huge tlb pages) - Level 2 */
+ if (pte_huge(*new_pte) || pmd_huge(*pmd)) {
+ pte_t *huge_pte = (pte_t *)pmd;
+ BUG_ON(pmd_present(*pmd) && !pmd_huge(*pmd));
+
+ old_pmd = *pmd;
+ set_pte_ext(huge_pte, *new_pte, 0); /* new_pte really new_pmd */
+ if (pmd_present(old_pmd))
+ __kvm_tlb_flush_vmid(kvm);
+ else
+ get_page(virt_to_page(pmd));
+ return 0;
+ }
+
+ /* Create 2nd stage page mappings - Level 2 */
+ BUG_ON(pmd_present(*pmd) && pmd_huge(*pmd));
if (pmd_none(*pmd)) {
if (!cache)
return 0; /* ignore calls from kvm_set_spte_hva */
@@ -551,7 +566,8 @@ out:
return ret;
}
-static void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
+static void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
+ unsigned long size)
{
/*
* If we are going to insert an instruction page and the icache is
@@ -563,24 +579,64 @@ static void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
* damn shame - as written in the ARM ARM (DDI 0406C - Page B3-1384)
*/
if (icache_is_pipt()) {
- unsigned long hva = gfn_to_hva(kvm, gfn);
- __cpuc_coherent_user_range(hva, hva + PAGE_SIZE);
+ __cpuc_coherent_user_range(hva, hva + size);
} else if (!icache_is_vivt_asid_tagged()) {
/* any kind of VIPT cache */
__flush_icache_all();
}
}
+static bool transparent_hugepage_adjust(struct kvm *kvm, pfn_t *pfnp,
+ phys_addr_t *ipap)
+{
+ pfn_t pfn = *pfnp;
+ gfn_t gfn = *ipap >> PAGE_SHIFT;
+
+ if (PageTransCompound(pfn_to_page(pfn))) {
+ unsigned long mask;
+ kvm_err("transparent huge page at: %#18llx\n",
+ (unsigned long long)*ipap);
+ /*
+ * mmu_notifier_retry was successful and we hold the
+ * mmu_lock here, so the pmd can't become splitting
+ * from under us, and in turn
+ * __split_huge_page_refcount() can't run from under
+ * us and we can safely transfer the refcount from
+ * PG_tail to PG_head as we switch the pfn from tail to
+ * head.
+ */
+ mask = KVM_PAGES_PER_HPAGE - 1;
+ VM_BUG_ON((gfn & mask) != (pfn & mask));
+ if (pfn & mask) {
+ gfn &= ~mask;
+ *ipap &= ~(KVM_HPAGE_SIZE - 1);
+ kvm_release_pfn_clean(pfn);
+ pfn &= ~mask;
+ kvm_get_pfn(pfn);
+ *pfnp = pfn;
+ }
+
+ return true;
+ }
+
+ return false;
+}
+
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
- gfn_t gfn, struct kvm_memory_slot *memslot,
+ struct kvm_memory_slot *memslot,
bool is_iabt, unsigned long fault_status)
{
- pte_t new_pte;
- pfn_t pfn;
int ret;
- bool write_fault, writable;
+ bool write_fault, writable, hugetlb = false, force_pte = false;
unsigned long mmu_seq;
+ gfn_t gfn = fault_ipa >> PAGE_SHIFT;
+ unsigned long hva = gfn_to_hva(vcpu->kvm, gfn);
+ struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
+ struct vm_area_struct *vma;
+ pfn_t pfn;
+ pte_t new_pte;
+ unsigned long psize;
if (is_iabt)
write_fault = false;
@@ -594,32 +650,51 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
}
+ /* Let's check if we will get back a huge page */
+ down_read(¤t->mm->mmap_sem);
+ vma = find_vma_intersection(current->mm, hva, hva + 1);
+ if (is_vm_hugetlb_page(vma)) {
+ hugetlb = true;
+ hva &= PMD_MASK;
+ gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
+ psize = PMD_SIZE;
+ } else {
+ psize = PAGE_SIZE;
+ if (vma->vm_start & ~PMD_MASK)
+ force_pte = true;
+ }
+ up_read(¤t->mm->mmap_sem);
+
+ coherent_icache_guest_page(kvm, hva, psize);
+
+ pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
+ if (is_error_pfn(pfn))
+ return -EFAULT;
+
/* We need minimum second+third level pages */
ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS);
if (ret)
return ret;
- mmu_seq = vcpu->kvm->mmu_notifier_seq;
+ mmu_seq = kvm->mmu_notifier_seq;
smp_rmb();
- pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable);
- if (is_error_pfn(pfn))
- return -EFAULT;
-
- new_pte = pfn_pte(pfn, PAGE_S2);
- coherent_icache_guest_page(vcpu->kvm, gfn);
-
- spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu, mmu_seq))
+ spin_lock(&kvm->mmu_lock);
+ if (mmu_notifier_retry(kvm, mmu_seq))
goto out_unlock;
+ if (!hugetlb && !force_pte)
+ hugetlb = transparent_hugepage_adjust(kvm, &pfn, &fault_ipa);
+ new_pte = pfn_pte(pfn, PAGE_S2);
+ if (hugetlb)
+ new_pte = pte_mkhuge(new_pte);
if (writable) {
pte_val(new_pte) |= L_PTE_S2_RDWR;
kvm_set_pfn_dirty(pfn);
}
- stage2_set_pte(vcpu->kvm, memcache, fault_ipa, &new_pte, false);
+ ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false);
out_unlock:
- spin_unlock(&vcpu->kvm->mmu_lock);
+ spin_unlock(&kvm->mmu_lock);
/*
* XXX TODO FIXME:
- * This is _really_ *weird* !!!
@@ -628,7 +703,7 @@ out_unlock:
* guests under heavy memory pressure on the host and heavy swapping.
*/
kvm_release_pfn_dirty(pfn);
- return 0;
+ return ret;
}
/**
@@ -693,8 +768,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
return -EINVAL;
}
- ret = user_mem_abort(vcpu, fault_ipa, gfn, memslot,
- is_iabt, fault_status);
+ ret = user_mem_abort(vcpu, fault_ipa, memslot, is_iabt, fault_status);
return ret ? ret : 1;
}
--
1.7.9.5
^ permalink raw reply related [flat|nested] 5+ messages in thread
end of thread, other threads:[~2012-11-02 10:03 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-11-02 10:03 [RFC PATCH 0/4] KVM: ARM: Support transparent huge pages and hugetlbfs Christoffer Dall
2012-11-02 10:03 ` [RFC PATCH 1/4] KVM: ARM: Report support of mmu notifiers to user space Christoffer Dall
2012-11-02 10:03 ` [RFC PATCH 2/4] KVM: ARM: Fixup trace ipa printing Christoffer Dall
2012-11-02 10:03 ` [RFC PATCH 3/4] KVM: ARM: Improve stage2_clear_pte Christoffer Dall
2012-11-02 10:03 ` [RFC PATCH 4/4] KVM: ARM: Transparent huge pages and hugetlbfs support Christoffer Dall
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).