[RFC PATCH 4/4] KVM: ARM: Transparent huge pages and hugetlbfs support

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Christoffer Dall <c.dall@virtualopensystems.com>
To: kvmarm@lists.cs.columbia.edu
Cc: kvm@vger.kernel.org, Christoffer Dall <c.dall@virtualopensystems.com>
Subject: [RFC PATCH 4/4] KVM: ARM: Transparent huge pages and hugetlbfs support
Date: Fri,  2 Nov 2012 11:03:22 +0100	[thread overview]
Message-ID: <1351850602-4781-5-git-send-email-c.dall@virtualopensystems.com> (raw)
In-Reply-To: <1351850602-4781-1-git-send-email-c.dall@virtualopensystems.com>

Support transparent huge pages in KVM/ARM. This requires quite a bit of
checkint and for qemu support to take advantage of this, you need to
make sure qemu allocates pages on aligned to the PMD size.

Signed-off-by: Christoffer Dall <c.dall@virtualopensystems.com>
---
 arch/arm/include/asm/kvm_host.h |    6 +-
 arch/arm/kvm/mmu.c              |  126 +++++++++++++++++++++++++++++++--------
 2 files changed, 103 insertions(+), 29 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 7127fe7..4eea228 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -34,9 +34,9 @@
 #define KVM_VCPU_MAX_FEATURES 0
 
 /* We don't currently support large pages. */
-#define KVM_HPAGE_GFN_SHIFT(x)	0
-#define KVM_NR_PAGE_SIZES	1
-#define KVM_PAGES_PER_HPAGE(x)	(1UL<<31)
+#define KVM_HPAGE_GFN_SHIFT(_level)	(((_level) - 1) * 21)
+#define KVM_HPAGE_SIZE			(1UL << KVM_HPAGE_GFN_SHIFT(1))
+#define KVM_PAGES_PER_HPAGE		(KVM_HPAGE_SIZE / PAGE_SIZE)
 
 struct kvm_vcpu;
 u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode);
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 96ab6a8..762647c 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -19,6 +19,7 @@
 #include <linux/mman.h>
 #include <linux/kvm_host.h>
 #include <linux/io.h>
+#include <linux/hugetlb.h>
 #include <trace/events/kvm.h>
 #include <asm/idmap.h>
 #include <asm/pgalloc.h>
@@ -302,8 +303,7 @@ static void free_stage2_ptes(pmd_t *pmd, unsigned long addr)
 	pmd_page = virt_to_page(pmd);
 
 	for (i = 0; i < PTRS_PER_PMD; i++, addr += PMD_SIZE) {
-		BUG_ON(pmd_sect(*pmd));
-		if (!pmd_none(*pmd) && pmd_table(*pmd)) {
+		if (pmd_table(*pmd)) {
 			pte = pte_offset_kernel(pmd, addr);
 			free_guest_pages(pte, addr);
 			pte_free_kernel(NULL, pte);
@@ -470,7 +470,7 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 {
 	pgd_t *pgd;
 	pud_t *pud;
-	pmd_t *pmd;
+	pmd_t *pmd, old_pmd;
 	pte_t *pte, old_pte;
 
 	/* Create 2nd stage page table mapping - Level 1 */
@@ -486,7 +486,22 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 	} else
 		pmd = pmd_offset(pud, addr);
 
-	/* Create 2nd stage page table mapping - Level 2 */
+	/* Create 2nd stage section mappings (huge tlb pages) - Level 2 */
+	if (pte_huge(*new_pte) || pmd_huge(*pmd)) {
+		pte_t *huge_pte = (pte_t *)pmd;
+		BUG_ON(pmd_present(*pmd) && !pmd_huge(*pmd));
+
+		old_pmd = *pmd;
+		set_pte_ext(huge_pte, *new_pte, 0); /* new_pte really new_pmd */
+		if (pmd_present(old_pmd))
+			__kvm_tlb_flush_vmid(kvm);
+		else
+			get_page(virt_to_page(pmd));
+		return 0;
+	}
+
+	/* Create 2nd stage page mappings - Level 2 */
+	BUG_ON(pmd_present(*pmd) && pmd_huge(*pmd));
 	if (pmd_none(*pmd)) {
 		if (!cache)
 			return 0; /* ignore calls from kvm_set_spte_hva */
@@ -551,7 +566,8 @@ out:
 	return ret;
 }
 
-static void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
+static void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
+				       unsigned long size)
 {
 	/*
 	 * If we are going to insert an instruction page and the icache is
@@ -563,24 +579,64 @@ static void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
 	 * damn shame - as written in the ARM ARM (DDI 0406C - Page B3-1384)
 	 */
 	if (icache_is_pipt()) {
-		unsigned long hva = gfn_to_hva(kvm, gfn);
-		__cpuc_coherent_user_range(hva, hva + PAGE_SIZE);
+		__cpuc_coherent_user_range(hva, hva + size);
 	} else if (!icache_is_vivt_asid_tagged()) {
 		/* any kind of VIPT cache */
 		__flush_icache_all();
 	}
 }
 
+static bool transparent_hugepage_adjust(struct kvm *kvm, pfn_t *pfnp,
+					phys_addr_t *ipap)
+{
+	pfn_t pfn = *pfnp;
+	gfn_t gfn = *ipap >> PAGE_SHIFT;
+
+	if (PageTransCompound(pfn_to_page(pfn))) {
+		unsigned long mask;
+		kvm_err("transparent huge page at: %#18llx\n",
+			(unsigned long long)*ipap);
+		/*
+		 * mmu_notifier_retry was successful and we hold the
+		 * mmu_lock here, so the pmd can't become splitting
+		 * from under us, and in turn
+		 * __split_huge_page_refcount() can't run from under
+		 * us and we can safely transfer the refcount from
+		 * PG_tail to PG_head as we switch the pfn from tail to
+		 * head.
+		 */
+		mask = KVM_PAGES_PER_HPAGE - 1;
+		VM_BUG_ON((gfn & mask) != (pfn & mask));
+		if (pfn & mask) {
+			gfn &= ~mask;
+			*ipap &= ~(KVM_HPAGE_SIZE - 1);
+			kvm_release_pfn_clean(pfn);
+			pfn &= ~mask;
+			kvm_get_pfn(pfn);
+			*pfnp = pfn;
+		}
+
+		return true;
+	}
+
+	return false;
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
-			  gfn_t gfn, struct kvm_memory_slot *memslot,
+			  struct kvm_memory_slot *memslot,
 			  bool is_iabt, unsigned long fault_status)
 {
-	pte_t new_pte;
-	pfn_t pfn;
 	int ret;
-	bool write_fault, writable;
+	bool write_fault, writable, hugetlb = false, force_pte = false;
 	unsigned long mmu_seq;
+	gfn_t gfn = fault_ipa >> PAGE_SHIFT;
+	unsigned long hva = gfn_to_hva(vcpu->kvm, gfn);
+	struct kvm *kvm = vcpu->kvm;
 	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
+	struct vm_area_struct *vma;
+	pfn_t pfn;
+	pte_t new_pte;
+	unsigned long psize;
 
 	if (is_iabt)
 		write_fault = false;
@@ -594,32 +650,51 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		return -EFAULT;
 	}
 
+	/* Let's check if we will get back a huge page */
+	down_read(&current->mm->mmap_sem);
+	vma = find_vma_intersection(current->mm, hva, hva + 1);
+	if (is_vm_hugetlb_page(vma)) {
+		hugetlb = true;
+		hva &= PMD_MASK;
+		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
+		psize = PMD_SIZE;
+	} else {
+		psize = PAGE_SIZE;
+		if (vma->vm_start & ~PMD_MASK)
+			force_pte = true;
+	}
+	up_read(&current->mm->mmap_sem);
+
+	coherent_icache_guest_page(kvm, hva, psize);
+
+	pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
+	if (is_error_pfn(pfn))
+		return -EFAULT;
+
 	/* We need minimum second+third level pages */
 	ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS);
 	if (ret)
 		return ret;
 
-	mmu_seq = vcpu->kvm->mmu_notifier_seq;
+	mmu_seq = kvm->mmu_notifier_seq;
 	smp_rmb();
 
-	pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable);
-	if (is_error_pfn(pfn))
-		return -EFAULT;
-
-	new_pte = pfn_pte(pfn, PAGE_S2);
-	coherent_icache_guest_page(vcpu->kvm, gfn);
-
-	spin_lock(&vcpu->kvm->mmu_lock);
-	if (mmu_notifier_retry(vcpu, mmu_seq))
+	spin_lock(&kvm->mmu_lock);
+	if (mmu_notifier_retry(kvm, mmu_seq))
 		goto out_unlock;
+	if (!hugetlb && !force_pte)
+		hugetlb = transparent_hugepage_adjust(kvm, &pfn, &fault_ipa);
+	new_pte = pfn_pte(pfn, PAGE_S2);
+	if (hugetlb)
+		new_pte = pte_mkhuge(new_pte);
 	if (writable) {
 		pte_val(new_pte) |= L_PTE_S2_RDWR;
 		kvm_set_pfn_dirty(pfn);
 	}
-	stage2_set_pte(vcpu->kvm, memcache, fault_ipa, &new_pte, false);
+	ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false);
 
 out_unlock:
-	spin_unlock(&vcpu->kvm->mmu_lock);
+	spin_unlock(&kvm->mmu_lock);
 	/*
 	 * XXX TODO FIXME:
 -        * This is _really_ *weird* !!!
@@ -628,7 +703,7 @@ out_unlock:
 	 * guests under heavy memory pressure on the host and heavy swapping.
 	 */
 	kvm_release_pfn_dirty(pfn);
-	return 0;
+	return ret;
 }
 
 /**
@@ -693,8 +768,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		return -EINVAL;
 	}
 
-	ret = user_mem_abort(vcpu, fault_ipa, gfn, memslot,
-			     is_iabt, fault_status);
+	ret = user_mem_abort(vcpu, fault_ipa, memslot, is_iabt, fault_status);
 	return ret ? ret : 1;
 }
 
-- 
1.7.9.5

     prev parent reply	other threads:[~2012-11-02 10:03 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-11-02 10:03 [RFC PATCH 0/4] KVM: ARM: Support transparent huge pages and hugetlbfs Christoffer Dall
2012-11-02 10:03 ` [RFC PATCH 1/4] KVM: ARM: Report support of mmu notifiers to user space Christoffer Dall
2012-11-02 10:03 ` [RFC PATCH 2/4] KVM: ARM: Fixup trace ipa printing Christoffer Dall
2012-11-02 10:03 ` [RFC PATCH 3/4] KVM: ARM: Improve stage2_clear_pte Christoffer Dall
2012-11-02 10:03 ` Christoffer Dall [this message]

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:7127fe7 dfblob:4eea228 dfblob:96ab6a8 dfblob:762647c )
 OR (
bs:"[RFC PATCH 4/4] KVM: ARM: Transparent huge pages and hugetlbfs support" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1351850602-4781-5-git-send-email-c.dall@virtualopensystems.com \
    --to=c.dall@virtualopensystems.com \
    --cc=kvm@vger.kernel.org \
    --cc=kvmarm@lists.cs.columbia.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.