From: c.dall@virtualopensystems.com (Christoffer Dall)
To: linux-arm-kernel@lists.infradead.org
Subject: [PATCH 13/15] KVM: ARM: Handle guest faults in KVM
Date: Sat, 15 Sep 2012 11:35:52 -0400 [thread overview]
Message-ID: <20120915153552.21241.8656.stgit@ubuntu> (raw)
In-Reply-To: <20120915153359.21241.86002.stgit@ubuntu>
From: Christoffer Dall <cdall@cs.columbia.edu>
Handles the guest faults in KVM by mapping in corresponding user pages
in the 2nd stage page tables.
We invalidate the instruction cache by MVA whenever we map a page to the
guest (no, we cannot only do it when we have an iabt because the guest
may happily read/write a page before hitting the icache) if the hardware
uses VIPT or PIPT. In the latter case, we can invalidate only that
physical page. In the first case, all bets are off and we simply must
invalidate the whole affair. Not that VIVT icaches are tagged with
vmids, and we are out of the woods on that one. Alexander Graf was nice
enough to remind us of this massive pain.
There may be a subtle bug hidden in, which we currently hide by marking
all pages dirty even when the pages are only mapped read-only. The
current hypothesis is that marking pages dirty may exercise the IO system
and data cache more and therefore we don't see stale data in the guest,
but it's purely guesswork. The bug is manifested by seemingly random
kernel crashes in guests when the host is under extreme memory pressure and
swapping is enabled.
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <c.dall@virtualopensystems.com>
---
arch/arm/include/asm/kvm_arm.h | 9 ++
arch/arm/include/asm/kvm_asm.h | 2 +
arch/arm/kvm/mmu.c | 155 ++++++++++++++++++++++++++++++++++++++++
arch/arm/kvm/trace.h | 28 +++++++
4 files changed, 193 insertions(+), 1 deletion(-)
diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
index ae586c1..4cff3b7 100644
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@@ -158,11 +158,20 @@
#define HSR_ISS (HSR_IL - 1)
#define HSR_ISV_SHIFT (24)
#define HSR_ISV (1U << HSR_ISV_SHIFT)
+#define HSR_FSC (0x3f)
+#define HSR_FSC_TYPE (0x3c)
+#define HSR_WNR (1 << 6)
#define HSR_CV_SHIFT (24)
#define HSR_CV (1U << HSR_CV_SHIFT)
#define HSR_COND_SHIFT (20)
#define HSR_COND (0xfU << HSR_COND_SHIFT)
+#define FSC_FAULT (0x04)
+#define FSC_PERM (0x0c)
+
+/* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
+#define HPFAR_MASK (~0xf)
+
#define HSR_EC_UNKNOWN (0x00)
#define HSR_EC_WFI (0x01)
#define HSR_EC_CP15_32 (0x03)
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 201ec1f..40ee099 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -43,6 +43,8 @@ extern char __kvm_hyp_vector[];
extern char __kvm_hyp_code_start[];
extern char __kvm_hyp_code_end[];
+extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
+
extern void __kvm_flush_vm_context(void);
extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index ea17a97..52cc280 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -21,10 +21,16 @@
#include <linux/io.h>
#include <asm/idmap.h>
#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_asm.h>
+#include <asm/kvm_emulate.h>
#include <asm/mach/map.h>
+#include <asm/kvm_asm.h>
+#include <trace/events/kvm.h>
+
+#include "trace.h"
static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
@@ -490,9 +496,156 @@ out:
return ret;
}
+static void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
+{
+ /*
+ * If we are going to insert an instruction page and the icache is
+ * either VIPT or PIPT, there is a potential problem where the host
+ * (or another VM) may have used this page at the same virtual address
+ * as this guest, and we read incorrect data from the icache. If
+ * we're using a PIPT cache, we can invalidate just that page, but if
+ * we are using a VIPT cache we need to invalidate the entire icache -
+ * damn shame - as written in the ARM ARM (DDI 0406C - Page B3-1384)
+ */
+ if (icache_is_pipt()) {
+ unsigned long hva = gfn_to_hva(kvm, gfn);
+ __cpuc_coherent_user_range(hva, hva + PAGE_SIZE);
+ } else if (!icache_is_vivt_asid_tagged()) {
+ /* any kind of VIPT cache */
+ __flush_icache_all();
+ }
+}
+
+static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+ gfn_t gfn, struct kvm_memory_slot *memslot,
+ bool is_iabt, unsigned long fault_status)
+{
+ pte_t new_pte;
+ pfn_t pfn, pfn_existing = KVM_PFN_ERR_BAD;
+ int ret;
+ bool write_fault, writable;
+ struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
+
+ if (is_iabt)
+ write_fault = false;
+ else if ((vcpu->arch.hsr & HSR_ISV) && !(vcpu->arch.hsr & HSR_WNR))
+ write_fault = false;
+ else
+ write_fault = true;
+
+ if (fault_status == FSC_PERM && !write_fault) {
+ kvm_err("Unexpected L2 read permission error\n");
+ return -EFAULT;
+ }
+
+ /*
+ * If this is a write fault (think COW) we need to make sure the
+ * existing page, which other CPUs might still read, doesn't go away
+ * from under us, by calling gfn_to_pfn_prot(write_fault=true).
+ * Therefore, we call gfn_to_pfn_prot(write_fault=false), which will
+ * pin the existing page, then we get a new page for the user space
+ * pte and map this in the stage-2 table where we also make sure to
+ * flush the TLB for the VM, if there was an existing entry (the entry
+ * was updated setting the write flag to the potentially new page).
+ */
+ if (fault_status == FSC_PERM) {
+ pfn_existing = gfn_to_pfn_prot(vcpu->kvm, gfn, false, NULL);
+ if (is_error_pfn(pfn_existing))
+ return -EFAULT;
+ }
+
+ pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable);
+ if (is_error_pfn(pfn)) {
+ ret = -EFAULT;
+ goto out_put_existing;
+ }
+
+ /* We need minimum second+third level pages */
+ ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS);
+ if (ret)
+ goto out;
+ new_pte = pfn_pte(pfn, PAGE_KVM_GUEST);
+ if (writable)
+ pte_val(new_pte) |= L_PTE2_WRITE;
+ coherent_icache_guest_page(vcpu->kvm, gfn);
+
+ spin_lock(&vcpu->kvm->arch.pgd_lock);
+ stage2_set_pte(vcpu->kvm, memcache, fault_ipa, &new_pte);
+ spin_unlock(&vcpu->kvm->arch.pgd_lock);
+
+out:
+ /*
+ * XXX TODO FIXME:
+ * This is _really_ *weird* !!!
+ * We should only be calling the _dirty verison when we map something
+ * writable, but this causes memory failures in guests under heavy
+ * memory pressure on the host and heavy swapping.
+ */
+ kvm_release_pfn_dirty(pfn);
+out_put_existing:
+ if (!is_error_pfn(pfn_existing))
+ kvm_release_pfn_clean(pfn_existing);
+ return ret;
+}
+
+/**
+ * kvm_handle_guest_abort - handles all 2nd stage aborts
+ * @vcpu: the VCPU pointer
+ * @run: the kvm_run structure
+ *
+ * Any abort that gets to the host is almost guaranteed to be caused by a
+ * missing second stage translation table entry, which can mean that either the
+ * guest simply needs more memory and we must allocate an appropriate page or it
+ * can mean that the guest tried to access I/O memory, which is emulated by user
+ * space. The distinction is based on the IPA causing the fault and whether this
+ * memory region has been registered as standard RAM by user space.
+ */
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
{
- return -EINVAL;
+ unsigned long hsr_ec;
+ unsigned long fault_status;
+ phys_addr_t fault_ipa;
+ struct kvm_memory_slot *memslot = NULL;
+ bool is_iabt;
+ gfn_t gfn;
+ int ret;
+
+ hsr_ec = vcpu->arch.hsr >> HSR_EC_SHIFT;
+ is_iabt = (hsr_ec == HSR_EC_IABT);
+ fault_ipa = ((phys_addr_t)vcpu->arch.hpfar & HPFAR_MASK) << 8;
+
+ trace_kvm_guest_fault(*vcpu_pc(vcpu), vcpu->arch.hsr,
+ vcpu->arch.hdfar, vcpu->arch.hifar, fault_ipa);
+
+ /* Check the stage-2 fault is trans. fault or write fault */
+ fault_status = (vcpu->arch.hsr & HSR_FSC_TYPE);
+ if (fault_status != FSC_FAULT && fault_status != FSC_PERM) {
+ kvm_err("Unsupported fault status: EC=%#lx DFCS=%#lx\n",
+ hsr_ec, fault_status);
+ return -EFAULT;
+ }
+
+ gfn = fault_ipa >> PAGE_SHIFT;
+ if (!kvm_is_visible_gfn(vcpu->kvm, gfn)) {
+ if (is_iabt) {
+ /* Prefetch Abort on I/O address */
+ kvm_inject_pabt(vcpu, vcpu->arch.hifar);
+ return 1;
+ }
+
+ kvm_pr_unimpl("I/O address abort...");
+ return 0;
+ }
+
+ memslot = gfn_to_memslot(vcpu->kvm, gfn);
+ if (!memslot->user_alloc) {
+ kvm_err("non user-alloc memslots not supported\n");
+ return -EINVAL;
+ }
+
+ ret = user_mem_abort(vcpu, fault_ipa, gfn, memslot,
+ is_iabt, fault_status);
+ return ret ? ret : 1;
}
static void handle_hva_to_gpa(struct kvm *kvm, unsigned long hva,
diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h
index 772e045..40606c9 100644
--- a/arch/arm/kvm/trace.h
+++ b/arch/arm/kvm/trace.h
@@ -39,6 +39,34 @@ TRACE_EVENT(kvm_exit,
TP_printk("PC: 0x%08lx", __entry->vcpu_pc)
);
+TRACE_EVENT(kvm_guest_fault,
+ TP_PROTO(unsigned long vcpu_pc, unsigned long hsr,
+ unsigned long hdfar, unsigned long hifar,
+ unsigned long ipa),
+ TP_ARGS(vcpu_pc, hsr, hdfar, hifar, ipa),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, vcpu_pc )
+ __field( unsigned long, hsr )
+ __field( unsigned long, hdfar )
+ __field( unsigned long, hifar )
+ __field( unsigned long, ipa )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_pc = vcpu_pc;
+ __entry->hsr = hsr;
+ __entry->hdfar = hdfar;
+ __entry->hifar = hifar;
+ __entry->ipa = ipa;
+ ),
+
+ TP_printk("guest fault at PC %#08lx (hdfar %#08lx, hifar %#08lx, "
+ "ipa %#08lx, hsr %#08lx",
+ __entry->vcpu_pc, __entry->hdfar, __entry->hifar,
+ __entry->hsr, __entry->ipa)
+);
+
TRACE_EVENT(kvm_irq_line,
TP_PROTO(unsigned int type, int vcpu_idx, int irq_num, int level),
TP_ARGS(type, vcpu_idx, irq_num, level),
next prev parent reply other threads:[~2012-09-15 15:35 UTC|newest]
Thread overview: 82+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-09-15 15:34 [PATCH 00/15] KVM/ARM Implementation Christoffer Dall
2012-09-15 15:34 ` [PATCH 01/15] ARM: add mem_type prot_pte accessor Christoffer Dall
2012-09-18 12:23 ` Will Deacon
2012-09-18 19:18 ` Christoffer Dall
2012-09-18 21:04 ` Russell King - ARM Linux
2012-09-18 21:53 ` Christoffer Dall
2012-09-20 10:01 ` Marc Zyngier
2012-09-20 13:21 ` Christoffer Dall
2012-09-15 15:34 ` [PATCH 02/15] ARM: Add page table and page defines needed by KVM Christoffer Dall
2012-09-18 12:47 ` Will Deacon
2012-09-18 14:06 ` Catalin Marinas
2012-09-18 15:05 ` Christoffer Dall
2012-09-18 15:07 ` Catalin Marinas
2012-09-18 15:10 ` Christoffer Dall
2012-09-18 22:01 ` Christoffer Dall
2012-09-19 9:21 ` Will Deacon
2012-09-20 0:10 ` Christoffer Dall
2012-09-15 15:34 ` [PATCH 03/15] ARM: Section based HYP idmap Christoffer Dall
2012-09-18 13:00 ` Will Deacon
2012-10-01 2:19 ` Christoffer Dall
2012-09-15 15:34 ` [PATCH 04/15] ARM: idmap: only initialize HYP idmap when HYP mode is available Christoffer Dall
2012-09-18 13:03 ` Will Deacon
2012-09-20 0:11 ` Christoffer Dall
2012-09-15 15:35 ` [PATCH 05/15] ARM: Expose PMNC bitfields for KVM use Christoffer Dall
2012-09-18 13:08 ` Will Deacon
2012-09-18 22:13 ` Christoffer Dall
2012-09-19 4:09 ` [kvmarm] " Rusty Russell
2012-09-19 9:30 ` Will Deacon
2012-09-15 15:35 ` [PATCH 06/15] KVM: ARM: Initial skeleton to compile KVM support Christoffer Dall
2012-09-25 15:20 ` Will Deacon
2012-09-26 1:43 ` Christoffer Dall
2012-09-27 14:13 ` Will Deacon
2012-09-27 14:39 ` Marc Zyngier
2012-09-27 14:45 ` [kvmarm] " Peter Maydell
2012-09-27 15:20 ` Will Deacon
2012-09-30 19:21 ` Christoffer Dall
2012-10-01 13:03 ` [kvmarm] " Marc Zyngier
2012-10-04 13:02 ` Min-gyu Kim
2012-10-04 13:35 ` Christoffer Dall
2012-10-05 6:28 ` Rusty Russell
2012-10-04 13:44 ` [kvmarm] " Avi Kivity
2012-09-15 15:35 ` [PATCH 07/15] KVM: ARM: Hypervisor inititalization Christoffer Dall
2012-09-15 15:35 ` [PATCH 08/15] KVM: ARM: Memory virtualization setup Christoffer Dall
2012-09-15 15:35 ` [PATCH 09/15] KVM: ARM: Inject IRQs and FIQs from userspace Christoffer Dall
2012-09-25 15:55 ` Will Deacon
2012-09-29 15:50 ` Christoffer Dall
2012-09-30 12:48 ` Will Deacon
2012-09-30 14:34 ` Christoffer Dall
2012-09-15 15:35 ` [PATCH 10/15] KVM: ARM: World-switch implementation Christoffer Dall
2012-09-25 17:00 ` Will Deacon
2012-09-25 17:15 ` [kvmarm] " Peter Maydell
2012-09-25 17:42 ` Marc Zyngier
2012-09-30 0:33 ` Christoffer Dall
2012-09-30 9:48 ` Peter Maydell
2012-09-30 14:31 ` Christoffer Dall
2012-09-30 17:47 ` Christoffer Dall
2012-09-15 15:35 ` [PATCH 11/15] KVM: ARM: Emulation framework and CP15 emulation Christoffer Dall
2012-09-15 15:35 ` [PATCH 12/15] KVM: ARM: User space API for getting/setting co-proc registers Christoffer Dall
2012-09-15 15:35 ` Christoffer Dall [this message]
2012-09-25 11:11 ` [PATCH 13/15] KVM: ARM: Handle guest faults in KVM Min-gyu Kim
2012-09-25 12:38 ` Christoffer Dall
2012-09-27 3:11 ` Min-gyu Kim
2012-09-27 5:35 ` Christoffer Dall
2012-09-27 15:26 ` [kvmarm] " Marc Zyngier
2012-09-27 12:39 ` Catalin Marinas
2012-09-27 17:15 ` Christoffer Dall
2012-09-27 17:21 ` Catalin Marinas
2012-09-15 15:35 ` [PATCH 14/15] KVM: ARM: Handle I/O aborts Christoffer Dall
2012-09-27 15:11 ` Will Deacon
2012-09-30 21:49 ` Christoffer Dall
2012-10-01 12:53 ` Dave Martin
2012-10-01 15:12 ` Jon Medhurst (Tixy)
2012-10-01 16:07 ` Dave Martin
2012-10-05 9:00 ` Russell King - ARM Linux
2012-10-08 10:04 ` Dave Martin
2012-10-08 21:52 ` Christoffer Dall
2012-09-15 15:36 ` [PATCH 15/15] KVM: ARM: Guest wait-for-interrupts (WFI) support Christoffer Dall
2012-09-25 17:04 ` Will Deacon
2012-09-29 23:00 ` Christoffer Dall
2012-09-18 12:21 ` [PATCH 00/15] KVM/ARM Implementation Will Deacon
2012-09-18 12:32 ` Christoffer Dall
2012-09-19 12:44 ` Avi Kivity
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20120915153552.21241.8656.stgit@ubuntu \
--to=c.dall@virtualopensystems.com \
--cc=linux-arm-kernel@lists.infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).