[PATCH RFC 13/44] x86/pt-shadow: Shadow L4 tables from 64bit PV guests

From: Andrew Cooper <andrew.cooper3@citrix.com>
To: Xen-devel <xen-devel@lists.xen.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: [PATCH RFC 13/44] x86/pt-shadow: Shadow L4 tables from 64bit PV guests
Date: Thu, 4 Jan 2018 20:21:38 +0000	[thread overview]
Message-ID: <1515097329-31902-14-git-send-email-andrew.cooper3@citrix.com> (raw)
In-Reply-To: <1515097329-31902-1-git-send-email-andrew.cooper3@citrix.com>

See the code comments for reasoning and the algorithm description.

This is a very simplistic algorithm, which comes with a substantial
performance overhead.  The algorithm will be improved in a later patch, once
more infrastructure is in place.

Some of the code (particularly in pt_maybe_shadow()) is structured oddly.
This is deliberate to simplify the patch for the later algorithm improvement,
to avoid unnecessary code motion getting in the way of the logical change.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v3:
 * Rebase over change to using ptsh
 * Rework, in terms of being as close to the eventual algorithm as possible,
   before we get map_domain_page() which is usable in context switch context.
---
 xen/arch/x86/mm.c                  |   5 +-
 xen/arch/x86/mm/shadow/multi.c     |   2 +
 xen/arch/x86/pv/mm.h               |  16 +++-
 xen/arch/x86/pv/pt-shadow.c        | 164 +++++++++++++++++++++++++++++++++++++
 xen/include/asm-x86/fixmap.h       |   1 +
 xen/include/asm-x86/pv/pt-shadow.h |  24 ++++++
 6 files changed, 209 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index f85ef6c..375565f 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -126,6 +126,7 @@
 #include <asm/hvm/grant_table.h>
 #include <asm/pv/grant_table.h>
 #include <asm/pv/mm.h>
+#include <asm/pv/pt-shadow.h>
 
 #include "pv/mm.h"
 
@@ -501,13 +502,15 @@ DEFINE_PER_CPU(unsigned long, curr_ptbase);
 
 void do_write_ptbase(struct vcpu *v, bool tlb_maintenance)
 {
-    unsigned long new_cr3 = v->arch.cr3;
+    unsigned long new_cr3;
     unsigned int cpu = smp_processor_id();
     unsigned long *this_curr_ptbase = &per_cpu(curr_ptbase, cpu);
 
     /* Check that %cr3 isn't being shuffled under our feet. */
     ASSERT(*this_curr_ptbase == read_cr3());
 
+    new_cr3 = pt_maybe_shadow(v);
+
     if ( tlb_maintenance )
         write_cr3(new_cr3);
     else
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index c4e954e..9c929ed 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -39,6 +39,7 @@ asm(".file \"" __OBJECT_FILE__ "\"");
 #include <asm/hvm/cacheattr.h>
 #include <asm/mtrr.h>
 #include <asm/guest_pt.h>
+#include <asm/pv/pt-shadow.h>
 #include <public/sched.h>
 #include "private.h"
 #include "types.h"
@@ -952,6 +953,7 @@ static int shadow_set_l4e(struct domain *d,
 
     /* Write the new entry */
     shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
+    pt_shadow_l4_write(d, mfn_to_page(sl4mfn), pgentry_ptr_to_slot(sl4e));
     flags |= SHADOW_SET_CHANGED;
 
     if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
diff --git a/xen/arch/x86/pv/mm.h b/xen/arch/x86/pv/mm.h
index a10b09a..7c66ca7 100644
--- a/xen/arch/x86/pv/mm.h
+++ b/xen/arch/x86/pv/mm.h
@@ -1,6 +1,8 @@
 #ifndef __PV_MM_H__
 #define __PV_MM_H__
 
+#include <asm/pv/pt-shadow.h>
+
 l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn);
 
 int new_guest_cr3(mfn_t mfn);
@@ -38,7 +40,7 @@ static inline l1_pgentry_t guest_get_eff_l1e(unsigned long linear)
  */
 static inline bool update_intpte(intpte_t *p, intpte_t old, intpte_t new,
                                  unsigned long mfn, struct vcpu *v,
-                                 bool preserve_ad)
+                                 bool preserve_ad, unsigned int level)
 {
     bool rv = true;
 
@@ -77,6 +79,11 @@ static inline bool update_intpte(intpte_t *p, intpte_t old, intpte_t new,
             old = t;
         }
     }
+
+    if ( level == 4 )
+        pt_shadow_l4_write(v->domain, mfn_to_page(mfn),
+                           pgentry_ptr_to_slot(p));
+
     return rv;
 }
 
@@ -87,7 +94,12 @@ static inline bool update_intpte(intpte_t *p, intpte_t old, intpte_t new,
 #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v,_ad)                         \
     update_intpte(&_t ## e_get_intpte(*(_p)),                       \
                   _t ## e_get_intpte(_o), _t ## e_get_intpte(_n),   \
-                  (_m), (_v), (_ad))
+                  (_m), (_v), (_ad), _t ## _LEVEL)
+
+#define l1_LEVEL 1
+#define l2_LEVEL 2
+#define l3_LEVEL 3
+#define l4_LEVEL 4
 
 static inline l1_pgentry_t adjust_guest_l1e(l1_pgentry_t l1e,
                                             const struct domain *d)
diff --git a/xen/arch/x86/pv/pt-shadow.c b/xen/arch/x86/pv/pt-shadow.c
index 7db8efb..46a0251 100644
--- a/xen/arch/x86/pv/pt-shadow.c
+++ b/xen/arch/x86/pv/pt-shadow.c
@@ -22,8 +22,32 @@
 #include <xen/mm.h>
 #include <xen/numa.h>
 
+#include <asm/fixmap.h>
 #include <asm/pv/pt-shadow.h>
 
+/*
+ * To use percpu linear ranges, we require that no two pcpus have %cr3
+ * pointing at the same L4 pagetable at the same time.
+ *
+ * Guests however might choose to use the same L4 pagetable on multiple vcpus
+ * at once, e.g. concurrently scheduling two threads from the same process.
+ * In practice, all HVM guests, and 32bit PV guests run on Xen-provided
+ * per-vcpu monitor tables, so it is only 64bit PV guests which are an issue.
+ *
+ * To resolve the issue, we shadow L4 pagetables from 64bit PV guests when
+ * they are in context.
+ *
+ * The algorithm is fairly simple.
+ *
+ *   - When a pcpu is switching to a new vcpu cr3 and shadowing is necessary,
+ *     perform a full 4K copy of the guests frame into a percpu frame, and run
+ *     on that.
+ *   - When a write to a guests L4 pagetable occurs, the update must be
+ *     propagated to all existing shadows.  An IPI is sent to the domains
+ *     dirty mask indicating which frame/slot was updated, and each pcpu
+ *     checks to see whether it needs to sync the update into its shadow.
+ */
+
 struct pt_shadow {
     /*
      * A frame used to shadow a vcpus intended pagetable.  When shadowing,
@@ -31,6 +55,17 @@ struct pt_shadow {
      */
     paddr_t shadow_l4;
     l4_pgentry_t *shadow_l4_va;
+
+    /*
+     * Domain to which the shadowed state belongs, or NULL if no state is
+     * being cached.  IPIs for updates to cached information are based on the
+     * domain dirty mask, which can race with the target of the IPI switching
+     * to a different context.
+     */
+    const struct domain *domain;
+
+    /* If nonzero, a guests pagetable which we are shadowing. */
+    paddr_t shadowing;
 };
 
 static DEFINE_PER_CPU(struct pt_shadow, ptsh);
@@ -76,6 +111,135 @@ void pt_shadow_free(unsigned int cpu)
 }
 
 /*
+ * We only need to shadow 4-level PV guests.  All other guests have per-vcpu
+ * monitor tables which are never scheduled on concurrent pcpus.  Care needs
+ * to be taken not to shadow d0v0 during construction, as it writes its L4
+ * directly.
+ */
+static bool pt_need_shadow(const struct domain *d)
+{
+    return (system_state >= SYS_STATE_active && is_pv_domain(d) &&
+            !is_idle_domain(d) && !is_pv_32bit_domain(d) && d->max_vcpus > 1);
+}
+
+unsigned long pt_maybe_shadow(struct vcpu *v)
+{
+    unsigned int cpu = smp_processor_id();
+    struct pt_shadow *ptsh = &per_cpu(ptsh, cpu);
+    unsigned long flags, new_cr3 = v->arch.cr3;
+
+    /*
+     * IPIs for updates are based on the domain dirty mask.  If we ever switch
+     * out of the currently shadowed context (even to idle), the cache will
+     * become stale.
+     */
+    if ( ptsh->domain &&
+         ptsh->domain != v->domain )
+    {
+        ptsh->domain = NULL;
+        ptsh->shadowing = 0;
+    }
+
+    /* No shadowing necessary? Run on the intended pagetable. */
+    if ( !pt_need_shadow(v->domain) )
+        return new_cr3;
+
+    ptsh->domain = v->domain;
+
+    /* Fastpath, if we are already shadowing the intended pagetable. */
+    if ( ptsh->shadowing == new_cr3 )
+        return ptsh->shadow_l4;
+
+    /*
+     * We may be called with interrupts disabled (e.g. context switch), or
+     * interrupts enabled (e.g. new_guest_cr3()).
+     *
+     * Reads and modifications of ptsh-> are only on the local cpu, but must
+     * be excluded against reads and modifications in _pt_shadow_ipi().
+     */
+    local_irq_save(flags);
+
+    {
+        l4_pgentry_t *l4t, *vcpu_l4t;
+
+        set_percpu_fixmap(cpu, PERCPU_FIXSLOT_SHADOW,
+                          l1e_from_paddr(new_cr3, __PAGE_HYPERVISOR_RO));
+        ptsh->shadowing = new_cr3;
+        local_irq_restore(flags);
+
+        l4t = ptsh->shadow_l4_va;
+        vcpu_l4t = percpu_fix_to_virt(cpu, PERCPU_FIXSLOT_SHADOW);
+
+        copy_page(l4t, vcpu_l4t);
+    }
+
+    return ptsh->shadow_l4;
+}
+
+struct ptsh_ipi_info
+{
+    const struct domain *d;
+    const struct page_info *pg;
+    enum {
+        PTSH_IPI_WRITE,
+    } op;
+    unsigned int slot;
+};
+
+static void _pt_shadow_ipi(void *arg)
+{
+    unsigned int cpu = smp_processor_id();
+    struct pt_shadow *ptsh = &per_cpu(ptsh, cpu);
+    const struct ptsh_ipi_info *info = arg;
+    unsigned long maddr = page_to_maddr(info->pg);
+
+    /* No longer shadowing state from this domain?  Nothing to do. */
+    if ( info->d != ptsh->domain )
+        return;
+
+    /* Not shadowing this frame?  Nothing to do. */
+    if ( ptsh->shadowing != maddr )
+        return;
+
+    switch ( info->op )
+    {
+        l4_pgentry_t *l4t, *vcpu_l4t;
+
+    case PTSH_IPI_WRITE:
+        l4t = ptsh->shadow_l4_va;
+
+        /* Reuse the mapping established in pt_maybe_shadow(). */
+        ASSERT(l1e_get_paddr(*percpu_fixmap_l1e(cpu, PERCPU_FIXSLOT_SHADOW)) ==
+               maddr);
+        vcpu_l4t = percpu_fix_to_virt(cpu, PERCPU_FIXSLOT_SHADOW);
+
+        l4t[info->slot] = vcpu_l4t[info->slot];
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
+    }
+}
+
+void pt_shadow_l4_write(const struct domain *d, const struct page_info *pg,
+                        unsigned int slot)
+{
+    struct ptsh_ipi_info info;
+
+    if ( !pt_need_shadow(d) )
+        return;
+
+    info = (struct ptsh_ipi_info){
+        .d = d,
+        .pg = pg,
+        .op = PTSH_IPI_WRITE,
+        .slot = slot,
+    };
+
+    on_selected_cpus(d->domain_dirty_cpumask, _pt_shadow_ipi, &info, 1);
+}
+
+/*
  * Local variables:
  * mode: C
  * c-file-style: "BSD"
diff --git a/xen/include/asm-x86/fixmap.h b/xen/include/asm-x86/fixmap.h
index d46939a..748219f 100644
--- a/xen/include/asm-x86/fixmap.h
+++ b/xen/include/asm-x86/fixmap.h
@@ -28,6 +28,7 @@
 #include <acpi/apei.h>
 
 #define NR_PERCPU_SLOTS 1
+#define PERCPU_FIXSLOT_SHADOW 0
 
 /*
  * Here we define all the compile-time 'special' virtual
diff --git a/xen/include/asm-x86/pv/pt-shadow.h b/xen/include/asm-x86/pv/pt-shadow.h
index ff99c85..6e71e99 100644
--- a/xen/include/asm-x86/pv/pt-shadow.h
+++ b/xen/include/asm-x86/pv/pt-shadow.h
@@ -21,6 +21,8 @@
 #ifndef __X86_PV_PT_SHADOW_H__
 #define __X86_PV_PT_SHADOW_H__
 
+#include <xen/sched.h>
+
 #ifdef CONFIG_PV
 
 /*
@@ -30,11 +32,33 @@
 int pt_shadow_alloc(unsigned int cpu);
 void pt_shadow_free(unsigned int cpu);
 
+/*
+ * Called for context switches, and when a vcpu explicitly changes cr3.  The
+ * PT shadow logic returns the cr3 hardware should run on, which is either
+ * v->arch.cr3 (no shadowing necessary), or a local frame (which is a suitable
+ * shadow of v->arch.cr3).
+ */
+unsigned long pt_maybe_shadow(struct vcpu *v);
+
+/*
+ * Called when a write occurs to an L4 pagetable.  The PT shadow logic brings
+ * any shadows of this page up-to-date.
+ */
+void pt_shadow_l4_write(
+    const struct domain *d, const struct page_info *pg, unsigned int slot);
+
 #else /* !CONFIG_PV */
 
 static inline int pt_shadow_alloc(unsigned int cpu) { return 0; }
 static inline void pt_shadow_free(unsigned int cpu) { }
 
+static inline unsigned long pt_maybe_shadow(struct vcpu *v)
+{
+    return v->arch.cr3;
+}
+static inline void pt_shadow_l4_write(
+    const struct domain *d, const struct page_info *pg, unsigned int slot) { }
+
 #endif /* CONFIG_PV */
 
 #endif /* __X86_PV_PT_SHADOW_H__ */
-- 
2.1.4


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel